Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-09-24 14:16:56 +02:00
commit 921d188bce
82 changed files with 867 additions and 570 deletions

View File

@ -8,12 +8,12 @@ be used in real products.
spaCy comes with
[pretrained pipelines](https://spacy.io/models) and vectors, and
currently supports tokenization for **59+ languages**. It features
currently supports tokenization for **60+ languages**. It features
state-of-the-art speed, convolutional **neural network models** for tagging,
parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management.
spaCy is commercial open-source software, released under the MIT license.
💫 **Version 2.3 out now!**
💫 **Version 3.0 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@ -29,16 +29,17 @@ spaCy is commercial open-source software, released under the MIT license.
## 📖 Documentation
| Documentation | |
| --------------- | -------------------------------------------------------------- |
| [spaCy 101] | New to spaCy? Here's everything you need to know! |
| [Usage Guides] | How to use spaCy and its features. |
| [New in v3.0] | New features, backwards incompatibilities and migration guide. |
| [API Reference] | The detailed reference for spaCy's API. |
| [Models] | Download statistical language models for spaCy. |
| [Universe] | Libraries, extensions, demos, books and courses. |
| [Changelog] | Changes and version history. |
| [Contribute] | How to contribute to the spaCy project and code base. |
| Documentation | |
| ------------------- | -------------------------------------------------------------- |
| [spaCy 101] | New to spaCy? Here's everything you need to know! |
| [Usage Guides] | How to use spaCy and its features. |
| [New in v3.0] | New features, backwards incompatibilities and migration guide. |
| [Project Templates] | End-to-end workflows you can clone, modify and run. |
| [API Reference] | The detailed reference for spaCy's API. |
| [Models] | Download statistical language models for spaCy. |
| [Universe] | Libraries, extensions, demos, books and courses. |
| [Changelog] | Changes and version history. |
| [Contribute] | How to contribute to the spaCy project and code base. |
[spacy 101]: https://spacy.io/usage/spacy-101
[new in v3.0]: https://spacy.io/usage/v3
@ -46,6 +47,7 @@ spaCy is commercial open-source software, released under the MIT license.
[api reference]: https://spacy.io/api/
[models]: https://spacy.io/models
[universe]: https://spacy.io/universe
[project templates]: https://github.com/explosion/projects
[changelog]: https://spacy.io/usage#changelog
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
@ -69,7 +71,7 @@ it.
## Features
- Support for **59+ languages**
- Support for **60+ languages**
- **Trained pipelines**
- Multi-task learning with pretrained **transformers** like BERT
- Pretrained **word vectors**

View File

@ -20,6 +20,7 @@ pytokenizations
setuptools
packaging
importlib_metadata>=0.20; python_version < "3.8"
typing_extensions>=3.7.4; python_version < "3.8"
# Development dependencies
cython>=0.25
pytest>=4.6.5

View File

@ -57,6 +57,7 @@ install_requires =
setuptools
packaging
importlib_metadata>=0.20; python_version < "3.8"
typing_extensions>=3.7.4; python_version < "3.8"
[options.entry_points]
console_scripts =

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy-nightly"
__version__ = "3.0.0a20"
__version__ = "3.0.0a23"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -378,7 +378,7 @@ def git_sparse_checkout(repo, subpath, dest, branch):
# Looking for this 'rev-list' command in the git --help? Hah.
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
ret = run_command(cmd, capture=True)
git_repo = _from_http_to_git(repo)
git_repo = _http_to_git(repo)
# Now pass those missings into another bit of git internals
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
if not missings:
@ -414,7 +414,7 @@ def get_git_version(
return (int(version[0]), int(version[1]))
def _from_http_to_git(repo: str) -> str:
def _http_to_git(repo: str) -> str:
if repo.startswith("http://"):
repo = repo.replace(r"http://", r"https://")
if repo.startswith(r"https://"):

View File

@ -9,7 +9,7 @@ import sys
from ._util import app, Arg, Opt
from ..training import docs_to_json
from ..tokens import DocBin
from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs
# Converters are matched by file extension except for ner/iob, which are
@ -18,12 +18,12 @@ from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2do
# imported from /converters.
CONVERTERS = {
"conllubio": conllu2docs,
"conllu": conllu2docs,
"conll": conllu2docs,
"ner": conll_ner2docs,
"iob": iob2docs,
"json": json2docs,
"conllubio": conllu_to_docs,
"conllu": conllu_to_docs,
"conll": conllu_to_docs,
"ner": conll_ner_to_docs,
"iob": iob_to_docs,
"json": json_to_docs,
}

View File

@ -2,7 +2,7 @@ from typing import Optional, Dict, Any, Union, List
from pathlib import Path
from wasabi import msg, table
from thinc.api import Config
from thinc.config import VARIABLE_RE
from thinc.config import VARIABLE_RE, ConfigValidationError
import typer
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
@ -51,7 +51,10 @@ def debug_config(
msg.divider("Config validation")
with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides)
nlp, _ = util.load_model_from_config(config)
nlp, resolved = util.load_model_from_config(config)
# Use the resolved config here in case user has one function returning
# a dict of corpora etc.
check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
msg.good("Config is valid")
if show_vars:
variables = get_variables(config)
@ -93,3 +96,23 @@ def get_variables(config: Config) -> Dict[str, Any]:
value = util.dot_to_object(config, path)
result[variable] = repr(value)
return result
def check_section_refs(config: Config, fields: List[str]) -> None:
"""Validate fields in the config that refer to other sections or values
(e.g. in the corpora) and make sure that those references exist.
"""
errors = []
for field in fields:
# If the field doesn't exist in the config, we ignore it
try:
value = util.dot_to_object(config, field)
except KeyError:
continue
try:
util.dot_to_object(config, value)
except KeyError:
msg = f"not a valid section reference: {value}"
errors.append({"loc": field.split("."), "msg": msg})
if errors:
raise ConfigValidationError(config, errors)

View File

@ -128,7 +128,7 @@ def debug_model(
goldY = None
for e in range(3):
if tok2vec:
tok2vec.predict(X)
tok2vec.update([Example.from_dict(x, {}) for x in X])
Y, get_dX = model.begin_update(X)
if goldY is None:
goldY = _simulate_gold(Y)

View File

@ -36,7 +36,7 @@ def init_config_cli(
"""
Generate a starter config.cfg for training. Based on your requirements
specified via the CLI arguments, this command generates a config with the
optimal settings for you use case. This includes the choice of architecture,
optimal settings for your use case. This includes the choice of architecture,
pretrained weights and related hyperparameters.
DOCS: https://nightly.spacy.io/api/cli#init-config

View File

@ -27,19 +27,32 @@ def project_pull_cli(
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
# TODO: We don't have tests for this :(. It would take a bit of mockery to
# set up. I guess see if it breaks first?
config = load_project_config(project_dir)
if remote in config.get("remotes", {}):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []):
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if any(not dep.exists() for dep in deps):
continue
cmd_hash = get_command_hash("", "", deps, cmd["script"])
for output_path in cmd.get("outputs", []):
url = storage.pull(output_path, command_hash=cmd_hash)
yield url, output_path
commands = list(config.get("commands", []))
# We use a while loop here because we don't know how the commands
# will be ordered. A command might need dependencies from one that's later
# in the list.
while commands:
for i, cmd in enumerate(list(commands)):
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if all(dep.exists() for dep in deps):
cmd_hash = get_command_hash("", "", deps, cmd["script"])
for output_path in cmd.get("outputs", []):
url = storage.pull(output_path, command_hash=cmd_hash)
yield url, output_path
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
if all(loc.exists() for loc in out_locs):
update_lockfile(project_dir, cmd)
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
if all(loc.exists() for loc in out_locs):
update_lockfile(project_dir, cmd)
# We remove the command from the list here, and break, so that
# we iterate over the loop again.
commands.remove(i)
break
else:
# If we didn't break the for loop, break the while loop.
break

View File

@ -59,7 +59,8 @@ factory = "parser"
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
state_type = "parser"
extra_state_tokens = false
hidden_width = 128
maxout_pieces = 3
use_upper = false
@ -79,7 +80,8 @@ factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = false
@ -93,6 +95,49 @@ grad_factor = 1.0
@layers = "reduce_mean.v1"
{% endif -%}
{% if "entity_linker" in components -%}
[components.entity_linker]
factory = "entity_linker"
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
incl_context = true
incl_prior = true
[components.entity_linker.model]
@architectures = "spacy.EntityLinker.v1"
nO = null
[components.entity_linker.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.entity_linker.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{% endif -%}
{% if "textcat" in components %}
[components.textcat]
factory = "textcat"
{% if optimize == "accuracy" %}
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1"
exclusive_classes = false
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
nO = null
{% else -%}
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
{%- endif %}
{%- endif %}
{# NON-TRANSFORMER PIPELINE #}
{% else -%}
@ -140,7 +185,8 @@ factory = "parser"
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
state_type = "parser"
extra_state_tokens = false
hidden_width = 128
maxout_pieces = 3
use_upper = true
@ -157,7 +203,8 @@ factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 6
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
@ -167,10 +214,50 @@ nO = null
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{% endif %}
{% if "entity_linker" in components -%}
[components.entity_linker]
factory = "entity_linker"
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
incl_context = true
incl_prior = true
[components.entity_linker.model]
@architectures = "spacy.EntityLinker.v1"
nO = null
[components.entity_linker.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{% endif %}
{% if "textcat" in components %}
[components.textcat]
factory = "textcat"
{% if optimize == "accuracy" %}
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1"
exclusive_classes = false
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
nO = null
{% else -%}
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
{%- endif %}
{%- endif %}
{% endif %}
{% for pipe in components %}
{% if pipe not in ["tagger", "parser", "ner"] %}
{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %}
{# Other components defined by the user: we just assume they're factories #}
[components.{{ pipe }}]
factory = "{{ pipe }}"
@ -197,7 +284,7 @@ vectors = "{{ word_vectors }}"
{% endif -%}
{% if use_transformer -%}
accumulate_gradient = {{ transformer["size_factor"] }}
{% endif %}
{% endif -%}
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
@ -230,18 +317,3 @@ start = 100
stop = 1000
compound = 1.001
{% endif %}
[training.score_weights]
{%- if "tagger" in components %}
tag_acc = {{ (1.0 / components|length)|round(2) }}
{%- endif -%}
{%- if "parser" in components %}
dep_uas = 0.0
dep_las = {{ (1.0 / components|length)|round(2) }}
sents_f = 0.0
{%- endif %}
{%- if "ner" in components %}
ents_f = {{ (1.0 / components|length)|round(2) }}
ents_p = 0.0
ents_r = 0.0
{%- endif -%}

View File

@ -152,7 +152,8 @@ def train(
exclude=frozen_components,
)
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
print_row, finalize_logger = train_logger(nlp)
with nlp.select_pipes(disable=frozen_components):
print_row, finalize_logger = train_logger(nlp)
try:
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
@ -163,7 +164,8 @@ def train(
progress.close()
print_row(info)
if is_best_checkpoint and output_path is not None:
update_meta(T_cfg, nlp, info)
with nlp.select_pipes(disable=frozen_components):
update_meta(T_cfg, nlp, info)
with nlp.use_params(optimizer.averages):
nlp.to_disk(output_path / "model-best")
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
@ -207,10 +209,17 @@ def create_train_batches(iterator, batcher, max_epochs: int):
def create_evaluation_callback(
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
) -> Callable[[], Tuple[float, Dict[str, float]]]:
weights = {key: value for key, value in weights.items() if value is not None}
def evaluate() -> Tuple[float, Dict[str, float]]:
dev_examples = list(dev_corpus(nlp))
scores = nlp.evaluate(dev_examples)
# Calculate a weighted sum based on score_weights for the main score
# Calculate a weighted sum based on score_weights for the main score.
# We can only consider scores that are ints/floats, not dicts like
# entity scores per type etc.
for key, value in scores.items():
if key in weights and not isinstance(value, (int, float)):
raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
try:
weighted_score = sum(
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
@ -366,7 +375,8 @@ def update_meta(
) -> None:
nlp.meta["performance"] = {}
for metric in training["score_weights"]:
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
if metric is not None:
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
for pipe_name in nlp.pipe_names:
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]

View File

@ -22,6 +22,11 @@ try:
except ImportError:
cupy = None
try: # Python 3.8+
from typing import Literal
except ImportError:
from typing_extensions import Literal # noqa: F401
from thinc.api import Optimizer # noqa: F401
pickle = pickle

View File

@ -69,7 +69,7 @@ class Warnings:
"in problems with the vocab further on in the pipeline.")
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
"entities \"{entities}\". Use "
"`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
"`spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)`"
" to check the alignment. Misaligned entities ('-') will be "
"ignored during training.")
W033 = ("Training a new {model} using a model with no lexeme normalization "
@ -480,6 +480,13 @@ class Errors:
E201 = ("Span index out of range.")
# TODO: fix numbering after merging develop into master
E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
"float or int but got: {score_type}. To exclude the score from the "
"final score, set its weight to null in the [training.score_weights] "
"section of your training config.")
E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})")
E917 = ("Received invalid value {value} for 'state_type' in "
"TransitionBasedParser: only 'parser' or 'ner' are valid options.")
E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
"values are an instance of spacy.vocab.Vocab or True to create one"
" (default).")

View File

@ -140,7 +140,6 @@ cdef class KnowledgeBase:
self._entries.push_back(entry)
self._aliases_table.push_back(alias)
cpdef from_disk(self, loc)
cpdef set_entities(self, entity_list, freq_list, vector_list)

View File

@ -9,7 +9,8 @@ from libcpp.vector cimport vector
from pathlib import Path
import warnings
from os import path
from spacy import util
from .typedefs cimport hash_t
from .errors import Errors, Warnings
@ -319,8 +320,14 @@ cdef class KnowledgeBase:
return 0.0
def to_disk(self, loc):
cdef Writer writer = Writer(loc)
def to_disk(self, path):
path = util.ensure_path(path)
if path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
if not path.parent.exists():
path.parent.mkdir(parents=True)
cdef Writer writer = Writer(path)
writer.write_header(self.get_size_entities(), self.entity_vector_length)
# dumping the entity vectors in their original order
@ -359,7 +366,13 @@ cdef class KnowledgeBase:
writer.close()
cpdef from_disk(self, loc):
def from_disk(self, path):
path = util.ensure_path(path)
if path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
if not path.exists():
raise ValueError(Errors.E929.format(loc=path))
cdef hash_t entity_hash
cdef hash_t alias_hash
cdef int64_t entry_index
@ -369,7 +382,7 @@ cdef class KnowledgeBase:
cdef AliasC alias
cdef float vector_element
cdef Reader reader = Reader(loc)
cdef Reader reader = Reader(path)
# STEP 0: load header and initialize KB
cdef int64_t nr_entities
@ -450,16 +463,13 @@ cdef class KnowledgeBase:
cdef class Writer:
def __init__(self, object loc):
if isinstance(loc, Path):
loc = bytes(loc)
if path.exists(loc):
if path.isdir(loc):
raise ValueError(Errors.E928.format(loc=loc))
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
def __init__(self, path):
assert isinstance(path, Path)
content = bytes(path)
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
self._fp = fopen(<char*>bytes_loc, 'wb')
if not self._fp:
raise IOError(Errors.E146.format(path=loc))
raise IOError(Errors.E146.format(path=path))
fseek(self._fp, 0, 0)
def close(self):
@ -496,14 +506,9 @@ cdef class Writer:
cdef class Reader:
def __init__(self, object loc):
if isinstance(loc, Path):
loc = bytes(loc)
if not path.exists(loc):
raise ValueError(Errors.E929.format(loc=loc))
if path.isdir(loc):
raise ValueError(Errors.E928.format(loc=loc))
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
def __init__(self, path):
content = bytes(path)
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
self._fp = fopen(<char*>bytes_loc, 'rb')
if not self._fp:
PyErr_SetFromErrno(IOError)

View File

@ -25,7 +25,6 @@ class Bengali(Language):
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(

View File

@ -30,7 +30,6 @@ class Greek(Language):
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(

View File

@ -29,7 +29,6 @@ class English(Language):
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(

View File

@ -28,7 +28,6 @@ class Persian(Language):
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(

View File

@ -33,7 +33,6 @@ class French(Language):
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(

View File

@ -28,7 +28,6 @@ class Norwegian(Language):
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(

View File

@ -30,7 +30,6 @@ class Dutch(Language):
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(

View File

@ -35,7 +35,6 @@ class Polish(Language):
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "pos_lookup", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(

View File

@ -25,7 +25,6 @@ class Russian(Language):
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(

View File

@ -31,7 +31,6 @@ class Swedish(Language):
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(

View File

@ -25,7 +25,6 @@ class Ukrainian(Language):
"lemmatizer",
assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(

View File

@ -248,9 +248,12 @@ class Language:
self._config["nlp"]["pipeline"] = list(self.component_names)
self._config["nlp"]["disabled"] = list(self.disabled)
self._config["components"] = pipeline
if not self._config["training"].get("score_weights"):
combined_score_weights = combine_score_weights(score_weights)
self._config["training"]["score_weights"] = combined_score_weights
# We're merging the existing score weights back into the combined
# weights to make sure we're preserving custom settings in the config
# but also reflect updates (e.g. new components added)
prev_weights = self._config["training"].get("score_weights", {})
combined_score_weights = combine_score_weights(score_weights, prev_weights)
self._config["training"]["score_weights"] = combined_score_weights
if not srsly.is_json_serializable(self._config):
raise ValueError(Errors.E961.format(config=self._config))
return self._config
@ -412,7 +415,6 @@ class Language:
assigns: Iterable[str] = SimpleFrozenList(),
requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False,
scores: Iterable[str] = SimpleFrozenList(),
default_score_weights: Dict[str, float] = SimpleFrozenDict(),
func: Optional[Callable] = None,
) -> Callable:
@ -430,12 +432,11 @@ class Language:
e.g. "token.ent_id". Used for pipeline analyis.
retokenizes (bool): Whether the component changes the tokenization.
Used for pipeline analysis.
scores (Iterable[str]): All scores set by the component if it's trainable,
e.g. ["ents_f", "ents_r", "ents_p"].
default_score_weights (Dict[str, float]): The scores to report during
training, and their default weight towards the final score used to
select the best model. Weights should sum to 1.0 per component and
will be combined and normalized for the whole pipeline.
will be combined and normalized for the whole pipeline. If None,
the score won't be shown in the logs or be weighted.
func (Optional[Callable]): Factory function if not used as a decorator.
DOCS: https://nightly.spacy.io/api/language#factory
@ -475,7 +476,7 @@ class Language:
default_config=default_config,
assigns=validate_attrs(assigns),
requires=validate_attrs(requires),
scores=scores,
scores=list(default_score_weights.keys()),
default_score_weights=default_score_weights,
retokenizes=retokenizes,
)

View File

@ -2,6 +2,8 @@ from typing import Optional, List
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
from thinc.types import Floats2d
from ...errors import Errors
from ...compat import Literal
from ...util import registry
from .._precomputable_affine import PrecomputableAffine
from ..tb_framework import TransitionModel
@ -11,7 +13,8 @@ from ...tokens import Doc
@registry.architectures.register("spacy.TransitionBasedParser.v1")
def build_tb_parser_model(
tok2vec: Model[List[Doc], List[Floats2d]],
nr_feature_tokens: int,
state_type: Literal["parser", "ner"],
extra_state_tokens: bool,
hidden_width: int,
maxout_pieces: int,
use_upper: bool = True,
@ -40,20 +43,12 @@ def build_tb_parser_model(
tok2vec (Model[List[Doc], List[Floats2d]]):
Subnetwork to map tokens into vector representations.
nr_feature_tokens (int): The number of tokens in the context to use to
construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The
2, 8 and 13 feature sets are designed for the parser, while the 3 and 6
feature sets are designed for the NER. The recommended feature sets are
3 for NER, and 8 for the dependency parser.
TODO: This feature should be split into two, state_type: ["deps", "ner"]
and extra_state_features: [True, False]. This would map into:
(deps, False): 8
(deps, True): 13
(ner, False): 3
(ner, True): 6
state_type (str):
String value denoting the type of parser model: "parser" or "ner"
extra_state_tokens (bool): Whether or not to use additional tokens in the context
to construct the state vector. Defaults to `False`, which means 3 and 8
for the NER and parser respectively. When set to `True`, this would become 6
feature sets (for the NER) or 13 (for the parser).
hidden_width (int): The width of the hidden layer.
maxout_pieces (int): How many pieces to use in the state prediction layer.
Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
@ -68,8 +63,14 @@ def build_tb_parser_model(
Usually inferred from data at the beginning of training, or loaded from
disk.
"""
if state_type == "parser":
nr_feature_tokens = 13 if extra_state_tokens else 8
elif state_type == "ner":
nr_feature_tokens = 6 if extra_state_tokens else 3
else:
raise ValueError(Errors.E917.format(value=state_type))
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))
tok2vec.set_dim("nO", hidden_width)
lower = PrecomputableAffine(
nO=hidden_width if use_upper else nO,

View File

@ -15,7 +15,8 @@ from ..training import validate_examples
default_model_config = """
[model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
state_type = "parser"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
@ -42,8 +43,14 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
"min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL,
},
scores=["dep_uas", "dep_las", "dep_las_per_type", "sents_p", "sents_r", "sents_f"],
default_score_weights={"dep_uas": 0.5, "dep_las": 0.5, "sents_f": 0.0},
default_score_weights={
"dep_uas": 0.5,
"dep_las": 0.5,
"dep_las_per_type": None,
"sents_p": None,
"sents_r": None,
"sents_f": 0.0,
},
)
def make_parser(
nlp: Language,

View File

@ -25,8 +25,12 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
"overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP,
},
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)
def make_entity_ruler(
nlp: Language,

View File

@ -21,7 +21,6 @@ from .. import util
"lookups": None,
"overwrite": False,
},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0},
)
def make_lemmatizer(

View File

@ -49,8 +49,7 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
"morphologizer",
assigns=["token.morph", "token.pos"],
default_config={"model": DEFAULT_MORPH_MODEL},
scores=["pos_acc", "morph_acc", "morph_per_feat"],
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5},
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
)
def make_morphologizer(
nlp: Language,

View File

@ -13,7 +13,8 @@ from ..training import validate_examples
default_model_config = """
[model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 6
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
@ -38,8 +39,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
"update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL,
},
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
)
def make_ner(

View File

@ -15,7 +15,6 @@ from .. import util
"sentencizer",
assigns=["token.is_sent_start", "doc.sents"],
default_config={"punct_chars": None},
scores=["sents_p", "sents_r", "sents_f"],
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)
def make_sentencizer(

View File

@ -36,7 +36,6 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
"senter",
assigns=["token.is_sent_start"],
default_config={"model": DEFAULT_SENTER_MODEL},
scores=["sents_p", "sents_r", "sents_f"],
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
)
def make_senter(nlp: Language, name: str, model: Model):

View File

@ -42,7 +42,6 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
"tagger",
assigns=["token.tag"],
default_config={"model": DEFAULT_TAGGER_MODEL},
scores=["tag_acc"],
default_score_weights={"tag_acc": 1.0},
)
def make_tagger(nlp: Language, name: str, model: Model):

View File

@ -62,18 +62,17 @@ subword_features = true
"positive_label": None,
"model": DEFAULT_TEXTCAT_MODEL,
},
scores=[
"cats_score",
"cats_score_desc",
"cats_p",
"cats_r",
"cats_f",
"cats_macro_f",
"cats_macro_auc",
"cats_f_per_type",
"cats_macro_auc_per_type",
],
default_score_weights={"cats_score": 1.0},
default_score_weights={
"cats_score": 1.0,
"cats_score_desc": None,
"cats_p": None,
"cats_r": None,
"cats_f": None,
"cats_macro_f": None,
"cats_macro_auc": None,
"cats_f_per_type": None,
"cats_macro_auc_per_type": None,
},
)
def make_textcat(
nlp: Language,

View File

@ -127,7 +127,7 @@ class Tok2Vec(Pipe):
tokvecs = self.model.predict(docs)
batch_id = Tok2VecListener.get_batch_id(docs)
for listener in self.listeners:
listener.receive(batch_id, tokvecs, None)
listener.receive(batch_id, tokvecs, lambda dX: [])
return tokvecs
def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:

View File

@ -211,7 +211,7 @@ class ConfigSchemaTraining(BaseModel):
seed: Optional[StrictInt] = Field(..., title="Random seed")
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
optimizer: Optimizer = Field(..., title="The optimizer to use")

View File

@ -240,7 +240,7 @@ class Scorer:
pred_per_feat[field].add((gold_i, feat))
for field in per_feat:
per_feat[field].score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
)
result = {k: v.to_dict() for k, v in per_feat.items()}
return {f"{attr}_per_feat": result}
@ -418,9 +418,9 @@ class Scorer:
f_per_type[pred_label].fp += 1
micro_prf = PRFScore()
for label_prf in f_per_type.values():
micro_prf.tp = label_prf.tp
micro_prf.fn = label_prf.fn
micro_prf.fp = label_prf.fp
micro_prf.tp += label_prf.tp
micro_prf.fn += label_prf.fn
micro_prf.fp += label_prf.fp
n_cats = len(f_per_type) + 1e-100
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats

View File

@ -144,6 +144,29 @@ def test_kb_empty(nlp):
entity_linker.begin_training(lambda: [])
def test_kb_serialize(nlp):
"""Test serialization of the KB"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
with make_tempdir() as d:
# normal read-write behaviour
mykb.to_disk(d / "kb")
mykb.from_disk(d / "kb")
mykb.to_disk(d / "kb.file")
mykb.from_disk(d / "kb.file")
mykb.to_disk(d / "new" / "kb")
mykb.from_disk(d / "new" / "kb")
# allow overwriting an existing file
mykb.to_disk(d / "kb.file")
with pytest.raises(ValueError):
# can not write to a directory
mykb.to_disk(d)
with pytest.raises(ValueError):
# can not read from a directory
mykb.from_disk(d)
with pytest.raises(ValueError):
# can not read from an unknown file
mykb.from_disk(d / "unknown" / "kb")
def test_candidate_generation(nlp):
"""Test correct candidate generation"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

View File

@ -359,12 +359,8 @@ def test_language_factories_scores():
func = lambda nlp, name: lambda doc: doc
weights1 = {"a1": 0.5, "a2": 0.5}
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
Language.factory(
f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
)
Language.factory(
f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
)
Language.factory(f"{name}1", default_score_weights=weights1, func=func)
Language.factory(f"{name}2", default_score_weights=weights2, func=func)
meta1 = Language.get_factory_meta(f"{name}1")
assert meta1.default_score_weights == weights1
meta2 = Language.get_factory_meta(f"{name}2")
@ -376,6 +372,21 @@ def test_language_factories_scores():
cfg = nlp.config["training"]
expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05}
assert cfg["score_weights"] == expected_weights
# Test with custom defaults
config = nlp.config.copy()
config["training"]["score_weights"]["a1"] = 0.0
config["training"]["score_weights"]["b3"] = 1.0
nlp = English.from_config(config)
score_weights = nlp.config["training"]["score_weights"]
expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
assert score_weights == expected
# Test with null values
config = nlp.config.copy()
config["training"]["score_weights"]["a1"] = None
nlp = English.from_config(config)
score_weights = nlp.config["training"]["score_weights"]
expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
assert score_weights == expected
def test_pipe_factories_from_source():

View File

@ -8,6 +8,7 @@ from spacy.language import Language
from spacy.pipeline import TextCategorizer
from spacy.tokens import Doc
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.scorer import Scorer
from ..util import make_tempdir
from ...cli.train import verify_textcat_config
@ -224,3 +225,31 @@ def test_positive_class_not_binary():
assert textcat.labels == ("SOME", "THING", "POS")
with pytest.raises(ValueError):
verify_textcat_config(nlp, pipe_config)
def test_textcat_evaluation():
train_examples = []
nlp = English()
ref1 = nlp("one")
ref1.cats = {"winter": 1.0, "summer": 1.0, "spring": 1.0, "autumn": 1.0}
pred1 = nlp("one")
pred1.cats = {"winter": 1.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0}
train_examples.append(Example(pred1, ref1))
ref2 = nlp("two")
ref2.cats = {"winter": 0.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0}
pred2 = nlp("two")
pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
train_examples.append(Example(pred2, ref2))
scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"])
assert scores["cats_f_per_type"]["winter"]["p"] == 1/2
assert scores["cats_f_per_type"]["winter"]["r"] == 1/1
assert scores["cats_f_per_type"]["summer"]["p"] == 0
assert scores["cats_f_per_type"]["summer"]["r"] == 0/1
assert scores["cats_f_per_type"]["spring"]["p"] == 1/1
assert scores["cats_f_per_type"]["spring"]["r"] == 1/2
assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2
assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2
assert scores["cats_micro_p"] == 4/5
assert scores["cats_micro_r"] == 4/6

View File

@ -169,3 +169,22 @@ def test_tok2vec_listener():
nlp.select_pipes(disable="tok2vec")
assert nlp.pipe_names == ["tagger"]
nlp("Running the pipeline with the Tok2Vec component disabled.")
def test_tok2vec_listener_callback():
orig_config = Config().from_str(cfg_string)
nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
assert nlp.pipe_names == ["tok2vec", "tagger"]
tagger = nlp.get_pipe("tagger")
tok2vec = nlp.get_pipe("tok2vec")
nlp._link_components()
docs = [nlp.make_doc("A random sentence")]
tok2vec.model.initialize(X=docs)
gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")]
tagger.model.initialize(X=docs, Y=label_sample)
docs = [nlp.make_doc("Another entirely random sentence")]
tok2vec.update([Example.from_dict(x, {}) for x in docs])
Y, get_dX = tagger.model.begin_update(docs)
# assure that the backprop call works (and doesn't hit a 'None' callback)
assert get_dX(Y) is not None

View File

@ -3,7 +3,7 @@ from spacy.pipeline import Pipe
from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Doc, Span, DocBin
from spacy.training import Example, Corpus
from spacy.training.converters import json2docs
from spacy.training.converters import json_to_docs
from spacy.vocab import Vocab
from spacy.lang.en import English
from spacy.util import minibatch, ensure_path, load_model
@ -425,7 +425,7 @@ def test_issue4402():
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
with make_tempdir() as tmpdir:
output_file = tmpdir / "test4402.spacy"
docs = json2docs([json_data])
docs = json_to_docs([json_data])
data = DocBin(docs=docs, attrs=attrs).to_bytes()
with output_file.open("wb") as file_:
file_.write(data)

View File

@ -1,7 +1,7 @@
import pytest
from spacy.tokens import Doc, Span, DocBin
from spacy.training import Example
from spacy.training.converters.conllu2docs import conllu2docs
from spacy.training.converters.conllu_to_docs import conllu_to_docs
from spacy.lang.en import English
from spacy.kb import KnowledgeBase
from spacy.vocab import Vocab
@ -82,7 +82,7 @@ def test_issue4651_without_phrase_matcher_attr():
def test_issue4665():
"""
conllu2json should not raise an exception if the HEAD column contains an
conllu_to_docs should not raise an exception if the HEAD column contains an
underscore
"""
input_data = """
@ -105,7 +105,7 @@ def test_issue4665():
17 . _ PUNCT . _ _ punct _ _
18 ] _ PUNCT -RRB- _ _ punct _ _
"""
conllu2docs(input_data)
conllu_to_docs(input_data)
def test_issue4674():

View File

@ -67,7 +67,8 @@ width = ${components.tok2vec.model.width}
parser_config_string = """
[model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 99
state_type = "parser"
extra_state_tokens = false
hidden_width = 66
maxout_pieces = 2
@ -95,7 +96,11 @@ def my_parser():
MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
)
parser = build_tb_parser_model(
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
tok2vec=tok2vec,
state_type="parser",
extra_state_tokens=True,
hidden_width=65,
maxout_pieces=5,
)
return parser
@ -340,3 +345,13 @@ def test_config_auto_fill_extra_fields():
assert "extra" not in nlp.config["training"]
# Make sure the config generated is valid
load_model_from_config(nlp.config)
def test_config_validate_literal():
nlp = English()
config = Config().from_str(parser_config_string)
config["model"]["state_type"] = "nonsense"
with pytest.raises(ConfigValidationError):
nlp.add_pipe("parser", config=config)
config["model"]["state_type"] = "ner"
nlp.add_pipe("parser", config=config)

View File

@ -1,20 +1,21 @@
import pytest
from click import NoSuchOption
from spacy.training import docs_to_json, biluo_tags_from_offsets
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.training import docs_to_json, offsets_to_biluo_tags
from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
from thinc.config import ConfigValidationError
from spacy.cli.debug_config import check_section_refs
from thinc.config import ConfigValidationError, Config
import srsly
import os
from .util import make_tempdir
def test_cli_converters_conllu2json():
def test_cli_converters_conllu_to_docs():
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
lines = [
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
@ -23,7 +24,7 @@ def test_cli_converters_conllu2json():
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
]
input_data = "\n".join(lines)
converted_docs = conllu2docs(input_data, n_sents=1)
converted_docs = conllu_to_docs(input_data, n_sents=1)
assert len(converted_docs) == 1
converted = [docs_to_json(converted_docs)]
assert converted[0]["id"] == 0
@ -39,7 +40,7 @@ def test_cli_converters_conllu2json():
ent_offsets = [
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
@ -62,9 +63,9 @@ def test_cli_converters_conllu2json():
),
],
)
def test_cli_converters_conllu2json_name_ner_map(lines):
def test_cli_converters_conllu_to_docs_name_ner_map(lines):
input_data = "\n".join(lines)
converted_docs = conllu2docs(
converted_docs = conllu_to_docs(
input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}
)
assert len(converted_docs) == 1
@ -83,11 +84,11 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
ent_offsets = [
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
def test_cli_converters_conllu2json_subtokens():
def test_cli_converters_conllu_to_docs_subtokens():
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
lines = [
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
@ -98,7 +99,7 @@ def test_cli_converters_conllu2json_subtokens():
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
]
input_data = "\n".join(lines)
converted_docs = conllu2docs(
converted_docs = conllu_to_docs(
input_data, n_sents=1, merge_subtokens=True, append_morphology=True
)
assert len(converted_docs) == 1
@ -132,11 +133,11 @@ def test_cli_converters_conllu2json_subtokens():
ent_offsets = [
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "U-PER", "O", "O"]
def test_cli_converters_iob2json():
def test_cli_converters_iob_to_docs():
lines = [
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
"I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
@ -144,7 +145,7 @@ def test_cli_converters_iob2json():
"I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
]
input_data = "\n".join(lines)
converted_docs = iob2docs(input_data, n_sents=10)
converted_docs = iob_to_docs(input_data, n_sents=10)
assert len(converted_docs) == 1
converted = docs_to_json(converted_docs)
assert converted["id"] == 0
@ -161,7 +162,7 @@ def test_cli_converters_iob2json():
assert ent.text in ["New York City", "London"]
def test_cli_converters_conll_ner2json():
def test_cli_converters_conll_ner_to_docs():
lines = [
"-DOCSTART- -X- O O",
"",
@ -211,7 +212,7 @@ def test_cli_converters_conll_ner2json():
".\t.\t_\tO",
]
input_data = "\n".join(lines)
converted_docs = conll_ner2docs(input_data, n_sents=10)
converted_docs = conll_ner_to_docs(input_data, n_sents=10)
assert len(converted_docs) == 1
converted = docs_to_json(converted_docs)
assert converted["id"] == 0
@ -413,3 +414,15 @@ def test_string_to_list(value):
def test_string_to_list_intify(value):
assert string_to_list(value, intify=False) == ["1", "2", "3"]
assert string_to_list(value, intify=True) == [1, 2, 3]
def test_check_section_refs():
config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
config = Config(config)
# Valid section reference
check_section_refs(config, ["a.b.c"])
# Section that doesn't exist in this config
check_section_refs(config, ["x.y.z"])
# Invalid section reference
with pytest.raises(ConfigValidationError):
check_section_refs(config, ["a.b.c", "f.g"])

View File

@ -2,7 +2,7 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal
import pytest
from pytest import approx
from spacy.training import Example
from spacy.training.iob_utils import biluo_tags_from_offsets
from spacy.training.iob_utils import offsets_to_biluo_tags
from spacy.scorer import Scorer, ROCAUCScore
from spacy.scorer import _roc_auc_score, _roc_curve
from spacy.lang.en import English
@ -186,7 +186,7 @@ def test_ner_per_type(en_vocab):
words=input_.split(" "),
ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
)
entities = biluo_tags_from_offsets(doc, annot["entities"])
entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities})
# a hack for sentence boundaries
example.predicted[1].is_sent_start = False
@ -211,7 +211,7 @@ def test_ner_per_type(en_vocab):
words=input_.split(" "),
ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
)
entities = biluo_tags_from_offsets(doc, annot["entities"])
entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities})
# a hack for sentence boundaries
example.predicted[1].is_sent_start = False

View File

@ -1,9 +1,9 @@
import numpy
from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
from spacy.training import spans_from_biluo_tags, iob_to_biluo
from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
from spacy.training import biluo_tags_to_spans, iob_to_biluo
from spacy.training import Corpus, docs_to_json
from spacy.training.example import Example
from spacy.training.converters import json2docs
from spacy.training.converters import json_to_docs
from spacy.training.augment import make_orth_variants_example
from spacy.lang.en import English
from spacy.tokens import Doc, DocBin
@ -69,7 +69,7 @@ def test_gold_biluo_U(en_vocab):
spaces = [True, True, True, False, True]
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to London"), "LOC")]
tags = biluo_tags_from_offsets(doc, entities)
tags = offsets_to_biluo_tags(doc, entities)
assert tags == ["O", "O", "O", "U-LOC", "O"]
@ -78,7 +78,7 @@ def test_gold_biluo_BL(en_vocab):
spaces = [True, True, True, True, False, True]
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")]
tags = biluo_tags_from_offsets(doc, entities)
tags = offsets_to_biluo_tags(doc, entities)
assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"]
@ -87,7 +87,7 @@ def test_gold_biluo_BIL(en_vocab):
spaces = [True, True, True, True, True, False, True]
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
tags = biluo_tags_from_offsets(doc, entities)
tags = offsets_to_biluo_tags(doc, entities)
assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
@ -100,7 +100,7 @@ def test_gold_biluo_overlap(en_vocab):
(len("I flew to "), len("I flew to San Francisco"), "LOC"),
]
with pytest.raises(ValueError):
biluo_tags_from_offsets(doc, entities)
offsets_to_biluo_tags(doc, entities)
def test_gold_biluo_misalign(en_vocab):
@ -109,7 +109,7 @@ def test_gold_biluo_misalign(en_vocab):
doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
with pytest.warns(UserWarning):
tags = biluo_tags_from_offsets(doc, entities)
tags = offsets_to_biluo_tags(doc, entities)
assert tags == ["O", "O", "O", "-", "-", "-"]
@ -155,7 +155,7 @@ def test_example_from_dict_some_ner(en_vocab):
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_json2docs_no_ner(en_vocab):
def test_json_to_docs_no_ner(en_vocab):
data = [
{
"id": 1,
@ -191,7 +191,7 @@ def test_json2docs_no_ner(en_vocab):
],
}
]
docs = json2docs(data)
docs = json_to_docs(data)
assert len(docs) == 1
for doc in docs:
assert not doc.has_annotation("ENT_IOB")
@ -358,9 +358,9 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
doc = en_tokenizer(text)
biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
biluo_tags_converted = offsets_to_biluo_tags(doc, offsets)
assert biluo_tags_converted == biluo_tags
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
offsets_converted = biluo_tags_to_offsets(doc, biluo_tags)
offsets_converted = [ent for ent in offsets if ent[2]]
assert offsets_converted == offsets
@ -368,7 +368,7 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
def test_biluo_spans(en_tokenizer):
doc = en_tokenizer("I flew to Silicon Valley via London.")
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
spans = spans_from_biluo_tags(doc, biluo_tags)
spans = biluo_tags_to_spans(doc, biluo_tags)
spans = [span for span in spans if span.label_]
assert len(spans) == 2
assert spans[0].text == "Silicon Valley"

View File

@ -2,8 +2,8 @@ from .corpus import Corpus # noqa: F401
from .example import Example, validate_examples # noqa: F401
from .align import Alignment # noqa: F401
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401
from .iob_utils import spans_from_biluo_tags, tags_to_entities # noqa: F401
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401
from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
from .gold_io import docs_to_json, read_json_file # noqa: F401
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
from .loggers import console_logger, wandb_logger # noqa: F401

View File

@ -1,4 +1,4 @@
from .iob2docs import iob2docs # noqa: F401
from .conll_ner2docs import conll_ner2docs # noqa: F401
from .json2docs import json2docs # noqa: F401
from .conllu2docs import conllu2docs # noqa: F401
from .iob_to_docs import iob_to_docs # noqa: F401
from .conll_ner_to_docs import conll_ner_to_docs # noqa: F401
from .json_to_docs import json_to_docs # noqa: F401
from .conllu_to_docs import conllu_to_docs # noqa: F401

View File

@ -7,7 +7,7 @@ from ...tokens import Doc, Span
from ...util import load_model
def conll_ner2docs(
def conll_ner_to_docs(
input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
):
"""

View File

@ -1,13 +1,13 @@
import re
from .conll_ner2docs import n_sents_info
from ...training import iob_to_biluo, spans_from_biluo_tags
from .conll_ner_to_docs import n_sents_info
from ...training import iob_to_biluo, biluo_tags_to_spans
from ...tokens import Doc, Token, Span
from ...vocab import Vocab
from wasabi import Printer
def conllu2docs(
def conllu_to_docs(
input_data,
n_sents=10,
append_morphology=False,
@ -78,7 +78,7 @@ def read_conllx(
if lines:
while lines[0].startswith("#"):
lines.pop(0)
doc = doc_from_conllu_sentence(
doc = conllu_sentence_to_doc(
vocab,
lines,
ner_tag_pattern,
@ -128,7 +128,7 @@ def get_entities(lines, tag_pattern, ner_map=None):
return iob_to_biluo(iob)
def doc_from_conllu_sentence(
def conllu_sentence_to_doc(
vocab,
lines,
ner_tag_pattern,
@ -215,7 +215,7 @@ def doc_from_conllu_sentence(
doc[i]._.merged_lemma = lemmas[i]
doc[i]._.merged_spaceafter = spaces[i]
ents = get_entities(lines, ner_tag_pattern, ner_map)
doc.ents = spans_from_biluo_tags(doc, ents)
doc.ents = biluo_tags_to_spans(doc, ents)
if merge_subtokens:
doc = merge_conllu_subtokens(lines, doc)

View File

@ -1,13 +1,13 @@
from wasabi import Printer
from .conll_ner2docs import n_sents_info
from .conll_ner_to_docs import n_sents_info
from ...vocab import Vocab
from ...training import iob_to_biluo, tags_to_entities
from ...tokens import Doc, Span
from ...util import minibatch
def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
def iob_to_docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
"""
Convert IOB files with one sentence per line and tags separated with '|'
into Doc objects so they can be saved. IOB and IOB2 are accepted.

View File

@ -1,12 +1,12 @@
import srsly
from ..gold_io import json_iterate, json_to_annotations
from ..example import annotations2doc
from ..example import annotations_to_doc
from ..example import _fix_legacy_dict_data, _parse_example_dict_data
from ...util import load_model
from ...lang.xx import MultiLanguage
def json2docs(input_data, model=None, **kwargs):
def json_to_docs(input_data, model=None, **kwargs):
nlp = load_model(model) if model is not None else MultiLanguage()
if not isinstance(input_data, bytes):
if not isinstance(input_data, str):
@ -17,6 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
for json_para in json_to_annotations(json_doc):
example_dict = _fix_legacy_dict_data(json_para)
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
doc = annotations_to_doc(nlp.vocab, tok_dict, doc_dict)
docs.append(doc)
return docs

View File

@ -7,13 +7,13 @@ from ..tokens.span cimport Span
from ..tokens.span import Span
from ..attrs import IDS
from .align import Alignment
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
from .iob_utils import spans_from_biluo_tags
from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
from .iob_utils import biluo_tags_to_spans
from ..errors import Errors, Warnings
from ..pipeline._parser_internals import nonproj
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
""" Create a Doc from dictionaries with token and doc annotations. """
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
@ -92,7 +92,7 @@ cdef class Example:
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
return Example(
predicted,
annotations2doc(predicted.vocab, tok_dict, doc_dict)
annotations_to_doc(predicted.vocab, tok_dict, doc_dict)
)
@property
@ -176,7 +176,7 @@ cdef class Example:
return [None] * len(self.x) # should this be 'missing' instead of 'None' ?
x_ents = self.get_aligned_spans_y2x(self.y.ents)
# Default to 'None' for missing values
x_tags = biluo_tags_from_offsets(
x_tags = offsets_to_biluo_tags(
self.x,
[(e.start_char, e.end_char, e.label_) for e in x_ents],
missing=None
@ -195,7 +195,7 @@ cdef class Example:
return {
"doc_annotation": {
"cats": dict(self.reference.cats),
"entities": biluo_tags_from_doc(self.reference),
"entities": doc_to_biluo_tags(self.reference),
"links": self._links_to_dict()
},
"token_annotation": {
@ -295,12 +295,12 @@ def _add_entities_to_doc(doc, ner_data):
elif isinstance(ner_data[0], tuple):
return _add_entities_to_doc(
doc,
biluo_tags_from_offsets(doc, ner_data)
offsets_to_biluo_tags(doc, ner_data)
)
elif isinstance(ner_data[0], str) or ner_data[0] is None:
return _add_entities_to_doc(
doc,
spans_from_biluo_tags(doc, ner_data)
biluo_tags_to_spans(doc, ner_data)
)
elif isinstance(ner_data[0], Span):
# Ugh, this is super messy. Really hard to set O entities
@ -388,7 +388,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
# This is annoying but to convert the offsets we need a Doc
# that has the target tokenization.
reference = Doc(vocab, words=words, spaces=spaces)
biluo = biluo_tags_from_offsets(reference, biluo_or_offsets)
biluo = offsets_to_biluo_tags(reference, biluo_or_offsets)
else:
biluo = biluo_or_offsets
ent_iobs = []

View File

@ -3,7 +3,7 @@ import srsly
from .. import util
from ..errors import Warnings
from ..tokens import Doc
from .iob_utils import biluo_tags_from_offsets, tags_to_entities
from .iob_utils import offsets_to_biluo_tags, tags_to_entities
import json
@ -32,7 +32,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
if ent.kb_id_:
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
json_para["links"].append(link_dict)
biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
for j, sent in enumerate(doc.sents):

View File

@ -1,9 +1,11 @@
from typing import List, Tuple, Iterable, Union, Iterator
import warnings
from ..errors import Errors, Warnings
from ..tokens import Span
from ..tokens import Span, Doc
def iob_to_biluo(tags):
def iob_to_biluo(tags: Iterable[str]) -> List[str]:
out = []
tags = list(tags)
while tags:
@ -12,7 +14,7 @@ def iob_to_biluo(tags):
return out
def biluo_to_iob(tags):
def biluo_to_iob(tags: Iterable[str]) -> List[str]:
out = []
for tag in tags:
if tag is None:
@ -23,12 +25,12 @@ def biluo_to_iob(tags):
return out
def _consume_os(tags):
def _consume_os(tags: List[str]) -> Iterator[str]:
while tags and tags[0] == "O":
yield tags.pop(0)
def _consume_ent(tags):
def _consume_ent(tags: List[str]) -> List[str]:
if not tags:
return []
tag = tags.pop(0)
@ -50,15 +52,17 @@ def _consume_ent(tags):
return [start] + middle + [end]
def biluo_tags_from_doc(doc, missing="O"):
return biluo_tags_from_offsets(
def doc_to_biluo_tags(doc: Doc, missing: str = "O"):
return offsets_to_biluo_tags(
doc,
[(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
missing=missing,
)
def biluo_tags_from_offsets(doc, entities, missing="O"):
def offsets_to_biluo_tags(
doc: Doc, entities: Iterable[Tuple[int, int, Union[str, int]]], missing: str = "O"
) -> List[str]:
"""Encode labelled spans into per-token tags, using the
Begin/In/Last/Unit/Out scheme (BILUO).
@ -69,7 +73,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
the original string.
RETURNS (list): A list of unicode strings, describing the tags. Each tag
string will be of the form either "", "O" or "{action}-{label}", where
action is one of "B", "I", "L", "U". The string "-" is used where the
action is one of "B", "I", "L", "U". The missing label is used where the
entity offsets don't align with the tokenization in the `Doc` object.
The training algorithm will view these as missing values. "O" denotes a
non-entity token. "B" denotes the beginning of a multi-token entity,
@ -80,12 +84,11 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
>>> text = 'I like London.'
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
>>> doc = nlp.tokenizer(text)
>>> tags = biluo_tags_from_offsets(doc, entities)
>>> tags = offsets_to_biluo_tags(doc, entities)
>>> assert tags == ["O", "O", 'U-LOC', "O"]
"""
# Ensure no overlapping entity labels exist
tokens_in_ents = {}
starts = {token.idx: token.i for token in doc}
ends = {token.idx + len(token): token.i for token in doc}
biluo = ["-" for _ in doc]
@ -109,7 +112,6 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
)
)
tokens_in_ents[token_index] = (start_char, end_char, label)
start_token = starts.get(start_char)
end_token = ends.get(end_char)
# Only interested if the tokenization is correct
@ -143,7 +145,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
return biluo
def spans_from_biluo_tags(doc, tags):
def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]:
"""Encode per-token tags following the BILUO scheme into Span object, e.g.
to overwrite the doc.ents.
@ -161,7 +163,9 @@ def spans_from_biluo_tags(doc, tags):
return spans
def offsets_from_biluo_tags(doc, tags):
def biluo_tags_to_offsets(
doc: Doc, tags: Iterable[str]
) -> List[Tuple[int, int, Union[str, int]]]:
"""Encode per-token tags following the BILUO scheme into entity offsets.
doc (Doc): The document that the BILUO tags refer to.
@ -172,12 +176,12 @@ def offsets_from_biluo_tags(doc, tags):
`end` will be character-offset integers denoting the slice into the
original string.
"""
spans = spans_from_biluo_tags(doc, tags)
spans = biluo_tags_to_spans(doc, tags)
return [(span.start_char, span.end_char, span.label_) for span in spans]
def tags_to_entities(tags):
""" Note that the end index returned by this function is inclusive.
def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]:
"""Note that the end index returned by this function is inclusive.
To use it for Span creation, increment the end by 1."""
entities = []
start = None
@ -209,3 +213,9 @@ def tags_to_entities(tags):
else:
raise ValueError(Errors.E068.format(tag=tag))
return entities
# Fallbacks to make backwards-compat easier
offsets_from_biluo_tags = biluo_tags_to_offsets
spans_from_biluo_tags = biluo_tags_to_spans
biluo_tags_from_offsets = offsets_to_biluo_tags

View File

@ -11,9 +11,12 @@ def console_logger():
def setup_printer(
nlp: "Language",
) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
score_cols = list(nlp.config["training"]["score_weights"])
# we assume here that only components are enabled that should be trained & logged
logged_pipes = nlp.pipe_names
score_weights = nlp.config["training"]["score_weights"]
score_cols = [col for col, value in score_weights.items() if value is not None]
score_widths = [max(len(col), 6) for col in score_cols]
loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
loss_widths = [max(len(col), 8) for col in loss_cols]
table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
table_header = [col.upper() for col in table_header]
@ -26,7 +29,7 @@ def console_logger():
try:
losses = [
"{0:.2f}".format(float(info["losses"][pipe_name]))
for pipe_name in nlp.pipe_names
for pipe_name in logged_pipes
]
except KeyError as e:
raise KeyError(
@ -38,10 +41,15 @@ def console_logger():
) from None
scores = []
for col in score_cols:
score = float(info["other_scores"].get(col, 0.0))
if col != "speed":
score *= 100
scores.append("{0:.2f}".format(score))
score = info["other_scores"].get(col, 0.0)
try:
score = float(score)
if col != "speed":
score *= 100
scores.append("{0:.2f}".format(score))
except TypeError:
err = Errors.E916.format(name=col, score_type=type(score))
raise ValueError(err) from None
data = (
[info["epoch"], info["step"]]
+ losses

View File

@ -61,7 +61,7 @@ LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta",
# Default order of sections in the config.cfg. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order.
# fmt: off
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"]
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"]
# fmt: on
@ -1202,21 +1202,38 @@ def get_arg_names(func: Callable) -> List[str]:
return list(set([*argspec.args, *argspec.kwonlyargs]))
def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]:
def combine_score_weights(
weights: List[Dict[str, float]],
overrides: Dict[str, Optional[Union[float, int]]] = SimpleFrozenDict(),
) -> Dict[str, float]:
"""Combine and normalize score weights defined by components, e.g.
{"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}.
weights (List[dict]): The weights defined by the components.
overrides (Dict[str, Optional[Union[float, int]]]): Existing scores that
should be preserved.
RETURNS (Dict[str, float]): The combined and normalized weights.
"""
# We first need to extract all None/null values for score weights that
# shouldn't be shown in the table *or* be weighted
result = {}
all_weights = []
for w_dict in weights:
filtered_weights = {}
for key, value in w_dict.items():
value = overrides.get(key, value)
if value is None:
result[key] = None
else:
filtered_weights[key] = value
all_weights.append(filtered_weights)
for w_dict in all_weights:
# We need to account for weights that don't sum to 1.0 and normalize
# the score weights accordingly, then divide score by the number of
# components.
total = sum(w_dict.values())
for key, value in w_dict.items():
weight = round(value / total / len(weights), 2)
weight = round(value / total / len(all_weights), 2)
result[key] = result.get(key, 0.0) + weight
return result

View File

@ -414,7 +414,8 @@ one component.
> ```ini
> [model]
> @architectures = "spacy.TransitionBasedParser.v1"
> nr_feature_tokens = 6
> state_type = "ner"
> extra_state_tokens = false
> hidden_width = 64
> maxout_pieces = 2
>
@ -446,15 +447,16 @@ consists of either two or three subnetworks:
state representation. If not present, the output from the lower model is used
as action scores directly.
| Name | Description |
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~ |
| `hidden_width` | The width of the hidden layer. ~~int~~ |
| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ |
| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ |
| Name | Description |
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
| `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ |
| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
| `hidden_width` | The width of the hidden layer. ~~int~~ |
| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ |
| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ |
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}

View File

@ -275,7 +275,7 @@ $ python -m spacy convert ./data.json ./output.spacy
> entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes
> the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token
> representing a `PERSON` entity. The
> [`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets) function
> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function
> can help you convert entity offsets to the right format.
```python

View File

@ -145,17 +145,16 @@ examples, see the
> )
> ```
| Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `name` | The name of the component factory. ~~str~~ |
| _keyword-only_ | |
| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ |
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ |
| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ |
| Name | Description |
| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | The name of the component factory. ~~str~~ |
| _keyword-only_ | |
| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ |
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ |
| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ |
## Language.\_\_call\_\_ {#call tag="method"}
@ -1036,12 +1035,12 @@ provided by the [`@Language.component`](/api/language#component) or
component is defined and stored on the `Language` class for each component
instance and factory instance.
| Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `factory` | The name of the registered component factory. ~~str~~ |
| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ |
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~  |
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~  |
| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ |
| Name | Description |
| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory` | The name of the registered component factory. ~~str~~ |
| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ |
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~  |
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~  |
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ |
| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Based on the `default_score_weights` and used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |

View File

@ -619,7 +619,7 @@ sequences in the batch.
## Training data and alignment {#gold source="spacy/training"}
### training.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"}
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
Encode labelled spans into per-token tags, using the
[BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit,
@ -632,14 +632,20 @@ the beginning of a multi-token entity, `I` the inside of an entity of three or
more tokens, and `L` the end of an entity of two or more tokens. `U` denotes a
single-token entity.
<Infobox title="Changed in v3.0" variant="warning" id="biluo_tags_from_offsets">
This method was previously available as `spacy.gold.biluo_tags_from_offsets`.
</Infobox>
> #### Example
>
> ```python
> from spacy.training import biluo_tags_from_offsets
> from spacy.training import offsets_to_biluo_tags
>
> doc = nlp("I like London.")
> entities = [(7, 13, "LOC")]
> tags = biluo_tags_from_offsets(doc, entities)
> tags = offsets_to_biluo_tags(doc, entities)
> assert tags == ["O", "O", "U-LOC", "O"]
> ```
@ -647,21 +653,28 @@ single-token entity.
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. ~~Doc~~ |
| `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ |
| `missing` | The label used for missing values, e.g. if tokenization doesn't align with the entity offsets. Defaults to `"O"`. ~~str~~ |
| **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
### training.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}
### training.biluo_tags_to_offsets {#biluo_tags_to_offsets tag="function"}
Encode per-token tags following the
[BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets.
<Infobox title="Changed in v3.0" variant="warning" id="offsets_from_biluo_tags">
This method was previously available as `spacy.gold.offsets_from_biluo_tags`.
</Infobox>
> #### Example
>
> ```python
> from spacy.training import offsets_from_biluo_tags
> from spacy.training import biluo_tags_to_offsets
>
> doc = nlp("I like London.")
> tags = ["O", "O", "U-LOC", "O"]
> entities = offsets_from_biluo_tags(doc, tags)
> entities = biluo_tags_to_offsets(doc, tags)
> assert entities == [(7, 13, "LOC")]
> ```
@ -671,21 +684,27 @@ Encode per-token tags following the
| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
| **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ |
### training.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"}
### training.biluo_tags_to_spans {#biluo_tags_to_spans tag="function" new="2.1"}
Encode per-token tags following the
[BILUO scheme](/usage/linguistic-features#accessing-ner) into
[`Span`](/api/span) objects. This can be used to create entity spans from
token-based tags, e.g. to overwrite the `doc.ents`.
<Infobox title="Changed in v3.0" variant="warning" id="spans_from_biluo_tags">
This method was previously available as `spacy.gold.spans_from_biluo_tags`.
</Infobox>
> #### Example
>
> ```python
> from spacy.training import spans_from_biluo_tags
> from spacy.training import biluo_tags_to_spans
>
> doc = nlp("I like London.")
> tags = ["O", "O", "U-LOC", "O"]
> doc.ents = spans_from_biluo_tags(doc, tags)
> doc.ents = biluo_tags_to_spans(doc, tags)
> ```
| Name | Description |

View File

@ -1,24 +1,19 @@
import { Help } from 'components/typography'; import Link from 'components/link'
<!-- TODO: update, add project template -->
<!-- TODO: update numbers -->
<figure>
| System | Parser | Tagger | NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
| ------------------------------------------------------------------------- | ----------------: | ----------------: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k |
| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | |
| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | |
| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup> | _n/a_<sup>2</sup> | _n/a_<sup>2</sup> | 88.8 | 234 | 2k |
| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link> | - | 97.9 | 89.3 | | |
| Pipeline | Parser | Tagger | NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
| ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k |
| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | |
| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | |
<figcaption class="caption">
**Accuracy and speed on the
[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**<br />**1. **
[Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf). **2. ** _Coming soon_:
Qi et al. don't report parsing and tagging results on OntoNotes. We're working
on training Stanza on this corpus to allow direct comparison.
**Full pipeline accuracy and speed** on the
[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.
</figcaption>
@ -26,18 +21,24 @@ on training Stanza on this corpus to allow direct comparison.
<figure>
| System | POS | UAS | LAS |
| ------------------------------------------------------------------------------ | ---: | ---: | ---: |
| spaCy RoBERTa (2020) | | | |
| spaCy CNN (2020) | | | |
| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 |
| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 |
| Named Entity Recognition System | OntoNotes | CoNLL '03 |
| ------------------------------------------------------------------------------ | --------: | --------: |
| spaCy RoBERTa (2020) | | 92.2 |
| spaCy CNN (2020) | 85.3 | 88.4 |
| spaCy CNN (2017) | 86.4 | |
| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup> | 88.8 | 92.1 |
| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link><sup>2</sup> | 89.7 | 93.1 |
| BERT Base<sup>3</sup> | - | 92.4 |
<figcaption class="caption">
**Accuracy on the Penn Treebank.** See
[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
results.
**Named entity recognition accuracy** on the
[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) and
[CoNLL-2003](https://www.aclweb.org/anthology/W03-0419.pdf) corpora. See
[NLP-progress](http://nlpprogress.com/english/named_entity_recognition.html) for
more results. **1. ** [Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf).
**2. ** [Akbik et al. (2018)](https://www.aclweb.org/anthology/C18-1139/). **3.
** [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805).
</figcaption>

View File

@ -235,8 +235,6 @@ The `Transformer` component sets the
[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
which lets you access the transformers outputs at runtime.
<!-- TODO: update/confirm once we have final models trained -->
```cli
$ python -m spacy download en_core_trf_lg
```
@ -448,7 +446,8 @@ factory = "ner"
[nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3
state_type = "ner"
extra_state_tokens = false
hidden_width = 128
maxout_pieces = 3
use_upper = false

View File

@ -61,12 +61,25 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
<Benchmarks />
<Project id="benchmarks/parsing_penn_treebank">
<figure>
The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone
our project template.
| Dependency Parsing System | UAS | LAS |
| ------------------------------------------------------------------------------ | ---: | ---: |
| spaCy RoBERTa (2020)<sup>1</sup> | 96.8 | 95.0 |
| spaCy CNN (2020)<sup>1</sup> | 93.7 | 91.8 |
| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 |
| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.2 | 95.7 |
</Project>
<figcaption class="caption">
**Dependency parsing accuracy** on the Penn Treebank. See
[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
results. **1. ** Project template:
[`benchmarks/parsing_penn_treebank`](%%GITHUB_PROJECTS/benchmarks/parsing_penn_treebank).
</figcaption>
</figure>
<!-- TODO: ## Citing spaCy {#citation}

View File

@ -1654,9 +1654,12 @@ The [`SentenceRecognizer`](/api/sentencerecognizer) is a simple statistical
component that only provides sentence boundaries. Along with being faster and
smaller than the parser, its primary advantage is that it's easier to train
because it only requires annotated sentence boundaries rather than full
dependency parses.
<!-- TODO: update/confirm usage once we have final models trained -->
dependency parses. spaCy's [trained pipelines](/models) include both a parser
and a trained sentence segmenter, which is
[disabled](/usage/processing-pipelines#disabling) by default. If you only need
sentence boundaries and no parser, you can use the `enable` and `disable`
arguments on [`spacy.load`](/api/top-level#spacy.load) to enable the senter and
disable the parser.
> #### senter vs. parser
>

View File

@ -253,8 +253,6 @@ different mechanisms you can use:
Disabled and excluded component names can be provided to
[`spacy.load`](/api/top-level#spacy.load) as a list.
<!-- TODO: update with info on our models shipped with optional components -->
> #### 💡 Optional pipeline components
>
> The `disable` mechanism makes it easy to distribute pipeline packages with
@ -262,6 +260,11 @@ Disabled and excluded component names can be provided to
> your pipeline may include a statistical _and_ a rule-based component for
> sentence segmentation, and you can choose which one to run depending on your
> use case.
>
> For example, spaCy's [trained pipelines](/models) like
> [`en_core_web_sm`](/models/en#en_core_web_sm) contain both a `parser` and
> `senter` that perform sentence segmentation, but the `senter` is disabled by
> default.
```python
# Load the pipeline without the entity recognizer
@ -1501,7 +1504,7 @@ add those entities to the `doc.ents`, you can wrap it in a custom pipeline
component function and pass it the token texts from the `Doc` object received by
the component.
The [`training.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very
The [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans) is very
helpful here, because it takes a `Doc` object and token-based BILUO tags and
returns a sequence of `Span` objects in the `Doc` with added labels. So all your
wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
@ -1516,14 +1519,14 @@ wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
```python
### {highlight="1,8-9"}
import your_custom_entity_recognizer
from spacy.training import offsets_from_biluo_tags
from spacy.training import biluo_tags_to_spans
from spacy.language import Language
@Language.component("custom_ner_wrapper")
def custom_ner_wrapper(doc):
words = [token.text for token in doc]
custom_entities = your_custom_entity_recognizer(words)
doc.ents = spans_from_biluo_tags(doc, custom_entities)
doc.ents = biluo_tags_to_spans(doc, custom_entities)
return doc
```

View File

@ -213,7 +213,7 @@ a quick web demo. It looks pretty similar to a config file used to define CI
pipelines.
```yaml
https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
```
| Section | Description |
@ -733,7 +733,10 @@ workflows, but only one can be tracked by DVC.
<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
The Prodigy integration will require a nightly version of Prodigy that supports
spaCy v3+.
spaCy v3+. You can already use annotations created with Prodigy in spaCy v3 by
exporting your data with
[`data-to-spacy`](https://prodi.gy/docs/recipes#data-to-spacy) and running
[`spacy convert`](/api/cli#convert) to convert it to the binary format.
</Infobox>

View File

@ -470,6 +470,7 @@ score.
```ini
[training.score_weights]
dep_las = 0.4
dep_uas = null
ents_f = 0.4
tag_acc = 0.2
token_acc = 0.0
@ -481,9 +482,9 @@ you generate a config for a given pipeline, the score weights are generated by
combining and normalizing the default score weights of the pipeline components.
The default score weights are defined by each pipeline component via the
`default_score_weights` setting on the
[`@Language.component`](/api/language#component) or
[`@Language.factory`](/api/language#factory). By default, all pipeline
components are weighted equally.
[`@Language.factory`](/api/language#factory) decorator. By default, all pipeline
components are weighted equally. If a score weight is set to `null`, it will be
excluded from the logs and the score won't be weighted.
<Accordion title="Understanding the training output and score types" spaced>

View File

@ -88,7 +88,10 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
- **Architectures: ** [TransformerModel](/api/architectures#TransformerModel),
[TransformerListener](/api/architectures#TransformerListener),
[Tok2VecTransformer](/api/architectures#Tok2VecTransformer)
- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf)
- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf),
[`de_dep_news_trf`](/models/de#de_dep_news_trf),
[`es_dep_news_trf`](/models/es#es_dep_news_trf),
[`fr_dep_news_trf`](/models/fr#fr_dep_news_trf)
- **Implementation:**
[`spacy-transformers`](https://github.com/explosion/spacy-transformers)
@ -548,17 +551,19 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
### Removed or renamed API {#incompat-removed}
| Removed | Replacement |
| -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ |
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
| `GoldParse` | [`Example`](/api/example) |
| `GoldCorpus` | [`Corpus`](/api/corpus) |
| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) |
| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed |
| `spacy init-model` | [`spacy init vocab`](/api/cli#init-vocab) |
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |
| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, symlinks are deprecated |
| Removed | Replacement |
| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) |
| `GoldParse` | [`Example`](/api/example) |
| `GoldCorpus` | [`Corpus`](/api/corpus) |
| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) |
| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed |
| `gold.offsets_from_biluo_tags`, `gold.spans_from_biluo_tags`, `gold.biluo_tags_from_offsets` | [`training.biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets), [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), [`training.offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) |
| `spacy init-model` | [`spacy init vocab`](/api/cli#init-vocab) |
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |
| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, symlinks are deprecated |
The following deprecated methods, attributes and arguments were removed in v3.0.
Most of them have been **deprecated for a while** and many would previously
@ -968,16 +973,17 @@ python -m spacy package ./output ./packages
#### Data utilities and gold module {#migrating-gold}
The `spacy.gold` module has been renamed to `spacy.training`. This mostly
affects internals, but if you've been using the span offset conversion utilities
[`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets),
[`offsets_from_biluo_tags`](/api/top-level#offsets_from_biluo_tags) or
[`spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags), you'll have to
change your imports:
The `spacy.gold` module has been renamed to `spacy.training` and the conversion
utilities now follow the naming format of `x_to_y`. This mostly affects
internals, but if you've been using the span offset conversion utilities
[`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags),
[`biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets) or
[`biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), you'll have to
change your names and imports:
```diff
- from spacy.gold import biluo_tags_from_offsets, spans_from_biluo_tags
+ from spacy.training import biluo_tags_from_offsets, spans_from_biluo_tags
- from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, spans_from_biluo_tags
+ from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, biluo_tags_to_spans
```
#### Migration notes for plugin maintainers {#migrating-plugins}

View File

@ -24,6 +24,7 @@ const branch = isNightly ? 'develop' : 'master'
// Those variables are going to be replaced in the Markdown, e.g. %%GITHUB_SPACY
const replacements = {
GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
GITHUB_PROJECTS: `https://github.com/${site.projectsRepo}`,
}
/**

View File

@ -1,21 +1,11 @@
{
"languages": [
{
"code": "zh",
"name": "Chinese",
"models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"],
"dependencies": [
{
"name": "Jieba",
"url": "https://github.com/fxsjy/jieba"
},
{
"name": "PKUSeg",
"url": "https://github.com/lancopku/PKUSeg-python"
}
],
"has_examples": true
},
{ "code": "af", "name": "Afrikaans" },
{ "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true },
{ "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true },
{ "code": "bn", "name": "Bengali", "has_examples": true },
{ "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
{ "code": "cs", "name": "Czech", "has_examples": true },
{
"code": "da",
"name": "Danish",
@ -23,39 +13,10 @@
"has_examples": true,
"models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"]
},
{
"code": "nl",
"name": "Dutch",
"models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
"example": "Dit is een zin.",
"has_examples": true
},
{
"code": "en",
"name": "English",
"models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"],
"starters": [
"en_vectors_web_lg",
"en_trf_bertbaseuncased_lg",
"en_trf_robertabase_lg",
"en_trf_distilbertbaseuncased_lg",
"en_trf_xlnetbasecased_lg"
],
"example": "This is a sentence.",
"has_examples": true
},
{
"code": "fr",
"name": "French",
"models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"],
"example": "C'est une phrase.",
"has_examples": true
},
{
"code": "de",
"name": "German",
"models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"],
"starters": ["de_trf_bertbasecased_lg"],
"models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg", "de_dep_news_trf"],
"example": "Dies ist ein Satz.",
"has_examples": true
},
@ -66,6 +27,46 @@
"example": "Αυτή είναι μια πρόταση.",
"has_examples": true
},
{
"code": "en",
"name": "English",
"models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"],
"starters": ["en_vectors_web_lg"],
"example": "This is a sentence.",
"has_examples": true
},
{
"code": "es",
"name": "Spanish",
"models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf"],
"example": "Esto es una frase.",
"has_examples": true
},
{ "code": "et", "name": "Estonian" },
{ "code": "eu", "name": "Basque", "has_examples": true },
{ "code": "fa", "name": "Persian", "has_examples": true },
{ "code": "fi", "name": "Finnish", "has_examples": true },
{
"code": "fr",
"name": "French",
"models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg", "fr_dep_news_trf"],
"example": "C'est une phrase.",
"has_examples": true
},
{ "code": "ga", "name": "Irish" },
{ "code": "gu", "name": "Gujarati", "has_examples": true },
{ "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },
{ "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true },
{ "code": "hr", "name": "Croatian", "has_examples": true },
{ "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true },
{ "code": "hy", "name": "Armenian", "has_examples": true },
{
"code": "id",
"name": "Indonesian",
"example": "Ini adalah sebuah kalimat.",
"has_examples": true
},
{ "code": "is", "name": "Icelandic" },
{
"code": "it",
"name": "Italian",
@ -88,12 +89,37 @@
"example": "これは文章です。",
"has_examples": true
},
{ "code": "kn", "name": "Kannada", "has_examples": true },
{
"code": "ko",
"name": "Korean",
"dependencies": [
{
"name": "mecab-ko",
"url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md"
},
{ "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" },
{ "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" }
],
"example": "이것은 문장입니다.",
"has_examples": true
},
{ "code": "lb", "name": "Luxembourgish", "has_examples": true },
{
"code": "lij",
"name": "Ligurian",
"example": "Sta chì a l'é unna fraxe.",
"has_examples": true
},
{
"code": "lt",
"name": "Lithuanian",
"has_examples": true,
"models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"]
},
{ "code": "lv", "name": "Latvian" },
{ "code": "ml", "name": "Malayalam", "has_examples": true },
{ "code": "mr", "name": "Marathi" },
{
"code": "nb",
"name": "Norwegian Bokmål",
@ -101,6 +127,14 @@
"has_examples": true,
"models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"]
},
{ "code": "ne", "name": "Nepali", "has_examples": true },
{
"code": "nl",
"name": "Dutch",
"models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
"example": "Dit is een zin.",
"has_examples": true
},
{
"code": "pl",
"name": "Polish",
@ -122,69 +156,26 @@
"has_examples": true,
"models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"]
},
{
"code": "es",
"name": "Spanish",
"models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"],
"example": "Esto es una frase.",
"has_examples": true
},
{ "code": "sv", "name": "Swedish", "has_examples": true },
{ "code": "fi", "name": "Finnish", "has_examples": true },
{ "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true },
{
"code": "ru",
"name": "Russian",
"has_examples": true,
"dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
},
{
"code": "uk",
"name": "Ukrainian",
"has_examples": true,
"dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
},
{ "code": "hr", "name": "Croatian", "has_examples": true },
{ "code": "eu", "name": "Basque", "has_examples": true },
{ "code": "yo", "name": "Yoruba", "has_examples": true },
{ "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
{ "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
{ "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },
{ "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true },
{ "code": "fa", "name": "Persian", "has_examples": true },
{ "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true },
{ "code": "tt", "name": "Tatar", "has_examples": true },
{ "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true },
{ "code": "sa", "name": "Sanskrit", "has_examples": true },
{ "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true },
{ "code": "ga", "name": "Irish" },
{ "code": "bn", "name": "Bengali", "has_examples": true },
{ "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true },
{ "code": "mr", "name": "Marathi" },
{ "code": "kn", "name": "Kannada" },
{ "code": "ta", "name": "Tamil", "has_examples": true },
{
"code": "id",
"name": "Indonesian",
"example": "Ini adalah sebuah kalimat.",
"has_examples": true
},
{ "code": "tl", "name": "Tagalog" },
{ "code": "af", "name": "Afrikaans" },
{ "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true },
{ "code": "cs", "name": "Czech" },
{ "code": "is", "name": "Icelandic" },
{ "code": "lv", "name": "Latvian" },
{ "code": "sr", "name": "Serbian" },
{ "code": "sk", "name": "Slovak" },
{ "code": "sk", "name": "Slovak", "has_examples": true },
{ "code": "sl", "name": "Slovenian" },
{ "code": "lb", "name": "Luxembourgish" },
{
"code": "sq",
"name": "Albanian",
"example": "Kjo është një fjali.",
"has_examples": true
},
{ "code": "et", "name": "Estonian" },
{ "code": "sr", "name": "Serbian", "has_examples": true },
{ "code": "sv", "name": "Swedish", "has_examples": true },
{ "code": "ta", "name": "Tamil", "has_examples": true },
{ "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true },
{
"code": "th",
"name": "Thai",
@ -194,51 +185,43 @@
"example": "นี่คือประโยค",
"has_examples": true
},
{ "code": "tl", "name": "Tagalog" },
{ "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
{ "code": "tt", "name": "Tatar", "has_examples": true },
{
"code": "ko",
"name": "Korean",
"dependencies": [
{
"name": "mecab-ko",
"url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md"
},
{ "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" },
{ "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" }
],
"example": "이것은 문장입니다.",
"has_examples": true
"code": "uk",
"name": "Ukrainian",
"has_examples": true,
"dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
},
{ "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true },
{
"code": "vi",
"name": "Vietnamese",
"dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }]
},
{
"code": "lij",
"name": "Ligurian",
"example": "Sta chì a l'é unna fraxe.",
"has_examples": true
},
{
"code": "hy",
"name": "Armenian",
"has_examples": true
},
{
"code": "gu",
"name": "Gujarati",
"has_examples": true
},
{
"code": "ml",
"name": "Malayalam",
"has_examples": true
},
{
"code": "xx",
"name": "Multi-language",
"models": ["xx_ent_wiki_sm"],
"example": "This is a sentence about Facebook."
},
{ "code": "yo", "name": "Yoruba", "has_examples": true },
{
"code": "zh",
"name": "Chinese",
"models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"],
"dependencies": [
{
"name": "Jieba",
"url": "https://github.com/fxsjy/jieba"
},
{
"name": "PKUSeg",
"url": "https://github.com/lancopku/PKUSeg-python"
}
],
"has_examples": true
}
],
"licenses": [

View File

@ -1,4 +1,4 @@
import React from 'react'
import React, { Fragment } from 'react'
import PropTypes from 'prop-types'
import classNames from 'classnames'
@ -14,30 +14,34 @@ export default function Infobox({
className,
children,
}) {
const Wrapper = id ? 'div' : Fragment
const infoboxClassNames = classNames(classes.root, className, {
[classes.list]: !!list,
[classes.warning]: variant === 'warning',
[classes.danger]: variant === 'danger',
})
return (
<aside className={infoboxClassNames} id={id}>
{title && (
<h4 className={classes.title}>
{variant !== 'default' && !emoji && (
<Icon width={18} name={variant} inline className={classes.icon} />
)}
<span className={classes.titleText}>
{emoji && (
<span className={classes.emoji} aria-hidden="true">
{emoji}
</span>
<Wrapper>
{id && <a id={id} />}
<aside className={infoboxClassNames}>
{title && (
<h4 className={classes.title}>
{variant !== 'default' && !emoji && (
<Icon width={18} name={variant} inline className={classes.icon} />
)}
{title}
</span>
</h4>
)}
{children}
</aside>
<span className={classes.titleText}>
{emoji && (
<span className={classes.emoji} aria-hidden="true">
{emoji}
</span>
)}
{title}
</span>
</h4>
)}
{children}
</aside>
</Wrapper>
)
}

View File

@ -12,7 +12,6 @@ import Tag from '../components/tag'
import { H2, Label } from '../components/typography'
import Icon from '../components/icon'
import Link from '../components/link'
import Grid from '../components/grid'
import Infobox from '../components/infobox'
import Accordion from '../components/accordion'
import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
@ -31,10 +30,16 @@ const MODEL_META = {
wiki: 'Wikipedia',
uas: 'Unlabelled dependencies',
las: 'Labelled dependencies',
token_acc: 'Tokenization',
tok: 'Tokenization',
tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
ents_f: 'Entities (F-score)',
ents_p: 'Entities (precision)',
ents_r: 'Entities (recall)',
tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
ents_f: 'Named entities (F-score)',
ents_p: 'Named entities (precision)',
ents_r: 'Named entities (recall)',
sent_f: 'Sentence segmentation (F-score)',
sent_p: 'Sentence segmentation (precision)',
sent_r: 'Sentence segmentation (recall)',
cpu: 'words per second on CPU',
gpu: 'words per second on GPU',
pipeline: 'Active processing pipeline components in order',
@ -83,25 +88,19 @@ function formatVectors(data) {
}
function formatAccuracy(data) {
if (!data) return null
const labels = {
las: 'LAS',
uas: 'UAS',
tags_acc: 'TAG',
ents_f: 'NER F',
ents_p: 'NER P',
ents_r: 'NER R',
}
const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key)
const isNer = key => key.startsWith('ents_')
if (!data) return []
return Object.keys(data)
.filter(key => labels[key])
.map(key => ({
label: labels[key],
value: data[key].toFixed(2),
help: MODEL_META[key],
type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null,
}))
.map(label => {
const value = data[label]
return isNaN(value)
? null
: {
label,
value: value.toFixed(2),
help: MODEL_META[label],
}
})
.filter(item => item)
}
function formatModelMeta(data) {
@ -188,16 +187,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
{ label: 'Author', content: author },
{ label: 'License', content: license },
]
const accuracy = [
{
label: 'Syntax Accuracy',
items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'syntax') : null,
},
{
label: 'NER Accuracy',
items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'ner') : null,
},
]
const error = (
<Infobox title="Unable to load model details from GitHub" variant="danger">
@ -209,7 +198,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
</p>
</Infobox>
)
return (
<Section id={name}>
<H2
@ -254,33 +242,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
)}
</tbody>
</Table>
<Grid cols={2} gutterBottom={hasInteractiveCode || !!labels}>
{accuracy &&
accuracy.map(({ label, items }, i) =>
!items ? null : (
<Table fixed key={i}>
<thead>
<Tr>
<Th colSpan={2}>{label}</Th>
</Tr>
</thead>
<tbody>
{items.map((item, i) => (
<Tr key={i}>
<Td>
<Label>
{item.label}{' '}
{item.help && <Help>{item.help}</Help>}
</Label>
</Td>
<Td num>{item.value}</Td>
</Tr>
))}
</tbody>
</Table>
)
)}
</Grid>
{meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)}
{hasInteractiveCode && (
<CodeBlock title="Try out the model" lang="python" executable={true}>
@ -288,7 +249,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
`import spacy`,
`from spacy.lang.${langId}.examples import sentences `,
``,
`nlp = spacy.load('${name}')`,
`nlp = spacy.load("${name}")`,
`doc = nlp(sentences[0])`,
`print(doc.text)`,
`for token in doc:`,
@ -296,6 +257,25 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
].join('\n')}
</CodeBlock>
)}
{meta.accuracy && (
<Accordion id={`${name}-accuracy`} title="Accuracy Evaluation">
<Table>
<tbody>
{meta.accuracy.map(({ label, value, help }) => (
<Tr key={`${name}-${label}`}>
<Td nowrap>
<InlineCode>{label.toUpperCase()}</InlineCode>
</Td>
<Td>{help}</Td>
<Td num style={{ textAlign: 'right' }}>
{value}
</Td>
</Tr>
))}
</tbody>
</Table>
</Accordion>
)}
{labels && (
<Accordion id={`${name}-labels`} title="Label Scheme">
<p>
@ -313,7 +293,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const labelNames = labels[pipe] || []
const help = LABEL_SCHEME_META[pipe]
return (
<Tr key={pipe} evenodd={false} key={pipe}>
<Tr key={`${name}-${pipe}`} evenodd={false} key={pipe}>
<Td style={{ width: '20%' }}>
<Label>
{pipe} {help && <Help>{help}</Help>}
@ -343,7 +323,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const Models = ({ pageContext, repo, children }) => {
const [initialized, setInitialized] = useState(false)
const [compatibility, setCompatibility] = useState({})
const { id, title, meta } = pageContext
const { id, title, meta, hasExamples } = pageContext
const { models, isStarters } = meta
const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
@ -360,7 +340,6 @@ const Models = ({ pageContext, repo, children }) => {
const modelTitle = title
const modelTeaser = `Available trained pipelines for ${title}`
const starterTitle = `${title} starters`
const starterTeaser = `Available transfer learning starter packs for ${title}`
@ -392,6 +371,7 @@ const Models = ({ pageContext, repo, children }) => {
baseUrl={baseUrl}
repo={repo}
licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
hasExamples={meta.hasExamples}
/>
))
}

View File

@ -297,7 +297,7 @@ const Landing = ({ data }) => {
to run.
</p>
<p>
<Button to="/usage/facts-figures#benchmarks">See details</Button>
<Button to="/usage/facts-figures#benchmarks">More results</Button>
</p>
</LandingCol>

View File

@ -22,7 +22,7 @@ const Language = ({ name, code, models }) => (
<Td>
{models && models.length ? (
<Link to={`/models/${code}`}>
{models.length} {models.length === 1 ? 'model' : 'models'}
{models.length} {models.length === 1 ? 'package' : 'packages'}
</Link>
) : (
<em>none yet</em>
@ -51,7 +51,7 @@ const Languages = () => (
<Th>Language</Th>
<Th>Code</Th>
<Th>Language Data</Th>
<Th>Models</Th>
<Th>Pipelines</Th>
</Tr>
</thead>
<tbody>

View File

@ -16,7 +16,8 @@ export default function Project({
}) {
const repoArg = repo ? ` --repo ${repo}` : ''
const text = `${COMMAND} ${id}${repoArg}`
const url = `${repo || projectsRepo}/${id}`
const defaultRepo = `https://github.com/${projectsRepo}`
const url = `${repo || defaultRepo}/${id}`
const header = (
<>
{title}:{' '}