Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-09-24 14:16:56 +02:00
commit 921d188bce
82 changed files with 867 additions and 570 deletions

View File

@ -8,12 +8,12 @@ be used in real products.
spaCy comes with spaCy comes with
[pretrained pipelines](https://spacy.io/models) and vectors, and [pretrained pipelines](https://spacy.io/models) and vectors, and
currently supports tokenization for **59+ languages**. It features currently supports tokenization for **60+ languages**. It features
state-of-the-art speed, convolutional **neural network models** for tagging, state-of-the-art speed, convolutional **neural network models** for tagging,
parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management. parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management.
spaCy is commercial open-source software, released under the MIT license. spaCy is commercial open-source software, released under the MIT license.
💫 **Version 2.3 out now!** 💫 **Version 3.0 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases) [Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@ -30,10 +30,11 @@ spaCy is commercial open-source software, released under the MIT license.
## 📖 Documentation ## 📖 Documentation
| Documentation | | | Documentation | |
| --------------- | -------------------------------------------------------------- | | ------------------- | -------------------------------------------------------------- |
| [spaCy 101] | New to spaCy? Here's everything you need to know! | | [spaCy 101] | New to spaCy? Here's everything you need to know! |
| [Usage Guides] | How to use spaCy and its features. | | [Usage Guides] | How to use spaCy and its features. |
| [New in v3.0] | New features, backwards incompatibilities and migration guide. | | [New in v3.0] | New features, backwards incompatibilities and migration guide. |
| [Project Templates] | End-to-end workflows you can clone, modify and run. |
| [API Reference] | The detailed reference for spaCy's API. | | [API Reference] | The detailed reference for spaCy's API. |
| [Models] | Download statistical language models for spaCy. | | [Models] | Download statistical language models for spaCy. |
| [Universe] | Libraries, extensions, demos, books and courses. | | [Universe] | Libraries, extensions, demos, books and courses. |
@ -46,6 +47,7 @@ spaCy is commercial open-source software, released under the MIT license.
[api reference]: https://spacy.io/api/ [api reference]: https://spacy.io/api/
[models]: https://spacy.io/models [models]: https://spacy.io/models
[universe]: https://spacy.io/universe [universe]: https://spacy.io/universe
[project templates]: https://github.com/explosion/projects
[changelog]: https://spacy.io/usage#changelog [changelog]: https://spacy.io/usage#changelog
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md [contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
@ -69,7 +71,7 @@ it.
## Features ## Features
- Support for **59+ languages** - Support for **60+ languages**
- **Trained pipelines** - **Trained pipelines**
- Multi-task learning with pretrained **transformers** like BERT - Multi-task learning with pretrained **transformers** like BERT
- Pretrained **word vectors** - Pretrained **word vectors**

View File

@ -20,6 +20,7 @@ pytokenizations
setuptools setuptools
packaging packaging
importlib_metadata>=0.20; python_version < "3.8" importlib_metadata>=0.20; python_version < "3.8"
typing_extensions>=3.7.4; python_version < "3.8"
# Development dependencies # Development dependencies
cython>=0.25 cython>=0.25
pytest>=4.6.5 pytest>=4.6.5

View File

@ -57,6 +57,7 @@ install_requires =
setuptools setuptools
packaging packaging
importlib_metadata>=0.20; python_version < "3.8" importlib_metadata>=0.20; python_version < "3.8"
typing_extensions>=3.7.4; python_version < "3.8"
[options.entry_points] [options.entry_points]
console_scripts = console_scripts =

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a20" __version__ = "3.0.0a23"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -378,7 +378,7 @@ def git_sparse_checkout(repo, subpath, dest, branch):
# Looking for this 'rev-list' command in the git --help? Hah. # Looking for this 'rev-list' command in the git --help? Hah.
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
ret = run_command(cmd, capture=True) ret = run_command(cmd, capture=True)
git_repo = _from_http_to_git(repo) git_repo = _http_to_git(repo)
# Now pass those missings into another bit of git internals # Now pass those missings into another bit of git internals
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
if not missings: if not missings:
@ -414,7 +414,7 @@ def get_git_version(
return (int(version[0]), int(version[1])) return (int(version[0]), int(version[1]))
def _from_http_to_git(repo: str) -> str: def _http_to_git(repo: str) -> str:
if repo.startswith("http://"): if repo.startswith("http://"):
repo = repo.replace(r"http://", r"https://") repo = repo.replace(r"http://", r"https://")
if repo.startswith(r"https://"): if repo.startswith(r"https://"):

View File

@ -9,7 +9,7 @@ import sys
from ._util import app, Arg, Opt from ._util import app, Arg, Opt
from ..training import docs_to_json from ..training import docs_to_json
from ..tokens import DocBin from ..tokens import DocBin
from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs
# Converters are matched by file extension except for ner/iob, which are # Converters are matched by file extension except for ner/iob, which are
@ -18,12 +18,12 @@ from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2do
# imported from /converters. # imported from /converters.
CONVERTERS = { CONVERTERS = {
"conllubio": conllu2docs, "conllubio": conllu_to_docs,
"conllu": conllu2docs, "conllu": conllu_to_docs,
"conll": conllu2docs, "conll": conllu_to_docs,
"ner": conll_ner2docs, "ner": conll_ner_to_docs,
"iob": iob2docs, "iob": iob_to_docs,
"json": json2docs, "json": json_to_docs,
} }

View File

@ -2,7 +2,7 @@ from typing import Optional, Dict, Any, Union, List
from pathlib import Path from pathlib import Path
from wasabi import msg, table from wasabi import msg, table
from thinc.api import Config from thinc.api import Config
from thinc.config import VARIABLE_RE from thinc.config import VARIABLE_RE, ConfigValidationError
import typer import typer
from ._util import Arg, Opt, show_validation_error, parse_config_overrides from ._util import Arg, Opt, show_validation_error, parse_config_overrides
@ -51,7 +51,10 @@ def debug_config(
msg.divider("Config validation") msg.divider("Config validation")
with show_validation_error(config_path): with show_validation_error(config_path):
config = util.load_config(config_path, overrides=overrides) config = util.load_config(config_path, overrides=overrides)
nlp, _ = util.load_model_from_config(config) nlp, resolved = util.load_model_from_config(config)
# Use the resolved config here in case user has one function returning
# a dict of corpora etc.
check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
msg.good("Config is valid") msg.good("Config is valid")
if show_vars: if show_vars:
variables = get_variables(config) variables = get_variables(config)
@ -93,3 +96,23 @@ def get_variables(config: Config) -> Dict[str, Any]:
value = util.dot_to_object(config, path) value = util.dot_to_object(config, path)
result[variable] = repr(value) result[variable] = repr(value)
return result return result
def check_section_refs(config: Config, fields: List[str]) -> None:
"""Validate fields in the config that refer to other sections or values
(e.g. in the corpora) and make sure that those references exist.
"""
errors = []
for field in fields:
# If the field doesn't exist in the config, we ignore it
try:
value = util.dot_to_object(config, field)
except KeyError:
continue
try:
util.dot_to_object(config, value)
except KeyError:
msg = f"not a valid section reference: {value}"
errors.append({"loc": field.split("."), "msg": msg})
if errors:
raise ConfigValidationError(config, errors)

View File

@ -128,7 +128,7 @@ def debug_model(
goldY = None goldY = None
for e in range(3): for e in range(3):
if tok2vec: if tok2vec:
tok2vec.predict(X) tok2vec.update([Example.from_dict(x, {}) for x in X])
Y, get_dX = model.begin_update(X) Y, get_dX = model.begin_update(X)
if goldY is None: if goldY is None:
goldY = _simulate_gold(Y) goldY = _simulate_gold(Y)

View File

@ -36,7 +36,7 @@ def init_config_cli(
""" """
Generate a starter config.cfg for training. Based on your requirements Generate a starter config.cfg for training. Based on your requirements
specified via the CLI arguments, this command generates a config with the specified via the CLI arguments, this command generates a config with the
optimal settings for you use case. This includes the choice of architecture, optimal settings for your use case. This includes the choice of architecture,
pretrained weights and related hyperparameters. pretrained weights and related hyperparameters.
DOCS: https://nightly.spacy.io/api/cli#init-config DOCS: https://nightly.spacy.io/api/cli#init-config

View File

@ -27,14 +27,20 @@ def project_pull_cli(
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False): def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
# TODO: We don't have tests for this :(. It would take a bit of mockery to
# set up. I guess see if it breaks first?
config = load_project_config(project_dir) config = load_project_config(project_dir)
if remote in config.get("remotes", {}): if remote in config.get("remotes", {}):
remote = config["remotes"][remote] remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote) storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []): commands = list(config.get("commands", []))
# We use a while loop here because we don't know how the commands
# will be ordered. A command might need dependencies from one that's later
# in the list.
while commands:
for i, cmd in enumerate(list(commands)):
deps = [project_dir / dep for dep in cmd.get("deps", [])] deps = [project_dir / dep for dep in cmd.get("deps", [])]
if any(not dep.exists() for dep in deps): if all(dep.exists() for dep in deps):
continue
cmd_hash = get_command_hash("", "", deps, cmd["script"]) cmd_hash = get_command_hash("", "", deps, cmd["script"])
for output_path in cmd.get("outputs", []): for output_path in cmd.get("outputs", []):
url = storage.pull(output_path, command_hash=cmd_hash) url = storage.pull(output_path, command_hash=cmd_hash)
@ -43,3 +49,10 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
out_locs = [project_dir / out for out in cmd.get("outputs", [])] out_locs = [project_dir / out for out in cmd.get("outputs", [])]
if all(loc.exists() for loc in out_locs): if all(loc.exists() for loc in out_locs):
update_lockfile(project_dir, cmd) update_lockfile(project_dir, cmd)
# We remove the command from the list here, and break, so that
# we iterate over the loop again.
commands.remove(i)
break
else:
# If we didn't break the for loop, break the while loop.
break

View File

@ -59,7 +59,8 @@ factory = "parser"
[components.parser.model] [components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1" @architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8 state_type = "parser"
extra_state_tokens = false
hidden_width = 128 hidden_width = 128
maxout_pieces = 3 maxout_pieces = 3
use_upper = false use_upper = false
@ -79,7 +80,8 @@ factory = "ner"
[components.ner.model] [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1" @architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3 state_type = "ner"
extra_state_tokens = false
hidden_width = 64 hidden_width = 64
maxout_pieces = 2 maxout_pieces = 2
use_upper = false use_upper = false
@ -93,6 +95,49 @@ grad_factor = 1.0
@layers = "reduce_mean.v1" @layers = "reduce_mean.v1"
{% endif -%} {% endif -%}
{% if "entity_linker" in components -%}
[components.entity_linker]
factory = "entity_linker"
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
incl_context = true
incl_prior = true
[components.entity_linker.model]
@architectures = "spacy.EntityLinker.v1"
nO = null
[components.entity_linker.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
[components.entity_linker.model.tok2vec.pooling]
@layers = "reduce_mean.v1"
{% endif -%}
{% if "textcat" in components %}
[components.textcat]
factory = "textcat"
{% if optimize == "accuracy" %}
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1"
exclusive_classes = false
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
nO = null
{% else -%}
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
{%- endif %}
{%- endif %}
{# NON-TRANSFORMER PIPELINE #} {# NON-TRANSFORMER PIPELINE #}
{% else -%} {% else -%}
@ -140,7 +185,8 @@ factory = "parser"
[components.parser.model] [components.parser.model]
@architectures = "spacy.TransitionBasedParser.v1" @architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8 state_type = "parser"
extra_state_tokens = false
hidden_width = 128 hidden_width = 128
maxout_pieces = 3 maxout_pieces = 3
use_upper = true use_upper = true
@ -157,7 +203,8 @@ factory = "ner"
[components.ner.model] [components.ner.model]
@architectures = "spacy.TransitionBasedParser.v1" @architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 6 state_type = "ner"
extra_state_tokens = false
hidden_width = 64 hidden_width = 64
maxout_pieces = 2 maxout_pieces = 2
use_upper = true use_upper = true
@ -167,10 +214,50 @@ nO = null
@architectures = "spacy.Tok2VecListener.v1" @architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width} width = ${components.tok2vec.model.encode.width}
{% endif %} {% endif %}
{% if "entity_linker" in components -%}
[components.entity_linker]
factory = "entity_linker"
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
incl_context = true
incl_prior = true
[components.entity_linker.model]
@architectures = "spacy.EntityLinker.v1"
nO = null
[components.entity_linker.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.encode.width}
{% endif %}
{% if "textcat" in components %}
[components.textcat]
factory = "textcat"
{% if optimize == "accuracy" %}
[components.textcat.model]
@architectures = "spacy.TextCatEnsemble.v1"
exclusive_classes = false
width = 64
conv_depth = 2
embed_size = 2000
window_size = 1
ngram_size = 1
nO = null
{% else -%}
[components.textcat.model]
@architectures = "spacy.TextCatBOW.v1"
exclusive_classes = false
ngram_size = 1
no_output_layer = false
{%- endif %}
{%- endif %}
{% endif %} {% endif %}
{% for pipe in components %} {% for pipe in components %}
{% if pipe not in ["tagger", "parser", "ner"] %} {% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %}
{# Other components defined by the user: we just assume they're factories #} {# Other components defined by the user: we just assume they're factories #}
[components.{{ pipe }}] [components.{{ pipe }}]
factory = "{{ pipe }}" factory = "{{ pipe }}"
@ -197,7 +284,7 @@ vectors = "{{ word_vectors }}"
{% endif -%} {% endif -%}
{% if use_transformer -%} {% if use_transformer -%}
accumulate_gradient = {{ transformer["size_factor"] }} accumulate_gradient = {{ transformer["size_factor"] }}
{% endif %} {% endif -%}
dev_corpus = "corpora.dev" dev_corpus = "corpora.dev"
train_corpus = "corpora.train" train_corpus = "corpora.train"
@ -230,18 +317,3 @@ start = 100
stop = 1000 stop = 1000
compound = 1.001 compound = 1.001
{% endif %} {% endif %}
[training.score_weights]
{%- if "tagger" in components %}
tag_acc = {{ (1.0 / components|length)|round(2) }}
{%- endif -%}
{%- if "parser" in components %}
dep_uas = 0.0
dep_las = {{ (1.0 / components|length)|round(2) }}
sents_f = 0.0
{%- endif %}
{%- if "ner" in components %}
ents_f = {{ (1.0 / components|length)|round(2) }}
ents_p = 0.0
ents_r = 0.0
{%- endif -%}

View File

@ -152,6 +152,7 @@ def train(
exclude=frozen_components, exclude=frozen_components,
) )
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}") msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
with nlp.select_pipes(disable=frozen_components):
print_row, finalize_logger = train_logger(nlp) print_row, finalize_logger = train_logger(nlp)
try: try:
@ -163,6 +164,7 @@ def train(
progress.close() progress.close()
print_row(info) print_row(info)
if is_best_checkpoint and output_path is not None: if is_best_checkpoint and output_path is not None:
with nlp.select_pipes(disable=frozen_components):
update_meta(T_cfg, nlp, info) update_meta(T_cfg, nlp, info)
with nlp.use_params(optimizer.averages): with nlp.use_params(optimizer.averages):
nlp.to_disk(output_path / "model-best") nlp.to_disk(output_path / "model-best")
@ -207,10 +209,17 @@ def create_train_batches(iterator, batcher, max_epochs: int):
def create_evaluation_callback( def create_evaluation_callback(
nlp: Language, dev_corpus: Callable, weights: Dict[str, float] nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
) -> Callable[[], Tuple[float, Dict[str, float]]]: ) -> Callable[[], Tuple[float, Dict[str, float]]]:
weights = {key: value for key, value in weights.items() if value is not None}
def evaluate() -> Tuple[float, Dict[str, float]]: def evaluate() -> Tuple[float, Dict[str, float]]:
dev_examples = list(dev_corpus(nlp)) dev_examples = list(dev_corpus(nlp))
scores = nlp.evaluate(dev_examples) scores = nlp.evaluate(dev_examples)
# Calculate a weighted sum based on score_weights for the main score # Calculate a weighted sum based on score_weights for the main score.
# We can only consider scores that are ints/floats, not dicts like
# entity scores per type etc.
for key, value in scores.items():
if key in weights and not isinstance(value, (int, float)):
raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
try: try:
weighted_score = sum( weighted_score = sum(
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
@ -366,6 +375,7 @@ def update_meta(
) -> None: ) -> None:
nlp.meta["performance"] = {} nlp.meta["performance"] = {}
for metric in training["score_weights"]: for metric in training["score_weights"]:
if metric is not None:
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0) nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
for pipe_name in nlp.pipe_names: for pipe_name in nlp.pipe_names:
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name] nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]

View File

@ -22,6 +22,11 @@ try:
except ImportError: except ImportError:
cupy = None cupy = None
try: # Python 3.8+
from typing import Literal
except ImportError:
from typing_extensions import Literal # noqa: F401
from thinc.api import Optimizer # noqa: F401 from thinc.api import Optimizer # noqa: F401
pickle = pickle pickle = pickle

View File

@ -69,7 +69,7 @@ class Warnings:
"in problems with the vocab further on in the pipeline.") "in problems with the vocab further on in the pipeline.")
W030 = ("Some entities could not be aligned in the text \"{text}\" with " W030 = ("Some entities could not be aligned in the text \"{text}\" with "
"entities \"{entities}\". Use " "entities \"{entities}\". Use "
"`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" "`spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)`"
" to check the alignment. Misaligned entities ('-') will be " " to check the alignment. Misaligned entities ('-') will be "
"ignored during training.") "ignored during training.")
W033 = ("Training a new {model} using a model with no lexeme normalization " W033 = ("Training a new {model} using a model with no lexeme normalization "
@ -480,6 +480,13 @@ class Errors:
E201 = ("Span index out of range.") E201 = ("Span index out of range.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
"float or int but got: {score_type}. To exclude the score from the "
"final score, set its weight to null in the [training.score_weights] "
"section of your training config.")
E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})")
E917 = ("Received invalid value {value} for 'state_type' in "
"TransitionBasedParser: only 'parser' or 'ner' are valid options.")
E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid " E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
"values are an instance of spacy.vocab.Vocab or True to create one" "values are an instance of spacy.vocab.Vocab or True to create one"
" (default).") " (default).")

View File

@ -140,7 +140,6 @@ cdef class KnowledgeBase:
self._entries.push_back(entry) self._entries.push_back(entry)
self._aliases_table.push_back(alias) self._aliases_table.push_back(alias)
cpdef from_disk(self, loc)
cpdef set_entities(self, entity_list, freq_list, vector_list) cpdef set_entities(self, entity_list, freq_list, vector_list)

View File

@ -9,7 +9,8 @@ from libcpp.vector cimport vector
from pathlib import Path from pathlib import Path
import warnings import warnings
from os import path
from spacy import util
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .errors import Errors, Warnings from .errors import Errors, Warnings
@ -319,8 +320,14 @@ cdef class KnowledgeBase:
return 0.0 return 0.0
def to_disk(self, loc): def to_disk(self, path):
cdef Writer writer = Writer(loc) path = util.ensure_path(path)
if path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
if not path.parent.exists():
path.parent.mkdir(parents=True)
cdef Writer writer = Writer(path)
writer.write_header(self.get_size_entities(), self.entity_vector_length) writer.write_header(self.get_size_entities(), self.entity_vector_length)
# dumping the entity vectors in their original order # dumping the entity vectors in their original order
@ -359,7 +366,13 @@ cdef class KnowledgeBase:
writer.close() writer.close()
cpdef from_disk(self, loc): def from_disk(self, path):
path = util.ensure_path(path)
if path.is_dir():
raise ValueError(Errors.E928.format(loc=path))
if not path.exists():
raise ValueError(Errors.E929.format(loc=path))
cdef hash_t entity_hash cdef hash_t entity_hash
cdef hash_t alias_hash cdef hash_t alias_hash
cdef int64_t entry_index cdef int64_t entry_index
@ -369,7 +382,7 @@ cdef class KnowledgeBase:
cdef AliasC alias cdef AliasC alias
cdef float vector_element cdef float vector_element
cdef Reader reader = Reader(loc) cdef Reader reader = Reader(path)
# STEP 0: load header and initialize KB # STEP 0: load header and initialize KB
cdef int64_t nr_entities cdef int64_t nr_entities
@ -450,16 +463,13 @@ cdef class KnowledgeBase:
cdef class Writer: cdef class Writer:
def __init__(self, object loc): def __init__(self, path):
if isinstance(loc, Path): assert isinstance(path, Path)
loc = bytes(loc) content = bytes(path)
if path.exists(loc): cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
if path.isdir(loc):
raise ValueError(Errors.E928.format(loc=loc))
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self._fp = fopen(<char*>bytes_loc, 'wb') self._fp = fopen(<char*>bytes_loc, 'wb')
if not self._fp: if not self._fp:
raise IOError(Errors.E146.format(path=loc)) raise IOError(Errors.E146.format(path=path))
fseek(self._fp, 0, 0) fseek(self._fp, 0, 0)
def close(self): def close(self):
@ -496,14 +506,9 @@ cdef class Writer:
cdef class Reader: cdef class Reader:
def __init__(self, object loc): def __init__(self, path):
if isinstance(loc, Path): content = bytes(path)
loc = bytes(loc) cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
if not path.exists(loc):
raise ValueError(Errors.E929.format(loc=loc))
if path.isdir(loc):
raise ValueError(Errors.E928.format(loc=loc))
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
self._fp = fopen(<char*>bytes_loc, 'rb') self._fp = fopen(<char*>bytes_loc, 'rb')
if not self._fp: if not self._fp:
PyErr_SetFromErrno(IOError) PyErr_SetFromErrno(IOError)

View File

@ -25,7 +25,6 @@ class Bengali(Language):
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(

View File

@ -30,7 +30,6 @@ class Greek(Language):
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(

View File

@ -29,7 +29,6 @@ class English(Language):
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(

View File

@ -28,7 +28,6 @@ class Persian(Language):
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(

View File

@ -33,7 +33,6 @@ class French(Language):
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(

View File

@ -28,7 +28,6 @@ class Norwegian(Language):
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(

View File

@ -30,7 +30,6 @@ class Dutch(Language):
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(

View File

@ -35,7 +35,6 @@ class Polish(Language):
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pos_lookup", "lookups": None}, default_config={"model": None, "mode": "pos_lookup", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(

View File

@ -25,7 +25,6 @@ class Russian(Language):
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "lookups": None}, default_config={"model": None, "mode": "pymorphy2", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(

View File

@ -31,7 +31,6 @@ class Swedish(Language):
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "rule", "lookups": None}, default_config={"model": None, "mode": "rule", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(

View File

@ -25,7 +25,6 @@ class Ukrainian(Language):
"lemmatizer", "lemmatizer",
assigns=["token.lemma"], assigns=["token.lemma"],
default_config={"model": None, "mode": "pymorphy2", "lookups": None}, default_config={"model": None, "mode": "pymorphy2", "lookups": None},
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(

View File

@ -248,8 +248,11 @@ class Language:
self._config["nlp"]["pipeline"] = list(self.component_names) self._config["nlp"]["pipeline"] = list(self.component_names)
self._config["nlp"]["disabled"] = list(self.disabled) self._config["nlp"]["disabled"] = list(self.disabled)
self._config["components"] = pipeline self._config["components"] = pipeline
if not self._config["training"].get("score_weights"): # We're merging the existing score weights back into the combined
combined_score_weights = combine_score_weights(score_weights) # weights to make sure we're preserving custom settings in the config
# but also reflect updates (e.g. new components added)
prev_weights = self._config["training"].get("score_weights", {})
combined_score_weights = combine_score_weights(score_weights, prev_weights)
self._config["training"]["score_weights"] = combined_score_weights self._config["training"]["score_weights"] = combined_score_weights
if not srsly.is_json_serializable(self._config): if not srsly.is_json_serializable(self._config):
raise ValueError(Errors.E961.format(config=self._config)) raise ValueError(Errors.E961.format(config=self._config))
@ -412,7 +415,6 @@ class Language:
assigns: Iterable[str] = SimpleFrozenList(), assigns: Iterable[str] = SimpleFrozenList(),
requires: Iterable[str] = SimpleFrozenList(), requires: Iterable[str] = SimpleFrozenList(),
retokenizes: bool = False, retokenizes: bool = False,
scores: Iterable[str] = SimpleFrozenList(),
default_score_weights: Dict[str, float] = SimpleFrozenDict(), default_score_weights: Dict[str, float] = SimpleFrozenDict(),
func: Optional[Callable] = None, func: Optional[Callable] = None,
) -> Callable: ) -> Callable:
@ -430,12 +432,11 @@ class Language:
e.g. "token.ent_id". Used for pipeline analyis. e.g. "token.ent_id". Used for pipeline analyis.
retokenizes (bool): Whether the component changes the tokenization. retokenizes (bool): Whether the component changes the tokenization.
Used for pipeline analysis. Used for pipeline analysis.
scores (Iterable[str]): All scores set by the component if it's trainable,
e.g. ["ents_f", "ents_r", "ents_p"].
default_score_weights (Dict[str, float]): The scores to report during default_score_weights (Dict[str, float]): The scores to report during
training, and their default weight towards the final score used to training, and their default weight towards the final score used to
select the best model. Weights should sum to 1.0 per component and select the best model. Weights should sum to 1.0 per component and
will be combined and normalized for the whole pipeline. will be combined and normalized for the whole pipeline. If None,
the score won't be shown in the logs or be weighted.
func (Optional[Callable]): Factory function if not used as a decorator. func (Optional[Callable]): Factory function if not used as a decorator.
DOCS: https://nightly.spacy.io/api/language#factory DOCS: https://nightly.spacy.io/api/language#factory
@ -475,7 +476,7 @@ class Language:
default_config=default_config, default_config=default_config,
assigns=validate_attrs(assigns), assigns=validate_attrs(assigns),
requires=validate_attrs(requires), requires=validate_attrs(requires),
scores=scores, scores=list(default_score_weights.keys()),
default_score_weights=default_score_weights, default_score_weights=default_score_weights,
retokenizes=retokenizes, retokenizes=retokenizes,
) )

View File

@ -2,6 +2,8 @@ from typing import Optional, List
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
from thinc.types import Floats2d from thinc.types import Floats2d
from ...errors import Errors
from ...compat import Literal
from ...util import registry from ...util import registry
from .._precomputable_affine import PrecomputableAffine from .._precomputable_affine import PrecomputableAffine
from ..tb_framework import TransitionModel from ..tb_framework import TransitionModel
@ -11,7 +13,8 @@ from ...tokens import Doc
@registry.architectures.register("spacy.TransitionBasedParser.v1") @registry.architectures.register("spacy.TransitionBasedParser.v1")
def build_tb_parser_model( def build_tb_parser_model(
tok2vec: Model[List[Doc], List[Floats2d]], tok2vec: Model[List[Doc], List[Floats2d]],
nr_feature_tokens: int, state_type: Literal["parser", "ner"],
extra_state_tokens: bool,
hidden_width: int, hidden_width: int,
maxout_pieces: int, maxout_pieces: int,
use_upper: bool = True, use_upper: bool = True,
@ -40,20 +43,12 @@ def build_tb_parser_model(
tok2vec (Model[List[Doc], List[Floats2d]]): tok2vec (Model[List[Doc], List[Floats2d]]):
Subnetwork to map tokens into vector representations. Subnetwork to map tokens into vector representations.
nr_feature_tokens (int): The number of tokens in the context to use to state_type (str):
construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The String value denoting the type of parser model: "parser" or "ner"
2, 8 and 13 feature sets are designed for the parser, while the 3 and 6 extra_state_tokens (bool): Whether or not to use additional tokens in the context
feature sets are designed for the NER. The recommended feature sets are to construct the state vector. Defaults to `False`, which means 3 and 8
3 for NER, and 8 for the dependency parser. for the NER and parser respectively. When set to `True`, this would become 6
feature sets (for the NER) or 13 (for the parser).
TODO: This feature should be split into two, state_type: ["deps", "ner"]
and extra_state_features: [True, False]. This would map into:
(deps, False): 8
(deps, True): 13
(ner, False): 3
(ner, True): 6
hidden_width (int): The width of the hidden layer. hidden_width (int): The width of the hidden layer.
maxout_pieces (int): How many pieces to use in the state prediction layer. maxout_pieces (int): How many pieces to use in the state prediction layer.
Recommended values are 1, 2 or 3. If 1, the maxout non-linearity Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
@ -68,8 +63,14 @@ def build_tb_parser_model(
Usually inferred from data at the beginning of training, or loaded from Usually inferred from data at the beginning of training, or loaded from
disk. disk.
""" """
if state_type == "parser":
nr_feature_tokens = 13 if extra_state_tokens else 8
elif state_type == "ner":
nr_feature_tokens = 6 if extra_state_tokens else 3
else:
raise ValueError(Errors.E917.format(value=state_type))
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),) tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))
tok2vec.set_dim("nO", hidden_width) tok2vec.set_dim("nO", hidden_width)
lower = PrecomputableAffine( lower = PrecomputableAffine(
nO=hidden_width if use_upper else nO, nO=hidden_width if use_upper else nO,

View File

@ -15,7 +15,8 @@ from ..training import validate_examples
default_model_config = """ default_model_config = """
[model] [model]
@architectures = "spacy.TransitionBasedParser.v1" @architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8 state_type = "parser"
extra_state_tokens = false
hidden_width = 64 hidden_width = 64
maxout_pieces = 2 maxout_pieces = 2
@ -42,8 +43,14 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
"min_action_freq": 30, "min_action_freq": 30,
"model": DEFAULT_PARSER_MODEL, "model": DEFAULT_PARSER_MODEL,
}, },
scores=["dep_uas", "dep_las", "dep_las_per_type", "sents_p", "sents_r", "sents_f"], default_score_weights={
default_score_weights={"dep_uas": 0.5, "dep_las": 0.5, "sents_f": 0.0}, "dep_uas": 0.5,
"dep_las": 0.5,
"dep_las_per_type": None,
"sents_p": None,
"sents_r": None,
"sents_f": 0.0,
},
) )
def make_parser( def make_parser(
nlp: Language, nlp: Language,

View File

@ -25,8 +25,12 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
"overwrite_ents": False, "overwrite_ents": False,
"ent_id_sep": DEFAULT_ENT_ID_SEP, "ent_id_sep": DEFAULT_ENT_ID_SEP,
}, },
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], default_score_weights={
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0}, "ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
) )
def make_entity_ruler( def make_entity_ruler(
nlp: Language, nlp: Language,

View File

@ -21,7 +21,6 @@ from .. import util
"lookups": None, "lookups": None,
"overwrite": False, "overwrite": False,
}, },
scores=["lemma_acc"],
default_score_weights={"lemma_acc": 1.0}, default_score_weights={"lemma_acc": 1.0},
) )
def make_lemmatizer( def make_lemmatizer(

View File

@ -49,8 +49,7 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
"morphologizer", "morphologizer",
assigns=["token.morph", "token.pos"], assigns=["token.morph", "token.pos"],
default_config={"model": DEFAULT_MORPH_MODEL}, default_config={"model": DEFAULT_MORPH_MODEL},
scores=["pos_acc", "morph_acc", "morph_per_feat"], default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5},
) )
def make_morphologizer( def make_morphologizer(
nlp: Language, nlp: Language,

View File

@ -13,7 +13,8 @@ from ..training import validate_examples
default_model_config = """ default_model_config = """
[model] [model]
@architectures = "spacy.TransitionBasedParser.v1" @architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 6 state_type = "ner"
extra_state_tokens = false
hidden_width = 64 hidden_width = 64
maxout_pieces = 2 maxout_pieces = 2
@ -38,8 +39,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
"update_with_oracle_cut_size": 100, "update_with_oracle_cut_size": 100,
"model": DEFAULT_NER_MODEL, "model": DEFAULT_NER_MODEL,
}, },
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"], default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
) )
def make_ner( def make_ner(

View File

@ -15,7 +15,6 @@ from .. import util
"sentencizer", "sentencizer",
assigns=["token.is_sent_start", "doc.sents"], assigns=["token.is_sent_start", "doc.sents"],
default_config={"punct_chars": None}, default_config={"punct_chars": None},
scores=["sents_p", "sents_r", "sents_f"],
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
) )
def make_sentencizer( def make_sentencizer(

View File

@ -36,7 +36,6 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
"senter", "senter",
assigns=["token.is_sent_start"], assigns=["token.is_sent_start"],
default_config={"model": DEFAULT_SENTER_MODEL}, default_config={"model": DEFAULT_SENTER_MODEL},
scores=["sents_p", "sents_r", "sents_f"],
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0}, default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
) )
def make_senter(nlp: Language, name: str, model: Model): def make_senter(nlp: Language, name: str, model: Model):

View File

@ -42,7 +42,6 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
"tagger", "tagger",
assigns=["token.tag"], assigns=["token.tag"],
default_config={"model": DEFAULT_TAGGER_MODEL}, default_config={"model": DEFAULT_TAGGER_MODEL},
scores=["tag_acc"],
default_score_weights={"tag_acc": 1.0}, default_score_weights={"tag_acc": 1.0},
) )
def make_tagger(nlp: Language, name: str, model: Model): def make_tagger(nlp: Language, name: str, model: Model):

View File

@ -62,18 +62,17 @@ subword_features = true
"positive_label": None, "positive_label": None,
"model": DEFAULT_TEXTCAT_MODEL, "model": DEFAULT_TEXTCAT_MODEL,
}, },
scores=[ default_score_weights={
"cats_score", "cats_score": 1.0,
"cats_score_desc", "cats_score_desc": None,
"cats_p", "cats_p": None,
"cats_r", "cats_r": None,
"cats_f", "cats_f": None,
"cats_macro_f", "cats_macro_f": None,
"cats_macro_auc", "cats_macro_auc": None,
"cats_f_per_type", "cats_f_per_type": None,
"cats_macro_auc_per_type", "cats_macro_auc_per_type": None,
], },
default_score_weights={"cats_score": 1.0},
) )
def make_textcat( def make_textcat(
nlp: Language, nlp: Language,

View File

@ -127,7 +127,7 @@ class Tok2Vec(Pipe):
tokvecs = self.model.predict(docs) tokvecs = self.model.predict(docs)
batch_id = Tok2VecListener.get_batch_id(docs) batch_id = Tok2VecListener.get_batch_id(docs)
for listener in self.listeners: for listener in self.listeners:
listener.receive(batch_id, tokvecs, None) listener.receive(batch_id, tokvecs, lambda dX: [])
return tokvecs return tokvecs
def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None: def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:

View File

@ -211,7 +211,7 @@ class ConfigSchemaTraining(BaseModel):
seed: Optional[StrictInt] = Field(..., title="Random seed") seed: Optional[StrictInt] = Field(..., title="Random seed")
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU") gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps") accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model") score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights") init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text") raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
optimizer: Optimizer = Field(..., title="The optimizer to use") optimizer: Optimizer = Field(..., title="The optimizer to use")

View File

@ -240,7 +240,7 @@ class Scorer:
pred_per_feat[field].add((gold_i, feat)) pred_per_feat[field].add((gold_i, feat))
for field in per_feat: for field in per_feat:
per_feat[field].score_set( per_feat[field].score_set(
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()), pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
) )
result = {k: v.to_dict() for k, v in per_feat.items()} result = {k: v.to_dict() for k, v in per_feat.items()}
return {f"{attr}_per_feat": result} return {f"{attr}_per_feat": result}
@ -418,9 +418,9 @@ class Scorer:
f_per_type[pred_label].fp += 1 f_per_type[pred_label].fp += 1
micro_prf = PRFScore() micro_prf = PRFScore()
for label_prf in f_per_type.values(): for label_prf in f_per_type.values():
micro_prf.tp = label_prf.tp micro_prf.tp += label_prf.tp
micro_prf.fn = label_prf.fn micro_prf.fn += label_prf.fn
micro_prf.fp = label_prf.fp micro_prf.fp += label_prf.fp
n_cats = len(f_per_type) + 1e-100 n_cats = len(f_per_type) + 1e-100
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats

View File

@ -144,6 +144,29 @@ def test_kb_empty(nlp):
entity_linker.begin_training(lambda: []) entity_linker.begin_training(lambda: [])
def test_kb_serialize(nlp):
"""Test serialization of the KB"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
with make_tempdir() as d:
# normal read-write behaviour
mykb.to_disk(d / "kb")
mykb.from_disk(d / "kb")
mykb.to_disk(d / "kb.file")
mykb.from_disk(d / "kb.file")
mykb.to_disk(d / "new" / "kb")
mykb.from_disk(d / "new" / "kb")
# allow overwriting an existing file
mykb.to_disk(d / "kb.file")
with pytest.raises(ValueError):
# can not write to a directory
mykb.to_disk(d)
with pytest.raises(ValueError):
# can not read from a directory
mykb.from_disk(d)
with pytest.raises(ValueError):
# can not read from an unknown file
mykb.from_disk(d / "unknown" / "kb")
def test_candidate_generation(nlp): def test_candidate_generation(nlp):
"""Test correct candidate generation""" """Test correct candidate generation"""
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1) mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)

View File

@ -359,12 +359,8 @@ def test_language_factories_scores():
func = lambda nlp, name: lambda doc: doc func = lambda nlp, name: lambda doc: doc
weights1 = {"a1": 0.5, "a2": 0.5} weights1 = {"a1": 0.5, "a2": 0.5}
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1} weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
Language.factory( Language.factory(f"{name}1", default_score_weights=weights1, func=func)
f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func, Language.factory(f"{name}2", default_score_weights=weights2, func=func)
)
Language.factory(
f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
)
meta1 = Language.get_factory_meta(f"{name}1") meta1 = Language.get_factory_meta(f"{name}1")
assert meta1.default_score_weights == weights1 assert meta1.default_score_weights == weights1
meta2 = Language.get_factory_meta(f"{name}2") meta2 = Language.get_factory_meta(f"{name}2")
@ -376,6 +372,21 @@ def test_language_factories_scores():
cfg = nlp.config["training"] cfg = nlp.config["training"]
expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05} expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05}
assert cfg["score_weights"] == expected_weights assert cfg["score_weights"] == expected_weights
# Test with custom defaults
config = nlp.config.copy()
config["training"]["score_weights"]["a1"] = 0.0
config["training"]["score_weights"]["b3"] = 1.0
nlp = English.from_config(config)
score_weights = nlp.config["training"]["score_weights"]
expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
assert score_weights == expected
# Test with null values
config = nlp.config.copy()
config["training"]["score_weights"]["a1"] = None
nlp = English.from_config(config)
score_weights = nlp.config["training"]["score_weights"]
expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
assert score_weights == expected
def test_pipe_factories_from_source(): def test_pipe_factories_from_source():

View File

@ -8,6 +8,7 @@ from spacy.language import Language
from spacy.pipeline import TextCategorizer from spacy.pipeline import TextCategorizer
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.scorer import Scorer
from ..util import make_tempdir from ..util import make_tempdir
from ...cli.train import verify_textcat_config from ...cli.train import verify_textcat_config
@ -224,3 +225,31 @@ def test_positive_class_not_binary():
assert textcat.labels == ("SOME", "THING", "POS") assert textcat.labels == ("SOME", "THING", "POS")
with pytest.raises(ValueError): with pytest.raises(ValueError):
verify_textcat_config(nlp, pipe_config) verify_textcat_config(nlp, pipe_config)
def test_textcat_evaluation():
train_examples = []
nlp = English()
ref1 = nlp("one")
ref1.cats = {"winter": 1.0, "summer": 1.0, "spring": 1.0, "autumn": 1.0}
pred1 = nlp("one")
pred1.cats = {"winter": 1.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0}
train_examples.append(Example(pred1, ref1))
ref2 = nlp("two")
ref2.cats = {"winter": 0.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0}
pred2 = nlp("two")
pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
train_examples.append(Example(pred2, ref2))
scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"])
assert scores["cats_f_per_type"]["winter"]["p"] == 1/2
assert scores["cats_f_per_type"]["winter"]["r"] == 1/1
assert scores["cats_f_per_type"]["summer"]["p"] == 0
assert scores["cats_f_per_type"]["summer"]["r"] == 0/1
assert scores["cats_f_per_type"]["spring"]["p"] == 1/1
assert scores["cats_f_per_type"]["spring"]["r"] == 1/2
assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2
assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2
assert scores["cats_micro_p"] == 4/5
assert scores["cats_micro_r"] == 4/6

View File

@ -169,3 +169,22 @@ def test_tok2vec_listener():
nlp.select_pipes(disable="tok2vec") nlp.select_pipes(disable="tok2vec")
assert nlp.pipe_names == ["tagger"] assert nlp.pipe_names == ["tagger"]
nlp("Running the pipeline with the Tok2Vec component disabled.") nlp("Running the pipeline with the Tok2Vec component disabled.")
def test_tok2vec_listener_callback():
orig_config = Config().from_str(cfg_string)
nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
assert nlp.pipe_names == ["tok2vec", "tagger"]
tagger = nlp.get_pipe("tagger")
tok2vec = nlp.get_pipe("tok2vec")
nlp._link_components()
docs = [nlp.make_doc("A random sentence")]
tok2vec.model.initialize(X=docs)
gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")]
tagger.model.initialize(X=docs, Y=label_sample)
docs = [nlp.make_doc("Another entirely random sentence")]
tok2vec.update([Example.from_dict(x, {}) for x in docs])
Y, get_dX = tagger.model.begin_update(docs)
# assure that the backprop call works (and doesn't hit a 'None' callback)
assert get_dX(Y) is not None

View File

@ -3,7 +3,7 @@ from spacy.pipeline import Pipe
from spacy.matcher import PhraseMatcher, Matcher from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Doc, Span, DocBin from spacy.tokens import Doc, Span, DocBin
from spacy.training import Example, Corpus from spacy.training import Example, Corpus
from spacy.training.converters import json2docs from spacy.training.converters import json_to_docs
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.lang.en import English from spacy.lang.en import English
from spacy.util import minibatch, ensure_path, load_model from spacy.util import minibatch, ensure_path, load_model
@ -425,7 +425,7 @@ def test_issue4402():
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"] attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
with make_tempdir() as tmpdir: with make_tempdir() as tmpdir:
output_file = tmpdir / "test4402.spacy" output_file = tmpdir / "test4402.spacy"
docs = json2docs([json_data]) docs = json_to_docs([json_data])
data = DocBin(docs=docs, attrs=attrs).to_bytes() data = DocBin(docs=docs, attrs=attrs).to_bytes()
with output_file.open("wb") as file_: with output_file.open("wb") as file_:
file_.write(data) file_.write(data)

View File

@ -1,7 +1,7 @@
import pytest import pytest
from spacy.tokens import Doc, Span, DocBin from spacy.tokens import Doc, Span, DocBin
from spacy.training import Example from spacy.training import Example
from spacy.training.converters.conllu2docs import conllu2docs from spacy.training.converters.conllu_to_docs import conllu_to_docs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.kb import KnowledgeBase from spacy.kb import KnowledgeBase
from spacy.vocab import Vocab from spacy.vocab import Vocab
@ -82,7 +82,7 @@ def test_issue4651_without_phrase_matcher_attr():
def test_issue4665(): def test_issue4665():
""" """
conllu2json should not raise an exception if the HEAD column contains an conllu_to_docs should not raise an exception if the HEAD column contains an
underscore underscore
""" """
input_data = """ input_data = """
@ -105,7 +105,7 @@ def test_issue4665():
17 . _ PUNCT . _ _ punct _ _ 17 . _ PUNCT . _ _ punct _ _
18 ] _ PUNCT -RRB- _ _ punct _ _ 18 ] _ PUNCT -RRB- _ _ punct _ _
""" """
conllu2docs(input_data) conllu_to_docs(input_data)
def test_issue4674(): def test_issue4674():

View File

@ -67,7 +67,8 @@ width = ${components.tok2vec.model.width}
parser_config_string = """ parser_config_string = """
[model] [model]
@architectures = "spacy.TransitionBasedParser.v1" @architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 99 state_type = "parser"
extra_state_tokens = false
hidden_width = 66 hidden_width = 66
maxout_pieces = 2 maxout_pieces = 2
@ -95,7 +96,11 @@ def my_parser():
MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2), MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
) )
parser = build_tb_parser_model( parser = build_tb_parser_model(
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5 tok2vec=tok2vec,
state_type="parser",
extra_state_tokens=True,
hidden_width=65,
maxout_pieces=5,
) )
return parser return parser
@ -340,3 +345,13 @@ def test_config_auto_fill_extra_fields():
assert "extra" not in nlp.config["training"] assert "extra" not in nlp.config["training"]
# Make sure the config generated is valid # Make sure the config generated is valid
load_model_from_config(nlp.config) load_model_from_config(nlp.config)
def test_config_validate_literal():
nlp = English()
config = Config().from_str(parser_config_string)
config["model"]["state_type"] = "nonsense"
with pytest.raises(ConfigValidationError):
nlp.add_pipe("parser", config=config)
config["model"]["state_type"] = "ner"
nlp.add_pipe("parser", config=config)

View File

@ -1,20 +1,21 @@
import pytest import pytest
from click import NoSuchOption from click import NoSuchOption
from spacy.training import docs_to_json, biluo_tags_from_offsets from spacy.training import docs_to_json, offsets_to_biluo_tags
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import load_project_config, substitute_project_variables
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
from thinc.config import ConfigValidationError from spacy.cli.debug_config import check_section_refs
from thinc.config import ConfigValidationError, Config
import srsly import srsly
import os import os
from .util import make_tempdir from .util import make_tempdir
def test_cli_converters_conllu2json(): def test_cli_converters_conllu_to_docs():
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
lines = [ lines = [
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO", "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
@ -23,7 +24,7 @@ def test_cli_converters_conllu2json():
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO", "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
] ]
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted_docs = conllu2docs(input_data, n_sents=1) converted_docs = conllu_to_docs(input_data, n_sents=1)
assert len(converted_docs) == 1 assert len(converted_docs) == 1
converted = [docs_to_json(converted_docs)] converted = [docs_to_json(converted_docs)]
assert converted[0]["id"] == 0 assert converted[0]["id"] == 0
@ -39,7 +40,7 @@ def test_cli_converters_conllu2json():
ent_offsets = [ ent_offsets = [
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
] ]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "B-PER", "L-PER", "O"] assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
@ -62,9 +63,9 @@ def test_cli_converters_conllu2json():
), ),
], ],
) )
def test_cli_converters_conllu2json_name_ner_map(lines): def test_cli_converters_conllu_to_docs_name_ner_map(lines):
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted_docs = conllu2docs( converted_docs = conllu_to_docs(
input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""} input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}
) )
assert len(converted_docs) == 1 assert len(converted_docs) == 1
@ -83,11 +84,11 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
ent_offsets = [ ent_offsets = [
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
] ]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"] assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
def test_cli_converters_conllu2json_subtokens(): def test_cli_converters_conllu_to_docs_subtokens():
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu # https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
lines = [ lines = [
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O", "1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
@ -98,7 +99,7 @@ def test_cli_converters_conllu2json_subtokens():
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O", "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
] ]
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted_docs = conllu2docs( converted_docs = conllu_to_docs(
input_data, n_sents=1, merge_subtokens=True, append_morphology=True input_data, n_sents=1, merge_subtokens=True, append_morphology=True
) )
assert len(converted_docs) == 1 assert len(converted_docs) == 1
@ -132,11 +133,11 @@ def test_cli_converters_conllu2json_subtokens():
ent_offsets = [ ent_offsets = [
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"] (e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
] ]
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O") biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
assert biluo_tags == ["O", "U-PER", "O", "O"] assert biluo_tags == ["O", "U-PER", "O", "O"]
def test_cli_converters_iob2json(): def test_cli_converters_iob_to_docs():
lines = [ lines = [
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", "I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
"I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O", "I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
@ -144,7 +145,7 @@ def test_cli_converters_iob2json():
"I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O", "I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
] ]
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted_docs = iob2docs(input_data, n_sents=10) converted_docs = iob_to_docs(input_data, n_sents=10)
assert len(converted_docs) == 1 assert len(converted_docs) == 1
converted = docs_to_json(converted_docs) converted = docs_to_json(converted_docs)
assert converted["id"] == 0 assert converted["id"] == 0
@ -161,7 +162,7 @@ def test_cli_converters_iob2json():
assert ent.text in ["New York City", "London"] assert ent.text in ["New York City", "London"]
def test_cli_converters_conll_ner2json(): def test_cli_converters_conll_ner_to_docs():
lines = [ lines = [
"-DOCSTART- -X- O O", "-DOCSTART- -X- O O",
"", "",
@ -211,7 +212,7 @@ def test_cli_converters_conll_ner2json():
".\t.\t_\tO", ".\t.\t_\tO",
] ]
input_data = "\n".join(lines) input_data = "\n".join(lines)
converted_docs = conll_ner2docs(input_data, n_sents=10) converted_docs = conll_ner_to_docs(input_data, n_sents=10)
assert len(converted_docs) == 1 assert len(converted_docs) == 1
converted = docs_to_json(converted_docs) converted = docs_to_json(converted_docs)
assert converted["id"] == 0 assert converted["id"] == 0
@ -413,3 +414,15 @@ def test_string_to_list(value):
def test_string_to_list_intify(value): def test_string_to_list_intify(value):
assert string_to_list(value, intify=False) == ["1", "2", "3"] assert string_to_list(value, intify=False) == ["1", "2", "3"]
assert string_to_list(value, intify=True) == [1, 2, 3] assert string_to_list(value, intify=True) == [1, 2, 3]
def test_check_section_refs():
config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
config = Config(config)
# Valid section reference
check_section_refs(config, ["a.b.c"])
# Section that doesn't exist in this config
check_section_refs(config, ["x.y.z"])
# Invalid section reference
with pytest.raises(ConfigValidationError):
check_section_refs(config, ["a.b.c", "f.g"])

View File

@ -2,7 +2,7 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal
import pytest import pytest
from pytest import approx from pytest import approx
from spacy.training import Example from spacy.training import Example
from spacy.training.iob_utils import biluo_tags_from_offsets from spacy.training.iob_utils import offsets_to_biluo_tags
from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import Scorer, ROCAUCScore
from spacy.scorer import _roc_auc_score, _roc_curve from spacy.scorer import _roc_auc_score, _roc_curve
from spacy.lang.en import English from spacy.lang.en import English
@ -186,7 +186,7 @@ def test_ner_per_type(en_vocab):
words=input_.split(" "), words=input_.split(" "),
ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)], ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
) )
entities = biluo_tags_from_offsets(doc, annot["entities"]) entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities}) example = Example.from_dict(doc, {"entities": entities})
# a hack for sentence boundaries # a hack for sentence boundaries
example.predicted[1].is_sent_start = False example.predicted[1].is_sent_start = False
@ -211,7 +211,7 @@ def test_ner_per_type(en_vocab):
words=input_.split(" "), words=input_.split(" "),
ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)], ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
) )
entities = biluo_tags_from_offsets(doc, annot["entities"]) entities = offsets_to_biluo_tags(doc, annot["entities"])
example = Example.from_dict(doc, {"entities": entities}) example = Example.from_dict(doc, {"entities": entities})
# a hack for sentence boundaries # a hack for sentence boundaries
example.predicted[1].is_sent_start = False example.predicted[1].is_sent_start = False

View File

@ -1,9 +1,9 @@
import numpy import numpy
from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
from spacy.training import spans_from_biluo_tags, iob_to_biluo from spacy.training import biluo_tags_to_spans, iob_to_biluo
from spacy.training import Corpus, docs_to_json from spacy.training import Corpus, docs_to_json
from spacy.training.example import Example from spacy.training.example import Example
from spacy.training.converters import json2docs from spacy.training.converters import json_to_docs
from spacy.training.augment import make_orth_variants_example from spacy.training.augment import make_orth_variants_example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tokens import Doc, DocBin from spacy.tokens import Doc, DocBin
@ -69,7 +69,7 @@ def test_gold_biluo_U(en_vocab):
spaces = [True, True, True, False, True] spaces = [True, True, True, False, True]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to London"), "LOC")] entities = [(len("I flew to "), len("I flew to London"), "LOC")]
tags = biluo_tags_from_offsets(doc, entities) tags = offsets_to_biluo_tags(doc, entities)
assert tags == ["O", "O", "O", "U-LOC", "O"] assert tags == ["O", "O", "O", "U-LOC", "O"]
@ -78,7 +78,7 @@ def test_gold_biluo_BL(en_vocab):
spaces = [True, True, True, True, False, True] spaces = [True, True, True, True, False, True]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")] entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")]
tags = biluo_tags_from_offsets(doc, entities) tags = offsets_to_biluo_tags(doc, entities)
assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"] assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"]
@ -87,7 +87,7 @@ def test_gold_biluo_BIL(en_vocab):
spaces = [True, True, True, True, True, False, True] spaces = [True, True, True, True, True, False, True]
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
tags = biluo_tags_from_offsets(doc, entities) tags = offsets_to_biluo_tags(doc, entities)
assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"] assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
@ -100,7 +100,7 @@ def test_gold_biluo_overlap(en_vocab):
(len("I flew to "), len("I flew to San Francisco"), "LOC"), (len("I flew to "), len("I flew to San Francisco"), "LOC"),
] ]
with pytest.raises(ValueError): with pytest.raises(ValueError):
biluo_tags_from_offsets(doc, entities) offsets_to_biluo_tags(doc, entities)
def test_gold_biluo_misalign(en_vocab): def test_gold_biluo_misalign(en_vocab):
@ -109,7 +109,7 @@ def test_gold_biluo_misalign(en_vocab):
doc = Doc(en_vocab, words=words, spaces=spaces) doc = Doc(en_vocab, words=words, spaces=spaces)
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")] entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
with pytest.warns(UserWarning): with pytest.warns(UserWarning):
tags = biluo_tags_from_offsets(doc, entities) tags = offsets_to_biluo_tags(doc, entities)
assert tags == ["O", "O", "O", "-", "-", "-"] assert tags == ["O", "O", "O", "-", "-", "-"]
@ -155,7 +155,7 @@ def test_example_from_dict_some_ner(en_vocab):
@pytest.mark.filterwarnings("ignore::UserWarning") @pytest.mark.filterwarnings("ignore::UserWarning")
def test_json2docs_no_ner(en_vocab): def test_json_to_docs_no_ner(en_vocab):
data = [ data = [
{ {
"id": 1, "id": 1,
@ -191,7 +191,7 @@ def test_json2docs_no_ner(en_vocab):
], ],
} }
] ]
docs = json2docs(data) docs = json_to_docs(data)
assert len(docs) == 1 assert len(docs) == 1
for doc in docs: for doc in docs:
assert not doc.has_annotation("ENT_IOB") assert not doc.has_annotation("ENT_IOB")
@ -358,9 +358,9 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
offsets = [(10, 24, "LOC"), (29, 35, "GPE")] offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
doc = en_tokenizer(text) doc = en_tokenizer(text)
biluo_tags_converted = biluo_tags_from_offsets(doc, offsets) biluo_tags_converted = offsets_to_biluo_tags(doc, offsets)
assert biluo_tags_converted == biluo_tags assert biluo_tags_converted == biluo_tags
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags) offsets_converted = biluo_tags_to_offsets(doc, biluo_tags)
offsets_converted = [ent for ent in offsets if ent[2]] offsets_converted = [ent for ent in offsets if ent[2]]
assert offsets_converted == offsets assert offsets_converted == offsets
@ -368,7 +368,7 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
def test_biluo_spans(en_tokenizer): def test_biluo_spans(en_tokenizer):
doc = en_tokenizer("I flew to Silicon Valley via London.") doc = en_tokenizer("I flew to Silicon Valley via London.")
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"] biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
spans = spans_from_biluo_tags(doc, biluo_tags) spans = biluo_tags_to_spans(doc, biluo_tags)
spans = [span for span in spans if span.label_] spans = [span for span in spans if span.label_]
assert len(spans) == 2 assert len(spans) == 2
assert spans[0].text == "Silicon Valley" assert spans[0].text == "Silicon Valley"

View File

@ -2,8 +2,8 @@ from .corpus import Corpus # noqa: F401
from .example import Example, validate_examples # noqa: F401 from .example import Example, validate_examples # noqa: F401
from .align import Alignment # noqa: F401 from .align import Alignment # noqa: F401
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401 from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401
from .iob_utils import spans_from_biluo_tags, tags_to_entities # noqa: F401 from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
from .gold_io import docs_to_json, read_json_file # noqa: F401 from .gold_io import docs_to_json, read_json_file # noqa: F401
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401 from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
from .loggers import console_logger, wandb_logger # noqa: F401 from .loggers import console_logger, wandb_logger # noqa: F401

View File

@ -1,4 +1,4 @@
from .iob2docs import iob2docs # noqa: F401 from .iob_to_docs import iob_to_docs # noqa: F401
from .conll_ner2docs import conll_ner2docs # noqa: F401 from .conll_ner_to_docs import conll_ner_to_docs # noqa: F401
from .json2docs import json2docs # noqa: F401 from .json_to_docs import json_to_docs # noqa: F401
from .conllu2docs import conllu2docs # noqa: F401 from .conllu_to_docs import conllu_to_docs # noqa: F401

View File

@ -7,7 +7,7 @@ from ...tokens import Doc, Span
from ...util import load_model from ...util import load_model
def conll_ner2docs( def conll_ner_to_docs(
input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
): ):
""" """

View File

@ -1,13 +1,13 @@
import re import re
from .conll_ner2docs import n_sents_info from .conll_ner_to_docs import n_sents_info
from ...training import iob_to_biluo, spans_from_biluo_tags from ...training import iob_to_biluo, biluo_tags_to_spans
from ...tokens import Doc, Token, Span from ...tokens import Doc, Token, Span
from ...vocab import Vocab from ...vocab import Vocab
from wasabi import Printer from wasabi import Printer
def conllu2docs( def conllu_to_docs(
input_data, input_data,
n_sents=10, n_sents=10,
append_morphology=False, append_morphology=False,
@ -78,7 +78,7 @@ def read_conllx(
if lines: if lines:
while lines[0].startswith("#"): while lines[0].startswith("#"):
lines.pop(0) lines.pop(0)
doc = doc_from_conllu_sentence( doc = conllu_sentence_to_doc(
vocab, vocab,
lines, lines,
ner_tag_pattern, ner_tag_pattern,
@ -128,7 +128,7 @@ def get_entities(lines, tag_pattern, ner_map=None):
return iob_to_biluo(iob) return iob_to_biluo(iob)
def doc_from_conllu_sentence( def conllu_sentence_to_doc(
vocab, vocab,
lines, lines,
ner_tag_pattern, ner_tag_pattern,
@ -215,7 +215,7 @@ def doc_from_conllu_sentence(
doc[i]._.merged_lemma = lemmas[i] doc[i]._.merged_lemma = lemmas[i]
doc[i]._.merged_spaceafter = spaces[i] doc[i]._.merged_spaceafter = spaces[i]
ents = get_entities(lines, ner_tag_pattern, ner_map) ents = get_entities(lines, ner_tag_pattern, ner_map)
doc.ents = spans_from_biluo_tags(doc, ents) doc.ents = biluo_tags_to_spans(doc, ents)
if merge_subtokens: if merge_subtokens:
doc = merge_conllu_subtokens(lines, doc) doc = merge_conllu_subtokens(lines, doc)

View File

@ -1,13 +1,13 @@
from wasabi import Printer from wasabi import Printer
from .conll_ner2docs import n_sents_info from .conll_ner_to_docs import n_sents_info
from ...vocab import Vocab from ...vocab import Vocab
from ...training import iob_to_biluo, tags_to_entities from ...training import iob_to_biluo, tags_to_entities
from ...tokens import Doc, Span from ...tokens import Doc, Span
from ...util import minibatch from ...util import minibatch
def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs): def iob_to_docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
""" """
Convert IOB files with one sentence per line and tags separated with '|' Convert IOB files with one sentence per line and tags separated with '|'
into Doc objects so they can be saved. IOB and IOB2 are accepted. into Doc objects so they can be saved. IOB and IOB2 are accepted.

View File

@ -1,12 +1,12 @@
import srsly import srsly
from ..gold_io import json_iterate, json_to_annotations from ..gold_io import json_iterate, json_to_annotations
from ..example import annotations2doc from ..example import annotations_to_doc
from ..example import _fix_legacy_dict_data, _parse_example_dict_data from ..example import _fix_legacy_dict_data, _parse_example_dict_data
from ...util import load_model from ...util import load_model
from ...lang.xx import MultiLanguage from ...lang.xx import MultiLanguage
def json2docs(input_data, model=None, **kwargs): def json_to_docs(input_data, model=None, **kwargs):
nlp = load_model(model) if model is not None else MultiLanguage() nlp = load_model(model) if model is not None else MultiLanguage()
if not isinstance(input_data, bytes): if not isinstance(input_data, bytes):
if not isinstance(input_data, str): if not isinstance(input_data, str):
@ -17,6 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
for json_para in json_to_annotations(json_doc): for json_para in json_to_annotations(json_doc):
example_dict = _fix_legacy_dict_data(json_para) example_dict = _fix_legacy_dict_data(json_para)
tok_dict, doc_dict = _parse_example_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict)
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict) doc = annotations_to_doc(nlp.vocab, tok_dict, doc_dict)
docs.append(doc) docs.append(doc)
return docs return docs

View File

@ -7,13 +7,13 @@ from ..tokens.span cimport Span
from ..tokens.span import Span from ..tokens.span import Span
from ..attrs import IDS from ..attrs import IDS
from .align import Alignment from .align import Alignment
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
from .iob_utils import spans_from_biluo_tags from .iob_utils import biluo_tags_to_spans
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals import nonproj
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
""" Create a Doc from dictionaries with token and doc annotations. """ """ Create a Doc from dictionaries with token and doc annotations. """
attrs, array = _annot2array(vocab, tok_annot, doc_annot) attrs, array = _annot2array(vocab, tok_annot, doc_annot)
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"]) output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
@ -92,7 +92,7 @@ cdef class Example:
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted] tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
return Example( return Example(
predicted, predicted,
annotations2doc(predicted.vocab, tok_dict, doc_dict) annotations_to_doc(predicted.vocab, tok_dict, doc_dict)
) )
@property @property
@ -176,7 +176,7 @@ cdef class Example:
return [None] * len(self.x) # should this be 'missing' instead of 'None' ? return [None] * len(self.x) # should this be 'missing' instead of 'None' ?
x_ents = self.get_aligned_spans_y2x(self.y.ents) x_ents = self.get_aligned_spans_y2x(self.y.ents)
# Default to 'None' for missing values # Default to 'None' for missing values
x_tags = biluo_tags_from_offsets( x_tags = offsets_to_biluo_tags(
self.x, self.x,
[(e.start_char, e.end_char, e.label_) for e in x_ents], [(e.start_char, e.end_char, e.label_) for e in x_ents],
missing=None missing=None
@ -195,7 +195,7 @@ cdef class Example:
return { return {
"doc_annotation": { "doc_annotation": {
"cats": dict(self.reference.cats), "cats": dict(self.reference.cats),
"entities": biluo_tags_from_doc(self.reference), "entities": doc_to_biluo_tags(self.reference),
"links": self._links_to_dict() "links": self._links_to_dict()
}, },
"token_annotation": { "token_annotation": {
@ -295,12 +295,12 @@ def _add_entities_to_doc(doc, ner_data):
elif isinstance(ner_data[0], tuple): elif isinstance(ner_data[0], tuple):
return _add_entities_to_doc( return _add_entities_to_doc(
doc, doc,
biluo_tags_from_offsets(doc, ner_data) offsets_to_biluo_tags(doc, ner_data)
) )
elif isinstance(ner_data[0], str) or ner_data[0] is None: elif isinstance(ner_data[0], str) or ner_data[0] is None:
return _add_entities_to_doc( return _add_entities_to_doc(
doc, doc,
spans_from_biluo_tags(doc, ner_data) biluo_tags_to_spans(doc, ner_data)
) )
elif isinstance(ner_data[0], Span): elif isinstance(ner_data[0], Span):
# Ugh, this is super messy. Really hard to set O entities # Ugh, this is super messy. Really hard to set O entities
@ -388,7 +388,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
# This is annoying but to convert the offsets we need a Doc # This is annoying but to convert the offsets we need a Doc
# that has the target tokenization. # that has the target tokenization.
reference = Doc(vocab, words=words, spaces=spaces) reference = Doc(vocab, words=words, spaces=spaces)
biluo = biluo_tags_from_offsets(reference, biluo_or_offsets) biluo = offsets_to_biluo_tags(reference, biluo_or_offsets)
else: else:
biluo = biluo_or_offsets biluo = biluo_or_offsets
ent_iobs = [] ent_iobs = []

View File

@ -3,7 +3,7 @@ import srsly
from .. import util from .. import util
from ..errors import Warnings from ..errors import Warnings
from ..tokens import Doc from ..tokens import Doc
from .iob_utils import biluo_tags_from_offsets, tags_to_entities from .iob_utils import offsets_to_biluo_tags, tags_to_entities
import json import json
@ -32,7 +32,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
if ent.kb_id_: if ent.kb_id_:
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}} link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
json_para["links"].append(link_dict) json_para["links"].append(link_dict)
biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag) biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB") attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs} include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
for j, sent in enumerate(doc.sents): for j, sent in enumerate(doc.sents):

View File

@ -1,9 +1,11 @@
from typing import List, Tuple, Iterable, Union, Iterator
import warnings import warnings
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..tokens import Span from ..tokens import Span, Doc
def iob_to_biluo(tags): def iob_to_biluo(tags: Iterable[str]) -> List[str]:
out = [] out = []
tags = list(tags) tags = list(tags)
while tags: while tags:
@ -12,7 +14,7 @@ def iob_to_biluo(tags):
return out return out
def biluo_to_iob(tags): def biluo_to_iob(tags: Iterable[str]) -> List[str]:
out = [] out = []
for tag in tags: for tag in tags:
if tag is None: if tag is None:
@ -23,12 +25,12 @@ def biluo_to_iob(tags):
return out return out
def _consume_os(tags): def _consume_os(tags: List[str]) -> Iterator[str]:
while tags and tags[0] == "O": while tags and tags[0] == "O":
yield tags.pop(0) yield tags.pop(0)
def _consume_ent(tags): def _consume_ent(tags: List[str]) -> List[str]:
if not tags: if not tags:
return [] return []
tag = tags.pop(0) tag = tags.pop(0)
@ -50,15 +52,17 @@ def _consume_ent(tags):
return [start] + middle + [end] return [start] + middle + [end]
def biluo_tags_from_doc(doc, missing="O"): def doc_to_biluo_tags(doc: Doc, missing: str = "O"):
return biluo_tags_from_offsets( return offsets_to_biluo_tags(
doc, doc,
[(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents], [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
missing=missing, missing=missing,
) )
def biluo_tags_from_offsets(doc, entities, missing="O"): def offsets_to_biluo_tags(
doc: Doc, entities: Iterable[Tuple[int, int, Union[str, int]]], missing: str = "O"
) -> List[str]:
"""Encode labelled spans into per-token tags, using the """Encode labelled spans into per-token tags, using the
Begin/In/Last/Unit/Out scheme (BILUO). Begin/In/Last/Unit/Out scheme (BILUO).
@ -69,7 +73,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
the original string. the original string.
RETURNS (list): A list of unicode strings, describing the tags. Each tag RETURNS (list): A list of unicode strings, describing the tags. Each tag
string will be of the form either "", "O" or "{action}-{label}", where string will be of the form either "", "O" or "{action}-{label}", where
action is one of "B", "I", "L", "U". The string "-" is used where the action is one of "B", "I", "L", "U". The missing label is used where the
entity offsets don't align with the tokenization in the `Doc` object. entity offsets don't align with the tokenization in the `Doc` object.
The training algorithm will view these as missing values. "O" denotes a The training algorithm will view these as missing values. "O" denotes a
non-entity token. "B" denotes the beginning of a multi-token entity, non-entity token. "B" denotes the beginning of a multi-token entity,
@ -80,12 +84,11 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
>>> text = 'I like London.' >>> text = 'I like London.'
>>> entities = [(len('I like '), len('I like London'), 'LOC')] >>> entities = [(len('I like '), len('I like London'), 'LOC')]
>>> doc = nlp.tokenizer(text) >>> doc = nlp.tokenizer(text)
>>> tags = biluo_tags_from_offsets(doc, entities) >>> tags = offsets_to_biluo_tags(doc, entities)
>>> assert tags == ["O", "O", 'U-LOC', "O"] >>> assert tags == ["O", "O", 'U-LOC', "O"]
""" """
# Ensure no overlapping entity labels exist # Ensure no overlapping entity labels exist
tokens_in_ents = {} tokens_in_ents = {}
starts = {token.idx: token.i for token in doc} starts = {token.idx: token.i for token in doc}
ends = {token.idx + len(token): token.i for token in doc} ends = {token.idx + len(token): token.i for token in doc}
biluo = ["-" for _ in doc] biluo = ["-" for _ in doc]
@ -109,7 +112,6 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
) )
) )
tokens_in_ents[token_index] = (start_char, end_char, label) tokens_in_ents[token_index] = (start_char, end_char, label)
start_token = starts.get(start_char) start_token = starts.get(start_char)
end_token = ends.get(end_char) end_token = ends.get(end_char)
# Only interested if the tokenization is correct # Only interested if the tokenization is correct
@ -143,7 +145,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
return biluo return biluo
def spans_from_biluo_tags(doc, tags): def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]:
"""Encode per-token tags following the BILUO scheme into Span object, e.g. """Encode per-token tags following the BILUO scheme into Span object, e.g.
to overwrite the doc.ents. to overwrite the doc.ents.
@ -161,7 +163,9 @@ def spans_from_biluo_tags(doc, tags):
return spans return spans
def offsets_from_biluo_tags(doc, tags): def biluo_tags_to_offsets(
doc: Doc, tags: Iterable[str]
) -> List[Tuple[int, int, Union[str, int]]]:
"""Encode per-token tags following the BILUO scheme into entity offsets. """Encode per-token tags following the BILUO scheme into entity offsets.
doc (Doc): The document that the BILUO tags refer to. doc (Doc): The document that the BILUO tags refer to.
@ -172,11 +176,11 @@ def offsets_from_biluo_tags(doc, tags):
`end` will be character-offset integers denoting the slice into the `end` will be character-offset integers denoting the slice into the
original string. original string.
""" """
spans = spans_from_biluo_tags(doc, tags) spans = biluo_tags_to_spans(doc, tags)
return [(span.start_char, span.end_char, span.label_) for span in spans] return [(span.start_char, span.end_char, span.label_) for span in spans]
def tags_to_entities(tags): def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]:
"""Note that the end index returned by this function is inclusive. """Note that the end index returned by this function is inclusive.
To use it for Span creation, increment the end by 1.""" To use it for Span creation, increment the end by 1."""
entities = [] entities = []
@ -209,3 +213,9 @@ def tags_to_entities(tags):
else: else:
raise ValueError(Errors.E068.format(tag=tag)) raise ValueError(Errors.E068.format(tag=tag))
return entities return entities
# Fallbacks to make backwards-compat easier
offsets_from_biluo_tags = biluo_tags_to_offsets
spans_from_biluo_tags = biluo_tags_to_spans
biluo_tags_from_offsets = offsets_to_biluo_tags

View File

@ -11,9 +11,12 @@ def console_logger():
def setup_printer( def setup_printer(
nlp: "Language", nlp: "Language",
) -> Tuple[Callable[[Dict[str, Any]], None], Callable]: ) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
score_cols = list(nlp.config["training"]["score_weights"]) # we assume here that only components are enabled that should be trained & logged
logged_pipes = nlp.pipe_names
score_weights = nlp.config["training"]["score_weights"]
score_cols = [col for col, value in score_weights.items() if value is not None]
score_widths = [max(len(col), 6) for col in score_cols] score_widths = [max(len(col), 6) for col in score_cols]
loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names] loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
loss_widths = [max(len(col), 8) for col in loss_cols] loss_widths = [max(len(col), 8) for col in loss_cols]
table_header = ["E", "#"] + loss_cols + score_cols + ["Score"] table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
table_header = [col.upper() for col in table_header] table_header = [col.upper() for col in table_header]
@ -26,7 +29,7 @@ def console_logger():
try: try:
losses = [ losses = [
"{0:.2f}".format(float(info["losses"][pipe_name])) "{0:.2f}".format(float(info["losses"][pipe_name]))
for pipe_name in nlp.pipe_names for pipe_name in logged_pipes
] ]
except KeyError as e: except KeyError as e:
raise KeyError( raise KeyError(
@ -38,10 +41,15 @@ def console_logger():
) from None ) from None
scores = [] scores = []
for col in score_cols: for col in score_cols:
score = float(info["other_scores"].get(col, 0.0)) score = info["other_scores"].get(col, 0.0)
try:
score = float(score)
if col != "speed": if col != "speed":
score *= 100 score *= 100
scores.append("{0:.2f}".format(score)) scores.append("{0:.2f}".format(score))
except TypeError:
err = Errors.E916.format(name=col, score_type=type(score))
raise ValueError(err) from None
data = ( data = (
[info["epoch"], info["step"]] [info["epoch"], info["step"]]
+ losses + losses

View File

@ -61,7 +61,7 @@ LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta",
# Default order of sections in the config.cfg. Not all sections needs to exist, # Default order of sections in the config.cfg. Not all sections needs to exist,
# and additional sections are added at the end, in alphabetical order. # and additional sections are added at the end, in alphabetical order.
# fmt: off # fmt: off
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"] CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"]
# fmt: on # fmt: on
@ -1202,21 +1202,38 @@ def get_arg_names(func: Callable) -> List[str]:
return list(set([*argspec.args, *argspec.kwonlyargs])) return list(set([*argspec.args, *argspec.kwonlyargs]))
def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]: def combine_score_weights(
weights: List[Dict[str, float]],
overrides: Dict[str, Optional[Union[float, int]]] = SimpleFrozenDict(),
) -> Dict[str, float]:
"""Combine and normalize score weights defined by components, e.g. """Combine and normalize score weights defined by components, e.g.
{"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}. {"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}.
weights (List[dict]): The weights defined by the components. weights (List[dict]): The weights defined by the components.
overrides (Dict[str, Optional[Union[float, int]]]): Existing scores that
should be preserved.
RETURNS (Dict[str, float]): The combined and normalized weights. RETURNS (Dict[str, float]): The combined and normalized weights.
""" """
# We first need to extract all None/null values for score weights that
# shouldn't be shown in the table *or* be weighted
result = {} result = {}
all_weights = []
for w_dict in weights: for w_dict in weights:
filtered_weights = {}
for key, value in w_dict.items():
value = overrides.get(key, value)
if value is None:
result[key] = None
else:
filtered_weights[key] = value
all_weights.append(filtered_weights)
for w_dict in all_weights:
# We need to account for weights that don't sum to 1.0 and normalize # We need to account for weights that don't sum to 1.0 and normalize
# the score weights accordingly, then divide score by the number of # the score weights accordingly, then divide score by the number of
# components. # components.
total = sum(w_dict.values()) total = sum(w_dict.values())
for key, value in w_dict.items(): for key, value in w_dict.items():
weight = round(value / total / len(weights), 2) weight = round(value / total / len(all_weights), 2)
result[key] = result.get(key, 0.0) + weight result[key] = result.get(key, 0.0) + weight
return result return result

View File

@ -414,7 +414,8 @@ one component.
> ```ini > ```ini
> [model] > [model]
> @architectures = "spacy.TransitionBasedParser.v1" > @architectures = "spacy.TransitionBasedParser.v1"
> nr_feature_tokens = 6 > state_type = "ner"
> extra_state_tokens = false
> hidden_width = 64 > hidden_width = 64
> maxout_pieces = 2 > maxout_pieces = 2
> >
@ -447,9 +448,10 @@ consists of either two or three subnetworks:
as action scores directly. as action scores directly.
| Name | Description | | Name | Description |
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ | | `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~ | | `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ |
| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
| `hidden_width` | The width of the hidden layer. ~~int~~ | | `hidden_width` | The width of the hidden layer. ~~int~~ |
| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ | | `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ |
| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ | | `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |

View File

@ -275,7 +275,7 @@ $ python -m spacy convert ./data.json ./output.spacy
> entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes > entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes
> the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token > the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token
> representing a `PERSON` entity. The > representing a `PERSON` entity. The
> [`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets) function > [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function
> can help you convert entity offsets to the right format. > can help you convert entity offsets to the right format.
```python ```python

View File

@ -146,15 +146,14 @@ examples, see the
> ``` > ```
| Name | Description | | Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | The name of the component factory. ~~str~~ | | `name` | The name of the component factory. ~~str~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ | | `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ |
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | | `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | | `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ | | `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | | `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ |
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ |
| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ | | `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ |
## Language.\_\_call\_\_ {#call tag="method"} ## Language.\_\_call\_\_ {#call tag="method"}
@ -1037,11 +1036,11 @@ component is defined and stored on the `Language` class for each component
instance and factory instance. instance and factory instance.
| Name | Description | | Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory` | The name of the registered component factory. ~~str~~ | | `factory` | The name of the registered component factory. ~~str~~ |
| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ | | `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ |
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | | `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~  | | `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~  |
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~  | | `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~  |
| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | | `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ |
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ | | `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Based on the `default_score_weights` and used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |

View File

@ -619,7 +619,7 @@ sequences in the batch.
## Training data and alignment {#gold source="spacy/training"} ## Training data and alignment {#gold source="spacy/training"}
### training.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
Encode labelled spans into per-token tags, using the Encode labelled spans into per-token tags, using the
[BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit, [BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit,
@ -632,14 +632,20 @@ the beginning of a multi-token entity, `I` the inside of an entity of three or
more tokens, and `L` the end of an entity of two or more tokens. `U` denotes a more tokens, and `L` the end of an entity of two or more tokens. `U` denotes a
single-token entity. single-token entity.
<Infobox title="Changed in v3.0" variant="warning" id="biluo_tags_from_offsets">
This method was previously available as `spacy.gold.biluo_tags_from_offsets`.
</Infobox>
> #### Example > #### Example
> >
> ```python > ```python
> from spacy.training import biluo_tags_from_offsets > from spacy.training import offsets_to_biluo_tags
> >
> doc = nlp("I like London.") > doc = nlp("I like London.")
> entities = [(7, 13, "LOC")] > entities = [(7, 13, "LOC")]
> tags = biluo_tags_from_offsets(doc, entities) > tags = offsets_to_biluo_tags(doc, entities)
> assert tags == ["O", "O", "U-LOC", "O"] > assert tags == ["O", "O", "U-LOC", "O"]
> ``` > ```
@ -647,21 +653,28 @@ single-token entity.
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. ~~Doc~~ | | `doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. ~~Doc~~ |
| `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ | | `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ |
| `missing` | The label used for missing values, e.g. if tokenization doesn't align with the entity offsets. Defaults to `"O"`. ~~str~~ |
| **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ | | **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
### training.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} ### training.biluo_tags_to_offsets {#biluo_tags_to_offsets tag="function"}
Encode per-token tags following the Encode per-token tags following the
[BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets. [BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets.
<Infobox title="Changed in v3.0" variant="warning" id="offsets_from_biluo_tags">
This method was previously available as `spacy.gold.offsets_from_biluo_tags`.
</Infobox>
> #### Example > #### Example
> >
> ```python > ```python
> from spacy.training import offsets_from_biluo_tags > from spacy.training import biluo_tags_to_offsets
> >
> doc = nlp("I like London.") > doc = nlp("I like London.")
> tags = ["O", "O", "U-LOC", "O"] > tags = ["O", "O", "U-LOC", "O"]
> entities = offsets_from_biluo_tags(doc, tags) > entities = biluo_tags_to_offsets(doc, tags)
> assert entities == [(7, 13, "LOC")] > assert entities == [(7, 13, "LOC")]
> ``` > ```
@ -671,21 +684,27 @@ Encode per-token tags following the
| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ | | `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
| **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ | | **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ |
### training.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} ### training.biluo_tags_to_spans {#biluo_tags_to_spans tag="function" new="2.1"}
Encode per-token tags following the Encode per-token tags following the
[BILUO scheme](/usage/linguistic-features#accessing-ner) into [BILUO scheme](/usage/linguistic-features#accessing-ner) into
[`Span`](/api/span) objects. This can be used to create entity spans from [`Span`](/api/span) objects. This can be used to create entity spans from
token-based tags, e.g. to overwrite the `doc.ents`. token-based tags, e.g. to overwrite the `doc.ents`.
<Infobox title="Changed in v3.0" variant="warning" id="spans_from_biluo_tags">
This method was previously available as `spacy.gold.spans_from_biluo_tags`.
</Infobox>
> #### Example > #### Example
> >
> ```python > ```python
> from spacy.training import spans_from_biluo_tags > from spacy.training import biluo_tags_to_spans
> >
> doc = nlp("I like London.") > doc = nlp("I like London.")
> tags = ["O", "O", "U-LOC", "O"] > tags = ["O", "O", "U-LOC", "O"]
> doc.ents = spans_from_biluo_tags(doc, tags) > doc.ents = biluo_tags_to_spans(doc, tags)
> ``` > ```
| Name | Description | | Name | Description |

View File

@ -1,24 +1,19 @@
import { Help } from 'components/typography'; import Link from 'components/link' import { Help } from 'components/typography'; import Link from 'components/link'
<!-- TODO: update, add project template --> <!-- TODO: update numbers -->
<figure> <figure>
| System | Parser | Tagger | NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> | | Pipeline | Parser | Tagger | NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
| ------------------------------------------------------------------------- | ----------------: | ----------------: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: | | ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k | | [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k |
| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | | | [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | |
| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | | | `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | |
| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup> | _n/a_<sup>2</sup> | _n/a_<sup>2</sup> | 88.8 | 234 | 2k |
| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link> | - | 97.9 | 89.3 | | |
<figcaption class="caption"> <figcaption class="caption">
**Accuracy and speed on the **Full pipeline accuracy and speed** on the
[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**<br />**1. ** [OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.
[Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf). **2. ** _Coming soon_:
Qi et al. don't report parsing and tagging results on OntoNotes. We're working
on training Stanza on this corpus to allow direct comparison.
</figcaption> </figcaption>
@ -26,18 +21,24 @@ on training Stanza on this corpus to allow direct comparison.
<figure> <figure>
| System | POS | UAS | LAS | | Named Entity Recognition System | OntoNotes | CoNLL '03 |
| ------------------------------------------------------------------------------ | ---: | ---: | ---: | | ------------------------------------------------------------------------------ | --------: | --------: |
| spaCy RoBERTa (2020) | | | | | spaCy RoBERTa (2020) | | 92.2 |
| spaCy CNN (2020) | | | | | spaCy CNN (2020) | 85.3 | 88.4 |
| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 | | spaCy CNN (2017) | 86.4 | |
| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 | | [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup> | 88.8 | 92.1 |
| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link><sup>2</sup> | 89.7 | 93.1 |
| BERT Base<sup>3</sup> | - | 92.4 |
<figcaption class="caption"> <figcaption class="caption">
**Accuracy on the Penn Treebank.** See **Named entity recognition accuracy** on the
[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more [OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) and
results. [CoNLL-2003](https://www.aclweb.org/anthology/W03-0419.pdf) corpora. See
[NLP-progress](http://nlpprogress.com/english/named_entity_recognition.html) for
more results. **1. ** [Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf).
**2. ** [Akbik et al. (2018)](https://www.aclweb.org/anthology/C18-1139/). **3.
** [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805).
</figcaption> </figcaption>

View File

@ -235,8 +235,6 @@ The `Transformer` component sets the
[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute, [`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
which lets you access the transformers outputs at runtime. which lets you access the transformers outputs at runtime.
<!-- TODO: update/confirm once we have final models trained -->
```cli ```cli
$ python -m spacy download en_core_trf_lg $ python -m spacy download en_core_trf_lg
``` ```
@ -448,7 +446,8 @@ factory = "ner"
[nlp.pipeline.ner.model] [nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1" @architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3 state_type = "ner"
extra_state_tokens = false
hidden_width = 128 hidden_width = 128
maxout_pieces = 3 maxout_pieces = 3
use_upper = false use_upper = false

View File

@ -61,12 +61,25 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
<Benchmarks /> <Benchmarks />
<Project id="benchmarks/parsing_penn_treebank"> <figure>
The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone | Dependency Parsing System | UAS | LAS |
our project template. | ------------------------------------------------------------------------------ | ---: | ---: |
| spaCy RoBERTa (2020)<sup>1</sup> | 96.8 | 95.0 |
| spaCy CNN (2020)<sup>1</sup> | 93.7 | 91.8 |
| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 |
| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.2 | 95.7 |
</Project> <figcaption class="caption">
**Dependency parsing accuracy** on the Penn Treebank. See
[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
results. **1. ** Project template:
[`benchmarks/parsing_penn_treebank`](%%GITHUB_PROJECTS/benchmarks/parsing_penn_treebank).
</figcaption>
</figure>
<!-- TODO: ## Citing spaCy {#citation} <!-- TODO: ## Citing spaCy {#citation}

View File

@ -1654,9 +1654,12 @@ The [`SentenceRecognizer`](/api/sentencerecognizer) is a simple statistical
component that only provides sentence boundaries. Along with being faster and component that only provides sentence boundaries. Along with being faster and
smaller than the parser, its primary advantage is that it's easier to train smaller than the parser, its primary advantage is that it's easier to train
because it only requires annotated sentence boundaries rather than full because it only requires annotated sentence boundaries rather than full
dependency parses. dependency parses. spaCy's [trained pipelines](/models) include both a parser
and a trained sentence segmenter, which is
<!-- TODO: update/confirm usage once we have final models trained --> [disabled](/usage/processing-pipelines#disabling) by default. If you only need
sentence boundaries and no parser, you can use the `enable` and `disable`
arguments on [`spacy.load`](/api/top-level#spacy.load) to enable the senter and
disable the parser.
> #### senter vs. parser > #### senter vs. parser
> >

View File

@ -253,8 +253,6 @@ different mechanisms you can use:
Disabled and excluded component names can be provided to Disabled and excluded component names can be provided to
[`spacy.load`](/api/top-level#spacy.load) as a list. [`spacy.load`](/api/top-level#spacy.load) as a list.
<!-- TODO: update with info on our models shipped with optional components -->
> #### 💡 Optional pipeline components > #### 💡 Optional pipeline components
> >
> The `disable` mechanism makes it easy to distribute pipeline packages with > The `disable` mechanism makes it easy to distribute pipeline packages with
@ -262,6 +260,11 @@ Disabled and excluded component names can be provided to
> your pipeline may include a statistical _and_ a rule-based component for > your pipeline may include a statistical _and_ a rule-based component for
> sentence segmentation, and you can choose which one to run depending on your > sentence segmentation, and you can choose which one to run depending on your
> use case. > use case.
>
> For example, spaCy's [trained pipelines](/models) like
> [`en_core_web_sm`](/models/en#en_core_web_sm) contain both a `parser` and
> `senter` that perform sentence segmentation, but the `senter` is disabled by
> default.
```python ```python
# Load the pipeline without the entity recognizer # Load the pipeline without the entity recognizer
@ -1501,7 +1504,7 @@ add those entities to the `doc.ents`, you can wrap it in a custom pipeline
component function and pass it the token texts from the `Doc` object received by component function and pass it the token texts from the `Doc` object received by
the component. the component.
The [`training.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very The [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans) is very
helpful here, because it takes a `Doc` object and token-based BILUO tags and helpful here, because it takes a `Doc` object and token-based BILUO tags and
returns a sequence of `Span` objects in the `Doc` with added labels. So all your returns a sequence of `Span` objects in the `Doc` with added labels. So all your
wrapper has to do is compute the entity spans and overwrite the `doc.ents`. wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
@ -1516,14 +1519,14 @@ wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
```python ```python
### {highlight="1,8-9"} ### {highlight="1,8-9"}
import your_custom_entity_recognizer import your_custom_entity_recognizer
from spacy.training import offsets_from_biluo_tags from spacy.training import biluo_tags_to_spans
from spacy.language import Language from spacy.language import Language
@Language.component("custom_ner_wrapper") @Language.component("custom_ner_wrapper")
def custom_ner_wrapper(doc): def custom_ner_wrapper(doc):
words = [token.text for token in doc] words = [token.text for token in doc]
custom_entities = your_custom_entity_recognizer(words) custom_entities = your_custom_entity_recognizer(words)
doc.ents = spans_from_biluo_tags(doc, custom_entities) doc.ents = biluo_tags_to_spans(doc, custom_entities)
return doc return doc
``` ```

View File

@ -213,7 +213,7 @@ a quick web demo. It looks pretty similar to a config file used to define CI
pipelines. pipelines.
```yaml ```yaml
https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml %%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
``` ```
| Section | Description | | Section | Description |
@ -733,7 +733,10 @@ workflows, but only one can be tracked by DVC.
<Infobox title="This section is still under construction" emoji="🚧" variant="warning"> <Infobox title="This section is still under construction" emoji="🚧" variant="warning">
The Prodigy integration will require a nightly version of Prodigy that supports The Prodigy integration will require a nightly version of Prodigy that supports
spaCy v3+. spaCy v3+. You can already use annotations created with Prodigy in spaCy v3 by
exporting your data with
[`data-to-spacy`](https://prodi.gy/docs/recipes#data-to-spacy) and running
[`spacy convert`](/api/cli#convert) to convert it to the binary format.
</Infobox> </Infobox>

View File

@ -470,6 +470,7 @@ score.
```ini ```ini
[training.score_weights] [training.score_weights]
dep_las = 0.4 dep_las = 0.4
dep_uas = null
ents_f = 0.4 ents_f = 0.4
tag_acc = 0.2 tag_acc = 0.2
token_acc = 0.0 token_acc = 0.0
@ -481,9 +482,9 @@ you generate a config for a given pipeline, the score weights are generated by
combining and normalizing the default score weights of the pipeline components. combining and normalizing the default score weights of the pipeline components.
The default score weights are defined by each pipeline component via the The default score weights are defined by each pipeline component via the
`default_score_weights` setting on the `default_score_weights` setting on the
[`@Language.component`](/api/language#component) or [`@Language.factory`](/api/language#factory) decorator. By default, all pipeline
[`@Language.factory`](/api/language#factory). By default, all pipeline components are weighted equally. If a score weight is set to `null`, it will be
components are weighted equally. excluded from the logs and the score won't be weighted.
<Accordion title="Understanding the training output and score types" spaced> <Accordion title="Understanding the training output and score types" spaced>

View File

@ -88,7 +88,10 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
- **Architectures: ** [TransformerModel](/api/architectures#TransformerModel), - **Architectures: ** [TransformerModel](/api/architectures#TransformerModel),
[TransformerListener](/api/architectures#TransformerListener), [TransformerListener](/api/architectures#TransformerListener),
[Tok2VecTransformer](/api/architectures#Tok2VecTransformer) [Tok2VecTransformer](/api/architectures#Tok2VecTransformer)
- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf) - **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf),
[`de_dep_news_trf`](/models/de#de_dep_news_trf),
[`es_dep_news_trf`](/models/es#es_dep_news_trf),
[`fr_dep_news_trf`](/models/fr#fr_dep_news_trf)
- **Implementation:** - **Implementation:**
[`spacy-transformers`](https://github.com/explosion/spacy-transformers) [`spacy-transformers`](https://github.com/explosion/spacy-transformers)
@ -549,12 +552,14 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
### Removed or renamed API {#incompat-removed} ### Removed or renamed API {#incompat-removed}
| Removed | Replacement | | Removed | Replacement |
| -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | | -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) | | `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) |
| `GoldParse` | [`Example`](/api/example) | | `GoldParse` | [`Example`](/api/example) |
| `GoldCorpus` | [`Corpus`](/api/corpus) | | `GoldCorpus` | [`Corpus`](/api/corpus) |
| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) | | `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) |
| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed | | `Matcher.pipe`, `PhraseMatcher.pipe` | not needed |
| `gold.offsets_from_biluo_tags`, `gold.spans_from_biluo_tags`, `gold.biluo_tags_from_offsets` | [`training.biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets), [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), [`training.offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) |
| `spacy init-model` | [`spacy init vocab`](/api/cli#init-vocab) | | `spacy init-model` | [`spacy init vocab`](/api/cli#init-vocab) |
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) | | `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) | | `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |
@ -968,16 +973,17 @@ python -m spacy package ./output ./packages
#### Data utilities and gold module {#migrating-gold} #### Data utilities and gold module {#migrating-gold}
The `spacy.gold` module has been renamed to `spacy.training`. This mostly The `spacy.gold` module has been renamed to `spacy.training` and the conversion
affects internals, but if you've been using the span offset conversion utilities utilities now follow the naming format of `x_to_y`. This mostly affects
[`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets), internals, but if you've been using the span offset conversion utilities
[`offsets_from_biluo_tags`](/api/top-level#offsets_from_biluo_tags) or [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags),
[`spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags), you'll have to [`biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets) or
change your imports: [`biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), you'll have to
change your names and imports:
```diff ```diff
- from spacy.gold import biluo_tags_from_offsets, spans_from_biluo_tags - from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, spans_from_biluo_tags
+ from spacy.training import biluo_tags_from_offsets, spans_from_biluo_tags + from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, biluo_tags_to_spans
``` ```
#### Migration notes for plugin maintainers {#migrating-plugins} #### Migration notes for plugin maintainers {#migrating-plugins}

View File

@ -24,6 +24,7 @@ const branch = isNightly ? 'develop' : 'master'
// Those variables are going to be replaced in the Markdown, e.g. %%GITHUB_SPACY // Those variables are going to be replaced in the Markdown, e.g. %%GITHUB_SPACY
const replacements = { const replacements = {
GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`, GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
GITHUB_PROJECTS: `https://github.com/${site.projectsRepo}`,
} }
/** /**

View File

@ -1,21 +1,11 @@
{ {
"languages": [ "languages": [
{ { "code": "af", "name": "Afrikaans" },
"code": "zh", { "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true },
"name": "Chinese", { "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true },
"models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"], { "code": "bn", "name": "Bengali", "has_examples": true },
"dependencies": [ { "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
{ { "code": "cs", "name": "Czech", "has_examples": true },
"name": "Jieba",
"url": "https://github.com/fxsjy/jieba"
},
{
"name": "PKUSeg",
"url": "https://github.com/lancopku/PKUSeg-python"
}
],
"has_examples": true
},
{ {
"code": "da", "code": "da",
"name": "Danish", "name": "Danish",
@ -23,39 +13,10 @@
"has_examples": true, "has_examples": true,
"models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"] "models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"]
}, },
{
"code": "nl",
"name": "Dutch",
"models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
"example": "Dit is een zin.",
"has_examples": true
},
{
"code": "en",
"name": "English",
"models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"],
"starters": [
"en_vectors_web_lg",
"en_trf_bertbaseuncased_lg",
"en_trf_robertabase_lg",
"en_trf_distilbertbaseuncased_lg",
"en_trf_xlnetbasecased_lg"
],
"example": "This is a sentence.",
"has_examples": true
},
{
"code": "fr",
"name": "French",
"models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"],
"example": "C'est une phrase.",
"has_examples": true
},
{ {
"code": "de", "code": "de",
"name": "German", "name": "German",
"models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"], "models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg", "de_dep_news_trf"],
"starters": ["de_trf_bertbasecased_lg"],
"example": "Dies ist ein Satz.", "example": "Dies ist ein Satz.",
"has_examples": true "has_examples": true
}, },
@ -66,6 +27,46 @@
"example": "Αυτή είναι μια πρόταση.", "example": "Αυτή είναι μια πρόταση.",
"has_examples": true "has_examples": true
}, },
{
"code": "en",
"name": "English",
"models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"],
"starters": ["en_vectors_web_lg"],
"example": "This is a sentence.",
"has_examples": true
},
{
"code": "es",
"name": "Spanish",
"models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf"],
"example": "Esto es una frase.",
"has_examples": true
},
{ "code": "et", "name": "Estonian" },
{ "code": "eu", "name": "Basque", "has_examples": true },
{ "code": "fa", "name": "Persian", "has_examples": true },
{ "code": "fi", "name": "Finnish", "has_examples": true },
{
"code": "fr",
"name": "French",
"models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg", "fr_dep_news_trf"],
"example": "C'est une phrase.",
"has_examples": true
},
{ "code": "ga", "name": "Irish" },
{ "code": "gu", "name": "Gujarati", "has_examples": true },
{ "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },
{ "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true },
{ "code": "hr", "name": "Croatian", "has_examples": true },
{ "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true },
{ "code": "hy", "name": "Armenian", "has_examples": true },
{
"code": "id",
"name": "Indonesian",
"example": "Ini adalah sebuah kalimat.",
"has_examples": true
},
{ "code": "is", "name": "Icelandic" },
{ {
"code": "it", "code": "it",
"name": "Italian", "name": "Italian",
@ -88,12 +89,37 @@
"example": "これは文章です。", "example": "これは文章です。",
"has_examples": true "has_examples": true
}, },
{ "code": "kn", "name": "Kannada", "has_examples": true },
{
"code": "ko",
"name": "Korean",
"dependencies": [
{
"name": "mecab-ko",
"url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md"
},
{ "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" },
{ "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" }
],
"example": "이것은 문장입니다.",
"has_examples": true
},
{ "code": "lb", "name": "Luxembourgish", "has_examples": true },
{
"code": "lij",
"name": "Ligurian",
"example": "Sta chì a l'é unna fraxe.",
"has_examples": true
},
{ {
"code": "lt", "code": "lt",
"name": "Lithuanian", "name": "Lithuanian",
"has_examples": true, "has_examples": true,
"models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"] "models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"]
}, },
{ "code": "lv", "name": "Latvian" },
{ "code": "ml", "name": "Malayalam", "has_examples": true },
{ "code": "mr", "name": "Marathi" },
{ {
"code": "nb", "code": "nb",
"name": "Norwegian Bokmål", "name": "Norwegian Bokmål",
@ -101,6 +127,14 @@
"has_examples": true, "has_examples": true,
"models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"] "models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"]
}, },
{ "code": "ne", "name": "Nepali", "has_examples": true },
{
"code": "nl",
"name": "Dutch",
"models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
"example": "Dit is een zin.",
"has_examples": true
},
{ {
"code": "pl", "code": "pl",
"name": "Polish", "name": "Polish",
@ -122,69 +156,26 @@
"has_examples": true, "has_examples": true,
"models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"] "models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"]
}, },
{
"code": "es",
"name": "Spanish",
"models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"],
"example": "Esto es una frase.",
"has_examples": true
},
{ "code": "sv", "name": "Swedish", "has_examples": true },
{ "code": "fi", "name": "Finnish", "has_examples": true },
{ "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true },
{ {
"code": "ru", "code": "ru",
"name": "Russian", "name": "Russian",
"has_examples": true, "has_examples": true,
"dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }] "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
}, },
{ { "code": "sa", "name": "Sanskrit", "has_examples": true },
"code": "uk",
"name": "Ukrainian",
"has_examples": true,
"dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
},
{ "code": "hr", "name": "Croatian", "has_examples": true },
{ "code": "eu", "name": "Basque", "has_examples": true },
{ "code": "yo", "name": "Yoruba", "has_examples": true },
{ "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
{ "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
{ "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },
{ "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true },
{ "code": "fa", "name": "Persian", "has_examples": true },
{ "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true },
{ "code": "tt", "name": "Tatar", "has_examples": true },
{ "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true },
{ "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true }, { "code": "si", "name": "Sinhala", "example": "මෙය වාක්‍යයකි.", "has_examples": true },
{ "code": "ga", "name": "Irish" }, { "code": "sk", "name": "Slovak", "has_examples": true },
{ "code": "bn", "name": "Bengali", "has_examples": true },
{ "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true },
{ "code": "mr", "name": "Marathi" },
{ "code": "kn", "name": "Kannada" },
{ "code": "ta", "name": "Tamil", "has_examples": true },
{
"code": "id",
"name": "Indonesian",
"example": "Ini adalah sebuah kalimat.",
"has_examples": true
},
{ "code": "tl", "name": "Tagalog" },
{ "code": "af", "name": "Afrikaans" },
{ "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true },
{ "code": "cs", "name": "Czech" },
{ "code": "is", "name": "Icelandic" },
{ "code": "lv", "name": "Latvian" },
{ "code": "sr", "name": "Serbian" },
{ "code": "sk", "name": "Slovak" },
{ "code": "sl", "name": "Slovenian" }, { "code": "sl", "name": "Slovenian" },
{ "code": "lb", "name": "Luxembourgish" },
{ {
"code": "sq", "code": "sq",
"name": "Albanian", "name": "Albanian",
"example": "Kjo është një fjali.", "example": "Kjo është një fjali.",
"has_examples": true "has_examples": true
}, },
{ "code": "et", "name": "Estonian" }, { "code": "sr", "name": "Serbian", "has_examples": true },
{ "code": "sv", "name": "Swedish", "has_examples": true },
{ "code": "ta", "name": "Tamil", "has_examples": true },
{ "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true },
{ {
"code": "th", "code": "th",
"name": "Thai", "name": "Thai",
@ -194,51 +185,43 @@
"example": "นี่คือประโยค", "example": "นี่คือประโยค",
"has_examples": true "has_examples": true
}, },
{ "code": "tl", "name": "Tagalog" },
{ "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
{ "code": "tt", "name": "Tatar", "has_examples": true },
{ {
"code": "ko", "code": "uk",
"name": "Korean", "name": "Ukrainian",
"dependencies": [ "has_examples": true,
{ "dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
"name": "mecab-ko",
"url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md"
},
{ "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" },
{ "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" }
],
"example": "이것은 문장입니다.",
"has_examples": true
}, },
{ "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true },
{ {
"code": "vi", "code": "vi",
"name": "Vietnamese", "name": "Vietnamese",
"dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }] "dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }]
}, },
{
"code": "lij",
"name": "Ligurian",
"example": "Sta chì a l'é unna fraxe.",
"has_examples": true
},
{
"code": "hy",
"name": "Armenian",
"has_examples": true
},
{
"code": "gu",
"name": "Gujarati",
"has_examples": true
},
{
"code": "ml",
"name": "Malayalam",
"has_examples": true
},
{ {
"code": "xx", "code": "xx",
"name": "Multi-language", "name": "Multi-language",
"models": ["xx_ent_wiki_sm"], "models": ["xx_ent_wiki_sm"],
"example": "This is a sentence about Facebook." "example": "This is a sentence about Facebook."
},
{ "code": "yo", "name": "Yoruba", "has_examples": true },
{
"code": "zh",
"name": "Chinese",
"models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"],
"dependencies": [
{
"name": "Jieba",
"url": "https://github.com/fxsjy/jieba"
},
{
"name": "PKUSeg",
"url": "https://github.com/lancopku/PKUSeg-python"
}
],
"has_examples": true
} }
], ],
"licenses": [ "licenses": [

View File

@ -1,4 +1,4 @@
import React from 'react' import React, { Fragment } from 'react'
import PropTypes from 'prop-types' import PropTypes from 'prop-types'
import classNames from 'classnames' import classNames from 'classnames'
@ -14,13 +14,16 @@ export default function Infobox({
className, className,
children, children,
}) { }) {
const Wrapper = id ? 'div' : Fragment
const infoboxClassNames = classNames(classes.root, className, { const infoboxClassNames = classNames(classes.root, className, {
[classes.list]: !!list, [classes.list]: !!list,
[classes.warning]: variant === 'warning', [classes.warning]: variant === 'warning',
[classes.danger]: variant === 'danger', [classes.danger]: variant === 'danger',
}) })
return ( return (
<aside className={infoboxClassNames} id={id}> <Wrapper>
{id && <a id={id} />}
<aside className={infoboxClassNames}>
{title && ( {title && (
<h4 className={classes.title}> <h4 className={classes.title}>
{variant !== 'default' && !emoji && ( {variant !== 'default' && !emoji && (
@ -38,6 +41,7 @@ export default function Infobox({
)} )}
{children} {children}
</aside> </aside>
</Wrapper>
) )
} }

View File

@ -12,7 +12,6 @@ import Tag from '../components/tag'
import { H2, Label } from '../components/typography' import { H2, Label } from '../components/typography'
import Icon from '../components/icon' import Icon from '../components/icon'
import Link from '../components/link' import Link from '../components/link'
import Grid from '../components/grid'
import Infobox from '../components/infobox' import Infobox from '../components/infobox'
import Accordion from '../components/accordion' import Accordion from '../components/accordion'
import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util' import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
@ -31,10 +30,16 @@ const MODEL_META = {
wiki: 'Wikipedia', wiki: 'Wikipedia',
uas: 'Unlabelled dependencies', uas: 'Unlabelled dependencies',
las: 'Labelled dependencies', las: 'Labelled dependencies',
token_acc: 'Tokenization',
tok: 'Tokenization',
tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)', tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
ents_f: 'Entities (F-score)', tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
ents_p: 'Entities (precision)', ents_f: 'Named entities (F-score)',
ents_r: 'Entities (recall)', ents_p: 'Named entities (precision)',
ents_r: 'Named entities (recall)',
sent_f: 'Sentence segmentation (F-score)',
sent_p: 'Sentence segmentation (precision)',
sent_r: 'Sentence segmentation (recall)',
cpu: 'words per second on CPU', cpu: 'words per second on CPU',
gpu: 'words per second on GPU', gpu: 'words per second on GPU',
pipeline: 'Active processing pipeline components in order', pipeline: 'Active processing pipeline components in order',
@ -83,25 +88,19 @@ function formatVectors(data) {
} }
function formatAccuracy(data) { function formatAccuracy(data) {
if (!data) return null if (!data) return []
const labels = {
las: 'LAS',
uas: 'UAS',
tags_acc: 'TAG',
ents_f: 'NER F',
ents_p: 'NER P',
ents_r: 'NER R',
}
const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key)
const isNer = key => key.startsWith('ents_')
return Object.keys(data) return Object.keys(data)
.filter(key => labels[key]) .map(label => {
.map(key => ({ const value = data[label]
label: labels[key], return isNaN(value)
value: data[key].toFixed(2), ? null
help: MODEL_META[key], : {
type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null, label,
})) value: value.toFixed(2),
help: MODEL_META[label],
}
})
.filter(item => item)
} }
function formatModelMeta(data) { function formatModelMeta(data) {
@ -188,16 +187,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
{ label: 'Author', content: author }, { label: 'Author', content: author },
{ label: 'License', content: license }, { label: 'License', content: license },
] ]
const accuracy = [
{
label: 'Syntax Accuracy',
items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'syntax') : null,
},
{
label: 'NER Accuracy',
items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'ner') : null,
},
]
const error = ( const error = (
<Infobox title="Unable to load model details from GitHub" variant="danger"> <Infobox title="Unable to load model details from GitHub" variant="danger">
@ -209,7 +198,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
</p> </p>
</Infobox> </Infobox>
) )
return ( return (
<Section id={name}> <Section id={name}>
<H2 <H2
@ -254,33 +242,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
)} )}
</tbody> </tbody>
</Table> </Table>
<Grid cols={2} gutterBottom={hasInteractiveCode || !!labels}>
{accuracy &&
accuracy.map(({ label, items }, i) =>
!items ? null : (
<Table fixed key={i}>
<thead>
<Tr>
<Th colSpan={2}>{label}</Th>
</Tr>
</thead>
<tbody>
{items.map((item, i) => (
<Tr key={i}>
<Td>
<Label>
{item.label}{' '}
{item.help && <Help>{item.help}</Help>}
</Label>
</Td>
<Td num>{item.value}</Td>
</Tr>
))}
</tbody>
</Table>
)
)}
</Grid>
{meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)} {meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)}
{hasInteractiveCode && ( {hasInteractiveCode && (
<CodeBlock title="Try out the model" lang="python" executable={true}> <CodeBlock title="Try out the model" lang="python" executable={true}>
@ -288,7 +249,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
`import spacy`, `import spacy`,
`from spacy.lang.${langId}.examples import sentences `, `from spacy.lang.${langId}.examples import sentences `,
``, ``,
`nlp = spacy.load('${name}')`, `nlp = spacy.load("${name}")`,
`doc = nlp(sentences[0])`, `doc = nlp(sentences[0])`,
`print(doc.text)`, `print(doc.text)`,
`for token in doc:`, `for token in doc:`,
@ -296,6 +257,25 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
].join('\n')} ].join('\n')}
</CodeBlock> </CodeBlock>
)} )}
{meta.accuracy && (
<Accordion id={`${name}-accuracy`} title="Accuracy Evaluation">
<Table>
<tbody>
{meta.accuracy.map(({ label, value, help }) => (
<Tr key={`${name}-${label}`}>
<Td nowrap>
<InlineCode>{label.toUpperCase()}</InlineCode>
</Td>
<Td>{help}</Td>
<Td num style={{ textAlign: 'right' }}>
{value}
</Td>
</Tr>
))}
</tbody>
</Table>
</Accordion>
)}
{labels && ( {labels && (
<Accordion id={`${name}-labels`} title="Label Scheme"> <Accordion id={`${name}-labels`} title="Label Scheme">
<p> <p>
@ -313,7 +293,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const labelNames = labels[pipe] || [] const labelNames = labels[pipe] || []
const help = LABEL_SCHEME_META[pipe] const help = LABEL_SCHEME_META[pipe]
return ( return (
<Tr key={pipe} evenodd={false} key={pipe}> <Tr key={`${name}-${pipe}`} evenodd={false} key={pipe}>
<Td style={{ width: '20%' }}> <Td style={{ width: '20%' }}>
<Label> <Label>
{pipe} {help && <Help>{help}</Help>} {pipe} {help && <Help>{help}</Help>}
@ -343,7 +323,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
const Models = ({ pageContext, repo, children }) => { const Models = ({ pageContext, repo, children }) => {
const [initialized, setInitialized] = useState(false) const [initialized, setInitialized] = useState(false)
const [compatibility, setCompatibility] = useState({}) const [compatibility, setCompatibility] = useState({})
const { id, title, meta } = pageContext const { id, title, meta, hasExamples } = pageContext
const { models, isStarters } = meta const { models, isStarters } = meta
const baseUrl = `https://raw.githubusercontent.com/${repo}/master` const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
@ -360,7 +340,6 @@ const Models = ({ pageContext, repo, children }) => {
const modelTitle = title const modelTitle = title
const modelTeaser = `Available trained pipelines for ${title}` const modelTeaser = `Available trained pipelines for ${title}`
const starterTitle = `${title} starters` const starterTitle = `${title} starters`
const starterTeaser = `Available transfer learning starter packs for ${title}` const starterTeaser = `Available transfer learning starter packs for ${title}`
@ -392,6 +371,7 @@ const Models = ({ pageContext, repo, children }) => {
baseUrl={baseUrl} baseUrl={baseUrl}
repo={repo} repo={repo}
licenses={arrayToObj(site.siteMetadata.licenses, 'id')} licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
hasExamples={meta.hasExamples}
/> />
)) ))
} }

View File

@ -297,7 +297,7 @@ const Landing = ({ data }) => {
to run. to run.
</p> </p>
<p> <p>
<Button to="/usage/facts-figures#benchmarks">See details</Button> <Button to="/usage/facts-figures#benchmarks">More results</Button>
</p> </p>
</LandingCol> </LandingCol>

View File

@ -22,7 +22,7 @@ const Language = ({ name, code, models }) => (
<Td> <Td>
{models && models.length ? ( {models && models.length ? (
<Link to={`/models/${code}`}> <Link to={`/models/${code}`}>
{models.length} {models.length === 1 ? 'model' : 'models'} {models.length} {models.length === 1 ? 'package' : 'packages'}
</Link> </Link>
) : ( ) : (
<em>none yet</em> <em>none yet</em>
@ -51,7 +51,7 @@ const Languages = () => (
<Th>Language</Th> <Th>Language</Th>
<Th>Code</Th> <Th>Code</Th>
<Th>Language Data</Th> <Th>Language Data</Th>
<Th>Models</Th> <Th>Pipelines</Th>
</Tr> </Tr>
</thead> </thead>
<tbody> <tbody>

View File

@ -16,7 +16,8 @@ export default function Project({
}) { }) {
const repoArg = repo ? ` --repo ${repo}` : '' const repoArg = repo ? ` --repo ${repo}` : ''
const text = `${COMMAND} ${id}${repoArg}` const text = `${COMMAND} ${id}${repoArg}`
const url = `${repo || projectsRepo}/${id}` const defaultRepo = `https://github.com/${projectsRepo}`
const url = `${repo || defaultRepo}/${id}`
const header = ( const header = (
<> <>
{title}:{' '} {title}:{' '}