mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-16 03:20:34 +03:00
Merge branch 'develop' into nightly.spacy.io
This commit is contained in:
commit
921d188bce
28
README.md
28
README.md
|
@ -8,12 +8,12 @@ be used in real products.
|
|||
|
||||
spaCy comes with
|
||||
[pretrained pipelines](https://spacy.io/models) and vectors, and
|
||||
currently supports tokenization for **59+ languages**. It features
|
||||
currently supports tokenization for **60+ languages**. It features
|
||||
state-of-the-art speed, convolutional **neural network models** for tagging,
|
||||
parsing, **named entity recognition**, **text classification** and more, multi-task learning with pretrained **transformers** like BERT, as well as a production-ready training system and easy model packaging, deployment and workflow management.
|
||||
spaCy is commercial open-source software, released under the MIT license.
|
||||
|
||||
💫 **Version 2.3 out now!**
|
||||
💫 **Version 3.0 out now!**
|
||||
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
|
||||
|
||||
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
|
||||
|
@ -29,16 +29,17 @@ spaCy is commercial open-source software, released under the MIT license.
|
|||
|
||||
## 📖 Documentation
|
||||
|
||||
| Documentation | |
|
||||
| --------------- | -------------------------------------------------------------- |
|
||||
| [spaCy 101] | New to spaCy? Here's everything you need to know! |
|
||||
| [Usage Guides] | How to use spaCy and its features. |
|
||||
| [New in v3.0] | New features, backwards incompatibilities and migration guide. |
|
||||
| [API Reference] | The detailed reference for spaCy's API. |
|
||||
| [Models] | Download statistical language models for spaCy. |
|
||||
| [Universe] | Libraries, extensions, demos, books and courses. |
|
||||
| [Changelog] | Changes and version history. |
|
||||
| [Contribute] | How to contribute to the spaCy project and code base. |
|
||||
| Documentation | |
|
||||
| ------------------- | -------------------------------------------------------------- |
|
||||
| [spaCy 101] | New to spaCy? Here's everything you need to know! |
|
||||
| [Usage Guides] | How to use spaCy and its features. |
|
||||
| [New in v3.0] | New features, backwards incompatibilities and migration guide. |
|
||||
| [Project Templates] | End-to-end workflows you can clone, modify and run. |
|
||||
| [API Reference] | The detailed reference for spaCy's API. |
|
||||
| [Models] | Download statistical language models for spaCy. |
|
||||
| [Universe] | Libraries, extensions, demos, books and courses. |
|
||||
| [Changelog] | Changes and version history. |
|
||||
| [Contribute] | How to contribute to the spaCy project and code base. |
|
||||
|
||||
[spacy 101]: https://spacy.io/usage/spacy-101
|
||||
[new in v3.0]: https://spacy.io/usage/v3
|
||||
|
@ -46,6 +47,7 @@ spaCy is commercial open-source software, released under the MIT license.
|
|||
[api reference]: https://spacy.io/api/
|
||||
[models]: https://spacy.io/models
|
||||
[universe]: https://spacy.io/universe
|
||||
[project templates]: https://github.com/explosion/projects
|
||||
[changelog]: https://spacy.io/usage#changelog
|
||||
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
|
||||
|
||||
|
@ -69,7 +71,7 @@ it.
|
|||
|
||||
## Features
|
||||
|
||||
- Support for **59+ languages**
|
||||
- Support for **60+ languages**
|
||||
- **Trained pipelines**
|
||||
- Multi-task learning with pretrained **transformers** like BERT
|
||||
- Pretrained **word vectors**
|
||||
|
|
|
@ -20,6 +20,7 @@ pytokenizations
|
|||
setuptools
|
||||
packaging
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
typing_extensions>=3.7.4; python_version < "3.8"
|
||||
# Development dependencies
|
||||
cython>=0.25
|
||||
pytest>=4.6.5
|
||||
|
|
|
@ -57,6 +57,7 @@ install_requires =
|
|||
setuptools
|
||||
packaging
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
typing_extensions>=3.7.4; python_version < "3.8"
|
||||
|
||||
[options.entry_points]
|
||||
console_scripts =
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy-nightly"
|
||||
__version__ = "3.0.0a20"
|
||||
__version__ = "3.0.0a23"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -378,7 +378,7 @@ def git_sparse_checkout(repo, subpath, dest, branch):
|
|||
# Looking for this 'rev-list' command in the git --help? Hah.
|
||||
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}"
|
||||
ret = run_command(cmd, capture=True)
|
||||
git_repo = _from_http_to_git(repo)
|
||||
git_repo = _http_to_git(repo)
|
||||
# Now pass those missings into another bit of git internals
|
||||
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
|
||||
if not missings:
|
||||
|
@ -414,7 +414,7 @@ def get_git_version(
|
|||
return (int(version[0]), int(version[1]))
|
||||
|
||||
|
||||
def _from_http_to_git(repo: str) -> str:
|
||||
def _http_to_git(repo: str) -> str:
|
||||
if repo.startswith("http://"):
|
||||
repo = repo.replace(r"http://", r"https://")
|
||||
if repo.startswith(r"https://"):
|
||||
|
|
|
@ -9,7 +9,7 @@ import sys
|
|||
from ._util import app, Arg, Opt
|
||||
from ..training import docs_to_json
|
||||
from ..tokens import DocBin
|
||||
from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
|
||||
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs, conllu_to_docs
|
||||
|
||||
|
||||
# Converters are matched by file extension except for ner/iob, which are
|
||||
|
@ -18,12 +18,12 @@ from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2do
|
|||
# imported from /converters.
|
||||
|
||||
CONVERTERS = {
|
||||
"conllubio": conllu2docs,
|
||||
"conllu": conllu2docs,
|
||||
"conll": conllu2docs,
|
||||
"ner": conll_ner2docs,
|
||||
"iob": iob2docs,
|
||||
"json": json2docs,
|
||||
"conllubio": conllu_to_docs,
|
||||
"conllu": conllu_to_docs,
|
||||
"conll": conllu_to_docs,
|
||||
"ner": conll_ner_to_docs,
|
||||
"iob": iob_to_docs,
|
||||
"json": json_to_docs,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ from typing import Optional, Dict, Any, Union, List
|
|||
from pathlib import Path
|
||||
from wasabi import msg, table
|
||||
from thinc.api import Config
|
||||
from thinc.config import VARIABLE_RE
|
||||
from thinc.config import VARIABLE_RE, ConfigValidationError
|
||||
import typer
|
||||
|
||||
from ._util import Arg, Opt, show_validation_error, parse_config_overrides
|
||||
|
@ -51,7 +51,10 @@ def debug_config(
|
|||
msg.divider("Config validation")
|
||||
with show_validation_error(config_path):
|
||||
config = util.load_config(config_path, overrides=overrides)
|
||||
nlp, _ = util.load_model_from_config(config)
|
||||
nlp, resolved = util.load_model_from_config(config)
|
||||
# Use the resolved config here in case user has one function returning
|
||||
# a dict of corpora etc.
|
||||
check_section_refs(resolved, ["training.dev_corpus", "training.train_corpus"])
|
||||
msg.good("Config is valid")
|
||||
if show_vars:
|
||||
variables = get_variables(config)
|
||||
|
@ -93,3 +96,23 @@ def get_variables(config: Config) -> Dict[str, Any]:
|
|||
value = util.dot_to_object(config, path)
|
||||
result[variable] = repr(value)
|
||||
return result
|
||||
|
||||
|
||||
def check_section_refs(config: Config, fields: List[str]) -> None:
|
||||
"""Validate fields in the config that refer to other sections or values
|
||||
(e.g. in the corpora) and make sure that those references exist.
|
||||
"""
|
||||
errors = []
|
||||
for field in fields:
|
||||
# If the field doesn't exist in the config, we ignore it
|
||||
try:
|
||||
value = util.dot_to_object(config, field)
|
||||
except KeyError:
|
||||
continue
|
||||
try:
|
||||
util.dot_to_object(config, value)
|
||||
except KeyError:
|
||||
msg = f"not a valid section reference: {value}"
|
||||
errors.append({"loc": field.split("."), "msg": msg})
|
||||
if errors:
|
||||
raise ConfigValidationError(config, errors)
|
||||
|
|
|
@ -128,7 +128,7 @@ def debug_model(
|
|||
goldY = None
|
||||
for e in range(3):
|
||||
if tok2vec:
|
||||
tok2vec.predict(X)
|
||||
tok2vec.update([Example.from_dict(x, {}) for x in X])
|
||||
Y, get_dX = model.begin_update(X)
|
||||
if goldY is None:
|
||||
goldY = _simulate_gold(Y)
|
||||
|
|
|
@ -36,7 +36,7 @@ def init_config_cli(
|
|||
"""
|
||||
Generate a starter config.cfg for training. Based on your requirements
|
||||
specified via the CLI arguments, this command generates a config with the
|
||||
optimal settings for you use case. This includes the choice of architecture,
|
||||
optimal settings for your use case. This includes the choice of architecture,
|
||||
pretrained weights and related hyperparameters.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#init-config
|
||||
|
|
|
@ -27,19 +27,32 @@ def project_pull_cli(
|
|||
|
||||
|
||||
def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||
# TODO: We don't have tests for this :(. It would take a bit of mockery to
|
||||
# set up. I guess see if it breaks first?
|
||||
config = load_project_config(project_dir)
|
||||
if remote in config.get("remotes", {}):
|
||||
remote = config["remotes"][remote]
|
||||
storage = RemoteStorage(project_dir, remote)
|
||||
for cmd in config.get("commands", []):
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if any(not dep.exists() for dep in deps):
|
||||
continue
|
||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||
for output_path in cmd.get("outputs", []):
|
||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||
yield url, output_path
|
||||
commands = list(config.get("commands", []))
|
||||
# We use a while loop here because we don't know how the commands
|
||||
# will be ordered. A command might need dependencies from one that's later
|
||||
# in the list.
|
||||
while commands:
|
||||
for i, cmd in enumerate(list(commands)):
|
||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||
if all(dep.exists() for dep in deps):
|
||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||
for output_path in cmd.get("outputs", []):
|
||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||
yield url, output_path
|
||||
|
||||
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
|
||||
if all(loc.exists() for loc in out_locs):
|
||||
update_lockfile(project_dir, cmd)
|
||||
out_locs = [project_dir / out for out in cmd.get("outputs", [])]
|
||||
if all(loc.exists() for loc in out_locs):
|
||||
update_lockfile(project_dir, cmd)
|
||||
# We remove the command from the list here, and break, so that
|
||||
# we iterate over the loop again.
|
||||
commands.remove(i)
|
||||
break
|
||||
else:
|
||||
# If we didn't break the for loop, break the while loop.
|
||||
break
|
||||
|
|
|
@ -59,7 +59,8 @@ factory = "parser"
|
|||
|
||||
[components.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
@ -79,7 +80,8 @@ factory = "ner"
|
|||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = false
|
||||
|
@ -93,6 +95,49 @@ grad_factor = 1.0
|
|||
@layers = "reduce_mean.v1"
|
||||
{% endif -%}
|
||||
|
||||
{% if "entity_linker" in components -%}
|
||||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
|
||||
incl_context = true
|
||||
incl_prior = true
|
||||
|
||||
[components.entity_linker.model]
|
||||
@architectures = "spacy.EntityLinker.v1"
|
||||
nO = null
|
||||
|
||||
[components.entity_linker.model.tok2vec]
|
||||
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||
grad_factor = 1.0
|
||||
|
||||
[components.entity_linker.model.tok2vec.pooling]
|
||||
@layers = "reduce_mean.v1"
|
||||
{% endif -%}
|
||||
|
||||
{% if "textcat" in components %}
|
||||
[components.textcat]
|
||||
factory = "textcat"
|
||||
|
||||
{% if optimize == "accuracy" %}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatEnsemble.v1"
|
||||
exclusive_classes = false
|
||||
width = 64
|
||||
conv_depth = 2
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
ngram_size = 1
|
||||
nO = null
|
||||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
|
||||
{# NON-TRANSFORMER PIPELINE #}
|
||||
{% else -%}
|
||||
|
||||
|
@ -140,7 +185,8 @@ factory = "parser"
|
|||
|
||||
[components.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = true
|
||||
|
@ -157,7 +203,8 @@ factory = "ner"
|
|||
|
||||
[components.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 6
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
use_upper = true
|
||||
|
@ -167,10 +214,50 @@ nO = null
|
|||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
{% endif %}
|
||||
|
||||
{% if "entity_linker" in components -%}
|
||||
[components.entity_linker]
|
||||
factory = "entity_linker"
|
||||
get_candidates = {"@misc":"spacy.CandidateGenerator.v1"}
|
||||
incl_context = true
|
||||
incl_prior = true
|
||||
|
||||
[components.entity_linker.model]
|
||||
@architectures = "spacy.EntityLinker.v1"
|
||||
nO = null
|
||||
|
||||
[components.entity_linker.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
{% endif %}
|
||||
|
||||
{% if "textcat" in components %}
|
||||
[components.textcat]
|
||||
factory = "textcat"
|
||||
|
||||
{% if optimize == "accuracy" %}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatEnsemble.v1"
|
||||
exclusive_classes = false
|
||||
width = 64
|
||||
conv_depth = 2
|
||||
embed_size = 2000
|
||||
window_size = 1
|
||||
ngram_size = 1
|
||||
nO = null
|
||||
|
||||
{% else -%}
|
||||
[components.textcat.model]
|
||||
@architectures = "spacy.TextCatBOW.v1"
|
||||
exclusive_classes = false
|
||||
ngram_size = 1
|
||||
no_output_layer = false
|
||||
{%- endif %}
|
||||
{%- endif %}
|
||||
{% endif %}
|
||||
|
||||
{% for pipe in components %}
|
||||
{% if pipe not in ["tagger", "parser", "ner"] %}
|
||||
{% if pipe not in ["tagger", "parser", "ner", "textcat", "entity_linker"] %}
|
||||
{# Other components defined by the user: we just assume they're factories #}
|
||||
[components.{{ pipe }}]
|
||||
factory = "{{ pipe }}"
|
||||
|
@ -197,7 +284,7 @@ vectors = "{{ word_vectors }}"
|
|||
{% endif -%}
|
||||
{% if use_transformer -%}
|
||||
accumulate_gradient = {{ transformer["size_factor"] }}
|
||||
{% endif %}
|
||||
{% endif -%}
|
||||
dev_corpus = "corpora.dev"
|
||||
train_corpus = "corpora.train"
|
||||
|
||||
|
@ -230,18 +317,3 @@ start = 100
|
|||
stop = 1000
|
||||
compound = 1.001
|
||||
{% endif %}
|
||||
|
||||
[training.score_weights]
|
||||
{%- if "tagger" in components %}
|
||||
tag_acc = {{ (1.0 / components|length)|round(2) }}
|
||||
{%- endif -%}
|
||||
{%- if "parser" in components %}
|
||||
dep_uas = 0.0
|
||||
dep_las = {{ (1.0 / components|length)|round(2) }}
|
||||
sents_f = 0.0
|
||||
{%- endif %}
|
||||
{%- if "ner" in components %}
|
||||
ents_f = {{ (1.0 / components|length)|round(2) }}
|
||||
ents_p = 0.0
|
||||
ents_r = 0.0
|
||||
{%- endif -%}
|
||||
|
|
|
@ -152,7 +152,8 @@ def train(
|
|||
exclude=frozen_components,
|
||||
)
|
||||
msg.info(f"Training. Initial learn rate: {optimizer.learn_rate}")
|
||||
print_row, finalize_logger = train_logger(nlp)
|
||||
with nlp.select_pipes(disable=frozen_components):
|
||||
print_row, finalize_logger = train_logger(nlp)
|
||||
|
||||
try:
|
||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
||||
|
@ -163,7 +164,8 @@ def train(
|
|||
progress.close()
|
||||
print_row(info)
|
||||
if is_best_checkpoint and output_path is not None:
|
||||
update_meta(T_cfg, nlp, info)
|
||||
with nlp.select_pipes(disable=frozen_components):
|
||||
update_meta(T_cfg, nlp, info)
|
||||
with nlp.use_params(optimizer.averages):
|
||||
nlp.to_disk(output_path / "model-best")
|
||||
progress = tqdm.tqdm(total=T_cfg["eval_frequency"], leave=False)
|
||||
|
@ -207,10 +209,17 @@ def create_train_batches(iterator, batcher, max_epochs: int):
|
|||
def create_evaluation_callback(
|
||||
nlp: Language, dev_corpus: Callable, weights: Dict[str, float]
|
||||
) -> Callable[[], Tuple[float, Dict[str, float]]]:
|
||||
weights = {key: value for key, value in weights.items() if value is not None}
|
||||
|
||||
def evaluate() -> Tuple[float, Dict[str, float]]:
|
||||
dev_examples = list(dev_corpus(nlp))
|
||||
scores = nlp.evaluate(dev_examples)
|
||||
# Calculate a weighted sum based on score_weights for the main score
|
||||
# Calculate a weighted sum based on score_weights for the main score.
|
||||
# We can only consider scores that are ints/floats, not dicts like
|
||||
# entity scores per type etc.
|
||||
for key, value in scores.items():
|
||||
if key in weights and not isinstance(value, (int, float)):
|
||||
raise ValueError(Errors.E915.format(name=key, score_type=type(value)))
|
||||
try:
|
||||
weighted_score = sum(
|
||||
scores.get(s, 0.0) * weights.get(s, 0.0) for s in weights
|
||||
|
@ -366,7 +375,8 @@ def update_meta(
|
|||
) -> None:
|
||||
nlp.meta["performance"] = {}
|
||||
for metric in training["score_weights"]:
|
||||
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
||||
if metric is not None:
|
||||
nlp.meta["performance"][metric] = info["other_scores"].get(metric, 0.0)
|
||||
for pipe_name in nlp.pipe_names:
|
||||
nlp.meta["performance"][f"{pipe_name}_loss"] = info["losses"][pipe_name]
|
||||
|
||||
|
|
|
@ -22,6 +22,11 @@ try:
|
|||
except ImportError:
|
||||
cupy = None
|
||||
|
||||
try: # Python 3.8+
|
||||
from typing import Literal
|
||||
except ImportError:
|
||||
from typing_extensions import Literal # noqa: F401
|
||||
|
||||
from thinc.api import Optimizer # noqa: F401
|
||||
|
||||
pickle = pickle
|
||||
|
|
|
@ -69,7 +69,7 @@ class Warnings:
|
|||
"in problems with the vocab further on in the pipeline.")
|
||||
W030 = ("Some entities could not be aligned in the text \"{text}\" with "
|
||||
"entities \"{entities}\". Use "
|
||||
"`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
|
||||
"`spacy.training.offsets_to_biluo_tags(nlp.make_doc(text), entities)`"
|
||||
" to check the alignment. Misaligned entities ('-') will be "
|
||||
"ignored during training.")
|
||||
W033 = ("Training a new {model} using a model with no lexeme normalization "
|
||||
|
@ -480,6 +480,13 @@ class Errors:
|
|||
E201 = ("Span index out of range.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E915 = ("Can't use score '{name}' to calculate final weighted score. Expected "
|
||||
"float or int but got: {score_type}. To exclude the score from the "
|
||||
"final score, set its weight to null in the [training.score_weights] "
|
||||
"section of your training config.")
|
||||
E916 = ("Can't log score for '{name}' in table: not a valid score ({score_type})")
|
||||
E917 = ("Received invalid value {value} for 'state_type' in "
|
||||
"TransitionBasedParser: only 'parser' or 'ner' are valid options.")
|
||||
E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
|
||||
"values are an instance of spacy.vocab.Vocab or True to create one"
|
||||
" (default).")
|
||||
|
|
|
@ -140,7 +140,6 @@ cdef class KnowledgeBase:
|
|||
self._entries.push_back(entry)
|
||||
self._aliases_table.push_back(alias)
|
||||
|
||||
cpdef from_disk(self, loc)
|
||||
cpdef set_entities(self, entity_list, freq_list, vector_list)
|
||||
|
||||
|
||||
|
|
47
spacy/kb.pyx
47
spacy/kb.pyx
|
@ -9,7 +9,8 @@ from libcpp.vector cimport vector
|
|||
|
||||
from pathlib import Path
|
||||
import warnings
|
||||
from os import path
|
||||
|
||||
from spacy import util
|
||||
|
||||
from .typedefs cimport hash_t
|
||||
from .errors import Errors, Warnings
|
||||
|
@ -319,8 +320,14 @@ cdef class KnowledgeBase:
|
|||
return 0.0
|
||||
|
||||
|
||||
def to_disk(self, loc):
|
||||
cdef Writer writer = Writer(loc)
|
||||
def to_disk(self, path):
|
||||
path = util.ensure_path(path)
|
||||
if path.is_dir():
|
||||
raise ValueError(Errors.E928.format(loc=path))
|
||||
if not path.parent.exists():
|
||||
path.parent.mkdir(parents=True)
|
||||
|
||||
cdef Writer writer = Writer(path)
|
||||
writer.write_header(self.get_size_entities(), self.entity_vector_length)
|
||||
|
||||
# dumping the entity vectors in their original order
|
||||
|
@ -359,7 +366,13 @@ cdef class KnowledgeBase:
|
|||
|
||||
writer.close()
|
||||
|
||||
cpdef from_disk(self, loc):
|
||||
def from_disk(self, path):
|
||||
path = util.ensure_path(path)
|
||||
if path.is_dir():
|
||||
raise ValueError(Errors.E928.format(loc=path))
|
||||
if not path.exists():
|
||||
raise ValueError(Errors.E929.format(loc=path))
|
||||
|
||||
cdef hash_t entity_hash
|
||||
cdef hash_t alias_hash
|
||||
cdef int64_t entry_index
|
||||
|
@ -369,7 +382,7 @@ cdef class KnowledgeBase:
|
|||
cdef AliasC alias
|
||||
cdef float vector_element
|
||||
|
||||
cdef Reader reader = Reader(loc)
|
||||
cdef Reader reader = Reader(path)
|
||||
|
||||
# STEP 0: load header and initialize KB
|
||||
cdef int64_t nr_entities
|
||||
|
@ -450,16 +463,13 @@ cdef class KnowledgeBase:
|
|||
|
||||
|
||||
cdef class Writer:
|
||||
def __init__(self, object loc):
|
||||
if isinstance(loc, Path):
|
||||
loc = bytes(loc)
|
||||
if path.exists(loc):
|
||||
if path.isdir(loc):
|
||||
raise ValueError(Errors.E928.format(loc=loc))
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
def __init__(self, path):
|
||||
assert isinstance(path, Path)
|
||||
content = bytes(path)
|
||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
|
||||
self._fp = fopen(<char*>bytes_loc, 'wb')
|
||||
if not self._fp:
|
||||
raise IOError(Errors.E146.format(path=loc))
|
||||
raise IOError(Errors.E146.format(path=path))
|
||||
fseek(self._fp, 0, 0)
|
||||
|
||||
def close(self):
|
||||
|
@ -496,14 +506,9 @@ cdef class Writer:
|
|||
|
||||
|
||||
cdef class Reader:
|
||||
def __init__(self, object loc):
|
||||
if isinstance(loc, Path):
|
||||
loc = bytes(loc)
|
||||
if not path.exists(loc):
|
||||
raise ValueError(Errors.E929.format(loc=loc))
|
||||
if path.isdir(loc):
|
||||
raise ValueError(Errors.E928.format(loc=loc))
|
||||
cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
|
||||
def __init__(self, path):
|
||||
content = bytes(path)
|
||||
cdef bytes bytes_loc = content.encode('utf8') if type(content) == unicode else content
|
||||
self._fp = fopen(<char*>bytes_loc, 'rb')
|
||||
if not self._fp:
|
||||
PyErr_SetFromErrno(IOError)
|
||||
|
|
|
@ -25,7 +25,6 @@ class Bengali(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -30,7 +30,6 @@ class Greek(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -29,7 +29,6 @@ class English(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -28,7 +28,6 @@ class Persian(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -33,7 +33,6 @@ class French(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -28,7 +28,6 @@ class Norwegian(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -30,7 +30,6 @@ class Dutch(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -35,7 +35,6 @@ class Polish(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pos_lookup", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -25,7 +25,6 @@ class Russian(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -31,7 +31,6 @@ class Swedish(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "rule", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -25,7 +25,6 @@ class Ukrainian(Language):
|
|||
"lemmatizer",
|
||||
assigns=["token.lemma"],
|
||||
default_config={"model": None, "mode": "pymorphy2", "lookups": None},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -248,9 +248,12 @@ class Language:
|
|||
self._config["nlp"]["pipeline"] = list(self.component_names)
|
||||
self._config["nlp"]["disabled"] = list(self.disabled)
|
||||
self._config["components"] = pipeline
|
||||
if not self._config["training"].get("score_weights"):
|
||||
combined_score_weights = combine_score_weights(score_weights)
|
||||
self._config["training"]["score_weights"] = combined_score_weights
|
||||
# We're merging the existing score weights back into the combined
|
||||
# weights to make sure we're preserving custom settings in the config
|
||||
# but also reflect updates (e.g. new components added)
|
||||
prev_weights = self._config["training"].get("score_weights", {})
|
||||
combined_score_weights = combine_score_weights(score_weights, prev_weights)
|
||||
self._config["training"]["score_weights"] = combined_score_weights
|
||||
if not srsly.is_json_serializable(self._config):
|
||||
raise ValueError(Errors.E961.format(config=self._config))
|
||||
return self._config
|
||||
|
@ -412,7 +415,6 @@ class Language:
|
|||
assigns: Iterable[str] = SimpleFrozenList(),
|
||||
requires: Iterable[str] = SimpleFrozenList(),
|
||||
retokenizes: bool = False,
|
||||
scores: Iterable[str] = SimpleFrozenList(),
|
||||
default_score_weights: Dict[str, float] = SimpleFrozenDict(),
|
||||
func: Optional[Callable] = None,
|
||||
) -> Callable:
|
||||
|
@ -430,12 +432,11 @@ class Language:
|
|||
e.g. "token.ent_id". Used for pipeline analyis.
|
||||
retokenizes (bool): Whether the component changes the tokenization.
|
||||
Used for pipeline analysis.
|
||||
scores (Iterable[str]): All scores set by the component if it's trainable,
|
||||
e.g. ["ents_f", "ents_r", "ents_p"].
|
||||
default_score_weights (Dict[str, float]): The scores to report during
|
||||
training, and their default weight towards the final score used to
|
||||
select the best model. Weights should sum to 1.0 per component and
|
||||
will be combined and normalized for the whole pipeline.
|
||||
will be combined and normalized for the whole pipeline. If None,
|
||||
the score won't be shown in the logs or be weighted.
|
||||
func (Optional[Callable]): Factory function if not used as a decorator.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/language#factory
|
||||
|
@ -475,7 +476,7 @@ class Language:
|
|||
default_config=default_config,
|
||||
assigns=validate_attrs(assigns),
|
||||
requires=validate_attrs(requires),
|
||||
scores=scores,
|
||||
scores=list(default_score_weights.keys()),
|
||||
default_score_weights=default_score_weights,
|
||||
retokenizes=retokenizes,
|
||||
)
|
||||
|
|
|
@ -2,6 +2,8 @@ from typing import Optional, List
|
|||
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
|
||||
from thinc.types import Floats2d
|
||||
|
||||
from ...errors import Errors
|
||||
from ...compat import Literal
|
||||
from ...util import registry
|
||||
from .._precomputable_affine import PrecomputableAffine
|
||||
from ..tb_framework import TransitionModel
|
||||
|
@ -11,7 +13,8 @@ from ...tokens import Doc
|
|||
@registry.architectures.register("spacy.TransitionBasedParser.v1")
|
||||
def build_tb_parser_model(
|
||||
tok2vec: Model[List[Doc], List[Floats2d]],
|
||||
nr_feature_tokens: int,
|
||||
state_type: Literal["parser", "ner"],
|
||||
extra_state_tokens: bool,
|
||||
hidden_width: int,
|
||||
maxout_pieces: int,
|
||||
use_upper: bool = True,
|
||||
|
@ -40,20 +43,12 @@ def build_tb_parser_model(
|
|||
|
||||
tok2vec (Model[List[Doc], List[Floats2d]]):
|
||||
Subnetwork to map tokens into vector representations.
|
||||
nr_feature_tokens (int): The number of tokens in the context to use to
|
||||
construct the state vector. Valid choices are 1, 2, 3, 6, 8 and 13. The
|
||||
2, 8 and 13 feature sets are designed for the parser, while the 3 and 6
|
||||
feature sets are designed for the NER. The recommended feature sets are
|
||||
3 for NER, and 8 for the dependency parser.
|
||||
|
||||
TODO: This feature should be split into two, state_type: ["deps", "ner"]
|
||||
and extra_state_features: [True, False]. This would map into:
|
||||
|
||||
(deps, False): 8
|
||||
(deps, True): 13
|
||||
(ner, False): 3
|
||||
(ner, True): 6
|
||||
|
||||
state_type (str):
|
||||
String value denoting the type of parser model: "parser" or "ner"
|
||||
extra_state_tokens (bool): Whether or not to use additional tokens in the context
|
||||
to construct the state vector. Defaults to `False`, which means 3 and 8
|
||||
for the NER and parser respectively. When set to `True`, this would become 6
|
||||
feature sets (for the NER) or 13 (for the parser).
|
||||
hidden_width (int): The width of the hidden layer.
|
||||
maxout_pieces (int): How many pieces to use in the state prediction layer.
|
||||
Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
|
||||
|
@ -68,8 +63,14 @@ def build_tb_parser_model(
|
|||
Usually inferred from data at the beginning of training, or loaded from
|
||||
disk.
|
||||
"""
|
||||
if state_type == "parser":
|
||||
nr_feature_tokens = 13 if extra_state_tokens else 8
|
||||
elif state_type == "ner":
|
||||
nr_feature_tokens = 6 if extra_state_tokens else 3
|
||||
else:
|
||||
raise ValueError(Errors.E917.format(value=state_type))
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width),)
|
||||
tok2vec = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width))
|
||||
tok2vec.set_dim("nO", hidden_width)
|
||||
lower = PrecomputableAffine(
|
||||
nO=hidden_width if use_upper else nO,
|
||||
|
|
|
@ -15,7 +15,8 @@ from ..training import validate_examples
|
|||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
|
||||
|
@ -42,8 +43,14 @@ DEFAULT_PARSER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"min_action_freq": 30,
|
||||
"model": DEFAULT_PARSER_MODEL,
|
||||
},
|
||||
scores=["dep_uas", "dep_las", "dep_las_per_type", "sents_p", "sents_r", "sents_f"],
|
||||
default_score_weights={"dep_uas": 0.5, "dep_las": 0.5, "sents_f": 0.0},
|
||||
default_score_weights={
|
||||
"dep_uas": 0.5,
|
||||
"dep_las": 0.5,
|
||||
"dep_las_per_type": None,
|
||||
"sents_p": None,
|
||||
"sents_r": None,
|
||||
"sents_f": 0.0,
|
||||
},
|
||||
)
|
||||
def make_parser(
|
||||
nlp: Language,
|
||||
|
|
|
@ -25,8 +25,12 @@ PatternType = Dict[str, Union[str, List[Dict[str, Any]]]]
|
|||
"overwrite_ents": False,
|
||||
"ent_id_sep": DEFAULT_ENT_ID_SEP,
|
||||
},
|
||||
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
|
||||
default_score_weights={
|
||||
"ents_f": 1.0,
|
||||
"ents_p": 0.0,
|
||||
"ents_r": 0.0,
|
||||
"ents_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_entity_ruler(
|
||||
nlp: Language,
|
||||
|
|
|
@ -21,7 +21,6 @@ from .. import util
|
|||
"lookups": None,
|
||||
"overwrite": False,
|
||||
},
|
||||
scores=["lemma_acc"],
|
||||
default_score_weights={"lemma_acc": 1.0},
|
||||
)
|
||||
def make_lemmatizer(
|
||||
|
|
|
@ -49,8 +49,7 @@ DEFAULT_MORPH_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"morphologizer",
|
||||
assigns=["token.morph", "token.pos"],
|
||||
default_config={"model": DEFAULT_MORPH_MODEL},
|
||||
scores=["pos_acc", "morph_acc", "morph_per_feat"],
|
||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5},
|
||||
default_score_weights={"pos_acc": 0.5, "morph_acc": 0.5, "morph_per_feat": None},
|
||||
)
|
||||
def make_morphologizer(
|
||||
nlp: Language,
|
||||
|
|
|
@ -13,7 +13,8 @@ from ..training import validate_examples
|
|||
default_model_config = """
|
||||
[model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 6
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 64
|
||||
maxout_pieces = 2
|
||||
|
||||
|
@ -38,8 +39,7 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"update_with_oracle_cut_size": 100,
|
||||
"model": DEFAULT_NER_MODEL,
|
||||
},
|
||||
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
|
||||
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
|
||||
|
||||
)
|
||||
def make_ner(
|
||||
|
|
|
@ -15,7 +15,6 @@ from .. import util
|
|||
"sentencizer",
|
||||
assigns=["token.is_sent_start", "doc.sents"],
|
||||
default_config={"punct_chars": None},
|
||||
scores=["sents_p", "sents_r", "sents_f"],
|
||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||
)
|
||||
def make_sentencizer(
|
||||
|
|
|
@ -36,7 +36,6 @@ DEFAULT_SENTER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"senter",
|
||||
assigns=["token.is_sent_start"],
|
||||
default_config={"model": DEFAULT_SENTER_MODEL},
|
||||
scores=["sents_p", "sents_r", "sents_f"],
|
||||
default_score_weights={"sents_f": 1.0, "sents_p": 0.0, "sents_r": 0.0},
|
||||
)
|
||||
def make_senter(nlp: Language, name: str, model: Model):
|
||||
|
|
|
@ -42,7 +42,6 @@ DEFAULT_TAGGER_MODEL = Config().from_str(default_model_config)["model"]
|
|||
"tagger",
|
||||
assigns=["token.tag"],
|
||||
default_config={"model": DEFAULT_TAGGER_MODEL},
|
||||
scores=["tag_acc"],
|
||||
default_score_weights={"tag_acc": 1.0},
|
||||
)
|
||||
def make_tagger(nlp: Language, name: str, model: Model):
|
||||
|
|
|
@ -62,18 +62,17 @@ subword_features = true
|
|||
"positive_label": None,
|
||||
"model": DEFAULT_TEXTCAT_MODEL,
|
||||
},
|
||||
scores=[
|
||||
"cats_score",
|
||||
"cats_score_desc",
|
||||
"cats_p",
|
||||
"cats_r",
|
||||
"cats_f",
|
||||
"cats_macro_f",
|
||||
"cats_macro_auc",
|
||||
"cats_f_per_type",
|
||||
"cats_macro_auc_per_type",
|
||||
],
|
||||
default_score_weights={"cats_score": 1.0},
|
||||
default_score_weights={
|
||||
"cats_score": 1.0,
|
||||
"cats_score_desc": None,
|
||||
"cats_p": None,
|
||||
"cats_r": None,
|
||||
"cats_f": None,
|
||||
"cats_macro_f": None,
|
||||
"cats_macro_auc": None,
|
||||
"cats_f_per_type": None,
|
||||
"cats_macro_auc_per_type": None,
|
||||
},
|
||||
)
|
||||
def make_textcat(
|
||||
nlp: Language,
|
||||
|
|
|
@ -127,7 +127,7 @@ class Tok2Vec(Pipe):
|
|||
tokvecs = self.model.predict(docs)
|
||||
batch_id = Tok2VecListener.get_batch_id(docs)
|
||||
for listener in self.listeners:
|
||||
listener.receive(batch_id, tokvecs, None)
|
||||
listener.receive(batch_id, tokvecs, lambda dX: [])
|
||||
return tokvecs
|
||||
|
||||
def set_annotations(self, docs: Sequence[Doc], tokvecses) -> None:
|
||||
|
|
|
@ -211,7 +211,7 @@ class ConfigSchemaTraining(BaseModel):
|
|||
seed: Optional[StrictInt] = Field(..., title="Random seed")
|
||||
gpu_allocator: Optional[StrictStr] = Field(..., title="Memory allocator when running on GPU")
|
||||
accumulate_gradient: StrictInt = Field(..., title="Whether to divide the batch up into substeps")
|
||||
score_weights: Dict[StrictStr, Union[StrictFloat, StrictInt]] = Field(..., title="Scores to report and their weights for selecting final model")
|
||||
score_weights: Dict[StrictStr, Optional[Union[StrictFloat, StrictInt]]] = Field(..., title="Scores to report and their weights for selecting final model")
|
||||
init_tok2vec: Optional[StrictStr] = Field(..., title="Path to pretrained tok2vec weights")
|
||||
raw_text: Optional[StrictStr] = Field(default=None, title="Raw text")
|
||||
optimizer: Optimizer = Field(..., title="The optimizer to use")
|
||||
|
|
|
@ -240,7 +240,7 @@ class Scorer:
|
|||
pred_per_feat[field].add((gold_i, feat))
|
||||
for field in per_feat:
|
||||
per_feat[field].score_set(
|
||||
pred_per_feat.get(field, set()), gold_per_feat.get(field, set()),
|
||||
pred_per_feat.get(field, set()), gold_per_feat.get(field, set())
|
||||
)
|
||||
result = {k: v.to_dict() for k, v in per_feat.items()}
|
||||
return {f"{attr}_per_feat": result}
|
||||
|
@ -418,9 +418,9 @@ class Scorer:
|
|||
f_per_type[pred_label].fp += 1
|
||||
micro_prf = PRFScore()
|
||||
for label_prf in f_per_type.values():
|
||||
micro_prf.tp = label_prf.tp
|
||||
micro_prf.fn = label_prf.fn
|
||||
micro_prf.fp = label_prf.fp
|
||||
micro_prf.tp += label_prf.tp
|
||||
micro_prf.fn += label_prf.fn
|
||||
micro_prf.fp += label_prf.fp
|
||||
n_cats = len(f_per_type) + 1e-100
|
||||
macro_p = sum(prf.precision for prf in f_per_type.values()) / n_cats
|
||||
macro_r = sum(prf.recall for prf in f_per_type.values()) / n_cats
|
||||
|
|
|
@ -144,6 +144,29 @@ def test_kb_empty(nlp):
|
|||
entity_linker.begin_training(lambda: [])
|
||||
|
||||
|
||||
def test_kb_serialize(nlp):
|
||||
"""Test serialization of the KB"""
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
with make_tempdir() as d:
|
||||
# normal read-write behaviour
|
||||
mykb.to_disk(d / "kb")
|
||||
mykb.from_disk(d / "kb")
|
||||
mykb.to_disk(d / "kb.file")
|
||||
mykb.from_disk(d / "kb.file")
|
||||
mykb.to_disk(d / "new" / "kb")
|
||||
mykb.from_disk(d / "new" / "kb")
|
||||
# allow overwriting an existing file
|
||||
mykb.to_disk(d / "kb.file")
|
||||
with pytest.raises(ValueError):
|
||||
# can not write to a directory
|
||||
mykb.to_disk(d)
|
||||
with pytest.raises(ValueError):
|
||||
# can not read from a directory
|
||||
mykb.from_disk(d)
|
||||
with pytest.raises(ValueError):
|
||||
# can not read from an unknown file
|
||||
mykb.from_disk(d / "unknown" / "kb")
|
||||
|
||||
def test_candidate_generation(nlp):
|
||||
"""Test correct candidate generation"""
|
||||
mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
|
||||
|
|
|
@ -359,12 +359,8 @@ def test_language_factories_scores():
|
|||
func = lambda nlp, name: lambda doc: doc
|
||||
weights1 = {"a1": 0.5, "a2": 0.5}
|
||||
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
|
||||
Language.factory(
|
||||
f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
|
||||
)
|
||||
Language.factory(
|
||||
f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
|
||||
)
|
||||
Language.factory(f"{name}1", default_score_weights=weights1, func=func)
|
||||
Language.factory(f"{name}2", default_score_weights=weights2, func=func)
|
||||
meta1 = Language.get_factory_meta(f"{name}1")
|
||||
assert meta1.default_score_weights == weights1
|
||||
meta2 = Language.get_factory_meta(f"{name}2")
|
||||
|
@ -376,6 +372,21 @@ def test_language_factories_scores():
|
|||
cfg = nlp.config["training"]
|
||||
expected_weights = {"a1": 0.25, "a2": 0.25, "b1": 0.1, "b2": 0.35, "b3": 0.05}
|
||||
assert cfg["score_weights"] == expected_weights
|
||||
# Test with custom defaults
|
||||
config = nlp.config.copy()
|
||||
config["training"]["score_weights"]["a1"] = 0.0
|
||||
config["training"]["score_weights"]["b3"] = 1.0
|
||||
nlp = English.from_config(config)
|
||||
score_weights = nlp.config["training"]["score_weights"]
|
||||
expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
|
||||
assert score_weights == expected
|
||||
# Test with null values
|
||||
config = nlp.config.copy()
|
||||
config["training"]["score_weights"]["a1"] = None
|
||||
nlp = English.from_config(config)
|
||||
score_weights = nlp.config["training"]["score_weights"]
|
||||
expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
|
||||
assert score_weights == expected
|
||||
|
||||
|
||||
def test_pipe_factories_from_source():
|
||||
|
|
|
@ -8,6 +8,7 @@ from spacy.language import Language
|
|||
from spacy.pipeline import TextCategorizer
|
||||
from spacy.tokens import Doc
|
||||
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
|
||||
from spacy.scorer import Scorer
|
||||
|
||||
from ..util import make_tempdir
|
||||
from ...cli.train import verify_textcat_config
|
||||
|
@ -224,3 +225,31 @@ def test_positive_class_not_binary():
|
|||
assert textcat.labels == ("SOME", "THING", "POS")
|
||||
with pytest.raises(ValueError):
|
||||
verify_textcat_config(nlp, pipe_config)
|
||||
|
||||
def test_textcat_evaluation():
|
||||
train_examples = []
|
||||
nlp = English()
|
||||
ref1 = nlp("one")
|
||||
ref1.cats = {"winter": 1.0, "summer": 1.0, "spring": 1.0, "autumn": 1.0}
|
||||
pred1 = nlp("one")
|
||||
pred1.cats = {"winter": 1.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0}
|
||||
train_examples.append(Example(pred1, ref1))
|
||||
|
||||
ref2 = nlp("two")
|
||||
ref2.cats = {"winter": 0.0, "summer": 0.0, "spring": 1.0, "autumn": 1.0}
|
||||
pred2 = nlp("two")
|
||||
pred2.cats = {"winter": 1.0, "summer": 0.0, "spring": 0.0, "autumn": 1.0}
|
||||
train_examples.append(Example(pred2, ref2))
|
||||
|
||||
scores = Scorer().score_cats(train_examples, "cats", labels=["winter", "summer", "spring", "autumn"])
|
||||
assert scores["cats_f_per_type"]["winter"]["p"] == 1/2
|
||||
assert scores["cats_f_per_type"]["winter"]["r"] == 1/1
|
||||
assert scores["cats_f_per_type"]["summer"]["p"] == 0
|
||||
assert scores["cats_f_per_type"]["summer"]["r"] == 0/1
|
||||
assert scores["cats_f_per_type"]["spring"]["p"] == 1/1
|
||||
assert scores["cats_f_per_type"]["spring"]["r"] == 1/2
|
||||
assert scores["cats_f_per_type"]["autumn"]["p"] == 2/2
|
||||
assert scores["cats_f_per_type"]["autumn"]["r"] == 2/2
|
||||
|
||||
assert scores["cats_micro_p"] == 4/5
|
||||
assert scores["cats_micro_r"] == 4/6
|
||||
|
|
|
@ -169,3 +169,22 @@ def test_tok2vec_listener():
|
|||
nlp.select_pipes(disable="tok2vec")
|
||||
assert nlp.pipe_names == ["tagger"]
|
||||
nlp("Running the pipeline with the Tok2Vec component disabled.")
|
||||
|
||||
|
||||
def test_tok2vec_listener_callback():
|
||||
orig_config = Config().from_str(cfg_string)
|
||||
nlp, config = util.load_model_from_config(orig_config, auto_fill=True, validate=True)
|
||||
assert nlp.pipe_names == ["tok2vec", "tagger"]
|
||||
tagger = nlp.get_pipe("tagger")
|
||||
tok2vec = nlp.get_pipe("tok2vec")
|
||||
nlp._link_components()
|
||||
docs = [nlp.make_doc("A random sentence")]
|
||||
tok2vec.model.initialize(X=docs)
|
||||
gold_array = [[1.0 for tag in ["V", "Z"]] for word in docs]
|
||||
label_sample = [tagger.model.ops.asarray(gold_array, dtype="float32")]
|
||||
tagger.model.initialize(X=docs, Y=label_sample)
|
||||
docs = [nlp.make_doc("Another entirely random sentence")]
|
||||
tok2vec.update([Example.from_dict(x, {}) for x in docs])
|
||||
Y, get_dX = tagger.model.begin_update(docs)
|
||||
# assure that the backprop call works (and doesn't hit a 'None' callback)
|
||||
assert get_dX(Y) is not None
|
||||
|
|
|
@ -3,7 +3,7 @@ from spacy.pipeline import Pipe
|
|||
from spacy.matcher import PhraseMatcher, Matcher
|
||||
from spacy.tokens import Doc, Span, DocBin
|
||||
from spacy.training import Example, Corpus
|
||||
from spacy.training.converters import json2docs
|
||||
from spacy.training.converters import json_to_docs
|
||||
from spacy.vocab import Vocab
|
||||
from spacy.lang.en import English
|
||||
from spacy.util import minibatch, ensure_path, load_model
|
||||
|
@ -425,7 +425,7 @@ def test_issue4402():
|
|||
attrs = ["ORTH", "SENT_START", "ENT_IOB", "ENT_TYPE"]
|
||||
with make_tempdir() as tmpdir:
|
||||
output_file = tmpdir / "test4402.spacy"
|
||||
docs = json2docs([json_data])
|
||||
docs = json_to_docs([json_data])
|
||||
data = DocBin(docs=docs, attrs=attrs).to_bytes()
|
||||
with output_file.open("wb") as file_:
|
||||
file_.write(data)
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
import pytest
|
||||
from spacy.tokens import Doc, Span, DocBin
|
||||
from spacy.training import Example
|
||||
from spacy.training.converters.conllu2docs import conllu2docs
|
||||
from spacy.training.converters.conllu_to_docs import conllu_to_docs
|
||||
from spacy.lang.en import English
|
||||
from spacy.kb import KnowledgeBase
|
||||
from spacy.vocab import Vocab
|
||||
|
@ -82,7 +82,7 @@ def test_issue4651_without_phrase_matcher_attr():
|
|||
|
||||
def test_issue4665():
|
||||
"""
|
||||
conllu2json should not raise an exception if the HEAD column contains an
|
||||
conllu_to_docs should not raise an exception if the HEAD column contains an
|
||||
underscore
|
||||
"""
|
||||
input_data = """
|
||||
|
@ -105,7 +105,7 @@ def test_issue4665():
|
|||
17 . _ PUNCT . _ _ punct _ _
|
||||
18 ] _ PUNCT -RRB- _ _ punct _ _
|
||||
"""
|
||||
conllu2docs(input_data)
|
||||
conllu_to_docs(input_data)
|
||||
|
||||
|
||||
def test_issue4674():
|
||||
|
|
|
@ -67,7 +67,8 @@ width = ${components.tok2vec.model.width}
|
|||
parser_config_string = """
|
||||
[model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 99
|
||||
state_type = "parser"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 66
|
||||
maxout_pieces = 2
|
||||
|
||||
|
@ -95,7 +96,11 @@ def my_parser():
|
|||
MaxoutWindowEncoder(width=321, window_size=3, maxout_pieces=4, depth=2),
|
||||
)
|
||||
parser = build_tb_parser_model(
|
||||
tok2vec=tok2vec, nr_feature_tokens=7, hidden_width=65, maxout_pieces=5
|
||||
tok2vec=tok2vec,
|
||||
state_type="parser",
|
||||
extra_state_tokens=True,
|
||||
hidden_width=65,
|
||||
maxout_pieces=5,
|
||||
)
|
||||
return parser
|
||||
|
||||
|
@ -340,3 +345,13 @@ def test_config_auto_fill_extra_fields():
|
|||
assert "extra" not in nlp.config["training"]
|
||||
# Make sure the config generated is valid
|
||||
load_model_from_config(nlp.config)
|
||||
|
||||
|
||||
def test_config_validate_literal():
|
||||
nlp = English()
|
||||
config = Config().from_str(parser_config_string)
|
||||
config["model"]["state_type"] = "nonsense"
|
||||
with pytest.raises(ConfigValidationError):
|
||||
nlp.add_pipe("parser", config=config)
|
||||
config["model"]["state_type"] = "ner"
|
||||
nlp.add_pipe("parser", config=config)
|
||||
|
|
|
@ -1,20 +1,21 @@
|
|||
import pytest
|
||||
from click import NoSuchOption
|
||||
from spacy.training import docs_to_json, biluo_tags_from_offsets
|
||||
from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
|
||||
from spacy.training import docs_to_json, offsets_to_biluo_tags
|
||||
from spacy.training.converters import iob_to_docs, conll_ner_to_docs, conllu_to_docs
|
||||
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
|
||||
from spacy.cli.init_config import init_config, RECOMMENDATIONS
|
||||
from spacy.cli._util import validate_project_commands, parse_config_overrides
|
||||
from spacy.cli._util import load_project_config, substitute_project_variables
|
||||
from spacy.cli._util import string_to_list, OVERRIDES_ENV_VAR
|
||||
from thinc.config import ConfigValidationError
|
||||
from spacy.cli.debug_config import check_section_refs
|
||||
from thinc.config import ConfigValidationError, Config
|
||||
import srsly
|
||||
import os
|
||||
|
||||
from .util import make_tempdir
|
||||
|
||||
|
||||
def test_cli_converters_conllu2json():
|
||||
def test_cli_converters_conllu_to_docs():
|
||||
# from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
|
||||
lines = [
|
||||
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tO",
|
||||
|
@ -23,7 +24,7 @@ def test_cli_converters_conllu2json():
|
|||
"4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
|
||||
]
|
||||
input_data = "\n".join(lines)
|
||||
converted_docs = conllu2docs(input_data, n_sents=1)
|
||||
converted_docs = conllu_to_docs(input_data, n_sents=1)
|
||||
assert len(converted_docs) == 1
|
||||
converted = [docs_to_json(converted_docs)]
|
||||
assert converted[0]["id"] == 0
|
||||
|
@ -39,7 +40,7 @@ def test_cli_converters_conllu2json():
|
|||
ent_offsets = [
|
||||
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
|
||||
]
|
||||
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
||||
biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
|
||||
assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
|
||||
|
||||
|
||||
|
@ -62,9 +63,9 @@ def test_cli_converters_conllu2json():
|
|||
),
|
||||
],
|
||||
)
|
||||
def test_cli_converters_conllu2json_name_ner_map(lines):
|
||||
def test_cli_converters_conllu_to_docs_name_ner_map(lines):
|
||||
input_data = "\n".join(lines)
|
||||
converted_docs = conllu2docs(
|
||||
converted_docs = conllu_to_docs(
|
||||
input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""}
|
||||
)
|
||||
assert len(converted_docs) == 1
|
||||
|
@ -83,11 +84,11 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
|
|||
ent_offsets = [
|
||||
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
|
||||
]
|
||||
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
||||
biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
|
||||
assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
|
||||
|
||||
|
||||
def test_cli_converters_conllu2json_subtokens():
|
||||
def test_cli_converters_conllu_to_docs_subtokens():
|
||||
# https://raw.githubusercontent.com/ohenrik/nb_news_ud_sm/master/original_data/no-ud-dev-ner.conllu
|
||||
lines = [
|
||||
"1\tDommer\tdommer\tNOUN\t_\tDefinite=Ind|Gender=Masc|Number=Sing\t2\tappos\t_\tname=O",
|
||||
|
@ -98,7 +99,7 @@ def test_cli_converters_conllu2json_subtokens():
|
|||
"5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
|
||||
]
|
||||
input_data = "\n".join(lines)
|
||||
converted_docs = conllu2docs(
|
||||
converted_docs = conllu_to_docs(
|
||||
input_data, n_sents=1, merge_subtokens=True, append_morphology=True
|
||||
)
|
||||
assert len(converted_docs) == 1
|
||||
|
@ -132,11 +133,11 @@ def test_cli_converters_conllu2json_subtokens():
|
|||
ent_offsets = [
|
||||
(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]
|
||||
]
|
||||
biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
|
||||
biluo_tags = offsets_to_biluo_tags(converted_docs[0], ent_offsets, missing="O")
|
||||
assert biluo_tags == ["O", "U-PER", "O", "O"]
|
||||
|
||||
|
||||
def test_cli_converters_iob2json():
|
||||
def test_cli_converters_iob_to_docs():
|
||||
lines = [
|
||||
"I|O like|O London|I-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
|
||||
"I|O like|O London|B-GPE and|O New|B-GPE York|I-GPE City|I-GPE .|O",
|
||||
|
@ -144,7 +145,7 @@ def test_cli_converters_iob2json():
|
|||
"I|PRP|O like|VBP|O London|NNP|B-GPE and|CC|O New|NNP|B-GPE York|NNP|I-GPE City|NNP|I-GPE .|.|O",
|
||||
]
|
||||
input_data = "\n".join(lines)
|
||||
converted_docs = iob2docs(input_data, n_sents=10)
|
||||
converted_docs = iob_to_docs(input_data, n_sents=10)
|
||||
assert len(converted_docs) == 1
|
||||
converted = docs_to_json(converted_docs)
|
||||
assert converted["id"] == 0
|
||||
|
@ -161,7 +162,7 @@ def test_cli_converters_iob2json():
|
|||
assert ent.text in ["New York City", "London"]
|
||||
|
||||
|
||||
def test_cli_converters_conll_ner2json():
|
||||
def test_cli_converters_conll_ner_to_docs():
|
||||
lines = [
|
||||
"-DOCSTART- -X- O O",
|
||||
"",
|
||||
|
@ -211,7 +212,7 @@ def test_cli_converters_conll_ner2json():
|
|||
".\t.\t_\tO",
|
||||
]
|
||||
input_data = "\n".join(lines)
|
||||
converted_docs = conll_ner2docs(input_data, n_sents=10)
|
||||
converted_docs = conll_ner_to_docs(input_data, n_sents=10)
|
||||
assert len(converted_docs) == 1
|
||||
converted = docs_to_json(converted_docs)
|
||||
assert converted["id"] == 0
|
||||
|
@ -413,3 +414,15 @@ def test_string_to_list(value):
|
|||
def test_string_to_list_intify(value):
|
||||
assert string_to_list(value, intify=False) == ["1", "2", "3"]
|
||||
assert string_to_list(value, intify=True) == [1, 2, 3]
|
||||
|
||||
|
||||
def test_check_section_refs():
|
||||
config = {"a": {"b": {"c": "a.d.e"}, "d": {"e": 1}}, "f": {"g": "d.f"}}
|
||||
config = Config(config)
|
||||
# Valid section reference
|
||||
check_section_refs(config, ["a.b.c"])
|
||||
# Section that doesn't exist in this config
|
||||
check_section_refs(config, ["x.y.z"])
|
||||
# Invalid section reference
|
||||
with pytest.raises(ConfigValidationError):
|
||||
check_section_refs(config, ["a.b.c", "f.g"])
|
||||
|
|
|
@ -2,7 +2,7 @@ from numpy.testing import assert_almost_equal, assert_array_almost_equal
|
|||
import pytest
|
||||
from pytest import approx
|
||||
from spacy.training import Example
|
||||
from spacy.training.iob_utils import biluo_tags_from_offsets
|
||||
from spacy.training.iob_utils import offsets_to_biluo_tags
|
||||
from spacy.scorer import Scorer, ROCAUCScore
|
||||
from spacy.scorer import _roc_auc_score, _roc_curve
|
||||
from spacy.lang.en import English
|
||||
|
@ -186,7 +186,7 @@ def test_ner_per_type(en_vocab):
|
|||
words=input_.split(" "),
|
||||
ents=[("CARDINAL", 0, 1), ("CARDINAL", 2, 3)],
|
||||
)
|
||||
entities = biluo_tags_from_offsets(doc, annot["entities"])
|
||||
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
||||
example = Example.from_dict(doc, {"entities": entities})
|
||||
# a hack for sentence boundaries
|
||||
example.predicted[1].is_sent_start = False
|
||||
|
@ -211,7 +211,7 @@ def test_ner_per_type(en_vocab):
|
|||
words=input_.split(" "),
|
||||
ents=[("ORG", 0, 1), ("GPE", 5, 6), ("ORG", 6, 7)],
|
||||
)
|
||||
entities = biluo_tags_from_offsets(doc, annot["entities"])
|
||||
entities = offsets_to_biluo_tags(doc, annot["entities"])
|
||||
example = Example.from_dict(doc, {"entities": entities})
|
||||
# a hack for sentence boundaries
|
||||
example.predicted[1].is_sent_start = False
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
import numpy
|
||||
from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
|
||||
from spacy.training import spans_from_biluo_tags, iob_to_biluo
|
||||
from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, Alignment
|
||||
from spacy.training import biluo_tags_to_spans, iob_to_biluo
|
||||
from spacy.training import Corpus, docs_to_json
|
||||
from spacy.training.example import Example
|
||||
from spacy.training.converters import json2docs
|
||||
from spacy.training.converters import json_to_docs
|
||||
from spacy.training.augment import make_orth_variants_example
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc, DocBin
|
||||
|
@ -69,7 +69,7 @@ def test_gold_biluo_U(en_vocab):
|
|||
spaces = [True, True, True, False, True]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to London"), "LOC")]
|
||||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
tags = offsets_to_biluo_tags(doc, entities)
|
||||
assert tags == ["O", "O", "O", "U-LOC", "O"]
|
||||
|
||||
|
||||
|
@ -78,7 +78,7 @@ def test_gold_biluo_BL(en_vocab):
|
|||
spaces = [True, True, True, True, False, True]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco"), "LOC")]
|
||||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
tags = offsets_to_biluo_tags(doc, entities)
|
||||
assert tags == ["O", "O", "O", "B-LOC", "L-LOC", "O"]
|
||||
|
||||
|
||||
|
@ -87,7 +87,7 @@ def test_gold_biluo_BIL(en_vocab):
|
|||
spaces = [True, True, True, True, True, False, True]
|
||||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
tags = offsets_to_biluo_tags(doc, entities)
|
||||
assert tags == ["O", "O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
|
||||
|
||||
|
||||
|
@ -100,7 +100,7 @@ def test_gold_biluo_overlap(en_vocab):
|
|||
(len("I flew to "), len("I flew to San Francisco"), "LOC"),
|
||||
]
|
||||
with pytest.raises(ValueError):
|
||||
biluo_tags_from_offsets(doc, entities)
|
||||
offsets_to_biluo_tags(doc, entities)
|
||||
|
||||
|
||||
def test_gold_biluo_misalign(en_vocab):
|
||||
|
@ -109,7 +109,7 @@ def test_gold_biluo_misalign(en_vocab):
|
|||
doc = Doc(en_vocab, words=words, spaces=spaces)
|
||||
entities = [(len("I flew to "), len("I flew to San Francisco Valley"), "LOC")]
|
||||
with pytest.warns(UserWarning):
|
||||
tags = biluo_tags_from_offsets(doc, entities)
|
||||
tags = offsets_to_biluo_tags(doc, entities)
|
||||
assert tags == ["O", "O", "O", "-", "-", "-"]
|
||||
|
||||
|
||||
|
@ -155,7 +155,7 @@ def test_example_from_dict_some_ner(en_vocab):
|
|||
|
||||
|
||||
@pytest.mark.filterwarnings("ignore::UserWarning")
|
||||
def test_json2docs_no_ner(en_vocab):
|
||||
def test_json_to_docs_no_ner(en_vocab):
|
||||
data = [
|
||||
{
|
||||
"id": 1,
|
||||
|
@ -191,7 +191,7 @@ def test_json2docs_no_ner(en_vocab):
|
|||
],
|
||||
}
|
||||
]
|
||||
docs = json2docs(data)
|
||||
docs = json_to_docs(data)
|
||||
assert len(docs) == 1
|
||||
for doc in docs:
|
||||
assert not doc.has_annotation("ENT_IOB")
|
||||
|
@ -358,9 +358,9 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
|||
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||
offsets = [(10, 24, "LOC"), (29, 35, "GPE")]
|
||||
doc = en_tokenizer(text)
|
||||
biluo_tags_converted = biluo_tags_from_offsets(doc, offsets)
|
||||
biluo_tags_converted = offsets_to_biluo_tags(doc, offsets)
|
||||
assert biluo_tags_converted == biluo_tags
|
||||
offsets_converted = offsets_from_biluo_tags(doc, biluo_tags)
|
||||
offsets_converted = biluo_tags_to_offsets(doc, biluo_tags)
|
||||
offsets_converted = [ent for ent in offsets if ent[2]]
|
||||
assert offsets_converted == offsets
|
||||
|
||||
|
@ -368,7 +368,7 @@ def test_roundtrip_offsets_biluo_conversion(en_tokenizer):
|
|||
def test_biluo_spans(en_tokenizer):
|
||||
doc = en_tokenizer("I flew to Silicon Valley via London.")
|
||||
biluo_tags = ["O", "O", "O", "B-LOC", "L-LOC", "O", "U-GPE", "O"]
|
||||
spans = spans_from_biluo_tags(doc, biluo_tags)
|
||||
spans = biluo_tags_to_spans(doc, biluo_tags)
|
||||
spans = [span for span in spans if span.label_]
|
||||
assert len(spans) == 2
|
||||
assert spans[0].text == "Silicon Valley"
|
||||
|
|
|
@ -2,8 +2,8 @@ from .corpus import Corpus # noqa: F401
|
|||
from .example import Example, validate_examples # noqa: F401
|
||||
from .align import Alignment # noqa: F401
|
||||
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
||||
from .iob_utils import biluo_tags_from_offsets, offsets_from_biluo_tags # noqa: F401
|
||||
from .iob_utils import spans_from_biluo_tags, tags_to_entities # noqa: F401
|
||||
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401
|
||||
from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
|
||||
from .gold_io import docs_to_json, read_json_file # noqa: F401
|
||||
from .batchers import minibatch_by_padded_size, minibatch_by_words # noqa: F401
|
||||
from .loggers import console_logger, wandb_logger # noqa: F401
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
from .iob2docs import iob2docs # noqa: F401
|
||||
from .conll_ner2docs import conll_ner2docs # noqa: F401
|
||||
from .json2docs import json2docs # noqa: F401
|
||||
from .conllu2docs import conllu2docs # noqa: F401
|
||||
from .iob_to_docs import iob_to_docs # noqa: F401
|
||||
from .conll_ner_to_docs import conll_ner_to_docs # noqa: F401
|
||||
from .json_to_docs import json_to_docs # noqa: F401
|
||||
from .conllu_to_docs import conllu_to_docs # noqa: F401
|
||||
|
|
|
@ -7,7 +7,7 @@ from ...tokens import Doc, Span
|
|||
from ...util import load_model
|
||||
|
||||
|
||||
def conll_ner2docs(
|
||||
def conll_ner_to_docs(
|
||||
input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs
|
||||
):
|
||||
"""
|
|
@ -1,13 +1,13 @@
|
|||
import re
|
||||
|
||||
from .conll_ner2docs import n_sents_info
|
||||
from ...training import iob_to_biluo, spans_from_biluo_tags
|
||||
from .conll_ner_to_docs import n_sents_info
|
||||
from ...training import iob_to_biluo, biluo_tags_to_spans
|
||||
from ...tokens import Doc, Token, Span
|
||||
from ...vocab import Vocab
|
||||
from wasabi import Printer
|
||||
|
||||
|
||||
def conllu2docs(
|
||||
def conllu_to_docs(
|
||||
input_data,
|
||||
n_sents=10,
|
||||
append_morphology=False,
|
||||
|
@ -78,7 +78,7 @@ def read_conllx(
|
|||
if lines:
|
||||
while lines[0].startswith("#"):
|
||||
lines.pop(0)
|
||||
doc = doc_from_conllu_sentence(
|
||||
doc = conllu_sentence_to_doc(
|
||||
vocab,
|
||||
lines,
|
||||
ner_tag_pattern,
|
||||
|
@ -128,7 +128,7 @@ def get_entities(lines, tag_pattern, ner_map=None):
|
|||
return iob_to_biluo(iob)
|
||||
|
||||
|
||||
def doc_from_conllu_sentence(
|
||||
def conllu_sentence_to_doc(
|
||||
vocab,
|
||||
lines,
|
||||
ner_tag_pattern,
|
||||
|
@ -215,7 +215,7 @@ def doc_from_conllu_sentence(
|
|||
doc[i]._.merged_lemma = lemmas[i]
|
||||
doc[i]._.merged_spaceafter = spaces[i]
|
||||
ents = get_entities(lines, ner_tag_pattern, ner_map)
|
||||
doc.ents = spans_from_biluo_tags(doc, ents)
|
||||
doc.ents = biluo_tags_to_spans(doc, ents)
|
||||
|
||||
if merge_subtokens:
|
||||
doc = merge_conllu_subtokens(lines, doc)
|
|
@ -1,13 +1,13 @@
|
|||
from wasabi import Printer
|
||||
|
||||
from .conll_ner2docs import n_sents_info
|
||||
from .conll_ner_to_docs import n_sents_info
|
||||
from ...vocab import Vocab
|
||||
from ...training import iob_to_biluo, tags_to_entities
|
||||
from ...tokens import Doc, Span
|
||||
from ...util import minibatch
|
||||
|
||||
|
||||
def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
|
||||
def iob_to_docs(input_data, n_sents=10, no_print=False, *args, **kwargs):
|
||||
"""
|
||||
Convert IOB files with one sentence per line and tags separated with '|'
|
||||
into Doc objects so they can be saved. IOB and IOB2 are accepted.
|
|
@ -1,12 +1,12 @@
|
|||
import srsly
|
||||
from ..gold_io import json_iterate, json_to_annotations
|
||||
from ..example import annotations2doc
|
||||
from ..example import annotations_to_doc
|
||||
from ..example import _fix_legacy_dict_data, _parse_example_dict_data
|
||||
from ...util import load_model
|
||||
from ...lang.xx import MultiLanguage
|
||||
|
||||
|
||||
def json2docs(input_data, model=None, **kwargs):
|
||||
def json_to_docs(input_data, model=None, **kwargs):
|
||||
nlp = load_model(model) if model is not None else MultiLanguage()
|
||||
if not isinstance(input_data, bytes):
|
||||
if not isinstance(input_data, str):
|
||||
|
@ -17,6 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
|
|||
for json_para in json_to_annotations(json_doc):
|
||||
example_dict = _fix_legacy_dict_data(json_para)
|
||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||
doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
|
||||
doc = annotations_to_doc(nlp.vocab, tok_dict, doc_dict)
|
||||
docs.append(doc)
|
||||
return docs
|
|
@ -7,13 +7,13 @@ from ..tokens.span cimport Span
|
|||
from ..tokens.span import Span
|
||||
from ..attrs import IDS
|
||||
from .align import Alignment
|
||||
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
||||
from .iob_utils import spans_from_biluo_tags
|
||||
from .iob_utils import biluo_to_iob, offsets_to_biluo_tags, doc_to_biluo_tags
|
||||
from .iob_utils import biluo_tags_to_spans
|
||||
from ..errors import Errors, Warnings
|
||||
from ..pipeline._parser_internals import nonproj
|
||||
|
||||
|
||||
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
||||
cpdef Doc annotations_to_doc(vocab, tok_annot, doc_annot):
|
||||
""" Create a Doc from dictionaries with token and doc annotations. """
|
||||
attrs, array = _annot2array(vocab, tok_annot, doc_annot)
|
||||
output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
|
||||
|
@ -92,7 +92,7 @@ cdef class Example:
|
|||
tok_dict["SPACY"] = [tok.whitespace_ for tok in predicted]
|
||||
return Example(
|
||||
predicted,
|
||||
annotations2doc(predicted.vocab, tok_dict, doc_dict)
|
||||
annotations_to_doc(predicted.vocab, tok_dict, doc_dict)
|
||||
)
|
||||
|
||||
@property
|
||||
|
@ -176,7 +176,7 @@ cdef class Example:
|
|||
return [None] * len(self.x) # should this be 'missing' instead of 'None' ?
|
||||
x_ents = self.get_aligned_spans_y2x(self.y.ents)
|
||||
# Default to 'None' for missing values
|
||||
x_tags = biluo_tags_from_offsets(
|
||||
x_tags = offsets_to_biluo_tags(
|
||||
self.x,
|
||||
[(e.start_char, e.end_char, e.label_) for e in x_ents],
|
||||
missing=None
|
||||
|
@ -195,7 +195,7 @@ cdef class Example:
|
|||
return {
|
||||
"doc_annotation": {
|
||||
"cats": dict(self.reference.cats),
|
||||
"entities": biluo_tags_from_doc(self.reference),
|
||||
"entities": doc_to_biluo_tags(self.reference),
|
||||
"links": self._links_to_dict()
|
||||
},
|
||||
"token_annotation": {
|
||||
|
@ -295,12 +295,12 @@ def _add_entities_to_doc(doc, ner_data):
|
|||
elif isinstance(ner_data[0], tuple):
|
||||
return _add_entities_to_doc(
|
||||
doc,
|
||||
biluo_tags_from_offsets(doc, ner_data)
|
||||
offsets_to_biluo_tags(doc, ner_data)
|
||||
)
|
||||
elif isinstance(ner_data[0], str) or ner_data[0] is None:
|
||||
return _add_entities_to_doc(
|
||||
doc,
|
||||
spans_from_biluo_tags(doc, ner_data)
|
||||
biluo_tags_to_spans(doc, ner_data)
|
||||
)
|
||||
elif isinstance(ner_data[0], Span):
|
||||
# Ugh, this is super messy. Really hard to set O entities
|
||||
|
@ -388,7 +388,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
|
|||
# This is annoying but to convert the offsets we need a Doc
|
||||
# that has the target tokenization.
|
||||
reference = Doc(vocab, words=words, spaces=spaces)
|
||||
biluo = biluo_tags_from_offsets(reference, biluo_or_offsets)
|
||||
biluo = offsets_to_biluo_tags(reference, biluo_or_offsets)
|
||||
else:
|
||||
biluo = biluo_or_offsets
|
||||
ent_iobs = []
|
||||
|
|
|
@ -3,7 +3,7 @@ import srsly
|
|||
from .. import util
|
||||
from ..errors import Warnings
|
||||
from ..tokens import Doc
|
||||
from .iob_utils import biluo_tags_from_offsets, tags_to_entities
|
||||
from .iob_utils import offsets_to_biluo_tags, tags_to_entities
|
||||
import json
|
||||
|
||||
|
||||
|
@ -32,7 +32,7 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
|
|||
if ent.kb_id_:
|
||||
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
|
||||
json_para["links"].append(link_dict)
|
||||
biluo_tags = biluo_tags_from_offsets(doc, json_para["entities"], missing=ner_missing_tag)
|
||||
biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
|
||||
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
|
||||
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
|
||||
for j, sent in enumerate(doc.sents):
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
from typing import List, Tuple, Iterable, Union, Iterator
|
||||
import warnings
|
||||
|
||||
from ..errors import Errors, Warnings
|
||||
from ..tokens import Span
|
||||
from ..tokens import Span, Doc
|
||||
|
||||
|
||||
def iob_to_biluo(tags):
|
||||
def iob_to_biluo(tags: Iterable[str]) -> List[str]:
|
||||
out = []
|
||||
tags = list(tags)
|
||||
while tags:
|
||||
|
@ -12,7 +14,7 @@ def iob_to_biluo(tags):
|
|||
return out
|
||||
|
||||
|
||||
def biluo_to_iob(tags):
|
||||
def biluo_to_iob(tags: Iterable[str]) -> List[str]:
|
||||
out = []
|
||||
for tag in tags:
|
||||
if tag is None:
|
||||
|
@ -23,12 +25,12 @@ def biluo_to_iob(tags):
|
|||
return out
|
||||
|
||||
|
||||
def _consume_os(tags):
|
||||
def _consume_os(tags: List[str]) -> Iterator[str]:
|
||||
while tags and tags[0] == "O":
|
||||
yield tags.pop(0)
|
||||
|
||||
|
||||
def _consume_ent(tags):
|
||||
def _consume_ent(tags: List[str]) -> List[str]:
|
||||
if not tags:
|
||||
return []
|
||||
tag = tags.pop(0)
|
||||
|
@ -50,15 +52,17 @@ def _consume_ent(tags):
|
|||
return [start] + middle + [end]
|
||||
|
||||
|
||||
def biluo_tags_from_doc(doc, missing="O"):
|
||||
return biluo_tags_from_offsets(
|
||||
def doc_to_biluo_tags(doc: Doc, missing: str = "O"):
|
||||
return offsets_to_biluo_tags(
|
||||
doc,
|
||||
[(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents],
|
||||
missing=missing,
|
||||
)
|
||||
|
||||
|
||||
def biluo_tags_from_offsets(doc, entities, missing="O"):
|
||||
def offsets_to_biluo_tags(
|
||||
doc: Doc, entities: Iterable[Tuple[int, int, Union[str, int]]], missing: str = "O"
|
||||
) -> List[str]:
|
||||
"""Encode labelled spans into per-token tags, using the
|
||||
Begin/In/Last/Unit/Out scheme (BILUO).
|
||||
|
||||
|
@ -69,7 +73,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
|
|||
the original string.
|
||||
RETURNS (list): A list of unicode strings, describing the tags. Each tag
|
||||
string will be of the form either "", "O" or "{action}-{label}", where
|
||||
action is one of "B", "I", "L", "U". The string "-" is used where the
|
||||
action is one of "B", "I", "L", "U". The missing label is used where the
|
||||
entity offsets don't align with the tokenization in the `Doc` object.
|
||||
The training algorithm will view these as missing values. "O" denotes a
|
||||
non-entity token. "B" denotes the beginning of a multi-token entity,
|
||||
|
@ -80,12 +84,11 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
|
|||
>>> text = 'I like London.'
|
||||
>>> entities = [(len('I like '), len('I like London'), 'LOC')]
|
||||
>>> doc = nlp.tokenizer(text)
|
||||
>>> tags = biluo_tags_from_offsets(doc, entities)
|
||||
>>> tags = offsets_to_biluo_tags(doc, entities)
|
||||
>>> assert tags == ["O", "O", 'U-LOC', "O"]
|
||||
"""
|
||||
# Ensure no overlapping entity labels exist
|
||||
tokens_in_ents = {}
|
||||
|
||||
starts = {token.idx: token.i for token in doc}
|
||||
ends = {token.idx + len(token): token.i for token in doc}
|
||||
biluo = ["-" for _ in doc]
|
||||
|
@ -109,7 +112,6 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
|
|||
)
|
||||
)
|
||||
tokens_in_ents[token_index] = (start_char, end_char, label)
|
||||
|
||||
start_token = starts.get(start_char)
|
||||
end_token = ends.get(end_char)
|
||||
# Only interested if the tokenization is correct
|
||||
|
@ -143,7 +145,7 @@ def biluo_tags_from_offsets(doc, entities, missing="O"):
|
|||
return biluo
|
||||
|
||||
|
||||
def spans_from_biluo_tags(doc, tags):
|
||||
def biluo_tags_to_spans(doc: Doc, tags: Iterable[str]) -> List[Span]:
|
||||
"""Encode per-token tags following the BILUO scheme into Span object, e.g.
|
||||
to overwrite the doc.ents.
|
||||
|
||||
|
@ -161,7 +163,9 @@ def spans_from_biluo_tags(doc, tags):
|
|||
return spans
|
||||
|
||||
|
||||
def offsets_from_biluo_tags(doc, tags):
|
||||
def biluo_tags_to_offsets(
|
||||
doc: Doc, tags: Iterable[str]
|
||||
) -> List[Tuple[int, int, Union[str, int]]]:
|
||||
"""Encode per-token tags following the BILUO scheme into entity offsets.
|
||||
|
||||
doc (Doc): The document that the BILUO tags refer to.
|
||||
|
@ -172,12 +176,12 @@ def offsets_from_biluo_tags(doc, tags):
|
|||
`end` will be character-offset integers denoting the slice into the
|
||||
original string.
|
||||
"""
|
||||
spans = spans_from_biluo_tags(doc, tags)
|
||||
spans = biluo_tags_to_spans(doc, tags)
|
||||
return [(span.start_char, span.end_char, span.label_) for span in spans]
|
||||
|
||||
|
||||
def tags_to_entities(tags):
|
||||
""" Note that the end index returned by this function is inclusive.
|
||||
def tags_to_entities(tags: Iterable[str]) -> List[Tuple[str, int, int]]:
|
||||
"""Note that the end index returned by this function is inclusive.
|
||||
To use it for Span creation, increment the end by 1."""
|
||||
entities = []
|
||||
start = None
|
||||
|
@ -209,3 +213,9 @@ def tags_to_entities(tags):
|
|||
else:
|
||||
raise ValueError(Errors.E068.format(tag=tag))
|
||||
return entities
|
||||
|
||||
|
||||
# Fallbacks to make backwards-compat easier
|
||||
offsets_from_biluo_tags = biluo_tags_to_offsets
|
||||
spans_from_biluo_tags = biluo_tags_to_spans
|
||||
biluo_tags_from_offsets = offsets_to_biluo_tags
|
||||
|
|
|
@ -11,9 +11,12 @@ def console_logger():
|
|||
def setup_printer(
|
||||
nlp: "Language",
|
||||
) -> Tuple[Callable[[Dict[str, Any]], None], Callable]:
|
||||
score_cols = list(nlp.config["training"]["score_weights"])
|
||||
# we assume here that only components are enabled that should be trained & logged
|
||||
logged_pipes = nlp.pipe_names
|
||||
score_weights = nlp.config["training"]["score_weights"]
|
||||
score_cols = [col for col, value in score_weights.items() if value is not None]
|
||||
score_widths = [max(len(col), 6) for col in score_cols]
|
||||
loss_cols = [f"Loss {pipe}" for pipe in nlp.pipe_names]
|
||||
loss_cols = [f"Loss {pipe}" for pipe in logged_pipes]
|
||||
loss_widths = [max(len(col), 8) for col in loss_cols]
|
||||
table_header = ["E", "#"] + loss_cols + score_cols + ["Score"]
|
||||
table_header = [col.upper() for col in table_header]
|
||||
|
@ -26,7 +29,7 @@ def console_logger():
|
|||
try:
|
||||
losses = [
|
||||
"{0:.2f}".format(float(info["losses"][pipe_name]))
|
||||
for pipe_name in nlp.pipe_names
|
||||
for pipe_name in logged_pipes
|
||||
]
|
||||
except KeyError as e:
|
||||
raise KeyError(
|
||||
|
@ -38,10 +41,15 @@ def console_logger():
|
|||
) from None
|
||||
scores = []
|
||||
for col in score_cols:
|
||||
score = float(info["other_scores"].get(col, 0.0))
|
||||
if col != "speed":
|
||||
score *= 100
|
||||
scores.append("{0:.2f}".format(score))
|
||||
score = info["other_scores"].get(col, 0.0)
|
||||
try:
|
||||
score = float(score)
|
||||
if col != "speed":
|
||||
score *= 100
|
||||
scores.append("{0:.2f}".format(score))
|
||||
except TypeError:
|
||||
err = Errors.E916.format(name=col, score_type=type(score))
|
||||
raise ValueError(err) from None
|
||||
data = (
|
||||
[info["epoch"], info["step"]]
|
||||
+ losses
|
||||
|
|
|
@ -61,7 +61,7 @@ LEXEME_NORM_LANGS = ["da", "de", "el", "en", "id", "lb", "pt", "ru", "sr", "ta",
|
|||
# Default order of sections in the config.cfg. Not all sections needs to exist,
|
||||
# and additional sections are added at the end, in alphabetical order.
|
||||
# fmt: off
|
||||
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "training", "pretraining"]
|
||||
CONFIG_SECTION_ORDER = ["paths", "variables", "system", "nlp", "components", "corpora", "training", "pretraining"]
|
||||
# fmt: on
|
||||
|
||||
|
||||
|
@ -1202,21 +1202,38 @@ def get_arg_names(func: Callable) -> List[str]:
|
|||
return list(set([*argspec.args, *argspec.kwonlyargs]))
|
||||
|
||||
|
||||
def combine_score_weights(weights: List[Dict[str, float]]) -> Dict[str, float]:
|
||||
def combine_score_weights(
|
||||
weights: List[Dict[str, float]],
|
||||
overrides: Dict[str, Optional[Union[float, int]]] = SimpleFrozenDict(),
|
||||
) -> Dict[str, float]:
|
||||
"""Combine and normalize score weights defined by components, e.g.
|
||||
{"ents_r": 0.2, "ents_p": 0.3, "ents_f": 0.5} and {"some_other_score": 1.0}.
|
||||
|
||||
weights (List[dict]): The weights defined by the components.
|
||||
overrides (Dict[str, Optional[Union[float, int]]]): Existing scores that
|
||||
should be preserved.
|
||||
RETURNS (Dict[str, float]): The combined and normalized weights.
|
||||
"""
|
||||
# We first need to extract all None/null values for score weights that
|
||||
# shouldn't be shown in the table *or* be weighted
|
||||
result = {}
|
||||
all_weights = []
|
||||
for w_dict in weights:
|
||||
filtered_weights = {}
|
||||
for key, value in w_dict.items():
|
||||
value = overrides.get(key, value)
|
||||
if value is None:
|
||||
result[key] = None
|
||||
else:
|
||||
filtered_weights[key] = value
|
||||
all_weights.append(filtered_weights)
|
||||
for w_dict in all_weights:
|
||||
# We need to account for weights that don't sum to 1.0 and normalize
|
||||
# the score weights accordingly, then divide score by the number of
|
||||
# components.
|
||||
total = sum(w_dict.values())
|
||||
for key, value in w_dict.items():
|
||||
weight = round(value / total / len(weights), 2)
|
||||
weight = round(value / total / len(all_weights), 2)
|
||||
result[key] = result.get(key, 0.0) + weight
|
||||
return result
|
||||
|
||||
|
|
|
@ -414,7 +414,8 @@ one component.
|
|||
> ```ini
|
||||
> [model]
|
||||
> @architectures = "spacy.TransitionBasedParser.v1"
|
||||
> nr_feature_tokens = 6
|
||||
> state_type = "ner"
|
||||
> extra_state_tokens = false
|
||||
> hidden_width = 64
|
||||
> maxout_pieces = 2
|
||||
>
|
||||
|
@ -446,15 +447,16 @@ consists of either two or three subnetworks:
|
|||
state representation. If not present, the output from the lower model is used
|
||||
as action scores directly.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~ |
|
||||
| `hidden_width` | The width of the hidden layer. ~~int~~ |
|
||||
| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ |
|
||||
| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
|
||||
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ |
|
||||
| Name | Description |
|
||||
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `state_type` | Which task to extract features for. Possible values are "ner" and "parser". ~~str~~ |
|
||||
| `extra_state_tokens` | Whether to use an expanded feature set when extracting the state tokens. Slightly slower, but sometimes improves accuracy slightly. Defaults to `False`. ~~bool~~ |
|
||||
| `hidden_width` | The width of the hidden layer. ~~int~~ |
|
||||
| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ |
|
||||
| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
|
||||
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ |
|
||||
|
||||
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
|
||||
|
||||
|
|
|
@ -275,7 +275,7 @@ $ python -m spacy convert ./data.json ./output.spacy
|
|||
> entity label, prefixed by the BILUO marker. For example `"B-ORG"` describes
|
||||
> the first token of a multi-token `ORG` entity and `"U-PERSON"` a single token
|
||||
> representing a `PERSON` entity. The
|
||||
> [`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets) function
|
||||
> [`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) function
|
||||
> can help you convert entity offsets to the right format.
|
||||
|
||||
```python
|
||||
|
|
|
@ -145,17 +145,16 @@ examples, see the
|
|||
> )
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `name` | The name of the component factory. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ |
|
||||
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
|
||||
| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ |
|
||||
| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ |
|
||||
| Name | Description |
|
||||
| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | The name of the component factory. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ |
|
||||
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
|
||||
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ |
|
||||
| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ |
|
||||
|
||||
## Language.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -1036,12 +1035,12 @@ provided by the [`@Language.component`](/api/language#component) or
|
|||
component is defined and stored on the `Language` class for each component
|
||||
instance and factory instance.
|
||||
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `factory` | The name of the registered component factory. ~~str~~ |
|
||||
| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ |
|
||||
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
|
||||
| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ |
|
||||
| Name | Description |
|
||||
| ----------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `factory` | The name of the registered component factory. ~~str~~ |
|
||||
| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ |
|
||||
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
|
||||
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ |
|
||||
| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Based on the `default_score_weights` and used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
|
|
|
@ -619,7 +619,7 @@ sequences in the batch.
|
|||
|
||||
## Training data and alignment {#gold source="spacy/training"}
|
||||
|
||||
### training.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"}
|
||||
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
|
||||
|
||||
Encode labelled spans into per-token tags, using the
|
||||
[BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit,
|
||||
|
@ -632,14 +632,20 @@ the beginning of a multi-token entity, `I` the inside of an entity of three or
|
|||
more tokens, and `L` the end of an entity of two or more tokens. `U` denotes a
|
||||
single-token entity.
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning" id="biluo_tags_from_offsets">
|
||||
|
||||
This method was previously available as `spacy.gold.biluo_tags_from_offsets`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.training import biluo_tags_from_offsets
|
||||
> from spacy.training import offsets_to_biluo_tags
|
||||
>
|
||||
> doc = nlp("I like London.")
|
||||
> entities = [(7, 13, "LOC")]
|
||||
> tags = biluo_tags_from_offsets(doc, entities)
|
||||
> tags = offsets_to_biluo_tags(doc, entities)
|
||||
> assert tags == ["O", "O", "U-LOC", "O"]
|
||||
> ```
|
||||
|
||||
|
@ -647,21 +653,28 @@ single-token entity.
|
|||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. ~~Doc~~ |
|
||||
| `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ |
|
||||
| `missing` | The label used for missing values, e.g. if tokenization doesn't align with the entity offsets. Defaults to `"O"`. ~~str~~ |
|
||||
| **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
|
||||
|
||||
### training.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}
|
||||
### training.biluo_tags_to_offsets {#biluo_tags_to_offsets tag="function"}
|
||||
|
||||
Encode per-token tags following the
|
||||
[BILUO scheme](/usage/linguistic-features#accessing-ner) into entity offsets.
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning" id="offsets_from_biluo_tags">
|
||||
|
||||
This method was previously available as `spacy.gold.offsets_from_biluo_tags`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.training import offsets_from_biluo_tags
|
||||
> from spacy.training import biluo_tags_to_offsets
|
||||
>
|
||||
> doc = nlp("I like London.")
|
||||
> tags = ["O", "O", "U-LOC", "O"]
|
||||
> entities = offsets_from_biluo_tags(doc, tags)
|
||||
> entities = biluo_tags_to_offsets(doc, tags)
|
||||
> assert entities == [(7, 13, "LOC")]
|
||||
> ```
|
||||
|
||||
|
@ -671,21 +684,27 @@ Encode per-token tags following the
|
|||
| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
|
||||
| **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ |
|
||||
|
||||
### training.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"}
|
||||
### training.biluo_tags_to_spans {#biluo_tags_to_spans tag="function" new="2.1"}
|
||||
|
||||
Encode per-token tags following the
|
||||
[BILUO scheme](/usage/linguistic-features#accessing-ner) into
|
||||
[`Span`](/api/span) objects. This can be used to create entity spans from
|
||||
token-based tags, e.g. to overwrite the `doc.ents`.
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning" id="spans_from_biluo_tags">
|
||||
|
||||
This method was previously available as `spacy.gold.spans_from_biluo_tags`.
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.training import spans_from_biluo_tags
|
||||
> from spacy.training import biluo_tags_to_spans
|
||||
>
|
||||
> doc = nlp("I like London.")
|
||||
> tags = ["O", "O", "U-LOC", "O"]
|
||||
> doc.ents = spans_from_biluo_tags(doc, tags)
|
||||
> doc.ents = biluo_tags_to_spans(doc, tags)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
|
|
|
@ -1,24 +1,19 @@
|
|||
import { Help } from 'components/typography'; import Link from 'components/link'
|
||||
|
||||
<!-- TODO: update, add project template -->
|
||||
<!-- TODO: update numbers -->
|
||||
|
||||
<figure>
|
||||
|
||||
| System | Parser | Tagger | NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
|
||||
| ------------------------------------------------------------------------- | ----------------: | ----------------: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
|
||||
| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k |
|
||||
| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | |
|
||||
| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | |
|
||||
| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup> | _n/a_<sup>2</sup> | _n/a_<sup>2</sup> | 88.8 | 234 | 2k |
|
||||
| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link> | - | 97.9 | 89.3 | | |
|
||||
| Pipeline | Parser | Tagger | NER | WPS<br />CPU <Help>words per second on CPU, higher is better</Help> | WPS<br/>GPU <Help>words per second on GPU, higher is better</Help> |
|
||||
| ---------------------------------------------------------- | -----: | -----: | ---: | ------------------------------------------------------------------: | -----------------------------------------------------------------: |
|
||||
| [`en_core_web_trf`](/models/en#en_core_web_trf) (spaCy v3) | | | | | 6k |
|
||||
| [`en_core_web_lg`](/models/en#en_core_web_lg) (spaCy v3) | | | | | |
|
||||
| `en_core_web_lg` (spaCy v2) | 91.9 | 97.2 | 85.9 | 10k | |
|
||||
|
||||
<figcaption class="caption">
|
||||
|
||||
**Accuracy and speed on the
|
||||
[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.**<br />**1. **
|
||||
[Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf). **2. ** _Coming soon_:
|
||||
Qi et al. don't report parsing and tagging results on OntoNotes. We're working
|
||||
on training Stanza on this corpus to allow direct comparison.
|
||||
**Full pipeline accuracy and speed** on the
|
||||
[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) corpus.
|
||||
|
||||
</figcaption>
|
||||
|
||||
|
@ -26,18 +21,24 @@ on training Stanza on this corpus to allow direct comparison.
|
|||
|
||||
<figure>
|
||||
|
||||
| System | POS | UAS | LAS |
|
||||
| ------------------------------------------------------------------------------ | ---: | ---: | ---: |
|
||||
| spaCy RoBERTa (2020) | | | |
|
||||
| spaCy CNN (2020) | | | |
|
||||
| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.3 | 97.4 | 96.3 |
|
||||
| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.3 | 97.2 | 95.7 |
|
||||
| Named Entity Recognition System | OntoNotes | CoNLL '03 |
|
||||
| ------------------------------------------------------------------------------ | --------: | --------: |
|
||||
| spaCy RoBERTa (2020) | | 92.2 |
|
||||
| spaCy CNN (2020) | 85.3 | 88.4 |
|
||||
| spaCy CNN (2017) | 86.4 | |
|
||||
| [Stanza](https://stanfordnlp.github.io/stanza/) (StanfordNLP)<sup>1</sup> | 88.8 | 92.1 |
|
||||
| <Link to="https://github.com/flairNLP/flair" hideIcon>Flair</Link><sup>2</sup> | 89.7 | 93.1 |
|
||||
| BERT Base<sup>3</sup> | - | 92.4 |
|
||||
|
||||
<figcaption class="caption">
|
||||
|
||||
**Accuracy on the Penn Treebank.** See
|
||||
[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
|
||||
results.
|
||||
**Named entity recognition accuracy** on the
|
||||
[OntoNotes 5.0](https://catalog.ldc.upenn.edu/LDC2013T19) and
|
||||
[CoNLL-2003](https://www.aclweb.org/anthology/W03-0419.pdf) corpora. See
|
||||
[NLP-progress](http://nlpprogress.com/english/named_entity_recognition.html) for
|
||||
more results. **1. ** [Qi et al. (2020)](https://arxiv.org/pdf/2003.07082.pdf).
|
||||
**2. ** [Akbik et al. (2018)](https://www.aclweb.org/anthology/C18-1139/). **3.
|
||||
** [Devlin et al. (2018)](https://arxiv.org/abs/1810.04805).
|
||||
|
||||
</figcaption>
|
||||
|
||||
|
|
|
@ -235,8 +235,6 @@ The `Transformer` component sets the
|
|||
[`Doc._.trf_data`](/api/transformer#custom_attributes) extension attribute,
|
||||
which lets you access the transformers outputs at runtime.
|
||||
|
||||
<!-- TODO: update/confirm once we have final models trained -->
|
||||
|
||||
```cli
|
||||
$ python -m spacy download en_core_trf_lg
|
||||
```
|
||||
|
@ -448,7 +446,8 @@ factory = "ner"
|
|||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
state_type = "ner"
|
||||
extra_state_tokens = false
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
|
|
@ -61,12 +61,25 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
|
|||
|
||||
<Benchmarks />
|
||||
|
||||
<Project id="benchmarks/parsing_penn_treebank">
|
||||
<figure>
|
||||
|
||||
The easiest way to reproduce spaCy's benchmarks on the Penn Treebank is to clone
|
||||
our project template.
|
||||
| Dependency Parsing System | UAS | LAS |
|
||||
| ------------------------------------------------------------------------------ | ---: | ---: |
|
||||
| spaCy RoBERTa (2020)<sup>1</sup> | 96.8 | 95.0 |
|
||||
| spaCy CNN (2020)<sup>1</sup> | 93.7 | 91.8 |
|
||||
| [Mrini et al.](https://khalilmrini.github.io/Label_Attention_Layer.pdf) (2019) | 97.4 | 96.3 |
|
||||
| [Zhou and Zhao](https://www.aclweb.org/anthology/P19-1230/) (2019) | 97.2 | 95.7 |
|
||||
|
||||
</Project>
|
||||
<figcaption class="caption">
|
||||
|
||||
**Dependency parsing accuracy** on the Penn Treebank. See
|
||||
[NLP-progress](http://nlpprogress.com/english/dependency_parsing.html) for more
|
||||
results. **1. ** Project template:
|
||||
[`benchmarks/parsing_penn_treebank`](%%GITHUB_PROJECTS/benchmarks/parsing_penn_treebank).
|
||||
|
||||
</figcaption>
|
||||
|
||||
</figure>
|
||||
|
||||
<!-- TODO: ## Citing spaCy {#citation}
|
||||
|
||||
|
|
|
@ -1654,9 +1654,12 @@ The [`SentenceRecognizer`](/api/sentencerecognizer) is a simple statistical
|
|||
component that only provides sentence boundaries. Along with being faster and
|
||||
smaller than the parser, its primary advantage is that it's easier to train
|
||||
because it only requires annotated sentence boundaries rather than full
|
||||
dependency parses.
|
||||
|
||||
<!-- TODO: update/confirm usage once we have final models trained -->
|
||||
dependency parses. spaCy's [trained pipelines](/models) include both a parser
|
||||
and a trained sentence segmenter, which is
|
||||
[disabled](/usage/processing-pipelines#disabling) by default. If you only need
|
||||
sentence boundaries and no parser, you can use the `enable` and `disable`
|
||||
arguments on [`spacy.load`](/api/top-level#spacy.load) to enable the senter and
|
||||
disable the parser.
|
||||
|
||||
> #### senter vs. parser
|
||||
>
|
||||
|
|
|
@ -253,8 +253,6 @@ different mechanisms you can use:
|
|||
Disabled and excluded component names can be provided to
|
||||
[`spacy.load`](/api/top-level#spacy.load) as a list.
|
||||
|
||||
<!-- TODO: update with info on our models shipped with optional components -->
|
||||
|
||||
> #### 💡 Optional pipeline components
|
||||
>
|
||||
> The `disable` mechanism makes it easy to distribute pipeline packages with
|
||||
|
@ -262,6 +260,11 @@ Disabled and excluded component names can be provided to
|
|||
> your pipeline may include a statistical _and_ a rule-based component for
|
||||
> sentence segmentation, and you can choose which one to run depending on your
|
||||
> use case.
|
||||
>
|
||||
> For example, spaCy's [trained pipelines](/models) like
|
||||
> [`en_core_web_sm`](/models/en#en_core_web_sm) contain both a `parser` and
|
||||
> `senter` that perform sentence segmentation, but the `senter` is disabled by
|
||||
> default.
|
||||
|
||||
```python
|
||||
# Load the pipeline without the entity recognizer
|
||||
|
@ -1501,7 +1504,7 @@ add those entities to the `doc.ents`, you can wrap it in a custom pipeline
|
|||
component function and pass it the token texts from the `Doc` object received by
|
||||
the component.
|
||||
|
||||
The [`training.spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags) is very
|
||||
The [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans) is very
|
||||
helpful here, because it takes a `Doc` object and token-based BILUO tags and
|
||||
returns a sequence of `Span` objects in the `Doc` with added labels. So all your
|
||||
wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
|
||||
|
@ -1516,14 +1519,14 @@ wrapper has to do is compute the entity spans and overwrite the `doc.ents`.
|
|||
```python
|
||||
### {highlight="1,8-9"}
|
||||
import your_custom_entity_recognizer
|
||||
from spacy.training import offsets_from_biluo_tags
|
||||
from spacy.training import biluo_tags_to_spans
|
||||
from spacy.language import Language
|
||||
|
||||
@Language.component("custom_ner_wrapper")
|
||||
def custom_ner_wrapper(doc):
|
||||
words = [token.text for token in doc]
|
||||
custom_entities = your_custom_entity_recognizer(words)
|
||||
doc.ents = spans_from_biluo_tags(doc, custom_entities)
|
||||
doc.ents = biluo_tags_to_spans(doc, custom_entities)
|
||||
return doc
|
||||
```
|
||||
|
||||
|
|
|
@ -213,7 +213,7 @@ a quick web demo. It looks pretty similar to a config file used to define CI
|
|||
pipelines.
|
||||
|
||||
```yaml
|
||||
https://github.com/explosion/projects/tree/v3/pipelines/tagger_parser_ud/project.yml
|
||||
%%GITHUB_PROJECTS/pipelines/tagger_parser_ud/project.yml
|
||||
```
|
||||
|
||||
| Section | Description |
|
||||
|
@ -733,7 +733,10 @@ workflows, but only one can be tracked by DVC.
|
|||
<Infobox title="This section is still under construction" emoji="🚧" variant="warning">
|
||||
|
||||
The Prodigy integration will require a nightly version of Prodigy that supports
|
||||
spaCy v3+.
|
||||
spaCy v3+. You can already use annotations created with Prodigy in spaCy v3 by
|
||||
exporting your data with
|
||||
[`data-to-spacy`](https://prodi.gy/docs/recipes#data-to-spacy) and running
|
||||
[`spacy convert`](/api/cli#convert) to convert it to the binary format.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
|
|
@ -470,6 +470,7 @@ score.
|
|||
```ini
|
||||
[training.score_weights]
|
||||
dep_las = 0.4
|
||||
dep_uas = null
|
||||
ents_f = 0.4
|
||||
tag_acc = 0.2
|
||||
token_acc = 0.0
|
||||
|
@ -481,9 +482,9 @@ you generate a config for a given pipeline, the score weights are generated by
|
|||
combining and normalizing the default score weights of the pipeline components.
|
||||
The default score weights are defined by each pipeline component via the
|
||||
`default_score_weights` setting on the
|
||||
[`@Language.component`](/api/language#component) or
|
||||
[`@Language.factory`](/api/language#factory). By default, all pipeline
|
||||
components are weighted equally.
|
||||
[`@Language.factory`](/api/language#factory) decorator. By default, all pipeline
|
||||
components are weighted equally. If a score weight is set to `null`, it will be
|
||||
excluded from the logs and the score won't be weighted.
|
||||
|
||||
<Accordion title="Understanding the training output and score types" spaced>
|
||||
|
||||
|
|
|
@ -88,7 +88,10 @@ import Benchmarks from 'usage/\_benchmarks-models.md'
|
|||
- **Architectures: ** [TransformerModel](/api/architectures#TransformerModel),
|
||||
[TransformerListener](/api/architectures#TransformerListener),
|
||||
[Tok2VecTransformer](/api/architectures#Tok2VecTransformer)
|
||||
- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf)
|
||||
- **Trained Pipelines:** [`en_core_web_trf`](/models/en#en_core_web_trf),
|
||||
[`de_dep_news_trf`](/models/de#de_dep_news_trf),
|
||||
[`es_dep_news_trf`](/models/es#es_dep_news_trf),
|
||||
[`fr_dep_news_trf`](/models/fr#fr_dep_news_trf)
|
||||
- **Implementation:**
|
||||
[`spacy-transformers`](https://github.com/explosion/spacy-transformers)
|
||||
|
||||
|
@ -548,17 +551,19 @@ Note that spaCy v3.0 now requires **Python 3.6+**.
|
|||
|
||||
### Removed or renamed API {#incompat-removed}
|
||||
|
||||
| Removed | Replacement |
|
||||
| -------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------ |
|
||||
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
|
||||
| `GoldParse` | [`Example`](/api/example) |
|
||||
| `GoldCorpus` | [`Corpus`](/api/corpus) |
|
||||
| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) |
|
||||
| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed |
|
||||
| `spacy init-model` | [`spacy init vocab`](/api/cli#init-vocab) |
|
||||
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
|
||||
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |
|
||||
| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, symlinks are deprecated |
|
||||
| Removed | Replacement |
|
||||
| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `Language.disable_pipes` | [`Language.select_pipes`](/api/language#select_pipes), [`Language.disable_pipe`](/api/language#disable_pipe) |
|
||||
| `Doc.is_tagged`, `Doc.is_parsed`, ... | [`Doc.has_annotation`](/api/doc#has_annotation) |
|
||||
| `GoldParse` | [`Example`](/api/example) |
|
||||
| `GoldCorpus` | [`Corpus`](/api/corpus) |
|
||||
| `KnowledgeBase.load_bulk`, `KnowledgeBase.dump` | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk) |
|
||||
| `Matcher.pipe`, `PhraseMatcher.pipe` | not needed |
|
||||
| `gold.offsets_from_biluo_tags`, `gold.spans_from_biluo_tags`, `gold.biluo_tags_from_offsets` | [`training.biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets), [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), [`training.offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) |
|
||||
| `spacy init-model` | [`spacy init vocab`](/api/cli#init-vocab) |
|
||||
| `spacy debug-data` | [`spacy debug data`](/api/cli#debug-data) |
|
||||
| `spacy profile` | [`spacy debug profile`](/api/cli#debug-profile) |
|
||||
| `spacy link`, `util.set_data_path`, `util.get_data_path` | not needed, symlinks are deprecated |
|
||||
|
||||
The following deprecated methods, attributes and arguments were removed in v3.0.
|
||||
Most of them have been **deprecated for a while** and many would previously
|
||||
|
@ -968,16 +973,17 @@ python -m spacy package ./output ./packages
|
|||
|
||||
#### Data utilities and gold module {#migrating-gold}
|
||||
|
||||
The `spacy.gold` module has been renamed to `spacy.training`. This mostly
|
||||
affects internals, but if you've been using the span offset conversion utilities
|
||||
[`biluo_tags_from_offsets`](/api/top-level#biluo_tags_from_offsets),
|
||||
[`offsets_from_biluo_tags`](/api/top-level#offsets_from_biluo_tags) or
|
||||
[`spans_from_biluo_tags`](/api/top-level#spans_from_biluo_tags), you'll have to
|
||||
change your imports:
|
||||
The `spacy.gold` module has been renamed to `spacy.training` and the conversion
|
||||
utilities now follow the naming format of `x_to_y`. This mostly affects
|
||||
internals, but if you've been using the span offset conversion utilities
|
||||
[`offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags),
|
||||
[`biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets) or
|
||||
[`biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), you'll have to
|
||||
change your names and imports:
|
||||
|
||||
```diff
|
||||
- from spacy.gold import biluo_tags_from_offsets, spans_from_biluo_tags
|
||||
+ from spacy.training import biluo_tags_from_offsets, spans_from_biluo_tags
|
||||
- from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, spans_from_biluo_tags
|
||||
+ from spacy.training import offsets_to_biluo_tags, biluo_tags_to_offsets, biluo_tags_to_spans
|
||||
```
|
||||
|
||||
#### Migration notes for plugin maintainers {#migrating-plugins}
|
||||
|
|
|
@ -24,6 +24,7 @@ const branch = isNightly ? 'develop' : 'master'
|
|||
// Those variables are going to be replaced in the Markdown, e.g. %%GITHUB_SPACY
|
||||
const replacements = {
|
||||
GITHUB_SPACY: `https://github.com/explosion/spaCy/tree/${branch}`,
|
||||
GITHUB_PROJECTS: `https://github.com/${site.projectsRepo}`,
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -1,21 +1,11 @@
|
|||
{
|
||||
"languages": [
|
||||
{
|
||||
"code": "zh",
|
||||
"name": "Chinese",
|
||||
"models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"],
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "Jieba",
|
||||
"url": "https://github.com/fxsjy/jieba"
|
||||
},
|
||||
{
|
||||
"name": "PKUSeg",
|
||||
"url": "https://github.com/lancopku/PKUSeg-python"
|
||||
}
|
||||
],
|
||||
"has_examples": true
|
||||
},
|
||||
{ "code": "af", "name": "Afrikaans" },
|
||||
{ "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true },
|
||||
{ "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true },
|
||||
{ "code": "bn", "name": "Bengali", "has_examples": true },
|
||||
{ "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
|
||||
{ "code": "cs", "name": "Czech", "has_examples": true },
|
||||
{
|
||||
"code": "da",
|
||||
"name": "Danish",
|
||||
|
@ -23,39 +13,10 @@
|
|||
"has_examples": true,
|
||||
"models": ["da_core_news_sm", "da_core_news_md", "da_core_news_lg"]
|
||||
},
|
||||
{
|
||||
"code": "nl",
|
||||
"name": "Dutch",
|
||||
"models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
|
||||
"example": "Dit is een zin.",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "en",
|
||||
"name": "English",
|
||||
"models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg"],
|
||||
"starters": [
|
||||
"en_vectors_web_lg",
|
||||
"en_trf_bertbaseuncased_lg",
|
||||
"en_trf_robertabase_lg",
|
||||
"en_trf_distilbertbaseuncased_lg",
|
||||
"en_trf_xlnetbasecased_lg"
|
||||
],
|
||||
"example": "This is a sentence.",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "fr",
|
||||
"name": "French",
|
||||
"models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg"],
|
||||
"example": "C'est une phrase.",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "de",
|
||||
"name": "German",
|
||||
"models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg"],
|
||||
"starters": ["de_trf_bertbasecased_lg"],
|
||||
"models": ["de_core_news_sm", "de_core_news_md", "de_core_news_lg", "de_dep_news_trf"],
|
||||
"example": "Dies ist ein Satz.",
|
||||
"has_examples": true
|
||||
},
|
||||
|
@ -66,6 +27,46 @@
|
|||
"example": "Αυτή είναι μια πρόταση.",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "en",
|
||||
"name": "English",
|
||||
"models": ["en_core_web_sm", "en_core_web_md", "en_core_web_lg", "en_core_web_trf"],
|
||||
"starters": ["en_vectors_web_lg"],
|
||||
"example": "This is a sentence.",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "es",
|
||||
"name": "Spanish",
|
||||
"models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg", "es_dep_news_trf"],
|
||||
"example": "Esto es una frase.",
|
||||
"has_examples": true
|
||||
},
|
||||
{ "code": "et", "name": "Estonian" },
|
||||
{ "code": "eu", "name": "Basque", "has_examples": true },
|
||||
{ "code": "fa", "name": "Persian", "has_examples": true },
|
||||
{ "code": "fi", "name": "Finnish", "has_examples": true },
|
||||
{
|
||||
"code": "fr",
|
||||
"name": "French",
|
||||
"models": ["fr_core_news_sm", "fr_core_news_md", "fr_core_news_lg", "fr_dep_news_trf"],
|
||||
"example": "C'est une phrase.",
|
||||
"has_examples": true
|
||||
},
|
||||
{ "code": "ga", "name": "Irish" },
|
||||
{ "code": "gu", "name": "Gujarati", "has_examples": true },
|
||||
{ "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },
|
||||
{ "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true },
|
||||
{ "code": "hr", "name": "Croatian", "has_examples": true },
|
||||
{ "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true },
|
||||
{ "code": "hy", "name": "Armenian", "has_examples": true },
|
||||
{
|
||||
"code": "id",
|
||||
"name": "Indonesian",
|
||||
"example": "Ini adalah sebuah kalimat.",
|
||||
"has_examples": true
|
||||
},
|
||||
{ "code": "is", "name": "Icelandic" },
|
||||
{
|
||||
"code": "it",
|
||||
"name": "Italian",
|
||||
|
@ -88,12 +89,37 @@
|
|||
"example": "これは文章です。",
|
||||
"has_examples": true
|
||||
},
|
||||
{ "code": "kn", "name": "Kannada", "has_examples": true },
|
||||
{
|
||||
"code": "ko",
|
||||
"name": "Korean",
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "mecab-ko",
|
||||
"url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md"
|
||||
},
|
||||
{ "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" },
|
||||
{ "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" }
|
||||
],
|
||||
"example": "이것은 문장입니다.",
|
||||
"has_examples": true
|
||||
},
|
||||
{ "code": "lb", "name": "Luxembourgish", "has_examples": true },
|
||||
{
|
||||
"code": "lij",
|
||||
"name": "Ligurian",
|
||||
"example": "Sta chì a l'é unna fraxe.",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "lt",
|
||||
"name": "Lithuanian",
|
||||
"has_examples": true,
|
||||
"models": ["lt_core_news_sm", "lt_core_news_md", "lt_core_news_lg"]
|
||||
},
|
||||
{ "code": "lv", "name": "Latvian" },
|
||||
{ "code": "ml", "name": "Malayalam", "has_examples": true },
|
||||
{ "code": "mr", "name": "Marathi" },
|
||||
{
|
||||
"code": "nb",
|
||||
"name": "Norwegian Bokmål",
|
||||
|
@ -101,6 +127,14 @@
|
|||
"has_examples": true,
|
||||
"models": ["nb_core_news_sm", "nb_core_news_md", "nb_core_news_lg"]
|
||||
},
|
||||
{ "code": "ne", "name": "Nepali", "has_examples": true },
|
||||
{
|
||||
"code": "nl",
|
||||
"name": "Dutch",
|
||||
"models": ["nl_core_news_sm", "nl_core_news_md", "nl_core_news_lg"],
|
||||
"example": "Dit is een zin.",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "pl",
|
||||
"name": "Polish",
|
||||
|
@ -122,69 +156,26 @@
|
|||
"has_examples": true,
|
||||
"models": ["ro_core_news_sm", "ro_core_news_md", "ro_core_news_lg"]
|
||||
},
|
||||
{
|
||||
"code": "es",
|
||||
"name": "Spanish",
|
||||
"models": ["es_core_news_sm", "es_core_news_md", "es_core_news_lg"],
|
||||
"example": "Esto es una frase.",
|
||||
"has_examples": true
|
||||
},
|
||||
{ "code": "sv", "name": "Swedish", "has_examples": true },
|
||||
{ "code": "fi", "name": "Finnish", "has_examples": true },
|
||||
{ "code": "hu", "name": "Hungarian", "example": "Ez egy mondat.", "has_examples": true },
|
||||
{
|
||||
"code": "ru",
|
||||
"name": "Russian",
|
||||
"has_examples": true,
|
||||
"dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
|
||||
},
|
||||
{
|
||||
"code": "uk",
|
||||
"name": "Ukrainian",
|
||||
"has_examples": true,
|
||||
"dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
|
||||
},
|
||||
{ "code": "hr", "name": "Croatian", "has_examples": true },
|
||||
{ "code": "eu", "name": "Basque", "has_examples": true },
|
||||
{ "code": "yo", "name": "Yoruba", "has_examples": true },
|
||||
{ "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
|
||||
{ "code": "ca", "name": "Catalan", "example": "Això és una frase.", "has_examples": true },
|
||||
{ "code": "he", "name": "Hebrew", "example": "זהו משפט.", "has_examples": true },
|
||||
{ "code": "ar", "name": "Arabic", "example": "هذه جملة", "has_examples": true },
|
||||
{ "code": "fa", "name": "Persian", "has_examples": true },
|
||||
{ "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true },
|
||||
{ "code": "tt", "name": "Tatar", "has_examples": true },
|
||||
{ "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true },
|
||||
{ "code": "sa", "name": "Sanskrit", "has_examples": true },
|
||||
{ "code": "si", "name": "Sinhala", "example": "මෙය වාක්යයකි.", "has_examples": true },
|
||||
{ "code": "ga", "name": "Irish" },
|
||||
{ "code": "bn", "name": "Bengali", "has_examples": true },
|
||||
{ "code": "hi", "name": "Hindi", "example": "यह एक वाक्य है।", "has_examples": true },
|
||||
{ "code": "mr", "name": "Marathi" },
|
||||
{ "code": "kn", "name": "Kannada" },
|
||||
{ "code": "ta", "name": "Tamil", "has_examples": true },
|
||||
{
|
||||
"code": "id",
|
||||
"name": "Indonesian",
|
||||
"example": "Ini adalah sebuah kalimat.",
|
||||
"has_examples": true
|
||||
},
|
||||
{ "code": "tl", "name": "Tagalog" },
|
||||
{ "code": "af", "name": "Afrikaans" },
|
||||
{ "code": "bg", "name": "Bulgarian", "example": "Това е изречение", "has_examples": true },
|
||||
{ "code": "cs", "name": "Czech" },
|
||||
{ "code": "is", "name": "Icelandic" },
|
||||
{ "code": "lv", "name": "Latvian" },
|
||||
{ "code": "sr", "name": "Serbian" },
|
||||
{ "code": "sk", "name": "Slovak" },
|
||||
{ "code": "sk", "name": "Slovak", "has_examples": true },
|
||||
{ "code": "sl", "name": "Slovenian" },
|
||||
{ "code": "lb", "name": "Luxembourgish" },
|
||||
{
|
||||
"code": "sq",
|
||||
"name": "Albanian",
|
||||
"example": "Kjo është një fjali.",
|
||||
"has_examples": true
|
||||
},
|
||||
{ "code": "et", "name": "Estonian" },
|
||||
{ "code": "sr", "name": "Serbian", "has_examples": true },
|
||||
{ "code": "sv", "name": "Swedish", "has_examples": true },
|
||||
{ "code": "ta", "name": "Tamil", "has_examples": true },
|
||||
{ "code": "te", "name": "Telugu", "example": "ఇది ఒక వాక్యం.", "has_examples": true },
|
||||
{
|
||||
"code": "th",
|
||||
"name": "Thai",
|
||||
|
@ -194,51 +185,43 @@
|
|||
"example": "นี่คือประโยค",
|
||||
"has_examples": true
|
||||
},
|
||||
{ "code": "tl", "name": "Tagalog" },
|
||||
{ "code": "tr", "name": "Turkish", "example": "Bu bir cümledir.", "has_examples": true },
|
||||
{ "code": "tt", "name": "Tatar", "has_examples": true },
|
||||
{
|
||||
"code": "ko",
|
||||
"name": "Korean",
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "mecab-ko",
|
||||
"url": "https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md"
|
||||
},
|
||||
{ "name": "mecab-ko-dic", "url": "https://bitbucket.org/eunjeon/mecab-ko-dic" },
|
||||
{ "name": "natto-py", "url": "https://github.com/buruzaemon/natto-py" }
|
||||
],
|
||||
"example": "이것은 문장입니다.",
|
||||
"has_examples": true
|
||||
"code": "uk",
|
||||
"name": "Ukrainian",
|
||||
"has_examples": true,
|
||||
"dependencies": [{ "name": "pymorphy2", "url": "https://github.com/kmike/pymorphy2" }]
|
||||
},
|
||||
{ "code": "ur", "name": "Urdu", "example": "یہ ایک جملہ ہے", "has_examples": true },
|
||||
{
|
||||
"code": "vi",
|
||||
"name": "Vietnamese",
|
||||
"dependencies": [{ "name": "Pyvi", "url": "https://github.com/trungtv/pyvi" }]
|
||||
},
|
||||
{
|
||||
"code": "lij",
|
||||
"name": "Ligurian",
|
||||
"example": "Sta chì a l'é unna fraxe.",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "hy",
|
||||
"name": "Armenian",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "gu",
|
||||
"name": "Gujarati",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "ml",
|
||||
"name": "Malayalam",
|
||||
"has_examples": true
|
||||
},
|
||||
{
|
||||
"code": "xx",
|
||||
"name": "Multi-language",
|
||||
"models": ["xx_ent_wiki_sm"],
|
||||
"example": "This is a sentence about Facebook."
|
||||
},
|
||||
{ "code": "yo", "name": "Yoruba", "has_examples": true },
|
||||
{
|
||||
"code": "zh",
|
||||
"name": "Chinese",
|
||||
"models": ["zh_core_web_sm", "zh_core_web_md", "zh_core_web_lg"],
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "Jieba",
|
||||
"url": "https://github.com/fxsjy/jieba"
|
||||
},
|
||||
{
|
||||
"name": "PKUSeg",
|
||||
"url": "https://github.com/lancopku/PKUSeg-python"
|
||||
}
|
||||
],
|
||||
"has_examples": true
|
||||
}
|
||||
],
|
||||
"licenses": [
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
import React from 'react'
|
||||
import React, { Fragment } from 'react'
|
||||
import PropTypes from 'prop-types'
|
||||
import classNames from 'classnames'
|
||||
|
||||
|
@ -14,30 +14,34 @@ export default function Infobox({
|
|||
className,
|
||||
children,
|
||||
}) {
|
||||
const Wrapper = id ? 'div' : Fragment
|
||||
const infoboxClassNames = classNames(classes.root, className, {
|
||||
[classes.list]: !!list,
|
||||
[classes.warning]: variant === 'warning',
|
||||
[classes.danger]: variant === 'danger',
|
||||
})
|
||||
return (
|
||||
<aside className={infoboxClassNames} id={id}>
|
||||
{title && (
|
||||
<h4 className={classes.title}>
|
||||
{variant !== 'default' && !emoji && (
|
||||
<Icon width={18} name={variant} inline className={classes.icon} />
|
||||
)}
|
||||
<span className={classes.titleText}>
|
||||
{emoji && (
|
||||
<span className={classes.emoji} aria-hidden="true">
|
||||
{emoji}
|
||||
</span>
|
||||
<Wrapper>
|
||||
{id && <a id={id} />}
|
||||
<aside className={infoboxClassNames}>
|
||||
{title && (
|
||||
<h4 className={classes.title}>
|
||||
{variant !== 'default' && !emoji && (
|
||||
<Icon width={18} name={variant} inline className={classes.icon} />
|
||||
)}
|
||||
{title}
|
||||
</span>
|
||||
</h4>
|
||||
)}
|
||||
{children}
|
||||
</aside>
|
||||
<span className={classes.titleText}>
|
||||
{emoji && (
|
||||
<span className={classes.emoji} aria-hidden="true">
|
||||
{emoji}
|
||||
</span>
|
||||
)}
|
||||
{title}
|
||||
</span>
|
||||
</h4>
|
||||
)}
|
||||
{children}
|
||||
</aside>
|
||||
</Wrapper>
|
||||
)
|
||||
}
|
||||
|
||||
|
|
|
@ -12,7 +12,6 @@ import Tag from '../components/tag'
|
|||
import { H2, Label } from '../components/typography'
|
||||
import Icon from '../components/icon'
|
||||
import Link from '../components/link'
|
||||
import Grid from '../components/grid'
|
||||
import Infobox from '../components/infobox'
|
||||
import Accordion from '../components/accordion'
|
||||
import { join, arrayToObj, abbrNum, markdownToReact } from '../components/util'
|
||||
|
@ -31,10 +30,16 @@ const MODEL_META = {
|
|||
wiki: 'Wikipedia',
|
||||
uas: 'Unlabelled dependencies',
|
||||
las: 'Labelled dependencies',
|
||||
token_acc: 'Tokenization',
|
||||
tok: 'Tokenization',
|
||||
tags_acc: 'Part-of-speech tags (fine grained tags, Token.tag)',
|
||||
ents_f: 'Entities (F-score)',
|
||||
ents_p: 'Entities (precision)',
|
||||
ents_r: 'Entities (recall)',
|
||||
tag: 'Part-of-speech tags (fine grained tags, Token.tag)',
|
||||
ents_f: 'Named entities (F-score)',
|
||||
ents_p: 'Named entities (precision)',
|
||||
ents_r: 'Named entities (recall)',
|
||||
sent_f: 'Sentence segmentation (F-score)',
|
||||
sent_p: 'Sentence segmentation (precision)',
|
||||
sent_r: 'Sentence segmentation (recall)',
|
||||
cpu: 'words per second on CPU',
|
||||
gpu: 'words per second on GPU',
|
||||
pipeline: 'Active processing pipeline components in order',
|
||||
|
@ -83,25 +88,19 @@ function formatVectors(data) {
|
|||
}
|
||||
|
||||
function formatAccuracy(data) {
|
||||
if (!data) return null
|
||||
const labels = {
|
||||
las: 'LAS',
|
||||
uas: 'UAS',
|
||||
tags_acc: 'TAG',
|
||||
ents_f: 'NER F',
|
||||
ents_p: 'NER P',
|
||||
ents_r: 'NER R',
|
||||
}
|
||||
const isSyntax = key => ['tags_acc', 'las', 'uas'].includes(key)
|
||||
const isNer = key => key.startsWith('ents_')
|
||||
if (!data) return []
|
||||
return Object.keys(data)
|
||||
.filter(key => labels[key])
|
||||
.map(key => ({
|
||||
label: labels[key],
|
||||
value: data[key].toFixed(2),
|
||||
help: MODEL_META[key],
|
||||
type: isNer(key) ? 'ner' : isSyntax(key) ? 'syntax' : null,
|
||||
}))
|
||||
.map(label => {
|
||||
const value = data[label]
|
||||
return isNaN(value)
|
||||
? null
|
||||
: {
|
||||
label,
|
||||
value: value.toFixed(2),
|
||||
help: MODEL_META[label],
|
||||
}
|
||||
})
|
||||
.filter(item => item)
|
||||
}
|
||||
|
||||
function formatModelMeta(data) {
|
||||
|
@ -188,16 +187,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
|||
{ label: 'Author', content: author },
|
||||
{ label: 'License', content: license },
|
||||
]
|
||||
const accuracy = [
|
||||
{
|
||||
label: 'Syntax Accuracy',
|
||||
items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'syntax') : null,
|
||||
},
|
||||
{
|
||||
label: 'NER Accuracy',
|
||||
items: meta.accuracy ? meta.accuracy.filter(a => a.type === 'ner') : null,
|
||||
},
|
||||
]
|
||||
|
||||
const error = (
|
||||
<Infobox title="Unable to load model details from GitHub" variant="danger">
|
||||
|
@ -209,7 +198,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
|||
</p>
|
||||
</Infobox>
|
||||
)
|
||||
|
||||
return (
|
||||
<Section id={name}>
|
||||
<H2
|
||||
|
@ -254,33 +242,6 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
|||
)}
|
||||
</tbody>
|
||||
</Table>
|
||||
<Grid cols={2} gutterBottom={hasInteractiveCode || !!labels}>
|
||||
{accuracy &&
|
||||
accuracy.map(({ label, items }, i) =>
|
||||
!items ? null : (
|
||||
<Table fixed key={i}>
|
||||
<thead>
|
||||
<Tr>
|
||||
<Th colSpan={2}>{label}</Th>
|
||||
</Tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{items.map((item, i) => (
|
||||
<Tr key={i}>
|
||||
<Td>
|
||||
<Label>
|
||||
{item.label}{' '}
|
||||
{item.help && <Help>{item.help}</Help>}
|
||||
</Label>
|
||||
</Td>
|
||||
<Td num>{item.value}</Td>
|
||||
</Tr>
|
||||
))}
|
||||
</tbody>
|
||||
</Table>
|
||||
)
|
||||
)}
|
||||
</Grid>
|
||||
{meta.notes && markdownToReact(meta.notes, MARKDOWN_COMPONENTS)}
|
||||
{hasInteractiveCode && (
|
||||
<CodeBlock title="Try out the model" lang="python" executable={true}>
|
||||
|
@ -288,7 +249,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
|||
`import spacy`,
|
||||
`from spacy.lang.${langId}.examples import sentences `,
|
||||
``,
|
||||
`nlp = spacy.load('${name}')`,
|
||||
`nlp = spacy.load("${name}")`,
|
||||
`doc = nlp(sentences[0])`,
|
||||
`print(doc.text)`,
|
||||
`for token in doc:`,
|
||||
|
@ -296,6 +257,25 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
|||
].join('\n')}
|
||||
</CodeBlock>
|
||||
)}
|
||||
{meta.accuracy && (
|
||||
<Accordion id={`${name}-accuracy`} title="Accuracy Evaluation">
|
||||
<Table>
|
||||
<tbody>
|
||||
{meta.accuracy.map(({ label, value, help }) => (
|
||||
<Tr key={`${name}-${label}`}>
|
||||
<Td nowrap>
|
||||
<InlineCode>{label.toUpperCase()}</InlineCode>
|
||||
</Td>
|
||||
<Td>{help}</Td>
|
||||
<Td num style={{ textAlign: 'right' }}>
|
||||
{value}
|
||||
</Td>
|
||||
</Tr>
|
||||
))}
|
||||
</tbody>
|
||||
</Table>
|
||||
</Accordion>
|
||||
)}
|
||||
{labels && (
|
||||
<Accordion id={`${name}-labels`} title="Label Scheme">
|
||||
<p>
|
||||
|
@ -313,7 +293,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
|||
const labelNames = labels[pipe] || []
|
||||
const help = LABEL_SCHEME_META[pipe]
|
||||
return (
|
||||
<Tr key={pipe} evenodd={false} key={pipe}>
|
||||
<Tr key={`${name}-${pipe}`} evenodd={false} key={pipe}>
|
||||
<Td style={{ width: '20%' }}>
|
||||
<Label>
|
||||
{pipe} {help && <Help>{help}</Help>}
|
||||
|
@ -343,7 +323,7 @@ const Model = ({ name, langId, langName, baseUrl, repo, compatibility, hasExampl
|
|||
const Models = ({ pageContext, repo, children }) => {
|
||||
const [initialized, setInitialized] = useState(false)
|
||||
const [compatibility, setCompatibility] = useState({})
|
||||
const { id, title, meta } = pageContext
|
||||
const { id, title, meta, hasExamples } = pageContext
|
||||
const { models, isStarters } = meta
|
||||
const baseUrl = `https://raw.githubusercontent.com/${repo}/master`
|
||||
|
||||
|
@ -360,7 +340,6 @@ const Models = ({ pageContext, repo, children }) => {
|
|||
|
||||
const modelTitle = title
|
||||
const modelTeaser = `Available trained pipelines for ${title}`
|
||||
|
||||
const starterTitle = `${title} starters`
|
||||
const starterTeaser = `Available transfer learning starter packs for ${title}`
|
||||
|
||||
|
@ -392,6 +371,7 @@ const Models = ({ pageContext, repo, children }) => {
|
|||
baseUrl={baseUrl}
|
||||
repo={repo}
|
||||
licenses={arrayToObj(site.siteMetadata.licenses, 'id')}
|
||||
hasExamples={meta.hasExamples}
|
||||
/>
|
||||
))
|
||||
}
|
||||
|
|
|
@ -297,7 +297,7 @@ const Landing = ({ data }) => {
|
|||
to run.
|
||||
</p>
|
||||
<p>
|
||||
<Button to="/usage/facts-figures#benchmarks">See details</Button>
|
||||
<Button to="/usage/facts-figures#benchmarks">More results</Button>
|
||||
</p>
|
||||
</LandingCol>
|
||||
|
||||
|
|
|
@ -22,7 +22,7 @@ const Language = ({ name, code, models }) => (
|
|||
<Td>
|
||||
{models && models.length ? (
|
||||
<Link to={`/models/${code}`}>
|
||||
{models.length} {models.length === 1 ? 'model' : 'models'}
|
||||
{models.length} {models.length === 1 ? 'package' : 'packages'}
|
||||
</Link>
|
||||
) : (
|
||||
<em>none yet</em>
|
||||
|
@ -51,7 +51,7 @@ const Languages = () => (
|
|||
<Th>Language</Th>
|
||||
<Th>Code</Th>
|
||||
<Th>Language Data</Th>
|
||||
<Th>Models</Th>
|
||||
<Th>Pipelines</Th>
|
||||
</Tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
|
|
|
@ -16,7 +16,8 @@ export default function Project({
|
|||
}) {
|
||||
const repoArg = repo ? ` --repo ${repo}` : ''
|
||||
const text = `${COMMAND} ${id}${repoArg}`
|
||||
const url = `${repo || projectsRepo}/${id}`
|
||||
const defaultRepo = `https://github.com/${projectsRepo}`
|
||||
const url = `${repo || defaultRepo}/${id}`
|
||||
const header = (
|
||||
<>
|
||||
{title}:{' '}
|
||||
|
|
Loading…
Reference in New Issue
Block a user