Merge branch 'develop' into nightly.spacy.io

This commit is contained in:
Ines Montani 2020-09-12 17:55:45 +02:00
commit 472b9b4fa3
180 changed files with 2008 additions and 1902 deletions

View File

@ -1,7 +1,7 @@
SHELL := /bin/bash SHELL := /bin/bash
ifndef SPACY_EXTRAS ifndef SPACY_EXTRAS
override SPACY_EXTRAS = spacy-lookups-data jieba pkuseg==0.0.25 sudachipy sudachidict_core override SPACY_EXTRAS = spacy-lookups-data==0.4.0.dev0 jieba pkuseg==0.0.25 sudachipy sudachidict_core
endif endif
ifndef PYVER ifndef PYVER

View File

@ -1,7 +1,7 @@
from pathlib import Path from pathlib import Path
import plac import plac
import spacy import spacy
from spacy.gold import docs_to_json from spacy.training import docs_to_json
import srsly import srsly
import sys import sys

View File

@ -31,10 +31,13 @@ lang = "en"
vectors = null vectors = null
[nlp.pipeline.ner] [nlp.pipeline.ner]
factory = "simple_ner" factory = "ner"
[nlp.pipeline.ner.model] [nlp.pipeline.ner.model]
@architectures = "spacy.BiluoTagger.v1" @architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 6
hidden_width = 64
maxout_pieces = 2
[nlp.pipeline.ner.model.tok2vec] [nlp.pipeline.ner.model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1" @architectures = "spacy.HashEmbedCNN.v1"

View File

@ -6,7 +6,7 @@ requires = [
"cymem>=2.0.2,<2.1.0", "cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0", "preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0", "murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.0a30,<8.0.0a40", "thinc>=8.0.0a31,<8.0.0a40",
"blis>=0.4.0,<0.5.0", "blis>=0.4.0,<0.5.0",
"pytokenizations", "pytokenizations",
"pathy" "pathy"

View File

@ -1,7 +1,7 @@
# Our libraries # Our libraries
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a30,<8.0.0a40 thinc>=8.0.0a31,<8.0.0a40
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
ml_datasets>=0.1.1 ml_datasets>=0.1.1
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0

View File

@ -34,13 +34,13 @@ setup_requires =
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
thinc>=8.0.0a30,<8.0.0a40 thinc>=8.0.0a31,<8.0.0a40
install_requires = install_requires =
# Our libraries # Our libraries
murmurhash>=0.28.0,<1.1.0 murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0 cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0 preshed>=3.0.2,<3.1.0
thinc>=8.0.0a30,<8.0.0a40 thinc>=8.0.0a31,<8.0.0a40
blis>=0.4.0,<0.5.0 blis>=0.4.0,<0.5.0
wasabi>=0.8.0,<1.1.0 wasabi>=0.8.0,<1.1.0
srsly>=2.1.0,<3.0.0 srsly>=2.1.0,<3.0.0
@ -64,7 +64,7 @@ console_scripts =
[options.extras_require] [options.extras_require]
lookups = lookups =
spacy_lookups_data>=0.3.2,<0.4.0 spacy_lookups_data==0.4.0.dev0
cuda = cuda =
cupy>=5.0.0b4,<9.0.0 cupy>=5.0.0b4,<9.0.0
cuda80 = cuda80 =

View File

@ -23,7 +23,7 @@ Options.docstrings = True
PACKAGES = find_packages() PACKAGES = find_packages()
MOD_NAMES = [ MOD_NAMES = [
"spacy.gold.example", "spacy.training.example",
"spacy.parts_of_speech", "spacy.parts_of_speech",
"spacy.strings", "spacy.strings",
"spacy.lexeme", "spacy.lexeme",
@ -48,7 +48,7 @@ MOD_NAMES = [
"spacy.pipeline._parser_internals.stateclass", "spacy.pipeline._parser_internals.stateclass",
"spacy.pipeline._parser_internals.transition_system", "spacy.pipeline._parser_internals.transition_system",
"spacy.tokenizer", "spacy.tokenizer",
"spacy.gold.gold_io", "spacy.training.gold_io",
"spacy.tokens.doc", "spacy.tokens.doc",
"spacy.tokens.span", "spacy.tokens.span",
"spacy.tokens.token", "spacy.tokens.token",

View File

@ -1,7 +1,8 @@
# fmt: off # fmt: off
__title__ = "spacy-nightly" __title__ = "spacy-nightly"
__version__ = "3.0.0a14" __version__ = "3.0.0a16"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/spacy-boilerplates" __projects__ = "https://github.com/explosion/projects"
__projects_branch__ = "v3"

View File

@ -1,4 +1,4 @@
from typing import Dict, Any, Union, List, Optional, TYPE_CHECKING from typing import Dict, Any, Union, List, Optional, Tuple, TYPE_CHECKING
import sys import sys
import shutil import shutil
from pathlib import Path from pathlib import Path
@ -6,6 +6,7 @@ from wasabi import msg
import srsly import srsly
import hashlib import hashlib
import typer import typer
import subprocess
from click import NoSuchOption from click import NoSuchOption
from typer.main import get_command from typer.main import get_command
from contextlib import contextmanager from contextlib import contextmanager
@ -13,7 +14,7 @@ from thinc.config import Config, ConfigValidationError
from configparser import InterpolationError from configparser import InterpolationError
from ..schemas import ProjectConfigSchema, validate from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir from ..util import import_file, run_command, make_tempdir, registry
if TYPE_CHECKING: if TYPE_CHECKING:
from pathy import Pathy # noqa: F401 from pathy import Pathy # noqa: F401
@ -54,6 +55,8 @@ app.add_typer(init_cli)
def setup_cli() -> None: def setup_cli() -> None:
# Make sure the entry-point for CLI runs, so that they get imported.
registry.cli.get_all()
# Ensure that the help messages always display the correct prompt # Ensure that the help messages always display the correct prompt
command = get_command(app) command = get_command(app)
command(prog_name=COMMAND) command(prog_name=COMMAND)
@ -318,33 +321,87 @@ def git_sparse_checkout(repo: str, subpath: str, dest: Path, *, branch: str = "m
# *that* we can do by path. # *that* we can do by path.
# We're using Git and sparse checkout to only clone the files we need # We're using Git and sparse checkout to only clone the files we need
with make_tempdir() as tmp_dir: with make_tempdir() as tmp_dir:
git_version = get_git_version()
supports_sparse = git_version >= (2, 22)
# This is the "clone, but don't download anything" part. # This is the "clone, but don't download anything" part.
cmd = ( cmd = f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " f"-b {branch} "
f"git clone {repo} {tmp_dir} --no-checkout --depth 1 " if supports_sparse:
f"--filter=blob:none " # <-- The key bit cmd += f"--filter=blob:none" # <-- The key bit
f"-b {branch}" else:
msg.warn(
f"You're running an old version of Git (v{git_version[0]}.{git_version[1]}) "
f"that doesn't fully support sparse checkout yet. This means that "
f"more files than necessary may be downloaded temporarily. To "
f"only download the files needed, upgrade to Git v2.22 or above."
) )
run_command(cmd, capture=True) _attempt_run_command(cmd)
# Now we need to find the missing filenames for the subpath we want. # Now we need to find the missing filenames for the subpath we want.
# Looking for this 'rev-list' command in the git --help? Hah. # Looking for this 'rev-list' command in the git --help? Hah.
cmd = f"git -C {tmp_dir} rev-list --objects --all --missing=print -- {subpath}" cmd = f"git -C {tmp_dir} rev-list --objects --all {'--missing=print ' if supports_sparse else ''} -- {subpath}"
ret = run_command(cmd, capture=True) ret = _attempt_run_command(cmd)
repo = _from_http_to_git(repo) git_repo = _from_http_to_git(repo)
# Now pass those missings into another bit of git internals # Now pass those missings into another bit of git internals
missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")]) missings = " ".join([x[1:] for x in ret.stdout.split() if x.startswith("?")])
cmd = f"git -C {tmp_dir} fetch-pack {repo} {missings}" if supports_sparse and not missings:
run_command(cmd, capture=True) err = (
f"Could not find any relevant files for '{subpath}'. "
f"Did you specify a correct and complete path within repo '{repo}' "
f"and branch {branch}?"
)
msg.fail(err, exits=1)
if supports_sparse:
cmd = f"git -C {tmp_dir} fetch-pack {git_repo} {missings}"
_attempt_run_command(cmd)
# And finally, we can checkout our subpath # And finally, we can checkout our subpath
cmd = f"git -C {tmp_dir} checkout {branch} {subpath}" cmd = f"git -C {tmp_dir} checkout {branch} {subpath}"
run_command(cmd) _attempt_run_command(cmd)
# We need Path(name) to make sure we also support subdirectories # We need Path(name) to make sure we also support subdirectories
shutil.move(str(tmp_dir / Path(subpath)), str(dest)) shutil.move(str(tmp_dir / Path(subpath)), str(dest))
def _from_http_to_git(repo): def get_git_version() -> Tuple[int, int]:
ret = _attempt_run_command(["git", "--version"])
# TODO: this seems kinda brittle?
version = ret.stdout[11:].strip().split(".")
return (int(version[0]), int(version[1]))
def _attempt_run_command(cmd: Union[str, List[str]]):
try:
return run_command(cmd, capture=True)
except subprocess.CalledProcessError as e:
err = f"Could not run command"
msg.fail(err)
print(cmd)
sys.exit(1)
def _from_http_to_git(repo: str) -> str:
if repo.startswith("http://"): if repo.startswith("http://"):
repo = repo.replace(r"http://", r"https://") repo = repo.replace(r"http://", r"https://")
if repo.startswith(r"https://"): if repo.startswith(r"https://"):
repo = repo.replace("https://", "git@").replace("/", ":", 1) repo = repo.replace("https://", "git@").replace("/", ":", 1)
if repo.endswith("/"):
repo = repo[:-1]
repo = f"{repo}.git" repo = f"{repo}.git"
return repo return repo
def string_to_list(value, intify=False):
"""Parse a comma-separated string to a list"""
if not value:
return []
if value.startswith("[") and value.endswith("]"):
value = value[1:-1]
result = []
for p in value.split(","):
p = p.strip()
if p.startswith("'") and p.endswith("'"):
p = p[1:-1]
if p.startswith('"') and p.endswith('"'):
p = p[1:-1]
p = p.strip()
if intify:
p = int(p)
result.append(p)
return result

View File

@ -7,9 +7,9 @@ import re
import sys import sys
from ._util import app, Arg, Opt from ._util import app, Arg, Opt
from ..gold import docs_to_json from ..training import docs_to_json
from ..tokens import DocBin from ..tokens import DocBin
from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs from ..training.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
# Converters are matched by file extension except for ner/iob, which are # Converters are matched by file extension except for ner/iob, which are

View File

@ -8,7 +8,7 @@ import typer
from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides from ._util import app, Arg, Opt, show_validation_error, parse_config_overrides
from ._util import import_code, debug_cli, get_sourced_components from ._util import import_code, debug_cli, get_sourced_components
from ..gold import Corpus, Example from ..training import Corpus, Example
from ..pipeline._parser_internals import nonproj from ..pipeline._parser_internals import nonproj
from ..language import Language from ..language import Language
from .. import util from .. import util

View File

@ -5,7 +5,7 @@ from thinc.api import require_gpu, fix_random_seed, set_dropout_rate, Adam
from thinc.api import Model, data_validation from thinc.api import Model, data_validation
import typer import typer
from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides from ._util import Arg, Opt, debug_cli, show_validation_error, parse_config_overrides, string_to_list
from .. import util from .. import util
@ -38,12 +38,13 @@ def debug_model_cli(
require_gpu(use_gpu) require_gpu(use_gpu)
else: else:
msg.info("Using CPU") msg.info("Using CPU")
layers = string_to_list(layers, intify=True)
print_settings = { print_settings = {
"dimensions": dimensions, "dimensions": dimensions,
"parameters": parameters, "parameters": parameters,
"gradients": gradients, "gradients": gradients,
"attributes": attributes, "attributes": attributes,
"layers": [int(x.strip()) for x in layers.split(",")] if layers else [], "layers": layers,
"print_before_training": P0, "print_before_training": P0,
"print_after_init": P1, "print_after_init": P1,
"print_after_training": P2, "print_after_training": P2,
@ -84,11 +85,11 @@ def debug_model(model: Model, *, print_settings: Optional[Dict[str, Any]] = None
_print_model(model, print_settings) _print_model(model, print_settings)
# STEP 1: Initializing the model and printing again # STEP 1: Initializing the model and printing again
X = _get_docs()
Y = _get_output(model.ops.xp) Y = _get_output(model.ops.xp)
_set_output_dim(nO=Y.shape[-1], model=model)
# The output vector might differ from the official type of the output layer # The output vector might differ from the official type of the output layer
with data_validation(False): with data_validation(False):
model.initialize(X=_get_docs(), Y=Y) model.initialize(X=X, Y=Y)
if print_settings.get("print_after_init"): if print_settings.get("print_after_init"):
msg.divider(f"STEP 1 - after initialization") msg.divider(f"STEP 1 - after initialization")
_print_model(model, print_settings) _print_model(model, print_settings)
@ -135,15 +136,6 @@ def _get_output(xp):
return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32") return xp.asarray([i + 10 for i, _ in enumerate(_get_docs())], dtype="float32")
def _set_output_dim(model, nO):
# the dim inference doesn't always work 100%, we need this hack like we have it in pipe.pyx
if model.has_dim("nO") is None:
model.set_dim("nO", nO)
if model.has_ref("output_layer"):
if model.get_ref("output_layer").has_dim("nO") is None:
model.get_ref("output_layer").set_dim("nO", nO)
def _print_model(model, print_settings): def _print_model(model, print_settings):
layers = print_settings.get("layers", "") layers = print_settings.get("layers", "")
parameters = print_settings.get("parameters", False) parameters = print_settings.get("parameters", False)

View File

@ -5,7 +5,7 @@ import re
import srsly import srsly
from thinc.api import require_gpu, fix_random_seed from thinc.api import require_gpu, fix_random_seed
from ..gold import Corpus from ..training import Corpus
from ..tokens import Doc from ..tokens import Doc
from ._util import app, Arg, Opt from ._util import app, Arg, Opt
from ..scorer import Scorer from ..scorer import Scorer

View File

@ -9,7 +9,7 @@ import re
from .. import util from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema from ..schemas import RecommendationSchema
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND, string_to_list
ROOT = Path(__file__).parent / "templates" ROOT = Path(__file__).parent / "templates"
@ -42,7 +42,7 @@ def init_config_cli(
""" """
if isinstance(optimize, Optimizations): # instance of enum from the CLI if isinstance(optimize, Optimizations): # instance of enum from the CLI
optimize = optimize.value optimize = optimize.value
pipeline = [p.strip() for p in pipeline.split(",")] pipeline = string_to_list(pipeline)
init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu) init_config(output_file, lang=lang, pipeline=pipeline, optimize=optimize, cpu=cpu)

View File

@ -256,6 +256,7 @@ def add_vectors(
def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int): def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
f = open_file(vectors_loc) f = open_file(vectors_loc)
f = ensure_shape(f)
shape = tuple(int(size) for size in next(f).split()) shape = tuple(int(size) for size in next(f).split())
if truncate_vectors >= 1: if truncate_vectors >= 1:
shape = (truncate_vectors, shape[1]) shape = (truncate_vectors, shape[1])
@ -274,6 +275,31 @@ def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
return vectors_data, vectors_keys return vectors_data, vectors_keys
def ensure_shape(lines):
"""Ensure that the first line of the data is the vectors shape.
If it's not, we read in the data and output the shape as the first result,
so that the reader doesn't have to deal with the problem.
"""
first_line = next(lines)
try:
shape = tuple(int(size) for size in first_line.split())
except ValueError:
shape = None
if shape is not None:
# All good, give the data
yield first_line
yield from lines
else:
# Figure out the shape, make it the first value, and then give the
# rest of the data.
width = len(first_line.split()) - 1
captured = [first_line] + list(lines)
length = len(captured)
yield f"{length} {width}"
yield from captured
def read_freqs( def read_freqs(
freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50 freqs_loc: Path, max_length: int = 100, min_doc_freq: int = 5, min_freq: int = 50
): ):

View File

@ -18,6 +18,7 @@ def package_cli(
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False), meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"), create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"), version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"), no_sdist: bool = Opt(False, "--no-sdist", "-NS", help="Don't build .tar.gz sdist, can be set if you want to run this step manually"),
force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"), force: bool = Opt(False, "--force", "-f", "-F", help="Force overwriting existing data in output directory"),
@ -38,6 +39,7 @@ def package_cli(
input_dir, input_dir,
output_dir, output_dir,
meta_path=meta_path, meta_path=meta_path,
name=name,
version=version, version=version,
create_meta=create_meta, create_meta=create_meta,
create_sdist=not no_sdist, create_sdist=not no_sdist,
@ -50,6 +52,7 @@ def package(
input_dir: Path, input_dir: Path,
output_dir: Path, output_dir: Path,
meta_path: Optional[Path] = None, meta_path: Optional[Path] = None,
name: Optional[str] = None,
version: Optional[str] = None, version: Optional[str] = None,
create_meta: bool = False, create_meta: bool = False,
create_sdist: bool = True, create_sdist: bool = True,
@ -71,6 +74,8 @@ def package(
msg.fail("Can't load pipeline meta.json", meta_path, exits=1) msg.fail("Can't load pipeline meta.json", meta_path, exits=1)
meta = srsly.read_json(meta_path) meta = srsly.read_json(meta_path)
meta = get_meta(input_dir, meta) meta = get_meta(input_dir, meta)
if name is not None:
meta["name"] = name
if version is not None: if version is not None:
meta["version"] = version meta["version"] = version
if not create_meta: # only print if user doesn't want to overwrite if not create_meta: # only print if user doesn't want to overwrite

View File

@ -38,16 +38,21 @@ def project_assets(project_dir: Path) -> None:
msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0) msg.warn(f"No assets specified in {PROJECT_FILE}", exits=0)
msg.info(f"Fetching {len(assets)} asset(s)") msg.info(f"Fetching {len(assets)} asset(s)")
for asset in assets: for asset in assets:
dest = Path(asset["dest"]) dest = (project_dir / asset["dest"]).resolve()
checksum = asset.get("checksum") checksum = asset.get("checksum")
if "git" in asset: if "git" in asset:
if dest.exists(): if dest.exists():
# If there's already a file, check for checksum # If there's already a file, check for checksum
if checksum and checksum == get_checksum(dest): if checksum and checksum == get_checksum(dest):
msg.good(f"Skipping download with matching checksum: {dest}") msg.good(
f"Skipping download with matching checksum: {asset['dest']}"
)
continue continue
else: else:
if dest.is_dir():
shutil.rmtree(dest) shutil.rmtree(dest)
else:
dest.unlink()
git_sparse_checkout( git_sparse_checkout(
asset["git"]["repo"], asset["git"]["repo"],
asset["git"]["path"], asset["git"]["path"],
@ -67,14 +72,16 @@ def check_private_asset(dest: Path, checksum: Optional[str] = None) -> None:
"""Check and validate assets without a URL (private assets that the user """Check and validate assets without a URL (private assets that the user
has to provide themselves) and give feedback about the checksum. has to provide themselves) and give feedback about the checksum.
dest (Path): Desintation path of the asset. dest (Path): Destination path of the asset.
checksum (Optional[str]): Optional checksum of the expected file. checksum (Optional[str]): Optional checksum of the expected file.
""" """
if not Path(dest).exists(): if not Path(dest).exists():
err = f"No URL provided for asset. You need to add this file yourself: {dest}" err = f"No URL provided for asset. You need to add this file yourself: {dest}"
msg.warn(err) msg.warn(err)
else: else:
if checksum and checksum == get_checksum(dest): if not checksum:
msg.good(f"Asset already exists: {dest}")
elif checksum == get_checksum(dest):
msg.good(f"Asset exists with matching checksum: {dest}") msg.good(f"Asset exists with matching checksum: {dest}")
else: else:
msg.fail(f"Asset available but with incorrect checksum: {dest}") msg.fail(f"Asset available but with incorrect checksum: {dest}")

View File

@ -16,6 +16,7 @@ def project_clone_cli(
name: str = Arg(..., help="The name of the template to clone"), name: str = Arg(..., help="The name of the template to clone"),
dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False), dest: Optional[Path] = Arg(None, help="Where to clone the project. Defaults to current working directory", exists=False),
repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"), repo: str = Opt(about.__projects__, "--repo", "-r", help="The repository to clone from"),
branch: str = Opt(about.__projects_branch__, "--branch", "-b", help="The branch to clone from")
# fmt: on # fmt: on
): ):
"""Clone a project template from a repository. Calls into "git" and will """Clone a project template from a repository. Calls into "git" and will
@ -26,23 +27,30 @@ def project_clone_cli(
DOCS: https://nightly.spacy.io/api/cli#project-clone DOCS: https://nightly.spacy.io/api/cli#project-clone
""" """
if dest is None: if dest is None:
dest = Path.cwd() / name dest = Path.cwd() / Path(name).parts[-1]
project_clone(name, dest, repo=repo) project_clone(name, dest, repo=repo, branch=branch)
def project_clone(name: str, dest: Path, *, repo: str = about.__projects__) -> None: def project_clone(
name: str,
dest: Path,
*,
repo: str = about.__projects__,
branch: str = about.__projects_branch__,
) -> None:
"""Clone a project template from a repository. """Clone a project template from a repository.
name (str): Name of subdirectory to clone. name (str): Name of subdirectory to clone.
dest (Path): Destination path of cloned project. dest (Path): Destination path of cloned project.
repo (str): URL of Git repo containing project templates. repo (str): URL of Git repo containing project templates.
branch (str): The branch to clone from
""" """
dest = ensure_path(dest) dest = ensure_path(dest)
check_clone(name, dest, repo) check_clone(name, dest, repo)
project_dir = dest.resolve() project_dir = dest.resolve()
repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo) repo_name = re.sub(r"(http(s?)):\/\/github.com/", "", repo)
try: try:
git_sparse_checkout(repo, name, dest) git_sparse_checkout(repo, name, dest, branch=branch)
except subprocess.CalledProcessError: except subprocess.CalledProcessError:
err = f"Could not clone '{name}' from repo '{repo_name}'" err = f"Could not clone '{name}' from repo '{repo_name}'"
msg.fail(err, exits=1) msg.fail(err, exits=1)

View File

@ -1,4 +1,5 @@
from typing import Optional, Dict, Any, Tuple, Union, Callable, List from typing import Optional, Dict, Any, Tuple, Union, Callable, List
from timeit import default_timer as timer
import srsly import srsly
import tqdm import tqdm
from pathlib import Path from pathlib import Path
@ -15,7 +16,7 @@ from ._util import app, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, get_sourced_components from ._util import import_code, get_sourced_components
from ..language import Language from ..language import Language
from .. import util from .. import util
from ..gold.example import Example from ..training.example import Example
from ..errors import Errors from ..errors import Errors
@ -286,9 +287,12 @@ def train_while_improving(
] ]
raw_batches = util.minibatch(raw_examples, size=8) raw_batches = util.minibatch(raw_examples, size=8)
words_seen = 0
start_time = timer()
for step, (epoch, batch) in enumerate(train_data): for step, (epoch, batch) in enumerate(train_data):
dropout = next(dropouts) dropout = next(dropouts)
for subbatch in subdivide_batch(batch, accumulate_gradient): for subbatch in subdivide_batch(batch, accumulate_gradient):
nlp.update( nlp.update(
subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
) )
@ -317,6 +321,7 @@ def train_while_improving(
else: else:
score, other_scores = (None, None) score, other_scores = (None, None)
is_best_checkpoint = None is_best_checkpoint = None
words_seen += sum(len(eg) for eg in batch)
info = { info = {
"epoch": epoch, "epoch": epoch,
"step": step, "step": step,
@ -324,6 +329,8 @@ def train_while_improving(
"other_scores": other_scores, "other_scores": other_scores,
"losses": losses, "losses": losses,
"checkpoints": results, "checkpoints": results,
"seconds": int(timer() - start_time),
"words": words_seen,
} }
yield batch, info, is_best_checkpoint yield batch, info, is_best_checkpoint
if is_best_checkpoint is not None: if is_best_checkpoint is not None:

View File

@ -52,7 +52,7 @@ path = ${paths.train}
# data is passed in sentence-by-sentence via some prior preprocessing. # data is passed in sentence-by-sentence via some prior preprocessing.
gold_preproc = false gold_preproc = false
# Limitations on training document length # Limitations on training document length
max_length = 2000 max_length = 0
# Limitation on number of training examples # Limitation on number of training examples
limit = 0 limit = 0
@ -64,7 +64,7 @@ path = ${paths.dev}
# data is passed in sentence-by-sentence via some prior preprocessing. # data is passed in sentence-by-sentence via some prior preprocessing.
gold_preproc = false gold_preproc = false
# Limitations on training document length # Limitations on training document length
max_length = 2000 max_length = 0
# Limitation on number of training examples # Limitation on number of training examples
limit = 0 limit = 0
@ -88,9 +88,4 @@ L2 = 0.01
grad_clip = 1.0 grad_clip = 1.0
use_averages = false use_averages = false
eps = 1e-8 eps = 1e-8
learn_rate = 0.001
[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 0.001

View File

@ -66,7 +66,7 @@ class Warnings:
"in problems with the vocab further on in the pipeline.") "in problems with the vocab further on in the pipeline.")
W030 = ("Some entities could not be aligned in the text \"{text}\" with " W030 = ("Some entities could not be aligned in the text \"{text}\" with "
"entities \"{entities}\". Use " "entities \"{entities}\". Use "
"`spacy.gold.biluo_tags_from_offsets(nlp.make_doc(text), entities)`" "`spacy.training.biluo_tags_from_offsets(nlp.make_doc(text), entities)`"
" to check the alignment. Misaligned entities ('-') will be " " to check the alignment. Misaligned entities ('-') will be "
"ignored during training.") "ignored during training.")
W033 = ("Training a new {model} using a model with no lexeme normalization " W033 = ("Training a new {model} using a model with no lexeme normalization "
@ -247,8 +247,8 @@ class Errors:
"Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}") "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
E065 = ("Only one of the vector table's width and shape can be specified. " E065 = ("Only one of the vector table's width and shape can be specified. "
"Got width {width} and shape {shape}.") "Got width {width} and shape {shape}.")
E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside " E067 = ("Invalid BILUO tag sequence: Got a tag starting with {start} "
"an entity) without a preceding 'B' (beginning of an entity). " "without a preceding 'B' (beginning of an entity). "
"Tag sequence:\n{tags}") "Tag sequence:\n{tags}")
E068 = ("Invalid BILUO tag: '{tag}'.") E068 = ("Invalid BILUO tag: '{tag}'.")
E071 = ("Error creating lexeme: specified orth ID ({orth}) does not " E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
@ -320,10 +320,6 @@ class Errors:
"So instead of pickling the span, pickle the Doc it belongs to or " "So instead of pickling the span, pickle the Doc it belongs to or "
"use Span.as_doc to convert the span to a standalone Doc object.") "use Span.as_doc to convert the span to a standalone Doc object.")
E115 = ("All subtokens must have associated heads.") E115 = ("All subtokens must have associated heads.")
E116 = ("Cannot currently add labels to pretrained text classifier. Add "
"labels before training begins. This functionality was available "
"in previous versions, but had significant bugs that led to poor "
"performance.")
E117 = ("The newly split tokens must match the text of the original token. " E117 = ("The newly split tokens must match the text of the original token. "
"New orths: {new}. Old text: {old}.") "New orths: {new}. Old text: {old}.")
E118 = ("The custom extension attribute '{attr}' is not registered on the " E118 = ("The custom extension attribute '{attr}' is not registered on the "
@ -378,8 +374,9 @@ class Errors:
"should be of equal length.") "should be of equal length.")
E141 = ("Entity vectors should be of length {required} instead of the " E141 = ("Entity vectors should be of length {required} instead of the "
"provided {found}.") "provided {found}.")
E143 = ("Labels for component '{name}' not initialized. Did you forget to " E143 = ("Labels for component '{name}' not initialized. This can be fixed "
"call add_label()?") "by calling add_label, or by providing a representative batch of "
"examples to the component's begin_training method.")
E145 = ("Error reading `{param}` from input file.") E145 = ("Error reading `{param}` from input file.")
E146 = ("Could not access `{path}`.") E146 = ("Could not access `{path}`.")
E147 = ("Unexpected error in the {method} functionality of the " E147 = ("Unexpected error in the {method} functionality of the "
@ -483,6 +480,16 @@ class Errors:
E201 = ("Span index out of range.") E201 = ("Span index out of range.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E921 = ("The method 'set_output' can only be called on components that have "
"a Model with a 'resize_output' attribute. Otherwise, the output "
"layer can not be dynamically changed.")
E922 = ("Component '{name}' has been initialized with an output dimension of "
"{nO} - cannot add any more labels.")
E923 = ("It looks like there is no proper sample data to initialize the "
"Model of component '{name}'. "
"This is likely a bug in spaCy, so feel free to open an issue.")
E924 = ("The '{name}' component does not seem to be initialized properly. "
"This is likely a bug in spaCy, so feel free to open an issue.")
E925 = ("Invalid color values for displaCy visualizer: expected dictionary " E925 = ("Invalid color values for displaCy visualizer: expected dictionary "
"mapping label names to colors but got: {obj}") "mapping label names to colors but got: {obj}")
E926 = ("It looks like you're trying to modify nlp.{attr} directly. This " E926 = ("It looks like you're trying to modify nlp.{attr} directly. This "

View File

@ -17,7 +17,7 @@ from timeit import default_timer as timer
from .tokens.underscore import Underscore from .tokens.underscore import Underscore
from .vocab import Vocab, create_vocab from .vocab import Vocab, create_vocab
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .gold import Example, validate_examples from .training import Example, validate_examples
from .scorer import Scorer from .scorer import Scorer
from .util import create_default_optimizer, registry, SimpleFrozenList from .util import create_default_optimizer, registry, SimpleFrozenList
from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER from .util import SimpleFrozenDict, combine_score_weights, CONFIG_SECTION_ORDER
@ -243,6 +243,7 @@ class Language:
self._config["nlp"]["pipeline"] = list(self.component_names) self._config["nlp"]["pipeline"] = list(self.component_names)
self._config["nlp"]["disabled"] = list(self.disabled) self._config["nlp"]["disabled"] = list(self.disabled)
self._config["components"] = pipeline self._config["components"] = pipeline
if not self._config["training"].get("score_weights"):
self._config["training"]["score_weights"] = combine_score_weights(score_weights) self._config["training"]["score_weights"] = combine_score_weights(score_weights)
if not srsly.is_json_serializable(self._config): if not srsly.is_json_serializable(self._config):
raise ValueError(Errors.E961.format(config=self._config)) raise ValueError(Errors.E961.format(config=self._config))
@ -656,7 +657,7 @@ class Language:
return resolved[factory_name] return resolved[factory_name]
def create_pipe_from_source( def create_pipe_from_source(
self, source_name: str, source: "Language", *, name: str, self, source_name: str, source: "Language", *, name: str
) -> Tuple[Callable[[Doc], Doc], str]: ) -> Tuple[Callable[[Doc], Doc], str]:
"""Create a pipeline component by copying it from an existing model. """Create a pipeline component by copying it from an existing model.
@ -1155,10 +1156,13 @@ class Language:
DOCS: https://nightly.spacy.io/api/language#begin_training DOCS: https://nightly.spacy.io/api/language#begin_training
""" """
# TODO: throw warning when get_gold_tuples is provided instead of get_examples
if get_examples is None: if get_examples is None:
get_examples = lambda: [] util.logger.debug(
else: # Populate vocab "No 'get_examples' callback provided to 'Language.begin_training', creating dummy examples"
)
doc = Doc(self.vocab, words=["x", "y", "z"])
get_examples = lambda: [Example.from_dict(doc, {})]
# Populate vocab
if not hasattr(get_examples, "__call__"): if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="Language", obj=type(get_examples)) err = Errors.E930.format(name="Language", obj=type(get_examples))
raise ValueError(err) raise ValueError(err)
@ -1187,7 +1191,7 @@ class Language:
return self._optimizer return self._optimizer
def resume_training( def resume_training(
self, *, sgd: Optional[Optimizer] = None, device: int = -1, self, *, sgd: Optional[Optimizer] = None, device: int = -1
) -> Optimizer: ) -> Optimizer:
"""Continue training a pretrained model. """Continue training a pretrained model.

View File

@ -1,105 +0,0 @@
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
from typing import Dict, Optional
import numpy
from thinc.api import Model
from thinc.types import Padded, Floats3d
def BILUO() -> Model[Padded, Padded]:
return Model(
"biluo",
forward,
init=init,
dims={"nO": None},
attrs={"get_num_actions": get_num_actions},
)
def init(model, X: Optional[Padded] = None, Y: Optional[Padded] = None):
if X is not None and Y is not None:
if X.data.shape != Y.data.shape:
# TODO: Fix error
raise ValueError("Mismatched shapes (TODO: Fix message)")
model.set_dim("nO", X.data.shape[2])
elif X is not None:
model.set_dim("nO", X.data.shape[2])
elif Y is not None:
model.set_dim("nO", Y.data.shape[2])
elif model.get_dim("nO") is None:
raise ValueError("Dimension unset for BILUO: nO")
def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
n_labels = (model.get_dim("nO") - 1) // 4
n_tokens, n_docs, n_actions = Xp.data.shape
# At each timestep, we make a validity mask of shape (n_docs, n_actions)
# to indicate which actions are valid next for each sequence. To construct
# the mask, we have a state of shape (2, n_actions) and a validity table of
# shape (2, n_actions+1, n_actions). The first dimension of the state indicates
# whether it's the last token, the second dimension indicates the previous
# action, plus a special 'null action' for the first entry.
valid_transitions = model.ops.asarray(_get_transition_table(n_labels))
prev_actions = model.ops.alloc1i(n_docs)
# Initialize as though prev action was O
prev_actions.fill(n_actions - 1)
Y = model.ops.alloc3f(*Xp.data.shape)
masks = model.ops.alloc3f(*Y.shape)
max_value = Xp.data.max()
for t in range(Xp.data.shape[0]):
is_last = (Xp.lengths < (t + 2)).astype("i")
masks[t] = valid_transitions[is_last, prev_actions]
# Don't train the out-of-bounds sequences.
masks[t, Xp.size_at_t[t] :] = 0
# Valid actions get 0*10e8, invalid get large negative value
Y[t] = Xp.data[t] + ((masks[t] - 1) * max_value * 10)
prev_actions = Y[t].argmax(axis=-1)
def backprop_biluo(dY: Padded) -> Padded:
dY.data *= masks
return dY
return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
def get_num_actions(n_labels: int) -> int:
# One BEGIN action per label
# One IN action per label
# One LAST action per label
# One UNIT action per label
# One OUT action
return n_labels + n_labels + n_labels + n_labels + 1
def _get_transition_table(
n_labels: int, *, _cache: Dict[int, Floats3d] = {}
) -> Floats3d:
n_actions = get_num_actions(n_labels)
if n_actions in _cache:
return _cache[n_actions]
table = numpy.zeros((2, n_actions, n_actions), dtype="f")
B_start, B_end = (0, n_labels)
I_start, I_end = (B_end, B_end + n_labels)
L_start, L_end = (I_end, I_end + n_labels)
U_start, _ = (L_end, L_end + n_labels) # noqa: F841
# Using ranges allows us to set specific cells, which is necessary to express
# that only actions of the same label are valid continuations.
B_range = numpy.arange(B_start, B_end)
I_range = numpy.arange(I_start, I_end)
L_range = numpy.arange(L_start, L_end)
# If this is the last token and the previous action was B or I, only L
# of that label is valid
table[1, B_range, L_range] = 1
table[1, I_range, L_range] = 1
# If this isn't the last token and the previous action was B or I, only I or
# L of that label are valid.
table[0, B_range, I_range] = 1
table[0, B_range, L_range] = 1
table[0, I_range, I_range] = 1
table[0, I_range, L_range] = 1
# If this isn't the last token and the previous was L, U or O, B is valid
table[0, L_start:, :B_end] = 1
# Regardless of whether this is the last token, if the previous action was
# {L, U, O}, U and O are valid.
table[:, L_start:, U_start:] = 1
_cache[n_actions] = table
return table

View File

@ -1,90 +0,0 @@
"""Thinc layer to do simpler transition-based parsing, NER, etc."""
from typing import Dict, Optional
from thinc.api import Ops, Model
from thinc.types import Padded, Floats3d
def IOB() -> Model[Padded, Padded]:
return Model(
"biluo",
forward,
init=init,
dims={"nO": None},
attrs={"get_num_actions": get_num_actions},
)
def init(model: Model, X: Optional[Padded] = None, Y: Optional[Padded] = None) -> None:
if X is not None and Y is not None:
if X.data.shape != Y.data.shape:
# TODO: Fix error
raise ValueError("Mismatched shapes (TODO: Fix message)")
model.set_dim("nO", X.data.shape[2])
elif X is not None:
model.set_dim("nO", X.data.shape[2])
elif Y is not None:
model.set_dim("nO", Y.data.shape[2])
elif model.get_dim("nO") is None:
raise ValueError("Dimension unset for BILUO: nO")
def forward(model: Model[Padded, Padded], Xp: Padded, is_train: bool):
n_labels = (model.get_dim("nO") - 1) // 2
n_tokens, n_docs, n_actions = Xp.data.shape
# At each timestep, we make a validity mask of shape (n_docs, n_actions)
# to indicate which actions are valid next for each sequence. To construct
# the mask, we have a state of shape (2, n_actions) and a validity table of
# shape (2, n_actions+1, n_actions). The first dimension of the state indicates
# whether it's the last token, the second dimension indicates the previous
# action, plus a special 'null action' for the first entry.
valid_transitions = _get_transition_table(model.ops, n_labels)
prev_actions = model.ops.alloc1i(n_docs)
# Initialize as though prev action was O
prev_actions.fill(n_actions - 1)
Y = model.ops.alloc3f(*Xp.data.shape)
masks = model.ops.alloc3f(*Y.shape)
for t in range(Xp.data.shape[0]):
masks[t] = valid_transitions[prev_actions]
# Don't train the out-of-bounds sequences.
masks[t, Xp.size_at_t[t] :] = 0
# Valid actions get 0*10e8, invalid get -1*10e8
Y[t] = Xp.data[t] + ((masks[t] - 1) * 10e8)
prev_actions = Y[t].argmax(axis=-1)
def backprop_biluo(dY: Padded) -> Padded:
# Masking the gradient seems to do poorly here. But why?
# dY.data *= masks
return dY
return Padded(Y, Xp.size_at_t, Xp.lengths, Xp.indices), backprop_biluo
def get_num_actions(n_labels: int) -> int:
# One BEGIN action per label
# One IN action per label
# One LAST action per label
# One UNIT action per label
# One OUT action
return n_labels * 2 + 1
def _get_transition_table(
ops: Ops, n_labels: int, _cache: Dict[int, Floats3d] = {}
) -> Floats3d:
n_actions = get_num_actions(n_labels)
if n_actions in _cache:
return ops.asarray(_cache[n_actions])
table = ops.alloc2f(n_actions, n_actions)
B_start, B_end = (0, n_labels)
I_start, I_end = (B_end, B_end + n_labels)
O_action = I_end
B_range = ops.xp.arange(B_start, B_end)
I_range = ops.xp.arange(I_start, I_end)
# B and O are always valid
table[:, B_start:B_end] = 1
table[:, O_action] = 1
# I can only follow a matching B
table[B_range, I_range] = 1
_cache[n_actions] = table
return table

View File

@ -1,6 +1,5 @@
from .entity_linker import * # noqa from .entity_linker import * # noqa
from .parser import * # noqa from .parser import * # noqa
from .simple_ner import * # noqa
from .tagger import * # noqa from .tagger import * # noqa
from .textcat import * # noqa from .textcat import * # noqa
from .tok2vec import * # noqa from .tok2vec import * # noqa

View File

@ -1,104 +0,0 @@
from typing import List
from thinc.api import Model, Linear, with_array, softmax_activation, padded2list
from thinc.api import chain, list2padded, configure_normal_init
from thinc.api import Dropout
from thinc.types import Floats2d
from ...tokens import Doc
from .._biluo import BILUO
from .._iob import IOB
from ...util import registry
@registry.architectures.register("spacy.BILUOTagger.v1")
def BiluoTagger(
tok2vec: Model[List[Doc], List[Floats2d]]
) -> Model[List[Doc], List[Floats2d]]:
"""Construct a simple NER tagger, that predicts BILUO tag scores for each
token and uses greedy decoding with transition-constraints to return a valid
BILUO tag sequence.
A BILUO tag sequence encodes a sequence of non-overlapping labelled spans
into tags assigned to each token. The first token of a span is given the
tag B-LABEL, the last token of the span is given the tag L-LABEL, and tokens
within the span are given the tag U-LABEL. Single-token spans are given
the tag U-LABEL. All other tokens are assigned the tag O.
The BILUO tag scheme generally results in better linear separation between
classes, especially for non-CRF models, because there are more distinct classes
for the different situations (Ratinov et al., 2009).
"""
biluo = BILUO()
linear = Linear(
nO=None, nI=tok2vec.get_dim("nO"), init_W=configure_normal_init(mean=0.02)
)
model = chain(
tok2vec,
list2padded(),
with_array(chain(Dropout(0.1), linear)),
biluo,
with_array(softmax_activation()),
padded2list(),
)
return Model(
"biluo-tagger",
forward,
init=init,
layers=[model, linear],
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
dims={"nO": None},
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
)
@registry.architectures.register("spacy.IOBTagger.v1")
def IOBTagger(
tok2vec: Model[List[Doc], List[Floats2d]]
) -> Model[List[Doc], List[Floats2d]]:
"""Construct a simple NER tagger, that predicts IOB tag scores for each
token and uses greedy decoding with transition-constraints to return a valid
IOB tag sequence.
An IOB tag sequence encodes a sequence of non-overlapping labelled spans
into tags assigned to each token. The first token of a span is given the
tag B-LABEL, and subsequent tokens are given the tag I-LABEL.
All other tokens are assigned the tag O.
"""
biluo = IOB()
linear = Linear(nO=None, nI=tok2vec.get_dim("nO"))
model = chain(
tok2vec,
list2padded(),
with_array(linear),
biluo,
with_array(softmax_activation()),
padded2list(),
)
return Model(
"iob-tagger",
forward,
init=init,
layers=[model],
refs={"tok2vec": tok2vec, "linear": linear, "biluo": biluo},
dims={"nO": None},
attrs={"get_num_actions": biluo.attrs["get_num_actions"]},
)
def init(model: Model[List[Doc], List[Floats2d]], X=None, Y=None) -> None:
if model.get_dim("nO") is None and Y:
model.set_dim("nO", Y[0].shape[1])
nO = model.get_dim("nO")
biluo = model.get_ref("biluo")
linear = model.get_ref("linear")
biluo.set_dim("nO", nO)
if linear.has_dim("nO") is None:
linear.set_dim("nO", nO)
model.layers[0].initialize(X=X, Y=Y)
def forward(model: Model, X: List[Doc], is_train: bool):
return model.layers[0](X, is_train)
__all__ = ["BiluoTagger"]

View File

@ -165,7 +165,7 @@ def MultiHashEmbed(
@registry.architectures.register("spacy.CharacterEmbed.v1") @registry.architectures.register("spacy.CharacterEmbed.v1")
def CharacterEmbed(width: int, rows: int, nM: int, nC: int): def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
"""Construct an embedded representations based on character embeddings, using """Construct an embedded representation based on character embeddings, using
a feed-forward network. A fixed number of UTF-8 byte characters are used for a feed-forward network. A fixed number of UTF-8 byte characters are used for
each word, taken from the beginning and end of the word equally. Padding is each word, taken from the beginning and end of the word equally. Padding is
used in the centre for words that are too short. used in the centre for words that are too short.
@ -176,8 +176,8 @@ def CharacterEmbed(width: int, rows: int, nM: int, nC: int):
ensures that the final character is always in the last position, instead ensures that the final character is always in the last position, instead
of being in an arbitrary position depending on the word length. of being in an arbitrary position depending on the word length.
The characters are embedded in a embedding table with 256 rows, and the The characters are embedded in a embedding table with a given number of rows,
vectors concatenated. A hash-embedded vector of the NORM of the word is and the vectors concatenated. A hash-embedded vector of the NORM of the word is
also concatenated on, and the result is then passed through a feed-forward also concatenated on, and the result is then passed through a feed-forward
network to construct a single vector to represent the information. network to construct a single vector to represent the information.

View File

@ -8,7 +8,6 @@ from .morphologizer import Morphologizer
from .pipe import Pipe from .pipe import Pipe
from .senter import SentenceRecognizer from .senter import SentenceRecognizer
from .sentencizer import Sentencizer from .sentencizer import Sentencizer
from .simple_ner import SimpleNER
from .tagger import Tagger from .tagger import Tagger
from .textcat import TextCategorizer from .textcat import TextCategorizer
from .tok2vec import Tok2Vec from .tok2vec import Tok2Vec
@ -25,7 +24,6 @@ __all__ = [
"Pipe", "Pipe",
"SentenceRecognizer", "SentenceRecognizer",
"Sentencizer", "Sentencizer",
"SimpleNER",
"Tagger", "Tagger",
"TextCategorizer", "TextCategorizer",
"Tok2Vec", "Tok2Vec",

View File

@ -8,7 +8,7 @@ from ...typedefs cimport hash_t, attr_t
from ...strings cimport hash_string from ...strings cimport hash_string
from ...structs cimport TokenC from ...structs cimport TokenC
from ...tokens.doc cimport Doc, set_children_from_heads from ...tokens.doc cimport Doc, set_children_from_heads
from ...gold.example cimport Example from ...training.example cimport Example
from ...errors import Errors from ...errors import Errors
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC

View File

@ -5,7 +5,7 @@ from cymem.cymem cimport Pool
from ...typedefs cimport weight_t, attr_t from ...typedefs cimport weight_t, attr_t
from ...lexeme cimport Lexeme from ...lexeme cimport Lexeme
from ...attrs cimport IS_SPACE from ...attrs cimport IS_SPACE
from ...gold.example cimport Example from ...training.example cimport Example
from ...errors import Errors from ...errors import Errors
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC

View File

@ -3,7 +3,7 @@ from cymem.cymem cimport Pool
from ...typedefs cimport attr_t, weight_t from ...typedefs cimport attr_t, weight_t
from ...structs cimport TokenC from ...structs cimport TokenC
from ...strings cimport StringStore from ...strings cimport StringStore
from ...gold.example cimport Example from ...training.example cimport Example
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC

View File

@ -4,7 +4,7 @@ from pathlib import Path
from .pipe import Pipe from .pipe import Pipe
from ..errors import Errors from ..errors import Errors
from ..gold import validate_examples from ..training import validate_examples
from ..language import Language from ..language import Language
from ..matcher import Matcher from ..matcher import Matcher
from ..scorer import Scorer from ..scorer import Scorer

View File

@ -9,7 +9,7 @@ from .functions import merge_subtokens
from ..language import Language from ..language import Language
from ._parser_internals import nonproj from ._parser_internals import nonproj
from ..scorer import Scorer from ..scorer import Scorer
from ..gold import validate_examples from ..training import validate_examples
default_model_config = """ default_model_config = """

View File

@ -1,3 +1,4 @@
from itertools import islice
from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple from typing import Optional, Iterable, Callable, Dict, Iterator, Union, List, Tuple
from pathlib import Path from pathlib import Path
import srsly import srsly
@ -11,7 +12,7 @@ from ..tokens import Doc
from .pipe import Pipe, deserialize_config from .pipe import Pipe, deserialize_config
from ..language import Language from ..language import Language
from ..vocab import Vocab from ..vocab import Vocab
from ..gold import Example, validate_examples from ..training import Example, validate_examples
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from ..util import SimpleFrozenList from ..util import SimpleFrozenList
from .. import util from .. import util
@ -128,7 +129,7 @@ class EntityLinker(Pipe):
# how many neightbour sentences to take into account # how many neightbour sentences to take into account
self.n_sents = cfg.get("n_sents", 0) self.n_sents = cfg.get("n_sents", 0)
def require_kb(self) -> None: def _require_kb(self) -> None:
# Raise an error if the knowledge base is not initialized. # Raise an error if the knowledge base is not initialized.
if len(self.kb) == 0: if len(self.kb) == 0:
raise ValueError(Errors.E139.format(name=self.name)) raise ValueError(Errors.E139.format(name=self.name))
@ -140,10 +141,11 @@ class EntityLinker(Pipe):
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
sgd: Optional[Optimizer] = None, sgd: Optional[Optimizer] = None,
) -> Optimizer: ) -> Optimizer:
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Optional function that get_examples (Callable[[], Iterable[Example]]): Function that
returns gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to components that this component is part of. Corresponds to
nlp.pipeline. nlp.pipeline.
@ -153,10 +155,19 @@ class EntityLinker(Pipe):
DOCS: https://nightly.spacy.io/api/entitylinker#begin_training DOCS: https://nightly.spacy.io/api/entitylinker#begin_training
""" """
self.require_kb() self._ensure_examples(get_examples)
self._require_kb()
nO = self.kb.entity_vector_length nO = self.kb.entity_vector_length
self.set_output(nO) doc_sample = []
self.model.initialize() vector_sample = []
for example in islice(get_examples(), 10):
doc_sample.append(example.x)
vector_sample.append(self.model.ops.alloc1f(nO))
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
)
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
return sgd return sgd
@ -184,7 +195,7 @@ class EntityLinker(Pipe):
DOCS: https://nightly.spacy.io/api/entitylinker#update DOCS: https://nightly.spacy.io/api/entitylinker#update
""" """
self.require_kb() self._require_kb()
if losses is None: if losses is None:
losses = {} losses = {}
losses.setdefault(self.name, 0.0) losses.setdefault(self.name, 0.0)
@ -296,7 +307,7 @@ class EntityLinker(Pipe):
DOCS: https://nightly.spacy.io/api/entitylinker#predict DOCS: https://nightly.spacy.io/api/entitylinker#predict
""" """
self.require_kb() self._require_kb()
entity_count = 0 entity_count = 0
final_kb_ids = [] final_kb_ids = []
if not docs: if not docs:
@ -405,7 +416,7 @@ class EntityLinker(Pipe):
token.ent_kb_id_ = kb_id token.ent_kb_id_ = kb_id
def to_disk( def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(), self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None: ) -> None:
"""Serialize the pipe to disk. """Serialize the pipe to disk.
@ -422,7 +433,7 @@ class EntityLinker(Pipe):
util.to_disk(path, serialize, exclude) util.to_disk(path, serialize, exclude)
def from_disk( def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList(), self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> "EntityLinker": ) -> "EntityLinker":
"""Load the pipe from disk. Modifies the object in place and returns it. """Load the pipe from disk. Modifies the object in place and returns it.

View File

@ -9,7 +9,7 @@ from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
from ..tokens import Doc, Span from ..tokens import Doc, Span
from ..matcher import Matcher, PhraseMatcher from ..matcher import Matcher, PhraseMatcher
from ..scorer import Scorer from ..scorer import Scorer
from ..gold import validate_examples from ..training import validate_examples
DEFAULT_ENT_ID_SEP = "||" DEFAULT_ENT_ID_SEP = "||"

View File

@ -8,7 +8,7 @@ from ..lookups import Lookups, load_lookups
from ..scorer import Scorer from ..scorer import Scorer
from ..tokens import Doc, Token from ..tokens import Doc, Token
from ..vocab import Vocab from ..vocab import Vocab
from ..gold import validate_examples from ..training import validate_examples
from .. import util from .. import util

View File

@ -2,6 +2,7 @@
from typing import Optional from typing import Optional
import srsly import srsly
from thinc.api import SequenceCategoricalCrossentropy, Model, Config from thinc.api import SequenceCategoricalCrossentropy, Model, Config
from itertools import islice
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..vocab cimport Vocab from ..vocab cimport Vocab
@ -15,7 +16,7 @@ from .pipe import deserialize_config
from .tagger import Tagger from .tagger import Tagger
from .. import util from .. import util
from ..scorer import Scorer from ..scorer import Scorer
from ..gold import validate_examples from ..training import validate_examples
default_model_config = """ default_model_config = """
@ -112,6 +113,7 @@ class Morphologizer(Tagger):
raise ValueError(Errors.E187) raise ValueError(Errors.E187)
if label in self.labels: if label in self.labels:
return 0 return 0
self._allow_extra_label()
# normalize label # normalize label
norm_label = self.vocab.morphology.normalize_features(label) norm_label = self.vocab.morphology.normalize_features(label)
# extract separate POS and morph tags # extract separate POS and morph tags
@ -128,10 +130,11 @@ class Morphologizer(Tagger):
return 1 return 1
def begin_training(self, get_examples, *, pipeline=None, sgd=None): def begin_training(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Optional function that get_examples (Callable[[], Iterable[Example]]): Function that
returns gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to components that this component is part of. Corresponds to
nlp.pipeline. nlp.pipeline.
@ -141,9 +144,8 @@ class Morphologizer(Tagger):
DOCS: https://nightly.spacy.io/api/morphologizer#begin_training DOCS: https://nightly.spacy.io/api/morphologizer#begin_training
""" """
if not hasattr(get_examples, "__call__"): self._ensure_examples(get_examples)
err = Errors.E930.format(name="Morphologizer", obj=type(get_examples)) # First, fetch all labels from the data
raise ValueError(err)
for example in get_examples(): for example in get_examples():
for i, token in enumerate(example.reference): for i, token in enumerate(example.reference):
pos = token.pos_ pos = token.pos_
@ -157,8 +159,25 @@ class Morphologizer(Tagger):
if norm_label not in self.cfg["labels_morph"]: if norm_label not in self.cfg["labels_morph"]:
self.cfg["labels_morph"][norm_label] = morph self.cfg["labels_morph"][norm_label] = morph
self.cfg["labels_pos"][norm_label] = POS_IDS[pos] self.cfg["labels_pos"][norm_label] = POS_IDS[pos]
self.set_output(len(self.labels)) if len(self.labels) <= 1:
self.model.initialize() raise ValueError(Errors.E143.format(name=self.name))
doc_sample = []
label_sample = []
for example in islice(get_examples(), 10):
gold_array = []
for i, token in enumerate(example.reference):
pos = token.pos_
morph = token.morph_
morph_dict = Morphology.feats_to_dict(morph)
if pos:
morph_dict[self.POS_FEAT] = pos
norm_label = self.vocab.strings[self.vocab.morphology.add(morph_dict)]
gold_array.append([1.0 if label == norm_label else 0.0 for label in self.labels])
doc_sample.append(example.x)
label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample)
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
return sgd return sgd

View File

@ -8,7 +8,7 @@ from ..tokens.doc cimport Doc
from .pipe import Pipe from .pipe import Pipe
from .tagger import Tagger from .tagger import Tagger
from ..gold import validate_examples from ..training import validate_examples
from ..language import Language from ..language import Language
from ._parser_internals import nonproj from ._parser_internals import nonproj
from ..attrs import POS, ID from ..attrs import POS, ID
@ -90,7 +90,7 @@ class MultitaskObjective(Tagger):
label = self.make_label(token) label = self.make_label(token)
if label is not None and label not in self.labels: if label is not None and label not in self.labels:
self.labels[label] = len(self.labels) self.labels[label] = len(self.labels)
self.model.initialize() self.model.initialize() # TODO: fix initialization by defining X and Y
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
return sgd return sgd
@ -178,7 +178,7 @@ class ClozeMultitask(Pipe):
pass pass
def begin_training(self, get_examples, pipeline=None, sgd=None): def begin_training(self, get_examples, pipeline=None, sgd=None):
self.model.initialize() self.model.initialize() # TODO: fix initialization by defining X and Y
X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO"))) X = self.model.ops.alloc((5, self.model.get_ref("tok2vec").get_dim("nO")))
self.model.output_layer.begin_training(X) self.model.output_layer.begin_training(X)
if sgd is None: if sgd is None:

View File

@ -7,7 +7,7 @@ from ._parser_internals.ner cimport BiluoPushDown
from ..language import Language from ..language import Language
from ..scorer import Scorer from ..scorer import Scorer
from ..gold import validate_examples from ..training import validate_examples
default_model_config = """ default_model_config = """

View File

@ -4,7 +4,7 @@ from thinc.api import set_dropout_rate, Model
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..gold import validate_examples from ..training import validate_examples
from ..errors import Errors from ..errors import Errors
from .. import util from .. import util
@ -160,6 +160,20 @@ cdef class Pipe:
""" """
raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name)) raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
def _require_labels(self) -> None:
"""Raise an error if the component's model has no labels defined."""
if not self.labels or list(self.labels) == [""]:
raise ValueError(Errors.E143.format(name=self.name))
def _allow_extra_label(self) -> None:
"""Raise an error if the component can not add any more labels."""
if self.model.has_dim("nO") and self.model.get_dim("nO") == len(self.labels):
if not self.is_resizable():
raise ValueError(Errors.E922.format(name=self.name, nO=self.model.get_dim("nO")))
def create_optimizer(self): def create_optimizer(self):
"""Create an optimizer for the pipeline component. """Create an optimizer for the pipeline component.
@ -171,9 +185,12 @@ cdef class Pipe:
def begin_training(self, get_examples, *, pipeline=None, sgd=None): def begin_training(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using data examples if available.
This method needs to be implemented by each Pipe component,
ensuring the internal model (if available) is initialized properly
using the provided sample of Example objects.
get_examples (Callable[[], Iterable[Example]]): Optional function that get_examples (Callable[[], Iterable[Example]]): Function that
returns gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to components that this component is part of. Corresponds to
nlp.pipeline. nlp.pipeline.
@ -183,16 +200,24 @@ cdef class Pipe:
DOCS: https://nightly.spacy.io/api/pipe#begin_training DOCS: https://nightly.spacy.io/api/pipe#begin_training
""" """
self.model.initialize() raise NotImplementedError(Errors.E931.format(method="add_label", name=self.name))
if sgd is None:
sgd = self.create_optimizer() def _ensure_examples(self, get_examples):
return sgd if get_examples is None or not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name=self.name, obj=type(get_examples))
raise ValueError(err)
if not get_examples():
err = Errors.E930.format(name=self.name, obj=get_examples())
raise ValueError(err)
def is_resizable(self):
return hasattr(self, "model") and "resize_output" in self.model.attrs
def set_output(self, nO): def set_output(self, nO):
if self.model.has_dim("nO") is not False: if self.is_resizable():
self.model.set_dim("nO", nO) self.model.attrs["resize_output"](self.model, nO)
if self.model.has_ref("output_layer"): else:
self.model.get_ref("output_layer").set_dim("nO", nO) raise NotImplementedError(Errors.E921)
def use_params(self, params): def use_params(self, params):
"""Modify the pipe's model, to use the given parameter values. At the """Modify the pipe's model, to use the given parameter values. At the

View File

@ -7,7 +7,7 @@ from ..tokens.doc cimport Doc
from .pipe import Pipe from .pipe import Pipe
from ..language import Language from ..language import Language
from ..scorer import Scorer from ..scorer import Scorer
from ..gold import validate_examples from ..training import validate_examples
from .. import util from .. import util

View File

@ -1,4 +1,6 @@
# cython: infer_types=True, profile=True, binding=True # cython: infer_types=True, profile=True, binding=True
from itertools import islice
import srsly import srsly
from thinc.api import Model, SequenceCategoricalCrossentropy, Config from thinc.api import Model, SequenceCategoricalCrossentropy, Config
@ -9,7 +11,7 @@ from .tagger import Tagger
from ..language import Language from ..language import Language
from ..errors import Errors from ..errors import Errors
from ..scorer import Scorer from ..scorer import Scorer
from ..gold import validate_examples from ..training import validate_examples
from .. import util from .. import util
@ -124,10 +126,11 @@ class SentenceRecognizer(Tagger):
return float(loss), d_scores return float(loss), d_scores
def begin_training(self, get_examples, *, pipeline=None, sgd=None): def begin_training(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Optional function that get_examples (Callable[[], Iterable[Example]]): Function that
returns gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to components that this component is part of. Corresponds to
nlp.pipeline. nlp.pipeline.
@ -137,8 +140,18 @@ class SentenceRecognizer(Tagger):
DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training DOCS: https://nightly.spacy.io/api/sentencerecognizer#begin_training
""" """
self.set_output(len(self.labels)) self._ensure_examples(get_examples)
self.model.initialize() doc_sample = []
label_sample = []
assert self.labels, Errors.E924.format(name=self.name)
for example in islice(get_examples(), 10):
doc_sample.append(example.x)
gold_tags = example.get_aligned("SENT_START")
gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags]
label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample, Y=label_sample)
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
return sgd return sgd

View File

@ -1,211 +0,0 @@
from typing import List, Iterable, Optional, Dict, Tuple, Callable, Set
from thinc.types import Floats2d
from thinc.api import SequenceCategoricalCrossentropy, set_dropout_rate, Model
from thinc.api import Optimizer, Config
from thinc.util import to_numpy
from ..errors import Errors
from ..gold import Example, spans_from_biluo_tags, iob_to_biluo, biluo_to_iob
from ..gold import validate_examples
from ..tokens import Doc
from ..language import Language
from ..vocab import Vocab
from ..scorer import Scorer
from .pipe import Pipe
default_model_config = """
[model]
@architectures = "spacy.BILUOTagger.v1"
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 128
depth = 4
embed_size = 7000
window_size = 1
maxout_pieces = 3
subword_features = true
"""
DEFAULT_SIMPLE_NER_MODEL = Config().from_str(default_model_config)["model"]
@Language.factory(
"simple_ner",
assigns=["doc.ents"],
default_config={"labels": [], "model": DEFAULT_SIMPLE_NER_MODEL},
scores=["ents_p", "ents_r", "ents_f", "ents_per_type"],
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0},
)
def make_simple_ner(
nlp: Language, name: str, model: Model, labels: Iterable[str]
) -> "SimpleNER":
return SimpleNER(nlp.vocab, model, name, labels=labels)
class SimpleNER(Pipe):
"""Named entity recognition with a tagging model. The model should include
validity constraints to ensure that only valid tag sequences are returned."""
def __init__(
self,
vocab: Vocab,
model: Model,
name: str = "simple_ner",
*,
labels: Iterable[str],
) -> None:
self.vocab = vocab
self.model = model
self.name = name
self.cfg = {"labels": []}
for label in labels:
self.add_label(label)
self.loss_func = SequenceCategoricalCrossentropy(
names=self.get_tag_names(), normalize=True, missing_value=None
)
assert self.model is not None
@property
def is_biluo(self) -> bool:
return self.model.name.startswith("biluo")
@property
def labels(self) -> Tuple[str]:
return tuple(self.cfg["labels"])
def add_label(self, label: str) -> None:
"""Add a new label to the pipe.
label (str): The label to add.
DOCS: https://nightly.spacy.io/api/simplener#add_label
"""
if not isinstance(label, str):
raise ValueError(Errors.E187)
if label not in self.labels:
self.cfg["labels"].append(label)
self.vocab.strings.add(label)
def get_tag_names(self) -> List[str]:
if self.is_biluo:
return (
[f"B-{label}" for label in self.labels]
+ [f"I-{label}" for label in self.labels]
+ [f"L-{label}" for label in self.labels]
+ [f"U-{label}" for label in self.labels]
+ ["O"]
)
else:
return (
[f"B-{label}" for label in self.labels]
+ [f"I-{label}" for label in self.labels]
+ ["O"]
)
def predict(self, docs: List[Doc]) -> List[Floats2d]:
scores = self.model.predict(docs)
return scores
def set_annotations(self, docs: List[Doc], scores: List[Floats2d]) -> None:
"""Set entities on a batch of documents from a batch of scores."""
tag_names = self.get_tag_names()
for i, doc in enumerate(docs):
actions = to_numpy(scores[i].argmax(axis=1))
tags = [tag_names[actions[j]] for j in range(len(doc))]
if not self.is_biluo:
tags = iob_to_biluo(tags)
doc.ents = spans_from_biluo_tags(doc, tags)
def update(
self,
examples: List[Example],
*,
set_annotations: bool = False,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
) -> Dict[str, float]:
if losses is None:
losses = {}
losses.setdefault("ner", 0.0)
validate_examples(examples, "SimpleNER.update")
if not any(_has_ner(eg) for eg in examples):
return losses
docs = [eg.predicted for eg in examples]
set_dropout_rate(self.model, drop)
scores, bp_scores = self.model.begin_update(docs)
loss, d_scores = self.get_loss(examples, scores)
bp_scores(d_scores)
if set_annotations:
self.set_annotations(docs, scores)
if sgd is not None:
self.model.finish_update(sgd)
losses["ner"] += loss
return losses
def get_loss(self, examples: List[Example], scores) -> Tuple[List[Floats2d], float]:
validate_examples(examples, "SimpleNER.get_loss")
truths = []
for eg in examples:
tags = eg.get_aligned_ner()
gold_tags = [(tag if tag != "-" else None) for tag in tags]
if not self.is_biluo:
gold_tags = biluo_to_iob(gold_tags)
truths.append(gold_tags)
for i in range(len(scores)):
if len(scores[i]) != len(truths[i]):
raise ValueError(
f"Mismatched output and gold sizes.\n"
f"Output: {len(scores[i])}, gold: {len(truths[i])}."
f"Input: {len(examples[i].doc)}"
)
d_scores, loss = self.loss_func(scores, truths)
return loss, d_scores
def begin_training(
self,
get_examples: Callable[[], Iterable[Example]],
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
sgd: Optional[Optimizer] = None,
):
all_labels = set()
if not hasattr(get_examples, "__call__"):
err = Errors.E930.format(name="SimpleNER", obj=type(get_examples))
raise ValueError(err)
for example in get_examples():
all_labels.update(_get_labels(example))
for label in sorted(all_labels):
self.add_label(label)
labels = self.labels
n_actions = self.model.attrs["get_num_actions"](len(labels))
self.model.set_dim("nO", n_actions)
self.model.initialize()
if pipeline is not None:
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
self.loss_func = SequenceCategoricalCrossentropy(
names=self.get_tag_names(), normalize=True, missing_value=None
)
return sgd
def init_multitask_objectives(self, *args, **kwargs):
pass
def score(self, examples, **kwargs):
validate_examples(examples, "SimpleNER.score")
return Scorer.score_spans(examples, "ents", **kwargs)
def _has_ner(example: Example) -> bool:
for ner_tag in example.get_aligned_ner():
if ner_tag != "-" and ner_tag is not None:
return True
else:
return False
def _get_labels(example: Example) -> Set[str]:
labels = set()
for ner_tag in example.get_aligned("ENT_TYPE", as_string=True):
if ner_tag != "O" and ner_tag != "-":
labels.add(ner_tag)
return labels

View File

@ -5,6 +5,7 @@ import srsly
from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config from thinc.api import Model, set_dropout_rate, SequenceCategoricalCrossentropy, Config
from thinc.types import Floats2d from thinc.types import Floats2d
import warnings import warnings
from itertools import islice
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..morphology cimport Morphology from ..morphology cimport Morphology
@ -16,7 +17,7 @@ from ..attrs import POS, ID
from ..parts_of_speech import X from ..parts_of_speech import X
from ..errors import Errors, TempErrors, Warnings from ..errors import Errors, TempErrors, Warnings
from ..scorer import Scorer from ..scorer import Scorer
from ..gold import validate_examples from ..training import validate_examples
from .. import util from .. import util
@ -258,10 +259,11 @@ class Tagger(Pipe):
return float(loss), d_scores return float(loss), d_scores
def begin_training(self, get_examples, *, pipeline=None, sgd=None): def begin_training(self, get_examples, *, pipeline=None, sgd=None):
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Optional function that get_examples (Callable[[], Iterable[Example]]): Function that
returns gold-standard Example objects. returns a representative sample of gold-standard Example objects..
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to components that this component is part of. Corresponds to
nlp.pipeline. nlp.pipeline.
@ -271,32 +273,24 @@ class Tagger(Pipe):
DOCS: https://nightly.spacy.io/api/tagger#begin_training DOCS: https://nightly.spacy.io/api/tagger#begin_training
""" """
if not hasattr(get_examples, "__call__"): self._ensure_examples(get_examples)
err = Errors.E930.format(name="Tagger", obj=type(get_examples))
raise ValueError(err)
tags = set()
doc_sample = [] doc_sample = []
label_sample = []
tags = set()
for example in get_examples(): for example in get_examples():
for token in example.y: for token in example.y:
if token.tag_:
tags.add(token.tag_) tags.add(token.tag_)
if len(doc_sample) < 10:
doc_sample.append(example.x)
if not doc_sample:
doc_sample.append(Doc(self.vocab, words=["hello"]))
for tag in sorted(tags): for tag in sorted(tags):
self.add_label(tag) self.add_label(tag)
if len(self.labels) == 0: for example in islice(get_examples(), 10):
err = Errors.E1006.format(name="Tagger") doc_sample.append(example.x)
raise ValueError(err) gold_tags = example.get_aligned("TAG", as_string=True)
self.set_output(len(self.labels)) gold_array = [[1.0 if tag == gold_tag else 0.0 for tag in self.labels] for gold_tag in gold_tags]
if doc_sample: label_sample.append(self.model.ops.asarray(gold_array, dtype="float32"))
label_sample = [ assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
self.model.ops.alloc2f(len(doc), len(self.labels)) assert len(label_sample) > 0, Errors.E923.format(name=self.name)
for doc in doc_sample
]
self.model.initialize(X=doc_sample, Y=label_sample) self.model.initialize(X=doc_sample, Y=label_sample)
else:
self.model.initialize()
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
return sgd return sgd
@ -313,6 +307,7 @@ class Tagger(Pipe):
raise ValueError(Errors.E187) raise ValueError(Errors.E187)
if label in self.labels: if label in self.labels:
return 0 return 0
self._allow_extra_label()
self.cfg["labels"].append(label) self.cfg["labels"].append(label)
self.vocab.strings.add(label) self.vocab.strings.add(label)
return 1 return 1

View File

@ -1,3 +1,4 @@
from itertools import islice
from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any from typing import Iterable, Tuple, Optional, Dict, List, Callable, Iterator, Any
from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config from thinc.api import get_array_module, Model, Optimizer, set_dropout_rate, Config
from thinc.types import Floats2d from thinc.types import Floats2d
@ -5,7 +6,7 @@ import numpy
from .pipe import Pipe from .pipe import Pipe
from ..language import Language from ..language import Language
from ..gold import Example, validate_examples from ..training import Example, validate_examples
from ..errors import Errors from ..errors import Errors
from ..scorer import Scorer from ..scorer import Scorer
from .. import util from .. import util
@ -128,11 +129,6 @@ class TextCategorizer(Pipe):
""" """
return tuple(self.cfg.setdefault("labels", [])) return tuple(self.cfg.setdefault("labels", []))
def require_labels(self) -> None:
"""Raise an error if the component's model has no labels defined."""
if not self.labels:
raise ValueError(Errors.E143.format(name=self.name))
@labels.setter @labels.setter
def labels(self, value: Iterable[str]) -> None: def labels(self, value: Iterable[str]) -> None:
self.cfg["labels"] = tuple(value) self.cfg["labels"] = tuple(value)
@ -311,17 +307,7 @@ class TextCategorizer(Pipe):
raise ValueError(Errors.E187) raise ValueError(Errors.E187)
if label in self.labels: if label in self.labels:
return 0 return 0
if self.model.has_dim("nO"): self._allow_extra_label()
# This functionality was available previously, but was broken.
# The problem is that we resize the last layer, but the last layer
# is actually just an ensemble. We're not resizing the child layers
# - a huge problem.
raise ValueError(Errors.E116)
# smaller = self.model._layers[-1]
# larger = Linear(len(self.labels)+1, smaller.nI)
# copy_array(larger.W[:smaller.nO], smaller.W)
# copy_array(larger.b[:smaller.nO], smaller.b)
# self.model._layers[-1] = larger
self.labels = tuple(list(self.labels) + [label]) self.labels = tuple(list(self.labels) + [label])
return 1 return 1
@ -332,10 +318,11 @@ class TextCategorizer(Pipe):
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
sgd: Optional[Optimizer] = None, sgd: Optional[Optimizer] = None,
) -> Optimizer: ) -> Optimizer:
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Optional function that get_examples (Callable[[], Iterable[Example]]): Function that
returns gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to components that this component is part of. Corresponds to
nlp.pipeline. nlp.pipeline.
@ -345,22 +332,19 @@ class TextCategorizer(Pipe):
DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training DOCS: https://nightly.spacy.io/api/textcategorizer#begin_training
""" """
if not hasattr(get_examples, "__call__"): self._ensure_examples(get_examples)
err = Errors.E930.format(name="TextCategorizer", obj=type(get_examples))
raise ValueError(err)
subbatch = [] # Select a subbatch of examples to initialize the model subbatch = [] # Select a subbatch of examples to initialize the model
for example in get_examples(): for example in islice(get_examples(), 10):
if len(subbatch) < 2: if len(subbatch) < 2:
subbatch.append(example) subbatch.append(example)
for cat in example.y.cats: for cat in example.y.cats:
self.add_label(cat) self.add_label(cat)
self.require_labels() doc_sample = [eg.reference for eg in subbatch]
docs = [eg.reference for eg in subbatch] label_sample, _ = self._examples_to_truth(subbatch)
if not docs: # need at least one doc self._require_labels()
docs = [Doc(self.vocab, words=["hello"])] assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
truths, _ = self._examples_to_truth(subbatch) assert len(label_sample) > 0, Errors.E923.format(name=self.name)
self.set_output(len(self.labels)) self.model.initialize(X=doc_sample, Y=label_sample)
self.model.initialize(X=docs, Y=truths)
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
return sgd return sgd

View File

@ -1,8 +1,9 @@
from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple from typing import Iterator, Sequence, Iterable, Optional, Dict, Callable, List, Tuple
from thinc.api import Model, set_dropout_rate, Optimizer, Config from thinc.api import Model, set_dropout_rate, Optimizer, Config
from itertools import islice
from .pipe import Pipe from .pipe import Pipe
from ..gold import Example, validate_examples from ..training import Example, validate_examples
from ..tokens import Doc from ..tokens import Doc
from ..vocab import Vocab from ..vocab import Vocab
from ..language import Language from ..language import Language
@ -209,10 +210,11 @@ class Tok2Vec(Pipe):
pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None, pipeline: Optional[List[Tuple[str, Callable[[Doc], Doc]]]] = None,
sgd: Optional[Optimizer] = None, sgd: Optional[Optimizer] = None,
): ):
"""Initialize the pipe for training, using data examples if available. """Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Optional function that get_examples (Callable[[], Iterable[Example]]): Function that
returns gold-standard Example objects. returns a representative sample of gold-standard Example objects.
pipeline (List[Tuple[str, Callable]]): Optional list of pipeline pipeline (List[Tuple[str, Callable]]): Optional list of pipeline
components that this component is part of. Corresponds to components that this component is part of. Corresponds to
nlp.pipeline. nlp.pipeline.
@ -222,8 +224,12 @@ class Tok2Vec(Pipe):
DOCS: https://nightly.spacy.io/api/tok2vec#begin_training DOCS: https://nightly.spacy.io/api/tok2vec#begin_training
""" """
docs = [Doc(self.vocab, words=["hello"])] self._ensure_examples(get_examples)
self.model.initialize(X=docs) doc_sample = []
for example in islice(get_examples(), 10):
doc_sample.append(example.x)
assert doc_sample, Errors.E923.format(name=self.name)
self.model.initialize(X=doc_sample)
def add_label(self, label): def add_label(self, label):
raise NotImplementedError raise NotImplementedError

View File

@ -21,7 +21,7 @@ from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
from ..ml.parser_model cimport get_c_weights, get_c_sizes from ..ml.parser_model cimport get_c_weights, get_c_sizes
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..gold import validate_examples from ..training import validate_examples
from ..errors import Errors, Warnings from ..errors import Errors, Warnings
from .. import util from .. import util
@ -244,7 +244,7 @@ cdef class Parser(Pipe):
int nr_class, int batch_size) nogil: int nr_class, int batch_size) nogil:
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
with gil: with gil:
assert self.moves.n_moves > 0 assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
is_valid = <int*>calloc(self.moves.n_moves, sizeof(int)) is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
cdef int i, guess cdef int i, guess
cdef Transition action cdef Transition action
@ -378,7 +378,7 @@ cdef class Parser(Pipe):
cdef int i cdef int i
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc # n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
assert self.moves.n_moves > 0 assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int)) is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float)) costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
@ -406,9 +406,7 @@ cdef class Parser(Pipe):
self.model.attrs["resize_output"](self.model, nO) self.model.attrs["resize_output"](self.model, nO)
def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs): def begin_training(self, get_examples, pipeline=None, sgd=None, **kwargs):
if not hasattr(get_examples, "__call__"): self._ensure_examples(get_examples)
err = Errors.E930.format(name="DependencyParser/EntityRecognizer", obj=type(get_examples))
raise ValueError(err)
self.cfg.update(kwargs) self.cfg.update(kwargs)
lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {}) lexeme_norms = self.vocab.lookups.get_table("lexeme_norm", {})
if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS: if len(lexeme_norms) == 0 and self.vocab.lang in util.LEXEME_NORM_LANGS:
@ -430,9 +428,6 @@ cdef class Parser(Pipe):
if sgd is None: if sgd is None:
sgd = self.create_optimizer() sgd = self.create_optimizer()
doc_sample = [] doc_sample = []
for example in islice(get_examples(), 10):
doc_sample.append(example.predicted)
if pipeline is not None: if pipeline is not None:
for name, component in pipeline: for name, component in pipeline:
if component is self: if component is self:
@ -441,10 +436,11 @@ cdef class Parser(Pipe):
doc_sample = list(component.pipe(doc_sample, batch_size=8)) doc_sample = list(component.pipe(doc_sample, batch_size=8))
else: else:
doc_sample = [component(doc) for doc in doc_sample] doc_sample = [component(doc) for doc in doc_sample]
if doc_sample: if not doc_sample:
for example in islice(get_examples(), 10):
doc_sample.append(example.predicted)
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(doc_sample) self.model.initialize(doc_sample)
else:
self.model.initialize()
if pipeline is not None: if pipeline is not None:
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
return sgd return sgd

View File

@ -12,7 +12,7 @@ from .attrs import NAMES
if TYPE_CHECKING: if TYPE_CHECKING:
# This lets us add type hints for mypy etc. without causing circular imports # This lets us add type hints for mypy etc. without causing circular imports
from .language import Language # noqa: F401 from .language import Language # noqa: F401
from .gold import Example # noqa: F401 from .training import Example # noqa: F401
ItemT = TypeVar("ItemT") ItemT = TypeVar("ItemT")
@ -180,7 +180,7 @@ class ModelMetaSchema(BaseModel):
url: StrictStr = Field("", title="Model author URL") url: StrictStr = Field("", title="Model author URL")
sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources") sources: Optional[Union[List[StrictStr], List[Dict[str, str]]]] = Field(None, title="Training data sources")
vectors: Dict[str, Any] = Field({}, title="Included word vectors") vectors: Dict[str, Any] = Field({}, title="Included word vectors")
labels: Dict[str, Dict[str, List[str]]] = Field({}, title="Component labels, keyed by component name") labels: Dict[str, List[str]] = Field({}, title="Component labels, keyed by component name")
accuracy: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy numbers") accuracy: Dict[str, Union[float, Dict[str, float]]] = Field({}, title="Accuracy numbers")
speed: Dict[str, Union[float, int]] = Field({}, title="Speed evaluation numbers") speed: Dict[str, Union[float, int]] = Field({}, title="Speed evaluation numbers")
spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used") spacy_git_version: StrictStr = Field("", title="Commit of spaCy version used")

View File

@ -1,7 +1,7 @@
from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING from typing import Optional, Iterable, Dict, Any, Callable, TYPE_CHECKING
import numpy as np import numpy as np
from .gold import Example from .training import Example
from .tokens import Token, Doc, Span from .tokens import Token, Doc, Span
from .errors import Errors from .errors import Errors
from .util import get_lang_class, SimpleFrozenList from .util import get_lang_class, SimpleFrozenList

View File

@ -1,5 +1,6 @@
from spacy.training import Example
from spacy.pipeline import EntityRecognizer from spacy.pipeline import EntityRecognizer
from spacy.tokens import Span from spacy.tokens import Span, Doc
from spacy import registry from spacy import registry
import pytest import pytest
@ -7,6 +8,12 @@ from ..util import get_doc
from spacy.pipeline.ner import DEFAULT_NER_MODEL from spacy.pipeline.ner import DEFAULT_NER_MODEL
def _ner_example(ner):
doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
return Example.from_dict(doc, gold)
def test_doc_add_entities_set_ents_iob(en_vocab): def test_doc_add_entities_set_ents_iob(en_vocab):
text = ["This", "is", "a", "lion"] text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text) doc = get_doc(en_vocab, text)
@ -18,10 +25,8 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
cfg = {"model": DEFAULT_NER_MODEL} cfg = {"model": DEFAULT_NER_MODEL}
model = registry.make_from_config(cfg, validate=True)["model"] model = registry.make_from_config(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model, **config) ner = EntityRecognizer(en_vocab, model, **config)
ner.begin_training(lambda: []) ner.begin_training(lambda: [_ner_example(ner)])
ner(doc) ner(doc)
assert len(list(doc.ents)) == 0
assert [w.ent_iob_ for w in doc] == (["O"] * len(doc))
doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)] doc.ents = [(doc.vocab.strings["ANIMAL"], 3, 4)]
assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"] assert [w.ent_iob_ for w in doc] == ["O", "O", "O", "B"]
@ -31,6 +36,7 @@ def test_doc_add_entities_set_ents_iob(en_vocab):
def test_ents_reset(en_vocab): def test_ents_reset(en_vocab):
"""Ensure that resetting doc.ents does not change anything"""
text = ["This", "is", "a", "lion"] text = ["This", "is", "a", "lion"]
doc = get_doc(en_vocab, text) doc = get_doc(en_vocab, text)
config = { config = {
@ -41,11 +47,11 @@ def test_ents_reset(en_vocab):
cfg = {"model": DEFAULT_NER_MODEL} cfg = {"model": DEFAULT_NER_MODEL}
model = registry.make_from_config(cfg, validate=True)["model"] model = registry.make_from_config(cfg, validate=True)["model"]
ner = EntityRecognizer(en_vocab, model, **config) ner = EntityRecognizer(en_vocab, model, **config)
ner.begin_training(lambda: []) ner.begin_training(lambda: [_ner_example(ner)])
ner(doc) ner(doc)
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) orig_iobs = [t.ent_iob_ for t in doc]
doc.ents = list(doc.ents) doc.ents = list(doc.ents)
assert [t.ent_iob_ for t in doc] == (["O"] * len(doc)) assert [t.ent_iob_ for t in doc] == orig_iobs
def test_add_overlapping_entities(en_vocab): def test_add_overlapping_entities(en_vocab):

View File

@ -3,7 +3,7 @@ from thinc.api import Adam, fix_random_seed
from spacy import registry from spacy import registry
from spacy.attrs import NORM from spacy.attrs import NORM
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.gold import Example from spacy.training import Example
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.pipeline import DependencyParser, EntityRecognizer from spacy.pipeline import DependencyParser, EntityRecognizer
from spacy.pipeline.ner import DEFAULT_NER_MODEL from spacy.pipeline.ner import DEFAULT_NER_MODEL
@ -35,7 +35,7 @@ def test_init_parser(parser):
def _train_parser(parser): def _train_parser(parser):
fix_random_seed(1) fix_random_seed(1)
parser.add_label("left") parser.add_label("left")
parser.begin_training(lambda: [], **parser.cfg) parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
sgd = Adam(0.001) sgd = Adam(0.001)
for i in range(5): for i in range(5):
@ -47,16 +47,25 @@ def _train_parser(parser):
return parser return parser
def _parser_example(parser):
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
return Example.from_dict(doc, gold)
def _ner_example(ner):
doc = Doc(ner.vocab, words=["Joe", "loves", "visiting", "London", "during", "the", "weekend"])
gold = {"entities": [(0, 3, "PERSON"), (19, 25, "LOC")]}
return Example.from_dict(doc, gold)
def test_add_label(parser): def test_add_label(parser):
parser = _train_parser(parser) parser = _train_parser(parser)
parser.add_label("right") parser.add_label("right")
sgd = Adam(0.001) sgd = Adam(0.001)
for i in range(100): for i in range(100):
losses = {} losses = {}
doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) parser.update([_parser_example(parser)], sgd=sgd, losses=losses)
gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
example = Example.from_dict(doc, gold)
parser.update([example], sgd=sgd, losses=losses)
doc = Doc(parser.vocab, words=["a", "b", "c", "d"]) doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
doc = parser(doc) doc = parser(doc)
assert doc[0].dep_ == "right" assert doc[0].dep_ == "right"
@ -75,7 +84,7 @@ def test_add_label_deserializes_correctly():
ner1.add_label("C") ner1.add_label("C")
ner1.add_label("B") ner1.add_label("B")
ner1.add_label("A") ner1.add_label("A")
ner1.begin_training(lambda: []) ner1.begin_training(lambda: [_ner_example(ner1)])
ner2 = EntityRecognizer(Vocab(), model, **config) ner2 = EntityRecognizer(Vocab(), model, **config)
# the second model needs to be resized before we can call from_bytes # the second model needs to be resized before we can call from_bytes

View File

@ -1,7 +1,7 @@
import pytest import pytest
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy import registry from spacy import registry
from spacy.gold import Example from spacy.training import Example
from spacy.pipeline import DependencyParser from spacy.pipeline import DependencyParser
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.pipeline._parser_internals.nonproj import projectivize from spacy.pipeline._parser_internals.nonproj import projectivize

View File

@ -4,7 +4,7 @@ from spacy.lang.en import English
from spacy.language import Language from spacy.language import Language
from spacy.lookups import Lookups from spacy.lookups import Lookups
from spacy.pipeline._parser_internals.ner import BiluoPushDown from spacy.pipeline._parser_internals.ner import BiluoPushDown
from spacy.gold import Example from spacy.training import Example
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.vocab import Vocab from spacy.vocab import Vocab
import logging import logging

View File

@ -1,7 +1,7 @@
import pytest import pytest
from spacy import registry from spacy import registry
from spacy.gold import Example from spacy.training import Example
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.pipeline._parser_internals.arc_eager import ArcEager from spacy.pipeline._parser_internals.arc_eager import ArcEager
from spacy.pipeline.transition_parser import Parser from spacy.pipeline.transition_parser import Parser

View File

@ -3,7 +3,7 @@ import pytest
from spacy.lang.en import English from spacy.lang.en import English
from ..util import get_doc, apply_transition_sequence, make_tempdir from ..util import get_doc, apply_transition_sequence, make_tempdir
from ... import util from ... import util
from ...gold import Example from ...training import Example
TRAIN_DATA = [ TRAIN_DATA = [
( (
@ -85,7 +85,7 @@ def test_parser_merge_pp(en_tokenizer):
pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"] pos = ["DET", "NOUN", "ADP", "DET", "NOUN", "VERB"]
tokens = en_tokenizer(text) tokens = en_tokenizer(text)
doc = get_doc( doc = get_doc(
tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos, tokens.vocab, words=[t.text for t in tokens], deps=deps, heads=heads, pos=pos
) )
with doc.retokenize() as retokenizer: with doc.retokenize() as retokenizer:
for np in doc.noun_chunks: for np in doc.noun_chunks:

View File

@ -3,7 +3,7 @@ from thinc.api import Adam
from spacy.attrs import NORM from spacy.attrs import NORM
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy import registry from spacy import registry
from spacy.gold import Example from spacy.training import Example
from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.pipeline import DependencyParser from spacy.pipeline import DependencyParser
@ -14,6 +14,12 @@ def vocab():
return Vocab(lex_attr_getters={NORM: lambda s: s}) return Vocab(lex_attr_getters={NORM: lambda s: s})
def _parser_example(parser):
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
return Example.from_dict(doc, gold)
@pytest.fixture @pytest.fixture
def parser(vocab): def parser(vocab):
config = { config = {
@ -28,7 +34,7 @@ def parser(vocab):
parser.cfg["hidden_width"] = 32 parser.cfg["hidden_width"] = 32
# parser.add_label('right') # parser.add_label('right')
parser.add_label("left") parser.add_label("left")
parser.begin_training(lambda: [], **parser.cfg) parser.begin_training(lambda: [_parser_example(parser)], **parser.cfg)
sgd = Adam(0.001) sgd = Adam(0.001)
for i in range(10): for i in range(10):

View File

@ -1,6 +1,6 @@
import pytest import pytest
import numpy import numpy
from spacy.gold import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.pipeline import AttributeRuler from spacy.pipeline import AttributeRuler
from spacy import util, registry from spacy import util, registry

View File

@ -4,7 +4,7 @@ import pytest
from spacy.kb import KnowledgeBase, get_candidates, Candidate from spacy.kb import KnowledgeBase, get_candidates, Candidate
from spacy import util, registry from spacy import util, registry
from spacy.gold import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tests.util import make_tempdir from spacy.tests.util import make_tempdir
from spacy.tokens import Span from spacy.tokens import Span
@ -281,11 +281,12 @@ def test_append_invalid_alias(nlp):
def test_preserving_links_asdoc(nlp): def test_preserving_links_asdoc(nlp):
"""Test that Span.as_doc preserves the existing entity links""" """Test that Span.as_doc preserves the existing entity links"""
vector_length = 1
@registry.misc.register("myLocationsKB.v1") @registry.misc.register("myLocationsKB.v1")
def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]: def dummy_kb() -> Callable[["Vocab"], KnowledgeBase]:
def create_kb(vocab): def create_kb(vocab):
mykb = KnowledgeBase(vocab, entity_vector_length=1) mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
# adding entities # adding entities
mykb.add_entity(entity="Q1", freq=19, entity_vector=[1]) mykb.add_entity(entity="Q1", freq=19, entity_vector=[1])
mykb.add_entity(entity="Q2", freq=8, entity_vector=[1]) mykb.add_entity(entity="Q2", freq=8, entity_vector=[1])
@ -305,10 +306,9 @@ def test_preserving_links_asdoc(nlp):
ruler = nlp.add_pipe("entity_ruler") ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns(patterns) ruler.add_patterns(patterns)
el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False} el_config = {"kb_loader": {"@misc": "myLocationsKB.v1"}, "incl_prior": False}
el_pipe = nlp.add_pipe("entity_linker", config=el_config, last=True) entity_linker = nlp.add_pipe("entity_linker", config=el_config, last=True)
el_pipe.begin_training(lambda: []) nlp.begin_training()
el_pipe.incl_context = False assert entity_linker.model.get_dim("nO") == vector_length
el_pipe.incl_prior = True
# test whether the entity links are preserved by the `as_doc()` function # test whether the entity links are preserved by the `as_doc()` function
text = "She lives in Boston. He lives in Denver." text = "She lives in Boston. He lives in Denver."
@ -373,6 +373,7 @@ def test_overfitting_IO():
# Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
nlp = English() nlp = English()
nlp.add_pipe("sentencizer") nlp.add_pipe("sentencizer")
vector_length = 3
# Add a custom component to recognize "Russ Cochran" as an entity for the example training data # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
patterns = [ patterns = [
@ -393,7 +394,7 @@ def test_overfitting_IO():
# create artificial KB - assign same prior weight to the two russ cochran's # create artificial KB - assign same prior weight to the two russ cochran's
# Q2146908 (Russ Cochran): American golfer # Q2146908 (Russ Cochran): American golfer
# Q7381115 (Russ Cochran): publisher # Q7381115 (Russ Cochran): publisher
mykb = KnowledgeBase(vocab, entity_vector_length=3) mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3]) mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7]) mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
mykb.add_alias( mykb.add_alias(
@ -406,14 +407,17 @@ def test_overfitting_IO():
return create_kb return create_kb
# Create the Entity Linker component and add it to the pipeline # Create the Entity Linker component and add it to the pipeline
nlp.add_pipe( entity_linker = nlp.add_pipe(
"entity_linker", "entity_linker",
config={"kb_loader": {"@misc": "myOverfittingKB.v1"}}, config={"kb_loader": {"@misc": "myOverfittingKB.v1"}},
last=True, last=True,
) )
# train the NEL pipe # train the NEL pipe
optimizer = nlp.begin_training() optimizer = nlp.begin_training(get_examples=lambda: train_examples)
assert entity_linker.model.get_dim("nO") == vector_length
assert entity_linker.model.get_dim("nO") == entity_linker.kb.entity_vector_length
for i in range(50): for i in range(50):
losses = {} losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses) nlp.update(train_examples, sgd=optimizer, losses=losses)

View File

@ -1,7 +1,7 @@
import pytest import pytest
from spacy import util from spacy import util
from spacy.gold import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.language import Language from spacy.language import Language
from spacy.tests.util import make_tempdir from spacy.tests.util import make_tempdir
@ -25,27 +25,61 @@ TRAIN_DATA = [
}, },
), ),
# test combinations of morph+POS # test combinations of morph+POS
("Eat blue ham", {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]},), ("Eat blue ham", {"morphs": ["Feat=V", "", ""], "pos": ["", "ADJ", ""]}),
] ]
def test_no_label():
nlp = Language()
nlp.add_pipe("morphologizer")
with pytest.raises(ValueError):
nlp.begin_training()
def test_implicit_label():
nlp = Language()
nlp.add_pipe("morphologizer")
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples)
def test_no_resize():
nlp = Language()
morphologizer = nlp.add_pipe("morphologizer")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "VERB")
nlp.begin_training()
# this throws an error because the morphologizer can't be resized after initialization
with pytest.raises(ValueError):
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "ADJ")
def test_begin_training_examples():
nlp = Language()
morphologizer = nlp.add_pipe("morphologizer")
morphologizer.add_label("POS" + Morphology.FIELD_SEP + "NOUN")
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training()
nlp.begin_training(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples)
def test_overfitting_IO(): def test_overfitting_IO():
# Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly # Simple test to try and quickly overfit the morphologizer - ensuring the ML models work correctly
nlp = English() nlp = English()
morphologizer = nlp.add_pipe("morphologizer") nlp.add_pipe("morphologizer")
train_examples = [] train_examples = []
for inst in TRAIN_DATA: for inst in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1])) train_examples.append(Example.from_dict(nlp.make_doc(inst[0]), inst[1]))
for morph, pos in zip(inst[1]["morphs"], inst[1]["pos"]): optimizer = nlp.begin_training(get_examples=lambda: train_examples)
if morph and pos:
morphologizer.add_label(
morph + Morphology.FEATURE_SEP + "POS" + Morphology.FIELD_SEP + pos
)
elif pos:
morphologizer.add_label("POS" + Morphology.FIELD_SEP + pos)
elif morph:
morphologizer.add_label(morph)
optimizer = nlp.begin_training()
for i in range(50): for i in range(50):
losses = {} losses = {}
@ -55,18 +89,8 @@ def test_overfitting_IO():
# test the trained model # test the trained model
test_text = "I like blue ham" test_text = "I like blue ham"
doc = nlp(test_text) doc = nlp(test_text)
gold_morphs = [ gold_morphs = ["Feat=N", "Feat=V", "", ""]
"Feat=N", gold_pos_tags = ["NOUN", "VERB", "ADJ", ""]
"Feat=V",
"",
"",
]
gold_pos_tags = [
"NOUN",
"VERB",
"ADJ",
"",
]
assert [t.morph_ for t in doc] == gold_morphs assert [t.morph_ for t in doc] == gold_morphs
assert [t.pos_ for t in doc] == gold_pos_tags assert [t.pos_ for t in doc] == gold_pos_tags

View File

@ -1,7 +1,7 @@
import pytest import pytest
from spacy import util from spacy import util
from spacy.gold import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.language import Language from spacy.language import Language
from spacy.tests.util import make_tempdir from spacy.tests.util import make_tempdir
@ -30,6 +30,20 @@ TRAIN_DATA = [
), ),
] ]
def test_begin_training_examples():
nlp = Language()
senter = nlp.add_pipe("senter")
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training()
nlp.begin_training(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples)
def test_overfitting_IO(): def test_overfitting_IO():
# Simple test to try and quickly overfit the senter - ensuring the ML models work correctly # Simple test to try and quickly overfit the senter - ensuring the ML models work correctly

View File

@ -1,45 +0,0 @@
from spacy.lang.en import English
from spacy.gold import Example
from spacy import util
from ..util import make_tempdir
TRAIN_DATA = [
("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
("I like London and Berlin.", {"entities": [(7, 13, "LOC"), (18, 24, "LOC")]}),
]
def test_overfitting_IO():
# Simple test to try and quickly overfit the SimpleNER component - ensuring the ML models work correctly
nlp = English()
ner = nlp.add_pipe("simple_ner")
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for ent in annotations.get("entities"):
ner.add_label(ent[2])
optimizer = nlp.begin_training()
for i in range(50):
losses = {}
nlp.update(train_examples, sgd=optimizer, losses=losses)
assert losses["ner"] < 0.0001
# test the trained model
test_text = "I like London."
doc = nlp(test_text)
ents = doc.ents
assert len(ents) == 1
assert ents[0].text == "London"
assert ents[0].label_ == "LOC"
# Also test the results are still the same after IO
with make_tempdir() as tmp_dir:
nlp.to_disk(tmp_dir)
nlp2 = util.load_model_from_path(tmp_dir)
doc2 = nlp2(test_text)
ents2 = doc2.ents
assert len(ents2) == 1
assert ents2[0].text == "London"
assert ents2[0].label_ == "LOC"

View File

@ -1,6 +1,6 @@
import pytest import pytest
from spacy import util from spacy import util
from spacy.gold import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.language import Language from spacy.language import Language
@ -34,6 +34,56 @@ TRAIN_DATA = [
] ]
def test_no_label():
nlp = Language()
nlp.add_pipe("tagger")
with pytest.raises(ValueError):
nlp.begin_training()
def test_no_resize():
nlp = Language()
tagger = nlp.add_pipe("tagger")
tagger.add_label("N")
tagger.add_label("V")
assert tagger.labels == ("N", "V")
nlp.begin_training()
assert tagger.model.get_dim("nO") == 2
# this throws an error because the tagger can't be resized after initialization
with pytest.raises(ValueError):
tagger.add_label("J")
def test_implicit_label():
nlp = Language()
nlp.add_pipe("tagger")
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples)
def test_begin_training_examples():
nlp = Language()
tagger = nlp.add_pipe("tagger")
train_examples = []
for tag in TAGS:
tagger.add_label(tag)
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
# you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training()
nlp.begin_training(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: train_examples[0])
with pytest.raises(ValueError):
nlp.begin_training(get_examples=lambda: [])
with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples)
def test_overfitting_IO(): def test_overfitting_IO():
# Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly # Simple test to try and quickly overfit the tagger - ensuring the ML models work correctly
nlp = English() nlp = English()
@ -41,9 +91,8 @@ def test_overfitting_IO():
train_examples = [] train_examples = []
for t in TRAIN_DATA: for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
for tag in TAGS: optimizer = nlp.begin_training(get_examples=lambda: train_examples)
tagger.add_label(tag) assert tagger.model.get_dim("nO") == len(TAGS)
optimizer = nlp.begin_training()
for i in range(50): for i in range(50):
losses = {} losses = {}

View File

@ -10,7 +10,7 @@ from spacy.tokens import Doc
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from ..util import make_tempdir from ..util import make_tempdir
from ...gold import Example from ...training import Example
TRAIN_DATA = [ TRAIN_DATA = [
@ -80,6 +80,51 @@ def test_label_types():
textcat.add_label(9) textcat.add_label(9)
def test_no_label():
nlp = Language()
nlp.add_pipe("textcat")
with pytest.raises(ValueError):
nlp.begin_training()
def test_implicit_label():
nlp = Language()
textcat = nlp.add_pipe("textcat")
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
nlp.begin_training(get_examples=lambda: train_examples)
def test_no_resize():
nlp = Language()
textcat = nlp.add_pipe("textcat")
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")
nlp.begin_training()
assert textcat.model.get_dim("nO") == 2
# this throws an error because the textcat can't be resized after initialization
with pytest.raises(ValueError):
textcat.add_label("NEUTRAL")
def test_begin_training_examples():
nlp = Language()
textcat = nlp.add_pipe("textcat")
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for label, value in annotations.get("cats").items():
textcat.add_label(label)
# you shouldn't really call this more than once, but for testing it should be fine
nlp.begin_training()
nlp.begin_training(get_examples=lambda: train_examples)
with pytest.raises(TypeError):
nlp.begin_training(get_examples=lambda: None)
with pytest.raises(ValueError):
nlp.begin_training(get_examples=train_examples)
def test_overfitting_IO(): def test_overfitting_IO():
# Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly # Simple test to try and quickly overfit the textcat component - ensuring the ML models work correctly
fix_random_seed(0) fix_random_seed(0)
@ -89,9 +134,8 @@ def test_overfitting_IO():
train_examples = [] train_examples = []
for text, annotations in TRAIN_DATA: for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
for label, value in annotations.get("cats").items(): optimizer = nlp.begin_training(get_examples=lambda: train_examples)
textcat.add_label(label) assert textcat.model.get_dim("nO") == 2
optimizer = nlp.begin_training()
for i in range(50): for i in range(50):
losses = {} losses = {}

View File

@ -1,7 +1,7 @@
import pytest import pytest
import random import random
from spacy import util from spacy import util
from spacy.gold import Example from spacy.training import Example
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.attrs import IS_PUNCT, ORTH, LOWER from spacy.attrs import IS_PUNCT, ORTH, LOWER
from spacy.vocab import Vocab from spacy.vocab import Vocab

View File

@ -3,7 +3,7 @@ import gc
import numpy import numpy
import copy import copy
from spacy.gold import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.lex_attrs import is_stop from spacy.lang.lex_attrs import is_stop

View File

@ -3,7 +3,7 @@ import numpy
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.matcher import Matcher from spacy.matcher import Matcher
from spacy.displacy import render from spacy.displacy import render
from spacy.gold import iob_to_biluo from spacy.training import iob_to_biluo
from spacy.lang.it import Italian from spacy.lang.it import Italian
from spacy.lang.en import English from spacy.lang.en import English

View File

@ -1,6 +1,6 @@
import pytest import pytest
from spacy import displacy from spacy import displacy
from spacy.gold import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.lang.ja import Japanese from spacy.lang.ja import Japanese
from spacy.lang.xx import MultiLanguage from spacy.lang.xx import MultiLanguage
@ -20,7 +20,7 @@ def test_issue2564():
nlp = Language() nlp = Language()
tagger = nlp.add_pipe("tagger") tagger = nlp.add_pipe("tagger")
tagger.add_label("A") tagger.add_label("A")
tagger.begin_training(lambda: []) nlp.begin_training()
doc = nlp("hello world") doc = nlp("hello world")
assert doc.is_tagged assert doc.is_tagged
docs = nlp.pipe(["hello", "world"]) docs = nlp.pipe(["hello", "world"])

View File

@ -9,7 +9,7 @@ from spacy.tokens import Doc, Token
from spacy.matcher import Matcher, PhraseMatcher from spacy.matcher import Matcher, PhraseMatcher
from spacy.errors import MatchPatternError from spacy.errors import MatchPatternError
from spacy.util import minibatch from spacy.util import minibatch
from spacy.gold import Example from spacy.training import Example
from spacy.lang.hi import Hindi from spacy.lang.hi import Hindi
from spacy.lang.es import Spanish from spacy.lang.es import Spanish
from spacy.lang.en import English from spacy.lang.en import English
@ -251,6 +251,12 @@ def test_issue3803():
assert [t.like_num for t in doc] == [True, True, True, True, True, True] assert [t.like_num for t in doc] == [True, True, True, True, True, True]
def _parser_example(parser):
doc = Doc(parser.vocab, words=["a", "b", "c", "d"])
gold = {"heads": [1, 1, 3, 3], "deps": ["right", "ROOT", "left", "ROOT"]}
return Example.from_dict(doc, gold)
def test_issue3830_no_subtok(): def test_issue3830_no_subtok():
"""Test that the parser doesn't have subtok label if not learn_tokens""" """Test that the parser doesn't have subtok label if not learn_tokens"""
config = { config = {
@ -264,7 +270,7 @@ def test_issue3830_no_subtok():
parser = DependencyParser(Vocab(), model, **config) parser = DependencyParser(Vocab(), model, **config)
parser.add_label("nsubj") parser.add_label("nsubj")
assert "subtok" not in parser.labels assert "subtok" not in parser.labels
parser.begin_training(lambda: []) parser.begin_training(lambda: [_parser_example(parser)])
assert "subtok" not in parser.labels assert "subtok" not in parser.labels
@ -281,7 +287,7 @@ def test_issue3830_with_subtok():
parser = DependencyParser(Vocab(), model, **config) parser = DependencyParser(Vocab(), model, **config)
parser.add_label("nsubj") parser.add_label("nsubj")
assert "subtok" not in parser.labels assert "subtok" not in parser.labels
parser.begin_training(lambda: []) parser.begin_training(lambda: [_parser_example(parser)])
assert "subtok" in parser.labels assert "subtok" in parser.labels

View File

@ -2,8 +2,8 @@ import pytest
from spacy.pipeline import Pipe from spacy.pipeline import Pipe
from spacy.matcher import PhraseMatcher, Matcher from spacy.matcher import PhraseMatcher, Matcher
from spacy.tokens import Doc, Span, DocBin from spacy.tokens import Doc, Span, DocBin
from spacy.gold import Example, Corpus from spacy.training import Example, Corpus
from spacy.gold.converters import json2docs from spacy.training.converters import json2docs
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.lang.en import English from spacy.lang.en import English
from spacy.util import minibatch, ensure_path, load_model from spacy.util import minibatch, ensure_path, load_model

View File

@ -1,9 +1,7 @@
import pytest import pytest
from mock import Mock
from spacy.matcher import DependencyMatcher
from spacy.tokens import Doc, Span, DocBin from spacy.tokens import Doc, Span, DocBin
from spacy.gold import Example from spacy.training import Example
from spacy.gold.converters.conllu2docs import conllu2docs from spacy.training.converters.conllu2docs import conllu2docs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.kb import KnowledgeBase from spacy.kb import KnowledgeBase
from spacy.vocab import Vocab from spacy.vocab import Vocab
@ -12,7 +10,7 @@ from spacy.util import ensure_path, load_model_from_path
import numpy import numpy
import pickle import pickle
from ..util import get_doc, make_tempdir from ..util import make_tempdir
def test_issue4528(en_vocab): def test_issue4528(en_vocab):

View File

@ -64,7 +64,7 @@ def tagger():
# 1. no model leads to error in serialization, # 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization # 2. the affected line is the one for model serialization
tagger.add_label("A") tagger.add_label("A")
tagger.begin_training(lambda: [], pipeline=nlp.pipeline) nlp.begin_training()
return tagger return tagger
@ -85,7 +85,7 @@ def entity_linker():
# need to add model for two reasons: # need to add model for two reasons:
# 1. no model leads to error in serialization, # 1. no model leads to error in serialization,
# 2. the affected line is the one for model serialization # 2. the affected line is the one for model serialization
entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline) nlp.begin_training()
return entity_linker return entity_linker

View File

@ -1,14 +1,15 @@
import pytest import pytest
from click import NoSuchOption from click import NoSuchOption
from spacy.gold import docs_to_json, biluo_tags_from_offsets from spacy.training import docs_to_json, biluo_tags_from_offsets
from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs from spacy.training.converters import iob2docs, conll_ner2docs, conllu2docs
from spacy.lang.en import English from spacy.lang.en import English
from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate from spacy.schemas import ProjectConfigSchema, RecommendationSchema, validate
from spacy.cli.pretrain import make_docs from spacy.cli.pretrain import make_docs
from spacy.cli.init_config import init_config, RECOMMENDATIONS from spacy.cli.init_config import init_config, RECOMMENDATIONS
from spacy.cli._util import validate_project_commands, parse_config_overrides from spacy.cli._util import validate_project_commands, parse_config_overrides
from spacy.cli._util import load_project_config, substitute_project_variables from spacy.cli._util import load_project_config, substitute_project_variables
from spacy.cli._util import string_to_list
from thinc.config import ConfigValidationError from thinc.config import ConfigValidationError
import srsly import srsly
@ -372,17 +373,13 @@ def test_parse_config_overrides(args, expected):
assert parse_config_overrides(args) == expected assert parse_config_overrides(args) == expected
@pytest.mark.parametrize( @pytest.mark.parametrize("args", [["--foo"], ["--x.foo", "bar", "--baz"]])
"args", [["--foo"], ["--x.foo", "bar", "--baz"]],
)
def test_parse_config_overrides_invalid(args): def test_parse_config_overrides_invalid(args):
with pytest.raises(NoSuchOption): with pytest.raises(NoSuchOption):
parse_config_overrides(args) parse_config_overrides(args)
@pytest.mark.parametrize( @pytest.mark.parametrize("args", [["--x.foo", "bar", "baz"], ["x.foo"]])
"args", [["--x.foo", "bar", "baz"], ["x.foo"]],
)
def test_parse_config_overrides_invalid_2(args): def test_parse_config_overrides_invalid_2(args):
with pytest.raises(SystemExit): with pytest.raises(SystemExit):
parse_config_overrides(args) parse_config_overrides(args)
@ -401,3 +398,44 @@ def test_init_config(lang, pipeline, optimize):
def test_model_recommendations(): def test_model_recommendations():
for lang, data in RECOMMENDATIONS.items(): for lang, data in RECOMMENDATIONS.items():
assert RecommendationSchema(**data) assert RecommendationSchema(**data)
@pytest.mark.parametrize(
"value",
[
# fmt: off
"parser,textcat,tagger",
" parser, textcat ,tagger ",
'parser,textcat,tagger',
' parser, textcat ,tagger ',
' "parser"," textcat " ,"tagger "',
" 'parser',' textcat ' ,'tagger '",
'[parser,textcat,tagger]',
'["parser","textcat","tagger"]',
'[" parser" ,"textcat ", " tagger " ]',
"[parser,textcat,tagger]",
"[ parser, textcat , tagger]",
"['parser','textcat','tagger']",
"[' parser' , 'textcat', ' tagger ' ]",
# fmt: on
],
)
def test_string_to_list(value):
assert string_to_list(value, intify=False) == ["parser", "textcat", "tagger"]
@pytest.mark.parametrize(
"value",
[
# fmt: off
"1,2,3",
'[1,2,3]',
'["1","2","3"]',
'[" 1" ,"2 ", " 3 " ]',
"[' 1' , '2', ' 3 ' ]",
# fmt: on
],
)
def test_string_to_list_intify(value):
assert string_to_list(value, intify=False) == ["1", "2", "3"]
assert string_to_list(value, intify=True) == [1, 2, 3]

View File

@ -3,7 +3,7 @@ import pytest
from spacy.language import Language from spacy.language import Language
from spacy.tokens import Doc, Span from spacy.tokens import Doc, Span
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.gold import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.util import registry from spacy.util import registry

View File

@ -1,5 +1,5 @@
import pytest import pytest
from spacy.gold.example import Example from spacy.training.example import Example
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.vocab import Vocab from spacy.vocab import Vocab

View File

@ -1,8 +1,8 @@
from numpy.testing import assert_almost_equal, assert_array_almost_equal from numpy.testing import assert_almost_equal, assert_array_almost_equal
import pytest import pytest
from pytest import approx from pytest import approx
from spacy.gold import Example from spacy.training import Example
from spacy.gold.iob_utils import biluo_tags_from_offsets from spacy.training.iob_utils import biluo_tags_from_offsets
from spacy.scorer import Scorer, ROCAUCScore from spacy.scorer import Scorer, ROCAUCScore
from spacy.scorer import _roc_auc_score, _roc_curve from spacy.scorer import _roc_auc_score, _roc_curve
from .util import get_doc from .util import get_doc

View File

@ -6,7 +6,7 @@ from spacy.ml.models.tok2vec import MishWindowEncoder, MaxoutWindowEncoder
from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener from spacy.pipeline.tok2vec import Tok2Vec, Tok2VecListener
from spacy.vocab import Vocab from spacy.vocab import Vocab
from spacy.tokens import Doc from spacy.tokens import Doc
from spacy.gold import Example from spacy.training import Example
from spacy import util from spacy import util
from spacy.lang.en import English from spacy.lang.en import English
from .util import get_batch from .util import get_batch
@ -89,6 +89,7 @@ def test_init_tok2vec():
tok2vec = nlp.add_pipe("tok2vec") tok2vec = nlp.add_pipe("tok2vec")
assert tok2vec.listeners == [] assert tok2vec.listeners == []
nlp.begin_training() nlp.begin_training()
assert tok2vec.model.get_dim("nO")
cfg_string = """ cfg_string = """

View File

@ -1,9 +1,10 @@
import numpy import numpy
from spacy.gold import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment from spacy.training import biluo_tags_from_offsets, offsets_from_biluo_tags, Alignment
from spacy.gold import spans_from_biluo_tags, iob_to_biluo from spacy.training import spans_from_biluo_tags, iob_to_biluo
from spacy.gold import Corpus, docs_to_json from spacy.training import Corpus, docs_to_json
from spacy.gold.example import Example from spacy.training.example import Example
from spacy.gold.converters import json2docs from spacy.training.converters import json2docs
from spacy.training.augment import make_orth_variants_example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tokens import Doc, DocBin from spacy.tokens import Doc, DocBin
from spacy.util import get_words_and_spaces, minibatch from spacy.util import get_words_and_spaces, minibatch
@ -12,7 +13,6 @@ import pytest
import srsly import srsly
from .util import make_tempdir from .util import make_tempdir
from ..gold.augment import make_orth_variants_example
@pytest.fixture @pytest.fixture

View File

@ -5,7 +5,7 @@ from .util import get_random_doc
from spacy import util from spacy import util
from spacy.util import dot_to_object, SimpleFrozenList from spacy.util import dot_to_object, SimpleFrozenList
from thinc.api import Config, Optimizer from thinc.api import Config, Optimizer
from spacy.gold.batchers import minibatch_by_words from spacy.training.batchers import minibatch_by_words
from ..lang.en import English from ..lang.en import English
from ..lang.nl import Dutch from ..lang.nl import Dutch
from ..language import DEFAULT_CONFIG_PATH from ..language import DEFAULT_CONFIG_PATH

View File

@ -24,7 +24,7 @@ from .util import registry
from .attrs import intify_attrs from .attrs import intify_attrs
from .symbols import ORTH from .symbols import ORTH
from .scorer import Scorer from .scorer import Scorer
from .gold import validate_examples from .training import validate_examples
cdef class Tokenizer: cdef class Tokenizer:

View File

@ -576,7 +576,7 @@ cdef class Doc:
entity_type = 0 entity_type = 0
kb_id = 0 kb_id = 0
# Set ent_iob to Missing (0) bij default unless this token was nered before # Set ent_iob to Missing (0) by default unless this token was nered before
ent_iob = 0 ent_iob = 0
if self.c[i].ent_iob != 0: if self.c[i].ent_iob != 0:
ent_iob = 2 ent_iob = 2

View File

@ -1,7 +1,7 @@
from wasabi import Printer from wasabi import Printer
from .. import tags_to_entities from .. import tags_to_entities
from ...gold import iob_to_biluo from ...training import iob_to_biluo
from ...lang.xx import MultiLanguage from ...lang.xx import MultiLanguage
from ...tokens import Doc, Span from ...tokens import Doc, Span
from ...util import load_model from ...util import load_model

View File

@ -1,7 +1,7 @@
import re import re
from .conll_ner2docs import n_sents_info from .conll_ner2docs import n_sents_info
from ...gold import iob_to_biluo, spans_from_biluo_tags from ...training import iob_to_biluo, spans_from_biluo_tags
from ...tokens import Doc, Token, Span from ...tokens import Doc, Token, Span
from ...vocab import Vocab from ...vocab import Vocab
from wasabi import Printer from wasabi import Printer

View File

@ -1,7 +1,7 @@
from wasabi import Printer from wasabi import Printer
from .conll_ner2docs import n_sents_info from .conll_ner2docs import n_sents_info
from ...gold import iob_to_biluo, tags_to_entities from ...training import iob_to_biluo, tags_to_entities
from ...tokens import Doc, Span from ...tokens import Doc, Span
from ...util import minibatch from ...util import minibatch

View File

@ -195,13 +195,15 @@ def tags_to_entities(tags):
continue continue
elif tag.startswith("I"): elif tag.startswith("I"):
if start is None: if start is None:
raise ValueError(Errors.E067.format(tags=tags[: i + 1])) raise ValueError(Errors.E067.format(start="I", tags=tags[: i + 1]))
continue continue
if tag.startswith("U"): if tag.startswith("U"):
entities.append((tag[2:], i, i)) entities.append((tag[2:], i, i))
elif tag.startswith("B"): elif tag.startswith("B"):
start = i start = i
elif tag.startswith("L"): elif tag.startswith("L"):
if start is None:
raise ValueError(Errors.E067.format(start="L", tags=tags[: i + 1]))
entities.append((tag[2:], start, i)) entities.append((tag[2:], start, i))
start = None start = None
else: else:

View File

@ -93,6 +93,7 @@ class registry(thinc.registry):
# environment. spaCy models packaged with `spacy package` will "advertise" # environment. spaCy models packaged with `spacy package` will "advertise"
# themselves via entry points. # themselves via entry points.
models = catalogue.create("spacy", "models", entry_points=True) models = catalogue.create("spacy", "models", entry_points=True)
cli = catalogue.create("spacy", "cli", entry_points=True)
class SimpleFrozenDict(dict): class SimpleFrozenDict(dict):
@ -647,7 +648,7 @@ def join_command(command: List[str]) -> str:
return " ".join(shlex.quote(cmd) for cmd in command) return " ".join(shlex.quote(cmd) for cmd in command)
def run_command(command: Union[str, List[str]], *, capture=False, stdin=None) -> None: def run_command(command: Union[str, List[str]], *, capture=False, stdin=None):
"""Run a command on the command line as a subprocess. If the subprocess """Run a command on the command line as a subprocess. If the subprocess
returns a non-zero exit code, a system exit is performed. returns a non-zero exit code, a system exit is performed.

View File

@ -290,10 +290,10 @@ always be the **last element** in the row.
> ``` > ```
| Name | Description | | Name | Description |
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~ | | `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~ |
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ | | `set_extra_annotations` | Function that takes a batch of `Doc` objects and transformer outputs and can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
### List {#list} ### List {#list}
@ -609,7 +609,6 @@ In addition to the native markdown elements, you can use the components
├── docs # the actual markdown content ├── docs # the actual markdown content
├── meta # JSON-formatted site metadata ├── meta # JSON-formatted site metadata
| ├── languages.json # supported languages and statistical models | ├── languages.json # supported languages and statistical models
| ├── logos.json # logos and links for landing page
| ├── sidebars.json # sidebar navigations for different sections | ├── sidebars.json # sidebar navigations for different sections
| ├── site.json # general site metadata | ├── site.json # general site metadata
| └── universe.json # data for the spaCy universe section | └── universe.json # data for the spaCy universe section

View File

@ -181,10 +181,10 @@ characters would be `"jumpping"`: 4 from the start, 4 from the end. This ensures
that the final character is always in the last position, instead of being in an that the final character is always in the last position, instead of being in an
arbitrary position depending on the word length. arbitrary position depending on the word length.
The characters are embedded in a embedding table with 256 rows, and the vectors The characters are embedded in a embedding table with a given number of rows,
concatenated. A hash-embedded vector of the `NORM` of the word is also and the vectors concatenated. A hash-embedded vector of the `NORM` of the word
concatenated on, and the result is then passed through a feed-forward network to is also concatenated on, and the result is then passed through a feed-forward
construct a single vector to represent the information. network to construct a single vector to represent the information.
| Name | Description | | Name | Description |
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -456,62 +456,6 @@ consists of either two or three subnetworks:
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ | | `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Docs], List[List[Floats2d]]]~~ |
### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"}
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.BILUOTagger.v1 "
>
> [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v1"
> # etc.
> ```
Construct a simple NER tagger that predicts
[BILUO](/usage/linguistic-features#accessing-ner) tag scores for each token and
uses greedy decoding with transition-constraints to return a valid BILUO tag
sequence. A BILUO tag sequence encodes a sequence of non-overlapping labelled
spans into tags assigned to each token. The first token of a span is given the
tag `B-LABEL`, the last token of the span is given the tag `L-LABEL`, and tokens
within the span are given the tag `U-LABEL`. Single-token spans are given the
tag `U-LABEL`. All other tokens are assigned the tag `O`. The BILUO tag scheme
generally results in better linear separation between classes, especially for
non-CRF models, because there are more distinct classes for the different
situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)).
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------ |
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"}
> #### Example Config
>
> ```ini
> [model]
> @architectures = "spacy.IOBTagger.v1 "
>
> [model.tok2vec]
> @architectures = "spacy.HashEmbedCNN.v1"
> # etc.
> ```
Construct a simple NER tagger, that predicts
[IOB](/usage/linguistic-features#accessing-ner) tag scores for each token and
uses greedy decoding with transition-constraints to return a valid IOB tag
sequence. An IOB tag sequence encodes a sequence of non-overlapping labeled
spans into tags assigned to each token. The first token of a span is given the
tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens
are assigned the tag O.
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------ |
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], List[Floats2d]]~~ |
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
### spacy.Tagger.v1 {#Tagger} ### spacy.Tagger.v1 {#Tagger}

View File

@ -38,7 +38,7 @@ how the component should be configured. You can override its settings via the
| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ | | `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py %%GITHUB_SPACY/spacy/pipeline/attributeruler.py
``` ```
## AttributeRuler.\_\_init\_\_ {#init tag="method"} ## AttributeRuler.\_\_init\_\_ {#init tag="method"}

Some files were not shown because too many files have changed in this diff Show More