Merge branch 'v4' into feature/multiple-code-files

This commit is contained in:
Paul O'Leary McCann 2023-02-02 12:34:59 +09:00
commit 5aff2b8204
225 changed files with 4265 additions and 2457 deletions

View File

@ -62,6 +62,11 @@ steps:
# - script: |
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
# displayName: 'Test no warnings on load (#11713)'
# condition: eq(variables['python_version'], '3.8')
#
# - script: |
# python -m spacy download ca_core_news_sm 2>&1 | grep -q skipping
# displayName: 'Test skip re-download (#12188)'
# condition: eq(variables['python_version'], '3.8')
- script: |

10
.gitignore vendored
View File

@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg
spacy/tests/package/pyproject.toml
spacy/tests/package/requirements.txt
# Website
website/.cache/
website/public/
website/node_modules
website/.npm
website/logs
*.log
npm-debug.log*
quickstart-training-generator.js
# Cython / C extensions
cythonize.json
spacy/*.html

View File

@ -3,7 +3,7 @@ repos:
rev: 22.3.0
hooks:
- id: black
language_version: python3.7
language_version: python3.8
additional_dependencies: ['click==8.0.4']
- repo: https://github.com/pycqa/flake8
rev: 5.0.4

View File

@ -271,7 +271,7 @@ except: # noqa: E722
### Python conventions
All Python code must be written **compatible with Python 3.6+**. More detailed
All Python code must be written **compatible with Python 3.8+**. More detailed
code conventions can be found in the [developer docs](https://github.com/explosion/spaCy/blob/master/extra/DEVELOPER_DOCS/Code%20Conventions.md).
#### I/O and handling paths

View File

@ -5,7 +5,7 @@ override SPACY_EXTRAS = spacy-lookups-data==1.0.2 jieba spacy-pkuseg==0.0.28 sud
endif
ifndef PYVER
override PYVER = 3.6
override PYVER = 3.8
endif
VENV := ./env$(PYVER)

View File

@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
💫 **Version 3.4 out now!**
💫 **Version 3.5 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
@ -105,7 +105,7 @@ For detailed installation instructions, see the
- **Operating system**: macOS / OS X · Linux · Windows (Cygwin, MinGW, Visual
Studio)
- **Python version**: Python 3.6+ (only 64 bit)
- **Python version**: Python 3.8+ (only 64 bit)
- **Package managers**: [pip] · [conda] (via `conda-forge`)
[pip]: https://pypi.org/project/spacy/

View File

@ -11,25 +11,39 @@ trigger:
exclude:
- "website/*"
- "*.md"
- "*.mdx"
- ".github/workflows/*"
pr:
paths:
exclude:
- "*.md"
- "*.mdx"
- "website/docs/*"
- "website/src/*"
- "website/meta/*.tsx"
- "website/meta/*.mjs"
- "website/meta/languages.json"
- "website/meta/site.json"
- "website/meta/sidebars.json"
- "website/meta/type-annotations.json"
- "website/pages/*"
- ".github/workflows/*"
jobs:
# Perform basic checks for most important errors (syntax etc.) Uses the config
# defined in .flake8 and overwrites the selected codes.
# Check formatting and linting. Perform basic checks for most important errors
# (syntax etc.) Uses the config defined in setup.cfg and overwrites the
# selected codes.
- job: "Validate"
pool:
vmImage: "ubuntu-latest"
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: "3.7"
versionSpec: "3.8"
- script: |
pip install black==22.3.0
python -m black spacy --check
displayName: "black"
- script: |
pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
@ -40,24 +54,6 @@ jobs:
strategy:
matrix:
# We're only running one platform per Python version to speed up builds
Python36Linux:
imageName: "ubuntu-20.04"
python.version: "3.6"
# Python36Windows:
# imageName: "windows-latest"
# python.version: "3.6"
# Python36Mac:
# imageName: "macos-latest"
# python.version: "3.6"
# Python37Linux:
# imageName: "ubuntu-20.04"
# python.version: "3.7"
Python37Windows:
imageName: "windows-latest"
python.version: "3.7"
# Python37Mac:
# imageName: "macos-latest"
# python.version: "3.7"
# Python38Linux:
# imageName: "ubuntu-latest"
# python.version: "3.8"

View File

@ -5,7 +5,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=9.0.0.dev1,<9.1.0",
"thinc>=9.0.0.dev2,<9.1.0",
"numpy>=1.15.0",
]
build-backend = "setuptools.build_meta"

View File

@ -1,9 +1,9 @@
# Our libraries
spacy-legacy>=3.0.11,<3.1.0
spacy-legacy>=4.0.0.dev0,<4.1.0
spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=9.0.0.dev1,<9.1.0
thinc>=9.0.0.dev2,<9.1.0
ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0
wasabi>=0.9.1,<1.2.0
@ -22,7 +22,6 @@ langcodes>=3.2.0,<4.0.0
# Official Python utilities
setuptools
packaging>=20.0
typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"
# Development dependencies
pre-commit>=2.13.0
cython>=0.25,<3.0
@ -31,8 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0
mypy>=0.990,<0.1000; platform_machine != "aarch64" and python_version >= "3.7"
types-dataclasses>=0.1.3; python_version < "3.7"
mypy>=0.990,<0.1000; platform_machine != "aarch64"
types-mock>=0.1.1
types-setuptools>=57.0.0
types-requests

View File

@ -17,8 +17,6 @@ classifiers =
Operating System :: Microsoft :: Windows
Programming Language :: Cython
Programming Language :: Python :: 3
Programming Language :: Python :: 3.6
Programming Language :: Python :: 3.7
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10
@ -31,15 +29,15 @@ project_urls =
[options]
zip_safe = false
include_package_data = true
python_requires = >=3.6
python_requires = >=3.8
install_requires =
# Our libraries
spacy-legacy>=3.0.11,<3.1.0
spacy-legacy>=4.0.0.dev0,<4.1.0
spacy-loggers>=1.0.0,<2.0.0
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=9.0.0.dev1,<9.1.0
thinc>=9.0.0.dev2,<9.1.0
wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
@ -55,7 +53,6 @@ install_requires =
# Official Python utilities
setuptools
packaging>=20.0
typing_extensions>=3.7.4,<4.2.0; python_version < "3.8"
langcodes>=3.2.0,<4.0.0
[options.entry_points]

View File

@ -33,12 +33,10 @@ MOD_NAMES = [
"spacy.kb.candidate",
"spacy.kb.kb",
"spacy.kb.kb_in_memory",
"spacy.ml.parser_model",
"spacy.ml.tb_framework",
"spacy.morphology",
"spacy.pipeline.dep_parser",
"spacy.pipeline._edit_tree_internals.edit_trees",
"spacy.pipeline.morphologizer",
"spacy.pipeline.ner",
"spacy.pipeline.pipe",
"spacy.pipeline.trainable_pipe",
"spacy.pipeline.sentencizer",
@ -46,6 +44,7 @@ MOD_NAMES = [
"spacy.pipeline.tagger",
"spacy.pipeline.transition_parser",
"spacy.pipeline._parser_internals.arc_eager",
"spacy.pipeline._parser_internals.batch",
"spacy.pipeline._parser_internals.ner",
"spacy.pipeline._parser_internals.nonproj",
"spacy.pipeline._parser_internals.search",
@ -53,6 +52,7 @@ MOD_NAMES = [
"spacy.pipeline._parser_internals.stateclass",
"spacy.pipeline._parser_internals.transition_system",
"spacy.pipeline._parser_internals._beam_utils",
"spacy.pipeline._parser_internals._parser_utils",
"spacy.tokenizer",
"spacy.training.align",
"spacy.training.gold_io",

View File

@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
__version__ = "3.5.0"
__version__ = "4.0.0.dev0"
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
__projects__ = "https://github.com/explosion/projects"

View File

@ -4,6 +4,7 @@ from ._util import app, setup_cli # noqa: F401
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
# are registered automatically and won't have to be imported here.
from .benchmark_speed import benchmark_speed_cli # noqa: F401
from .download import download # noqa: F401
from .info import info # noqa: F401
from .package import package # noqa: F401

View File

@ -1,4 +1,4 @@
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable
from typing import Dict, Any, Union, List, Optional, Tuple, Iterable, Literal
from typing import TYPE_CHECKING, overload
import sys
import shutil
@ -16,10 +16,10 @@ from thinc.util import gpu_is_available
from configparser import InterpolationError
import os
from ..compat import Literal
from ..schemas import ProjectConfigSchema, validate
from ..util import import_file, run_command, make_tempdir, registry, logger
from ..util import is_compatible_version, SimpleFrozenDict, ENV_VARS
from ..errors import RENAMED_LANGUAGE_CODES
from .. import about
if TYPE_CHECKING:
@ -46,6 +46,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
commands to check and validate your config files, training and evaluation data,
and custom model implementations.
"""
BENCHMARK_HELP = """Commands for benchmarking pipelines."""
INIT_HELP = """Commands for initializing configs and pipeline packages."""
# Wrappers for Typer's annotations. Initially created to set defaults and to
@ -54,12 +55,14 @@ Arg = typer.Argument
Opt = typer.Option
app = typer.Typer(name=NAME, help=HELP)
benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
app.add_typer(project_cli)
app.add_typer(debug_cli)
app.add_typer(benchmark_cli)
app.add_typer(init_cli)
@ -132,6 +135,16 @@ def _parse_override(value: Any) -> Any:
return str(value)
def _handle_renamed_language_codes(lang: Optional[str]) -> None:
# Throw error for renamed language codes in v4
if lang in RENAMED_LANGUAGE_CODES:
msg.fail(
title="Renamed language code",
text=f"Language code '{lang}' was replaced with '{RENAMED_LANGUAGE_CODES[lang]}' in spaCy v4. Update the language code from '{lang}' to '{RENAMED_LANGUAGE_CODES[lang]}'.",
exits=1,
)
def load_project_config(
path: Path, interpolate: bool = True, overrides: Dict[str, Any] = SimpleFrozenDict()
) -> Dict[str, Any]:

View File

@ -0,0 +1,174 @@
from typing import Iterable, List, Optional
import random
from itertools import islice
import numpy
from pathlib import Path
import time
from tqdm import tqdm
import typer
from wasabi import msg
from .. import util
from ..language import Language
from ..tokens import Doc
from ..training import Corpus
from ._util import Arg, Opt, benchmark_cli, setup_gpu
@benchmark_cli.command(
"speed",
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
)
def benchmark_speed_cli(
# fmt: off
ctx: typer.Context,
model: str = Arg(..., help="Model name or path"),
data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"),
no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"),
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
# fmt: on
):
"""
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
data in the binary .spacy format.
"""
setup_gpu(use_gpu=use_gpu, silent=False)
nlp = util.load_model(model)
batch_size = batch_size if batch_size is not None else nlp.batch_size
corpus = Corpus(data_path)
docs = [eg.predicted for eg in corpus(nlp)]
if len(docs) == 0:
msg.fail("Cannot benchmark speed using an empty corpus.", exits=1)
print(f"Warming up for {warmup_epochs} epochs...")
warmup(nlp, docs, warmup_epochs, batch_size)
print()
print(f"Benchmarking {n_batches} batches...")
wps = benchmark(nlp, docs, n_batches, batch_size, not no_shuffle)
print()
print_outliers(wps)
print_mean_with_ci(wps)
# Lowercased, behaves as a context manager function.
class time_context:
"""Register the running time of a context."""
def __enter__(self):
self.start = time.perf_counter()
return self
def __exit__(self, type, value, traceback):
self.elapsed = time.perf_counter() - self.start
class Quartiles:
"""Calculate the q1, q2, q3 quartiles and the inter-quartile range (iqr)
of a sample."""
q1: float
q2: float
q3: float
iqr: float
def __init__(self, sample: numpy.ndarray) -> None:
self.q1 = numpy.quantile(sample, 0.25)
self.q2 = numpy.quantile(sample, 0.5)
self.q3 = numpy.quantile(sample, 0.75)
self.iqr = self.q3 - self.q1
def annotate(
nlp: Language, docs: List[Doc], batch_size: Optional[int]
) -> numpy.ndarray:
docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
wps = []
while True:
with time_context() as elapsed:
batch_docs = list(
islice(docs, batch_size if batch_size else nlp.batch_size)
)
if len(batch_docs) == 0:
break
n_tokens = count_tokens(batch_docs)
wps.append(n_tokens / elapsed.elapsed)
return numpy.array(wps)
def benchmark(
nlp: Language,
docs: List[Doc],
n_batches: int,
batch_size: int,
shuffle: bool,
) -> numpy.ndarray:
if shuffle:
bench_docs = [
nlp.make_doc(random.choice(docs).text)
for _ in range(n_batches * batch_size)
]
else:
bench_docs = [
nlp.make_doc(docs[i % len(docs)].text)
for i in range(n_batches * batch_size)
]
return annotate(nlp, bench_docs, batch_size)
def bootstrap(x, statistic=numpy.mean, iterations=10000) -> numpy.ndarray:
"""Apply a statistic to repeated random samples of an array."""
return numpy.fromiter(
(
statistic(numpy.random.choice(x, len(x), replace=True))
for _ in range(iterations)
),
numpy.float64,
)
def count_tokens(docs: Iterable[Doc]) -> int:
return sum(len(doc) for doc in docs)
def print_mean_with_ci(sample: numpy.ndarray):
mean = numpy.mean(sample)
bootstrap_means = bootstrap(sample)
bootstrap_means.sort()
# 95% confidence interval
low = bootstrap_means[int(len(bootstrap_means) * 0.025)]
high = bootstrap_means[int(len(bootstrap_means) * 0.975)]
print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})")
def print_outliers(sample: numpy.ndarray):
quartiles = Quartiles(sample)
n_outliers = numpy.sum(
(sample < (quartiles.q1 - 1.5 * quartiles.iqr))
| (sample > (quartiles.q3 + 1.5 * quartiles.iqr))
)
n_extreme_outliers = numpy.sum(
(sample < (quartiles.q1 - 3.0 * quartiles.iqr))
| (sample > (quartiles.q3 + 3.0 * quartiles.iqr))
)
print(
f"Outliers: {(100 * n_outliers) / len(sample):.1f}%, extreme outliers: {(100 * n_extreme_outliers) / len(sample)}%"
)
def warmup(
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
) -> numpy.ndarray:
docs = warmup_epochs * docs
return annotate(nlp, docs, batch_size)

View File

@ -7,7 +7,7 @@ import re
import sys
import itertools
from ._util import app, Arg, Opt, walk_directory
from ._util import app, Arg, Opt, _handle_renamed_language_codes, walk_directory
from ..training import docs_to_json
from ..tokens import Doc, DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
@ -112,6 +112,10 @@ def convert(
input_path = Path(input_path)
if not msg:
msg = Printer(no_print=silent)
# Throw error for renamed language codes in v4
_handle_renamed_language_codes(lang)
ner_map = srsly.read_json(ner_map) if ner_map is not None else None
doc_files = []
for input_loc in walk_directory(input_path, converter):

View File

@ -1,5 +1,5 @@
from typing import Any, Dict, Iterable, List, Optional, Sequence, Set, Tuple, Union
from typing import cast, overload
from typing import Literal, cast, overload
from pathlib import Path
from collections import Counter
import sys
@ -17,10 +17,10 @@ from ..pipeline import TrainablePipe
from ..pipeline._parser_internals import nonproj
from ..pipeline._parser_internals.nonproj import DELIMITER
from ..pipeline import Morphologizer, SpanCategorizer
from ..pipeline._edit_tree_internals.edit_trees import EditTrees
from ..morphology import Morphology
from ..language import Language
from ..util import registry, resolve_dot_names
from ..compat import Literal
from ..vectors import Mode as VectorsMode
from .. import util
@ -671,6 +671,59 @@ def debug_data(
f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
)
if "trainable_lemmatizer" in factory_names:
msg.divider("Trainable Lemmatizer")
trees_train: Set[str] = gold_train_data["lemmatizer_trees"]
trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"]
# This is necessary context when someone is attempting to interpret whether the
# number of trees exclusively in the dev set is meaningful.
msg.info(f"{len(trees_train)} lemmatizer trees generated from training data")
msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data")
dev_not_train = trees_dev - trees_train
if len(dev_not_train) != 0:
pct = len(dev_not_train) / len(trees_dev)
msg.info(
f"{len(dev_not_train)} lemmatizer trees ({pct*100:.1f}% of dev trees)"
" were found exclusively in the dev data."
)
else:
# Would we ever expect this case? It seems like it would be pretty rare,
# and we might actually want a warning?
msg.info("All trees in dev data present in training data.")
if gold_train_data["n_low_cardinality_lemmas"] > 0:
n = gold_train_data["n_low_cardinality_lemmas"]
msg.warn(f"{n} training docs with 0 or 1 unique lemmas.")
if gold_dev_data["n_low_cardinality_lemmas"] > 0:
n = gold_dev_data["n_low_cardinality_lemmas"]
msg.warn(f"{n} dev docs with 0 or 1 unique lemmas.")
if gold_train_data["no_lemma_annotations"] > 0:
n = gold_train_data["no_lemma_annotations"]
msg.warn(f"{n} training docs with no lemma annotations.")
else:
msg.good("All training docs have lemma annotations.")
if gold_dev_data["no_lemma_annotations"] > 0:
n = gold_dev_data["no_lemma_annotations"]
msg.warn(f"{n} dev docs with no lemma annotations.")
else:
msg.good("All dev docs have lemma annotations.")
if gold_train_data["partial_lemma_annotations"] > 0:
n = gold_train_data["partial_lemma_annotations"]
msg.info(f"{n} training docs with partial lemma annotations.")
else:
msg.good("All training docs have complete lemma annotations.")
if gold_dev_data["partial_lemma_annotations"] > 0:
n = gold_dev_data["partial_lemma_annotations"]
msg.info(f"{n} dev docs with partial lemma annotations.")
else:
msg.good("All dev docs have complete lemma annotations.")
msg.divider("Summary")
good_counts = msg.counts[MESSAGES.GOOD]
warn_counts = msg.counts[MESSAGES.WARN]
@ -732,7 +785,13 @@ def _compile_gold(
"n_cats_multilabel": 0,
"n_cats_bad_values": 0,
"texts": set(),
"lemmatizer_trees": set(),
"no_lemma_annotations": 0,
"partial_lemma_annotations": 0,
"n_low_cardinality_lemmas": 0,
}
if "trainable_lemmatizer" in factory_names:
trees = EditTrees(nlp.vocab.strings)
for eg in examples:
gold = eg.reference
doc = eg.predicted
@ -862,6 +921,25 @@ def _compile_gold(
data["n_nonproj"] += 1
if nonproj.contains_cycle(aligned_heads):
data["n_cycles"] += 1
if "trainable_lemmatizer" in factory_names:
# from EditTreeLemmatizer._labels_from_data
if all(token.lemma == 0 for token in gold):
data["no_lemma_annotations"] += 1
continue
if any(token.lemma == 0 for token in gold):
data["partial_lemma_annotations"] += 1
lemma_set = set()
for token in gold:
if token.lemma != 0:
lemma_set.add(token.lemma)
tree_id = trees.add(token.text, token.lemma_)
tree_str = trees.tree_to_str(tree_id)
data["lemmatizer_trees"].add(tree_str)
# We want to identify cases where lemmas aren't assigned
# or are all assigned the same value, as this would indicate
# an issue since we're expecting a large set of lemmas
if len(lemma_set) < 2 and len(gold) > 1:
data["n_low_cardinality_lemmas"] += 1
return data

View File

@ -7,7 +7,8 @@ import typer
from ._util import app, Arg, Opt, WHEEL_SUFFIX, SDIST_SUFFIX
from .. import about
from ..util import is_package, get_minor_version, run_command
from ..util import is_prerelease_version
from ..util import is_prerelease_version, get_installed_models
from ..util import get_package_version
@app.command(
@ -63,6 +64,14 @@ def download(
compatibility = get_compatibility()
version = get_version(model_name, compatibility)
# If we already have this version installed, skip downloading
installed = get_installed_models()
if model_name in installed:
installed_version = get_package_version(model_name)
if installed_version == version:
msg.warn(f"{model_name} v{version} already installed, skipping")
return
filename = get_model_filename(model_name, version, sdist)
download_model(filename, pip_args)

View File

@ -7,12 +7,15 @@ from thinc.api import fix_random_seed
from ..training import Corpus
from ..tokens import Doc
from ._util import app, Arg, Opt, setup_gpu, import_code_paths
from ._util import app, Arg, Opt, setup_gpu, import_code_paths, benchmark_cli
from ..scorer import Scorer
from .. import util
from .. import displacy
@benchmark_cli.command(
"accuracy",
)
@app.command("evaluate")
def evaluate_cli(
# fmt: off
@ -36,7 +39,7 @@ def evaluate_cli(
dependency parses in a HTML file, set as output directory as the
displacy_path argument.
DOCS: https://spacy.io/api/cli#evaluate
DOCS: https://spacy.io/api/cli#benchmark-accuracy
"""
import_code_paths(code_paths)
evaluate(

View File

@ -8,11 +8,11 @@ import re
from jinja2 import Template
from .. import util
from ..language import DEFAULT_CONFIG_PRETRAIN_PATH
from ..language import DEFAULT_CONFIG_DISTILL_PATH, DEFAULT_CONFIG_PRETRAIN_PATH
from ..schemas import RecommendationSchema
from ..util import SimpleFrozenList
from ._util import init_cli, Arg, Opt, show_validation_error, COMMAND
from ._util import string_to_list, import_code
from ._util import string_to_list, import_code, _handle_renamed_language_codes
ROOT = Path(__file__).parent / "templates"
@ -43,7 +43,7 @@ class InitValues:
def init_config_cli(
# fmt: off
output_file: Path = Arg(..., help="File to save the config to or - for stdout (will only output config and no additional logging info)", allow_dash=True),
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Two-letter code of the language to use"),
lang: str = Opt(InitValues.lang, "--lang", "-l", help="Code of the language to use"),
pipeline: str = Opt(",".join(InitValues.pipeline), "--pipeline", "-p", help="Comma-separated names of trainable pipeline components to include (without 'tok2vec' or 'transformer')"),
optimize: Optimizations = Opt(InitValues.optimize, "--optimize", "-o", help="Whether to optimize for efficiency (faster inference, smaller model, lower memory consumption) or higher accuracy (potentially larger and slower model). This will impact the choice of architecture, pretrained weights and related hyperparameters."),
gpu: bool = Opt(InitValues.gpu, "--gpu", "-G", help="Whether the model can run on GPU. This will impact the choice of architecture, pretrained weights and related hyperparameters."),
@ -83,6 +83,7 @@ def init_fill_config_cli(
# fmt: off
base_path: Path = Arg(..., help="Path to base config to fill", exists=True, dir_okay=False),
output_file: Path = Arg("-", help="Path to output .cfg file (or - for stdout)", allow_dash=True),
distillation: bool = Opt(False, "--distillation", "-dt", help="Include config for distillation (with 'spacy distill')"),
pretraining: bool = Opt(False, "--pretraining", "-pt", help="Include config for pretraining (with 'spacy pretrain')"),
diff: bool = Opt(False, "--diff", "-D", help="Print a visual diff highlighting the changes"),
code_path: Optional[Path] = Opt(None, "--code-path", "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
@ -98,13 +99,20 @@ def init_fill_config_cli(
DOCS: https://spacy.io/api/cli#init-fill-config
"""
import_code(code_path)
fill_config(output_file, base_path, pretraining=pretraining, diff=diff)
fill_config(
output_file,
base_path,
distillation=distillation,
pretraining=pretraining,
diff=diff,
)
def fill_config(
output_file: Path,
base_path: Path,
*,
distillation: bool = False,
pretraining: bool = False,
diff: bool = False,
silent: bool = False,
@ -123,6 +131,9 @@ def fill_config(
# replaced with their actual config after loading, so we have to re-add them
sourced = util.get_sourced_components(config)
filled["components"].update(sourced)
if distillation:
distillation_config = util.load_config(DEFAULT_CONFIG_DISTILL_PATH)
filled = distillation_config.merge(filled)
if pretraining:
validate_config_for_pretrain(filled, msg)
pretrain_config = util.load_config(DEFAULT_CONFIG_PRETRAIN_PATH)
@ -158,6 +169,10 @@ def init_config(
msg = Printer(no_print=silent)
with TEMPLATE_PATH.open("r") as f:
template = Template(f.read())
# Throw error for renamed language codes in v4
_handle_renamed_language_codes(lang)
# Filter out duplicates since tok2vec and transformer are added by template
pipeline = [pipe for pipe in pipeline if pipe not in ("tok2vec", "transformer")]
defaults = RECOMMENDATIONS["__default__"]

View File

@ -9,7 +9,7 @@ from .. import util
from ..training.initialize import init_nlp, convert_vectors
from ..language import Language
from ._util import init_cli, Arg, Opt, parse_config_overrides, show_validation_error
from ._util import import_code, setup_gpu
from ._util import import_code, setup_gpu, _handle_renamed_language_codes
@init_cli.command("vectors")
@ -31,6 +31,10 @@ def init_vectors_cli(
a model with vectors.
"""
util.logger.setLevel(logging.DEBUG if verbose else logging.INFO)
# Throw error for renamed language codes in v4
_handle_renamed_language_codes(lang)
msg.info(f"Creating blank nlp object for language '{lang}'")
nlp = util.get_lang_class(lang)()
if jsonl_loc is not None:

View File

@ -87,12 +87,11 @@ grad_factor = 1.0
factory = "parser"
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2"
@architectures = "spacy.TransitionBasedParser.v3"
state_type = "parser"
extra_state_tokens = false
hidden_width = 128
maxout_pieces = 3
use_upper = false
nO = null
[components.parser.model.tok2vec]
@ -108,12 +107,11 @@ grad_factor = 1.0
factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
@architectures = "spacy.TransitionBasedParser.v3"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null
[components.ner.model.tok2vec]
@ -314,12 +312,11 @@ width = ${components.tok2vec.model.encode.width}
factory = "parser"
[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2"
@architectures = "spacy.TransitionBasedParser.v3"
state_type = "parser"
extra_state_tokens = false
hidden_width = 128
maxout_pieces = 3
use_upper = true
nO = null
[components.parser.model.tok2vec]
@ -332,12 +329,11 @@ width = ${components.tok2vec.model.encode.width}
factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
@architectures = "spacy.TransitionBasedParser.v3"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
nO = null
[components.ner.model.tok2vec]

View File

@ -22,19 +22,6 @@ try:
except ImportError:
cupy = None
if sys.version_info[:2] >= (3, 8): # Python 3.8+
from typing import Literal, Protocol, runtime_checkable
else:
from typing_extensions import Literal, Protocol, runtime_checkable # noqa: F401
# Important note: The importlib_metadata "backport" includes functionality
# that's not part of the built-in importlib.metadata. We should treat this
# import like the built-in and only use what's available there.
try: # Python 3.8+
import importlib.metadata as importlib_metadata
except ImportError:
from catalogue import _importlib_metadata as importlib_metadata # type: ignore[no-redef] # noqa: F401
from thinc.api import Optimizer # noqa: F401
pickle = pickle

View File

@ -0,0 +1,34 @@
[paths]
raw_text = null
[distillation]
corpus = "corpora.distillation"
dropout = 0.1
max_epochs = 1
max_steps = 0
student_to_teacher = {}
[distillation.batcher]
@batchers = "spacy.batch_by_words.v1"
size = 3000
discard_oversize = false
tolerance = 0.2
[distillation.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 1e-8
learn_rate = 1e-4
[corpora]
[corpora.distillation]
@readers = "spacy.PlainTextCorpus.v1"
path = ${paths.raw_text}
min_length = 0
max_length = 0

View File

@ -106,9 +106,7 @@ def serve(
if is_in_jupyter():
warnings.warn(Warnings.W011)
render(
docs, style=style, page=page, minify=minify, options=options, manual=manual
)
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
httpd = simple_server.make_server(host, port, app)
print(f"\nUsing the '{style}' visualizer")
print(f"Serving on http://{host}:{port} ...\n")

View File

@ -1,5 +1,5 @@
from typing import Literal
import warnings
from .compat import Literal
class ErrorsWithCodes(type):
@ -209,6 +209,8 @@ class Warnings(metaclass=ErrorsWithCodes):
"`enabled` ({enabled}). Be aware that this might affect other components in your pipeline.")
W124 = ("{host}:{port} is already in use, using the nearest available port {serve_port} as an alternative.")
W400 = ("`use_upper=False` is ignored, the upper layer is always enabled")
class Errors(metaclass=ErrorsWithCodes):
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
@ -947,15 +949,20 @@ class Errors(metaclass=ErrorsWithCodes):
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
"with `displacy.serve(doc, port)`")
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port)` "
"with `displacy.serve(doc, port=port)`")
E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
"or use `auto_switch_port=True` to pick an available port automatically.")
# v4 error strings
E4000 = ("Expected a Doc as input, but got: '{type}'")
E4001 = ("Expected input to be one of the following types: ({expected_types}), "
"but got '{received_type}'")
E4002 = ("Pipe '{name}' requires a teacher pipe for distillation.")
E4003 = ("Training examples for distillation must have the exact same tokens in the "
"reference and predicted docs.")
E4004 = ("Backprop is not supported when is_train is not set.")
RENAMED_LANGUAGE_CODES = {"xx": "mul", "is": "isl"}
# fmt: on

View File

@ -25,7 +25,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
to support entity linking of named entities to real-world concepts.
DOCS: https://spacy.io/api/kb_in_memory
DOCS: https://spacy.io/api/inmemorylookupkb
"""
def __init__(self, Vocab vocab, entity_vector_length):

View File

@ -7,7 +7,7 @@ class IcelandicDefaults(BaseDefaults):
class Icelandic(Language):
lang = "is"
lang = "isl"
Defaults = IcelandicDefaults

View File

@ -3,10 +3,10 @@ from ...language import Language
class MultiLanguage(Language):
"""Language class to be used for models that support multiple languages.
This module allows models to specify their language ID as 'xx'.
This module allows models to specify their language ID as 'mul'.
"""
lang = "xx"
lang = "mul"
__all__ = ["MultiLanguage"]

View File

@ -1,4 +1,4 @@
from typing import Iterator, Optional, Any, Dict, Callable, Iterable
from typing import Iterator, Optional, Any, Dict, Callable, Iterable, Literal
from typing import Union, Tuple, List, Set, Pattern, Sequence
from typing import NoReturn, TYPE_CHECKING, TypeVar, cast, overload
@ -22,7 +22,7 @@ from . import ty
from .tokens.underscore import Underscore
from .vocab import Vocab, create_vocab
from .pipe_analysis import validate_attrs, analyze_pipes, print_pipe_analysis
from .training import Example, validate_examples
from .training import Example, validate_examples, validate_distillation_examples
from .training.initialize import init_vocab, init_tok2vec
from .scorer import Scorer
from .util import registry, SimpleFrozenList, _pipe, raise_error, _DEFAULT_EMPTY_PIPES
@ -40,7 +40,6 @@ from .git_info import GIT_VERSION
from . import util
from . import about
from .lookups import load_lookups
from .compat import Literal
PipeCallable = Callable[[Doc], Doc]
@ -49,6 +48,9 @@ PipeCallable = Callable[[Doc], Doc]
# This is the base config will all settings (training etc.)
DEFAULT_CONFIG_PATH = Path(__file__).parent / "default_config.cfg"
DEFAULT_CONFIG = util.load_config(DEFAULT_CONFIG_PATH)
# This is the base config for the [distillation] block and currently not included
# in the main config and only added via the 'init fill-config' command
DEFAULT_CONFIG_DISTILL_PATH = Path(__file__).parent / "default_config_distillation.cfg"
# This is the base config for the [pretraining] block and currently not included
# in the main config and only added via the 'init fill-config' command
DEFAULT_CONFIG_PRETRAIN_PATH = Path(__file__).parent / "default_config_pretraining.cfg"
@ -1018,6 +1020,102 @@ class Language:
raise ValueError(Errors.E005.format(name=name, returned_type=type(doc)))
return doc
def distill(
self,
teacher: "Language",
examples: Iterable[Example],
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
exclude: Iterable[str] = SimpleFrozenList(),
annotates: Iterable[str] = SimpleFrozenList(),
student_to_teacher: Optional[Dict[str, str]] = None,
):
"""Distill the models in a student pipeline from a teacher pipeline.
teacher (Language): Teacher to distill from.
examples (Iterable[Example]): Distillation examples. The reference
(teacher) and predicted (student) docs must have the same number of
tokens and the same orthography.
drop (float): The dropout rate.
sgd (Optional[Optimizer]): An optimizer.
losses (Optional(Dict[str, float])): Dictionary to update with the loss,
keyed by component.
component_cfg (Optional[Dict[str, Dict[str, Any]]]): Config parameters
for specific pipeline components, keyed by component name.
exclude (Iterable[str]): Names of components that shouldn't be updated.
annotates (Iterable[str]): Names of components that should set
annotations on the predicted examples after updating.
student_to_teacher (Optional[Dict[str, str]]): Map student pipe name to
teacher pipe name, only needed for pipes where the student pipe
name does not match the teacher pipe name.
RETURNS (Dict[str, float]): The updated losses dictionary
DOCS: https://spacy.io/api/language#distill
"""
if student_to_teacher is None:
student_to_teacher = {}
if losses is None:
losses = {}
if isinstance(examples, list) and len(examples) == 0:
return losses
validate_distillation_examples(examples, "Language.distill")
examples = _copy_examples(examples, copy_x=True, copy_y=True)
if sgd is None:
if self._optimizer is None:
self._optimizer = self.create_optimizer()
sgd = self._optimizer
if component_cfg is None:
component_cfg = {}
pipe_kwargs = {}
for student_name, student_proc in self.pipeline:
component_cfg.setdefault(student_name, {})
pipe_kwargs[student_name] = deepcopy(component_cfg[student_name])
component_cfg[student_name].setdefault("drop", drop)
pipe_kwargs[student_name].setdefault("batch_size", self.batch_size)
teacher_pipes = dict(teacher.pipeline)
for student_name, student_proc in self.pipeline:
if student_name in annotates:
for doc, eg in zip(
_pipe(
(eg.predicted for eg in examples),
proc=student_proc,
name=student_name,
default_error_handler=self.default_error_handler,
kwargs=pipe_kwargs[student_name],
),
examples,
):
eg.predicted = doc
if (
student_name not in exclude
and isinstance(student_proc, ty.DistillableComponent)
and student_proc.is_distillable
):
# A missing teacher pipe is not an error, some student pipes
# do not need a teacher, such as tok2vec layer losses.
teacher_name = (
student_to_teacher[student_name]
if student_name in student_to_teacher
else student_name
)
teacher_pipe = teacher_pipes.get(teacher_name, None)
student_proc.distill(
teacher_pipe,
examples,
sgd=sgd,
losses=losses,
**component_cfg[student_name],
)
return losses
def disable_pipes(self, *names) -> "DisabledPipes":
"""Disable one or more pipeline components. If used as a context
manager, the pipeline will be restored to the initial state at the end
@ -1243,12 +1341,16 @@ class Language:
self,
get_examples: Optional[Callable[[], Iterable[Example]]] = None,
*,
labels: Optional[Dict[str, Any]] = None,
sgd: Optional[Optimizer] = None,
) -> Optimizer:
"""Initialize the pipe for training, using data examples if available.
get_examples (Callable[[], Iterable[Example]]): Optional function that
returns gold-standard Example objects.
labels (Optional[Dict[str, Any]]): Labels to pass to pipe initialization,
using the names of the pipes as keys. Overrides labels that are in
the model configuration.
sgd (Optional[Optimizer]): An optimizer to use for updates. If not
provided, will be created using the .create_optimizer() method.
RETURNS (thinc.api.Optimizer): The optimizer.
@ -1293,6 +1395,8 @@ class Language:
for name, proc in self.pipeline:
if isinstance(proc, ty.InitializableComponent):
p_settings = I["components"].get(name, {})
if labels is not None and name in labels:
p_settings["labels"] = labels[name]
p_settings = validate_init_settings(
proc.initialize, p_settings, section="components", name=name
)
@ -1726,6 +1830,7 @@ class Language:
# using the nlp.config with all defaults.
config = util.copy_config(config)
orig_pipeline = config.pop("components", {})
orig_distill = config.pop("distill", None)
orig_pretraining = config.pop("pretraining", None)
config["components"] = {}
if auto_fill:
@ -1734,6 +1839,9 @@ class Language:
filled = config
filled["components"] = orig_pipeline
config["components"] = orig_pipeline
if orig_distill is not None:
filled["distill"] = orig_distill
config["distill"] = orig_distill
if orig_pretraining is not None:
filled["pretraining"] = orig_pretraining
config["pretraining"] = orig_pretraining
@ -2223,13 +2331,18 @@ class DisabledPipes(list):
self[:] = []
def _copy_examples(examples: Iterable[Example]) -> List[Example]:
def _copy_examples(
examples: Iterable[Example], *, copy_x: bool = True, copy_y: bool = False
) -> List[Example]:
"""Make a copy of a batch of examples, copying the predicted Doc as well.
This is used in contexts where we need to take ownership of the examples
so that they can be mutated, for instance during Language.evaluate and
Language.update.
"""
return [Example(eg.x.copy(), eg.y) for eg in examples]
return [
Example(eg.x.copy() if copy_x else eg.x, eg.y.copy() if copy_y else eg.y)
for eg in examples
]
def _apply_pipes(

View File

@ -41,7 +41,7 @@ cdef class Lexeme:
"""
self.vocab = vocab
self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
self.c = <LexemeC*><void*>vocab.get_by_orth(orth)
if self.c.orth != orth:
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))

View File

@ -22,7 +22,7 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
max_edits = fuzzy
else:
# allow at least two edits (to allow at least one transposition) and up
# to 20% of the pattern string length
# to 30% of the pattern string length
max_edits = max(2, round(0.3 * len(pattern_text)))
return levenshtein(input_text, pattern_text, max_edits) <= max_edits

View File

@ -1,12 +1,15 @@
from typing import Any, List, Dict, Tuple, Optional, Callable, Union
from typing import Any, List, Dict, Tuple, Optional, Callable, Union, Literal
from typing import Iterator, Iterable, overload
from ..compat import Literal
from ..vocab import Vocab
from ..tokens import Doc, Span
class Matcher:
def __init__(self, vocab: Vocab, validate: bool = ...,
fuzzy_compare: Callable[[str, str, int], bool] = ...) -> None: ...
def __init__(
self,
vocab: Vocab,
validate: bool = ...,
fuzzy_compare: Callable[[str, str, int], bool] = ...,
) -> None: ...
def __reduce__(self) -> Any: ...
def __len__(self) -> int: ...
def __contains__(self, key: str) -> bool: ...

View File

@ -1,5 +1,5 @@
from typing import List, Tuple, Union, Optional, Callable, Any, Dict, overload
from ..compat import Literal
from typing import List, Tuple, Union, Optional, Callable, Any, Dict, Literal
from typing import overload
from .matcher import Matcher
from ..vocab import Vocab
from ..tokens import Doc, Span

View File

@ -1,164 +0,0 @@
from thinc.api import Model, normal_init
from ..util import registry
@registry.layers("spacy.PrecomputableAffine.v1")
def PrecomputableAffine(nO, nI, nF, nP, dropout=0.1):
model = Model(
"precomputable_affine",
forward,
init=init,
dims={"nO": nO, "nI": nI, "nF": nF, "nP": nP},
params={"W": None, "b": None, "pad": None},
attrs={"dropout_rate": dropout},
)
return model
def forward(model, X, is_train):
nF = model.get_dim("nF")
nO = model.get_dim("nO")
nP = model.get_dim("nP")
nI = model.get_dim("nI")
W = model.get_param("W")
# Preallocate array for layer output, including padding.
Yf = model.ops.alloc2f(X.shape[0] + 1, nF * nO * nP, zeros=False)
model.ops.gemm(X, W.reshape((nF * nO * nP, nI)), trans2=True, out=Yf[1:])
Yf = Yf.reshape((Yf.shape[0], nF, nO, nP))
# Set padding. Padding has shape (1, nF, nO, nP). Unfortunately, we cannot
# change its shape to (nF, nO, nP) without breaking existing models. So
# we'll squeeze the first dimension here.
Yf[0] = model.ops.xp.squeeze(model.get_param("pad"), 0)
def backward(dY_ids):
# This backprop is particularly tricky, because we get back a different
# thing from what we put out. We put out an array of shape:
# (nB, nF, nO, nP), and get back:
# (nB, nO, nP) and ids (nB, nF)
# The ids tell us the values of nF, so we would have:
#
# dYf = zeros((nB, nF, nO, nP))
# for b in range(nB):
# for f in range(nF):
# dYf[b, ids[b, f]] += dY[b]
#
# However, we avoid building that array for efficiency -- and just pass
# in the indices.
dY, ids = dY_ids
assert dY.ndim == 3
assert dY.shape[1] == nO, dY.shape
assert dY.shape[2] == nP, dY.shape
# nB = dY.shape[0]
model.inc_grad("pad", _backprop_precomputable_affine_padding(model, dY, ids))
Xf = X[ids]
Xf = Xf.reshape((Xf.shape[0], nF * nI))
model.inc_grad("b", dY.sum(axis=0))
dY = dY.reshape((dY.shape[0], nO * nP))
Wopfi = W.transpose((1, 2, 0, 3))
Wopfi = Wopfi.reshape((nO * nP, nF * nI))
dXf = model.ops.gemm(dY.reshape((dY.shape[0], nO * nP)), Wopfi)
dWopfi = model.ops.gemm(dY, Xf, trans1=True)
dWopfi = dWopfi.reshape((nO, nP, nF, nI))
# (o, p, f, i) --> (f, o, p, i)
dWopfi = dWopfi.transpose((2, 0, 1, 3))
model.inc_grad("W", dWopfi)
return dXf.reshape((dXf.shape[0], nF, nI))
return Yf, backward
def _backprop_precomputable_affine_padding(model, dY, ids):
nB = dY.shape[0]
nF = model.get_dim("nF")
nP = model.get_dim("nP")
nO = model.get_dim("nO")
# Backprop the "padding", used as a filler for missing values.
# Values that are missing are set to -1, and each state vector could
# have multiple missing values. The padding has different values for
# different missing features. The gradient of the padding vector is:
#
# for b in range(nB):
# for f in range(nF):
# if ids[b, f] < 0:
# d_pad[f] += dY[b]
#
# Which can be rewritten as:
#
# (ids < 0).T @ dY
mask = model.ops.asarray(ids < 0, dtype="f")
d_pad = model.ops.gemm(mask, dY.reshape(nB, nO * nP), trans1=True)
return d_pad.reshape((1, nF, nO, nP))
def init(model, X=None, Y=None):
"""This is like the 'layer sequential unit variance', but instead
of taking the actual inputs, we randomly generate whitened data.
Why's this all so complicated? We have a huge number of inputs,
and the maxout unit makes guessing the dynamics tricky. Instead
we set the maxout weights to values that empirically result in
whitened outputs given whitened inputs.
"""
if model.has_param("W") and model.get_param("W").any():
return
nF = model.get_dim("nF")
nO = model.get_dim("nO")
nP = model.get_dim("nP")
nI = model.get_dim("nI")
W = model.ops.alloc4f(nF, nO, nP, nI)
b = model.ops.alloc2f(nO, nP)
pad = model.ops.alloc4f(1, nF, nO, nP)
ops = model.ops
W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
pad = normal_init(ops, pad.shape, mean=1.0)
model.set_param("W", W)
model.set_param("b", b)
model.set_param("pad", pad)
ids = ops.alloc((5000, nF), dtype="f")
ids += ops.xp.random.uniform(0, 1000, ids.shape)
ids = ops.asarray(ids, dtype="i")
tokvecs = ops.alloc((5000, nI), dtype="f")
tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
tokvecs.shape
)
def predict(ids, tokvecs):
# nS ids. nW tokvecs. Exclude the padding array.
hiddens = model.predict(tokvecs[:-1]) # (nW, f, o, p)
vectors = model.ops.alloc((ids.shape[0], nO * nP), dtype="f")
# need nS vectors
hiddens = hiddens.reshape((hiddens.shape[0] * nF, nO * nP))
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
vectors = vectors.reshape((vectors.shape[0], nO, nP))
vectors += b
vectors = model.ops.asarray(vectors)
if nP >= 2:
return model.ops.maxout(vectors)[0]
else:
return vectors * (vectors >= 0)
tol_var = 0.01
tol_mean = 0.01
t_max = 10
W = model.get_param("W").copy()
b = model.get_param("b").copy()
for t_i in range(t_max):
acts1 = predict(ids, tokvecs)
var = model.ops.xp.var(acts1)
mean = model.ops.xp.mean(acts1)
if abs(var - 1.0) >= tol_var:
W /= model.ops.xp.sqrt(var)
model.set_param("W", W)
elif abs(mean) >= tol_mean:
b -= mean
model.set_param("b", b)
else:
break

View File

@ -23,6 +23,7 @@ DEFAULT_NVTX_ANNOTATABLE_PIPE_METHODS = [
"update",
"rehearse",
"get_loss",
"get_teacher_student_loss",
"initialize",
"begin_update",
"finish_update",

View File

@ -1,17 +1,19 @@
from typing import Optional, List, cast
from thinc.api import Model, chain, list2array, Linear, zero_init, use_ops
from typing import Optional, List, Tuple, Any, Literal
from thinc.types import Floats2d
from thinc.api import Model
import warnings
from ...errors import Errors
from ...compat import Literal
from ...errors import Errors, Warnings
from ...util import registry
from .._precomputable_affine import PrecomputableAffine
from ..tb_framework import TransitionModel
from ...tokens import Doc
from ...tokens.doc import Doc
TransitionSystem = Any # TODO
State = Any # TODO
@registry.architectures("spacy.TransitionBasedParser.v2")
def build_tb_parser_model(
@registry.architectures.register("spacy.TransitionBasedParser.v2")
def transition_parser_v2(
tok2vec: Model[List[Doc], List[Floats2d]],
state_type: Literal["parser", "ner"],
extra_state_tokens: bool,
@ -19,6 +21,46 @@ def build_tb_parser_model(
maxout_pieces: int,
use_upper: bool,
nO: Optional[int] = None,
) -> Model:
if not use_upper:
warnings.warn(Warnings.W400)
return build_tb_parser_model(
tok2vec,
state_type,
extra_state_tokens,
hidden_width,
maxout_pieces,
nO=nO,
)
@registry.architectures.register("spacy.TransitionBasedParser.v3")
def transition_parser_v3(
tok2vec: Model[List[Doc], List[Floats2d]],
state_type: Literal["parser", "ner"],
extra_state_tokens: bool,
hidden_width: int,
maxout_pieces: int,
nO: Optional[int] = None,
) -> Model:
return build_tb_parser_model(
tok2vec,
state_type,
extra_state_tokens,
hidden_width,
maxout_pieces,
nO=nO,
)
def build_tb_parser_model(
tok2vec: Model[List[Doc], List[Floats2d]],
state_type: Literal["parser", "ner"],
extra_state_tokens: bool,
hidden_width: int,
maxout_pieces: int,
nO: Optional[int] = None,
) -> Model:
"""
Build a transition-based parser model. Can apply to NER or dependency-parsing.
@ -51,14 +93,7 @@ def build_tb_parser_model(
feature sets (for the NER) or 13 (for the parser).
hidden_width (int): The width of the hidden layer.
maxout_pieces (int): How many pieces to use in the state prediction layer.
Recommended values are 1, 2 or 3. If 1, the maxout non-linearity
is replaced with a ReLu non-linearity if use_upper=True, and no
non-linearity if use_upper=False.
use_upper (bool): Whether to use an additional hidden layer after the state
vector in order to predict the action scores. It is recommended to set
this to False for large pretrained models such as transformers, and True
for smaller networks. The upper layer is computed on CPU, which becomes
a bottleneck on larger GPU-based models, where it's also less necessary.
Recommended values are 1, 2 or 3.
nO (int or None): The number of actions the model will predict between.
Usually inferred from data at the beginning of training, or loaded from
disk.
@ -69,106 +104,11 @@ def build_tb_parser_model(
nr_feature_tokens = 6 if extra_state_tokens else 3
else:
raise ValueError(Errors.E917.format(value=state_type))
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec = chain(
tok2vec,
list2array(),
Linear(hidden_width, t2v_width),
return TransitionModel(
tok2vec=tok2vec,
state_tokens=nr_feature_tokens,
hidden_width=hidden_width,
maxout_pieces=maxout_pieces,
nO=nO,
unseen_classes=set(),
)
tok2vec.set_dim("nO", hidden_width)
lower = _define_lower(
nO=hidden_width if use_upper else nO,
nF=nr_feature_tokens,
nI=tok2vec.get_dim("nO"),
nP=maxout_pieces,
)
upper = None
if use_upper:
with use_ops("cpu"):
# Initialize weights at zero, as it's a classification layer.
upper = _define_upper(nO=nO, nI=None)
return TransitionModel(tok2vec, lower, upper, resize_output)
def _define_upper(nO, nI):
return Linear(nO=nO, nI=nI, init_W=zero_init)
def _define_lower(nO, nF, nI, nP):
return PrecomputableAffine(nO=nO, nF=nF, nI=nI, nP=nP)
def resize_output(model, new_nO):
if model.attrs["has_upper"]:
return _resize_upper(model, new_nO)
return _resize_lower(model, new_nO)
def _resize_upper(model, new_nO):
upper = model.get_ref("upper")
if upper.has_dim("nO") is None:
upper.set_dim("nO", new_nO)
return model
elif new_nO == upper.get_dim("nO"):
return model
smaller = upper
nI = smaller.maybe_get_dim("nI")
with use_ops("cpu"):
larger = _define_upper(nO=new_nO, nI=nI)
# it could be that the model is not initialized yet, then skip this bit
if smaller.has_param("W"):
larger_W = larger.ops.alloc2f(new_nO, nI)
larger_b = larger.ops.alloc1f(new_nO)
smaller_W = smaller.get_param("W")
smaller_b = smaller.get_param("b")
# Weights are stored in (nr_out, nr_in) format, so we're basically
# just adding rows here.
if smaller.has_dim("nO"):
old_nO = smaller.get_dim("nO")
larger_W[:old_nO] = smaller_W
larger_b[:old_nO] = smaller_b
for i in range(old_nO, new_nO):
model.attrs["unseen_classes"].add(i)
larger.set_param("W", larger_W)
larger.set_param("b", larger_b)
model._layers[-1] = larger
model.set_ref("upper", larger)
return model
def _resize_lower(model, new_nO):
lower = model.get_ref("lower")
if lower.has_dim("nO") is None:
lower.set_dim("nO", new_nO)
return model
smaller = lower
nI = smaller.maybe_get_dim("nI")
nF = smaller.maybe_get_dim("nF")
nP = smaller.maybe_get_dim("nP")
larger = _define_lower(nO=new_nO, nI=nI, nF=nF, nP=nP)
# it could be that the model is not initialized yet, then skip this bit
if smaller.has_param("W"):
larger_W = larger.ops.alloc4f(nF, new_nO, nP, nI)
larger_b = larger.ops.alloc2f(new_nO, nP)
larger_pad = larger.ops.alloc4f(1, nF, new_nO, nP)
smaller_W = smaller.get_param("W")
smaller_b = smaller.get_param("b")
smaller_pad = smaller.get_param("pad")
# Copy the old weights and padding into the new layer
if smaller.has_dim("nO"):
old_nO = smaller.get_dim("nO")
larger_W[:, 0:old_nO, :, :] = smaller_W
larger_pad[:, :, 0:old_nO, :] = smaller_pad
larger_b[0:old_nO, :] = smaller_b
for i in range(old_nO, new_nO):
model.attrs["unseen_classes"].add(i)
larger.set_param("W", larger_W)
larger.set_param("b", larger_b)
larger.set_param("pad", larger_pad)
model._layers[1] = larger
model.set_ref("lower", larger)
return model

View File

@ -1,49 +0,0 @@
from libc.string cimport memset, memcpy
from thinc.backends.cblas cimport CBlas
from ..typedefs cimport weight_t, hash_t
from ..pipeline._parser_internals._state cimport StateC
cdef struct SizesC:
int states
int classes
int hiddens
int pieces
int feats
int embed_width
cdef struct WeightsC:
const float* feat_weights
const float* feat_bias
const float* hidden_bias
const float* hidden_weights
const float* seen_classes
cdef struct ActivationsC:
int* token_ids
float* unmaxed
float* scores
float* hiddens
int* is_valid
int _curr_size
int _max_size
cdef WeightsC get_c_weights(model) except *
cdef SizesC get_c_sizes(model, int batch_size) except *
cdef ActivationsC alloc_activations(SizesC n) nogil
cdef void free_activations(const ActivationsC* A) nogil
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
const WeightsC* W, SizesC n) nogil
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil
cdef void cpu_log_loss(float* d_scores,
const float* costs, const int* is_valid, const float* scores, int O) nogil

View File

@ -1,500 +0,0 @@
# cython: infer_types=True, cdivision=True, boundscheck=False
cimport numpy as np
from libc.math cimport exp
from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free, realloc
from thinc.backends.cblas cimport saxpy, sgemm
import numpy
import numpy.random
from thinc.api import Model, CupyOps, NumpyOps, get_ops
from .. import util
from ..errors import Errors
from ..typedefs cimport weight_t, class_t, hash_t
from ..pipeline._parser_internals.stateclass cimport StateClass
cdef WeightsC get_c_weights(model) except *:
cdef WeightsC output
cdef precompute_hiddens state2vec = model.state2vec
output.feat_weights = state2vec.get_feat_weights()
output.feat_bias = <const float*>state2vec.bias.data
cdef np.ndarray vec2scores_W
cdef np.ndarray vec2scores_b
if model.vec2scores is None:
output.hidden_weights = NULL
output.hidden_bias = NULL
else:
vec2scores_W = model.vec2scores.get_param("W")
vec2scores_b = model.vec2scores.get_param("b")
output.hidden_weights = <const float*>vec2scores_W.data
output.hidden_bias = <const float*>vec2scores_b.data
cdef np.ndarray class_mask = model._class_mask
output.seen_classes = <const float*>class_mask.data
return output
cdef SizesC get_c_sizes(model, int batch_size) except *:
cdef SizesC output
output.states = batch_size
if model.vec2scores is None:
output.classes = model.state2vec.get_dim("nO")
else:
output.classes = model.vec2scores.get_dim("nO")
output.hiddens = model.state2vec.get_dim("nO")
output.pieces = model.state2vec.get_dim("nP")
output.feats = model.state2vec.get_dim("nF")
output.embed_width = model.tokvecs.shape[1]
return output
cdef ActivationsC alloc_activations(SizesC n) nogil:
cdef ActivationsC A
memset(&A, 0, sizeof(A))
resize_activations(&A, n)
return A
cdef void free_activations(const ActivationsC* A) nogil:
free(A.token_ids)
free(A.scores)
free(A.unmaxed)
free(A.hiddens)
free(A.is_valid)
cdef void resize_activations(ActivationsC* A, SizesC n) nogil:
if n.states <= A._max_size:
A._curr_size = n.states
return
if A._max_size == 0:
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
A.scores = <float*>calloc(n.states * n.classes, sizeof(A.scores[0]))
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
A._max_size = n.states
else:
A.token_ids = <int*>realloc(A.token_ids,
n.states * n.feats * sizeof(A.token_ids[0]))
A.scores = <float*>realloc(A.scores,
n.states * n.classes * sizeof(A.scores[0]))
A.unmaxed = <float*>realloc(A.unmaxed,
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
A.hiddens = <float*>realloc(A.hiddens,
n.states * n.hiddens * sizeof(A.hiddens[0]))
A.is_valid = <int*>realloc(A.is_valid,
n.states * n.classes * sizeof(A.is_valid[0]))
A._max_size = n.states
A._curr_size = n.states
cdef void predict_states(CBlas cblas, ActivationsC* A, StateC** states,
const WeightsC* W, SizesC n) nogil:
cdef double one = 1.0
resize_activations(A, n)
for i in range(n.states):
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
memset(A.hiddens, 0, n.states * n.hiddens * sizeof(float))
sum_state_features(cblas, A.unmaxed,
W.feat_weights, A.token_ids, n.states, n.feats, n.hiddens * n.pieces)
for i in range(n.states):
saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
for j in range(n.hiddens):
index = i * n.hiddens * n.pieces + j * n.pieces
which = _arg_max(&A.unmaxed[index], n.pieces)
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
memset(A.scores, 0, n.states * n.classes * sizeof(float))
if W.hidden_weights == NULL:
memcpy(A.scores, A.hiddens, n.states * n.classes * sizeof(float))
else:
# Compute hidden-to-output
sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
1.0, <const float *>A.hiddens, n.hiddens,
<const float *>W.hidden_weights, n.hiddens,
0.0, A.scores, n.classes)
# Add bias
for i in range(n.states):
saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &A.scores[i*n.classes], 1)
# Set unseen classes to minimum value
i = 0
min_ = A.scores[0]
for i in range(1, n.states * n.classes):
if A.scores[i] < min_:
min_ = A.scores[i]
for i in range(n.states):
for j in range(n.classes):
if not W.seen_classes[j]:
A.scores[i*n.classes+j] = min_
cdef void sum_state_features(CBlas cblas, float* output,
const float* cached, const int* token_ids, int B, int F, int O) nogil:
cdef int idx, b, f, i
cdef const float* feature
padding = cached
cached += F * O
cdef int id_stride = F*O
cdef float one = 1.
for b in range(B):
for f in range(F):
if token_ids[f] < 0:
feature = &padding[f*O]
else:
idx = token_ids[f] * id_stride + f*O
feature = &cached[idx]
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
token_ids += F
cdef void cpu_log_loss(float* d_scores,
const float* costs, const int* is_valid, const float* scores,
int O) nogil:
"""Do multi-label log loss"""
cdef double max_, gmax, Z, gZ
best = arg_max_if_gold(scores, costs, is_valid, O)
guess = _arg_max(scores, O)
if best == -1 or guess == -1:
# These shouldn't happen, but if they do, we want to make sure we don't
# cause an OOB access.
return
Z = 1e-10
gZ = 1e-10
max_ = scores[guess]
gmax = scores[best]
for i in range(O):
Z += exp(scores[i] - max_)
if costs[i] <= costs[best]:
gZ += exp(scores[i] - gmax)
for i in range(O):
if costs[i] <= costs[best]:
d_scores[i] = (exp(scores[i]-max_) / Z) - (exp(scores[i]-gmax)/gZ)
else:
d_scores[i] = exp(scores[i]-max_) / Z
cdef int arg_max_if_gold(const weight_t* scores, const weight_t* costs,
const int* is_valid, int n) nogil:
# Find minimum cost
cdef float cost = 1
for i in range(n):
if is_valid[i] and costs[i] < cost:
cost = costs[i]
# Now find best-scoring with that cost
cdef int best = -1
for i in range(n):
if costs[i] <= cost and is_valid[i]:
if best == -1 or scores[i] > scores[best]:
best = i
return best
cdef int arg_max_if_valid(const weight_t* scores, const int* is_valid, int n) nogil:
cdef int best = -1
for i in range(n):
if is_valid[i] >= 1:
if best == -1 or scores[i] > scores[best]:
best = i
return best
class ParserStepModel(Model):
def __init__(self, docs, layers, *, has_upper, unseen_classes=None, train=True,
dropout=0.1):
Model.__init__(self, name="parser_step_model", forward=step_forward)
self.attrs["has_upper"] = has_upper
self.attrs["dropout_rate"] = dropout
self.tokvecs, self.bp_tokvecs = layers[0](docs, is_train=train)
if layers[1].get_dim("nP") >= 2:
activation = "maxout"
elif has_upper:
activation = None
else:
activation = "relu"
self.state2vec = precompute_hiddens(len(docs), self.tokvecs, layers[1],
activation=activation, train=train)
if has_upper:
self.vec2scores = layers[-1]
else:
self.vec2scores = None
self.cuda_stream = util.get_cuda_stream(non_blocking=True)
self.backprops = []
self._class_mask = numpy.zeros((self.nO,), dtype='f')
self._class_mask.fill(1)
if unseen_classes is not None:
for class_ in unseen_classes:
self._class_mask[class_] = 0.
def clear_memory(self):
del self.tokvecs
del self.bp_tokvecs
del self.state2vec
del self.backprops
del self._class_mask
@property
def nO(self):
if self.attrs["has_upper"]:
return self.vec2scores.get_dim("nO")
else:
return self.state2vec.get_dim("nO")
def class_is_unseen(self, class_):
return self._class_mask[class_]
def mark_class_unseen(self, class_):
self._class_mask[class_] = 0
def mark_class_seen(self, class_):
self._class_mask[class_] = 1
def get_token_ids(self, states):
cdef StateClass state
states = [state for state in states if not state.is_final()]
cdef np.ndarray ids = numpy.zeros((len(states), self.state2vec.nF),
dtype='i', order='C')
ids.fill(-1)
c_ids = <int*>ids.data
for state in states:
state.c.set_context_tokens(c_ids, ids.shape[1])
c_ids += ids.shape[1]
return ids
def backprop_step(self, token_ids, d_vector, get_d_tokvecs):
if isinstance(self.state2vec.ops, CupyOps) \
and not isinstance(token_ids, self.state2vec.ops.xp.ndarray):
# Move token_ids and d_vector to GPU, asynchronously
self.backprops.append((
util.get_async(self.cuda_stream, token_ids),
util.get_async(self.cuda_stream, d_vector),
get_d_tokvecs
))
else:
self.backprops.append((token_ids, d_vector, get_d_tokvecs))
def finish_steps(self, golds):
# Add a padding vector to the d_tokvecs gradient, so that missing
# values don't affect the real gradient.
d_tokvecs = self.ops.alloc((self.tokvecs.shape[0]+1, self.tokvecs.shape[1]))
# Tells CUDA to block, so our async copies complete.
if self.cuda_stream is not None:
self.cuda_stream.synchronize()
for ids, d_vector, bp_vector in self.backprops:
d_state_features = bp_vector((d_vector, ids))
ids = ids.flatten()
d_state_features = d_state_features.reshape(
(ids.size, d_state_features.shape[2]))
self.ops.scatter_add(d_tokvecs, ids,
d_state_features)
# Padded -- see update()
self.bp_tokvecs(d_tokvecs[:-1])
return d_tokvecs
NUMPY_OPS = NumpyOps()
def step_forward(model: ParserStepModel, states, is_train):
token_ids = model.get_token_ids(states)
vector, get_d_tokvecs = model.state2vec(token_ids, is_train)
mask = None
if model.attrs["has_upper"]:
dropout_rate = model.attrs["dropout_rate"]
if is_train and dropout_rate > 0:
mask = NUMPY_OPS.get_dropout_mask(vector.shape, 0.1)
vector *= mask
scores, get_d_vector = model.vec2scores(vector, is_train)
else:
scores = NumpyOps().asarray(vector)
get_d_vector = lambda d_scores: d_scores
# If the class is unseen, make sure its score is minimum
scores[:, model._class_mask == 0] = numpy.nanmin(scores)
def backprop_parser_step(d_scores):
# Zero vectors for unseen classes
d_scores *= model._class_mask
d_vector = get_d_vector(d_scores)
if mask is not None:
d_vector *= mask
model.backprop_step(token_ids, d_vector, get_d_tokvecs)
return None
return scores, backprop_parser_step
cdef class precompute_hiddens:
"""Allow a model to be "primed" by pre-computing input features in bulk.
This is used for the parser, where we want to take a batch of documents,
and compute vectors for each (token, position) pair. These vectors can then
be reused, especially for beam-search.
Let's say we're using 12 features for each state, e.g. word at start of
buffer, three words on stack, their children, etc. In the normal arc-eager
system, a document of length N is processed in 2*N states. This means we'll
create 2*N*12 feature vectors --- but if we pre-compute, we only need
N*12 vector computations. The saving for beam-search is much better:
if we have a beam of k, we'll normally make 2*N*12*K computations --
so we can save the factor k. This also gives a nice CPU/GPU division:
we can do all our hard maths up front, packed into large multiplications,
and do the hard-to-program parsing on the CPU.
"""
cdef readonly int nF, nO, nP
cdef bint _is_synchronized
cdef public object ops
cdef public object numpy_ops
cdef public object _cpu_ops
cdef np.ndarray _features
cdef np.ndarray _cached
cdef np.ndarray bias
cdef object _cuda_stream
cdef object _bp_hiddens
cdef object activation
def __init__(self, batch_size, tokvecs, lower_model, cuda_stream=None,
activation="maxout", train=False):
gpu_cached, bp_features = lower_model(tokvecs, train)
cdef np.ndarray cached
if not isinstance(gpu_cached, numpy.ndarray):
# Note the passing of cuda_stream here: it lets
# cupy make the copy asynchronously.
# We then have to block before first use.
cached = gpu_cached.get(stream=cuda_stream)
else:
cached = gpu_cached
if not isinstance(lower_model.get_param("b"), numpy.ndarray):
self.bias = lower_model.get_param("b").get(stream=cuda_stream)
else:
self.bias = lower_model.get_param("b")
self.nF = cached.shape[1]
if lower_model.has_dim("nP"):
self.nP = lower_model.get_dim("nP")
else:
self.nP = 1
self.nO = cached.shape[2]
self.ops = lower_model.ops
self.numpy_ops = NumpyOps()
self._cpu_ops = get_ops("cpu") if isinstance(self.ops, CupyOps) else self.ops
assert activation in (None, "relu", "maxout")
self.activation = activation
self._is_synchronized = False
self._cuda_stream = cuda_stream
self._cached = cached
self._bp_hiddens = bp_features
cdef const float* get_feat_weights(self) except NULL:
if not self._is_synchronized and self._cuda_stream is not None:
self._cuda_stream.synchronize()
self._is_synchronized = True
return <float*>self._cached.data
def has_dim(self, name):
if name == "nF":
return self.nF if self.nF is not None else True
elif name == "nP":
return self.nP if self.nP is not None else True
elif name == "nO":
return self.nO if self.nO is not None else True
else:
return False
def get_dim(self, name):
if name == "nF":
return self.nF
elif name == "nP":
return self.nP
elif name == "nO":
return self.nO
else:
raise ValueError(Errors.E1033.format(name=name))
def set_dim(self, name, value):
if name == "nF":
self.nF = value
elif name == "nP":
self.nP = value
elif name == "nO":
self.nO = value
else:
raise ValueError(Errors.E1033.format(name=name))
def __call__(self, X, bint is_train):
if is_train:
return self.begin_update(X)
else:
return self.predict(X), lambda X: X
def predict(self, X):
return self.begin_update(X)[0]
def begin_update(self, token_ids):
cdef np.ndarray state_vector = numpy.zeros(
(token_ids.shape[0], self.nO, self.nP), dtype='f')
# This is tricky, but (assuming GPU available);
# - Input to forward on CPU
# - Output from forward on CPU
# - Input to backward on GPU!
# - Output from backward on GPU
bp_hiddens = self._bp_hiddens
cdef CBlas cblas = self._cpu_ops.cblas()
feat_weights = self.get_feat_weights()
cdef int[:, ::1] ids = token_ids
sum_state_features(cblas, <float*>state_vector.data,
feat_weights, &ids[0,0],
token_ids.shape[0], self.nF, self.nO*self.nP)
state_vector += self.bias
state_vector, bp_nonlinearity = self._nonlinearity(state_vector)
def backward(d_state_vector_ids):
d_state_vector, token_ids = d_state_vector_ids
d_state_vector = bp_nonlinearity(d_state_vector)
d_tokens = bp_hiddens((d_state_vector, token_ids))
return d_tokens
return state_vector, backward
def _nonlinearity(self, state_vector):
if self.activation == "maxout":
return self._maxout_nonlinearity(state_vector)
else:
return self._relu_nonlinearity(state_vector)
def _maxout_nonlinearity(self, state_vector):
state_vector, mask = self.numpy_ops.maxout(state_vector)
# We're outputting to CPU, but we need this variable on GPU for the
# backward pass.
mask = self.ops.asarray(mask)
def backprop_maxout(d_best):
return self.ops.backprop_maxout(d_best, mask, self.nP)
return state_vector, backprop_maxout
def _relu_nonlinearity(self, state_vector):
state_vector = state_vector.reshape((state_vector.shape[0], -1))
mask = state_vector >= 0.
state_vector *= mask
# We're outputting to CPU, but we need this variable on GPU for the
# backward pass.
mask = self.ops.asarray(mask)
def backprop_relu(d_best):
d_best *= mask
return d_best.reshape((d_best.shape + (1,)))
return state_vector, backprop_relu
cdef inline int _arg_max(const float* scores, const int n_classes) nogil:
if n_classes == 2:
return 0 if scores[0] > scores[1] else 1
cdef int i
cdef int best = 0
cdef float mode = scores[0]
for i in range(1, n_classes):
if scores[i] > mode:
mode = scores[i]
best = i
return best

28
spacy/ml/tb_framework.pxd Normal file
View File

@ -0,0 +1,28 @@
from libc.stdint cimport int8_t
cdef struct SizesC:
int states
int classes
int hiddens
int pieces
int feats
int embed_width
int tokens
cdef struct WeightsC:
const float* feat_weights
const float* feat_bias
const float* hidden_bias
const float* hidden_weights
const int8_t* seen_mask
cdef struct ActivationsC:
int* token_ids
float* unmaxed
float* hiddens
int* is_valid
int _curr_size
int _max_size

View File

@ -1,50 +0,0 @@
from thinc.api import Model, noop
from .parser_model import ParserStepModel
from ..util import registry
@registry.layers("spacy.TransitionModel.v1")
def TransitionModel(
tok2vec, lower, upper, resize_output, dropout=0.2, unseen_classes=set()
):
"""Set up a stepwise transition-based model"""
if upper is None:
has_upper = False
upper = noop()
else:
has_upper = True
# don't define nO for this object, because we can't dynamically change it
return Model(
name="parser_model",
forward=forward,
dims={"nI": tok2vec.maybe_get_dim("nI")},
layers=[tok2vec, lower, upper],
refs={"tok2vec": tok2vec, "lower": lower, "upper": upper},
init=init,
attrs={
"has_upper": has_upper,
"unseen_classes": set(unseen_classes),
"resize_output": resize_output,
},
)
def forward(model, X, is_train):
step_model = ParserStepModel(
X,
model.layers,
unseen_classes=model.attrs["unseen_classes"],
train=is_train,
has_upper=model.attrs["has_upper"],
)
return step_model, step_model.finish_steps
def init(model, X=None, Y=None):
model.get_ref("tok2vec").initialize(X=X)
lower = model.get_ref("lower")
lower.initialize()
if model.attrs["has_upper"]:
statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
model.get_ref("upper").initialize(X=statevecs)

621
spacy/ml/tb_framework.pyx Normal file
View File

@ -0,0 +1,621 @@
# cython: infer_types=True, cdivision=True, boundscheck=False
from typing import List, Tuple, Any, Optional, TypeVar, cast
from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free, realloc
from libcpp.vector cimport vector
import numpy
cimport numpy as np
from thinc.api import Model, normal_init, chain, list2array, Linear
from thinc.api import uniform_init, glorot_uniform_init, zero_init
from thinc.api import NumpyOps
from thinc.backends.cblas cimport CBlas, saxpy, sgemm
from thinc.types import Floats1d, Floats2d, Floats3d, Floats4d
from thinc.types import Ints1d, Ints2d
from ..errors import Errors
from ..pipeline._parser_internals import _beam_utils
from ..pipeline._parser_internals.batch import GreedyBatch
from ..pipeline._parser_internals._parser_utils cimport arg_max
from ..pipeline._parser_internals.transition_system cimport c_transition_batch, c_apply_actions
from ..pipeline._parser_internals.transition_system cimport TransitionSystem
from ..pipeline._parser_internals.stateclass cimport StateC, StateClass
from ..tokens.doc import Doc
from ..util import registry
State = Any # TODO
@registry.layers("spacy.TransitionModel.v2")
def TransitionModel(
*,
tok2vec: Model[List[Doc], List[Floats2d]],
beam_width: int = 1,
beam_density: float = 0.0,
state_tokens: int,
hidden_width: int,
maxout_pieces: int,
nO: Optional[int] = None,
unseen_classes=set(),
) -> Model[Tuple[List[Doc], TransitionSystem], List[Tuple[State, List[Floats2d]]]]:
"""Set up a transition-based parsing model, using a maxout hidden
layer and a linear output layer.
"""
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec_projected = chain(tok2vec, list2array(), Linear(hidden_width, t2v_width)) # type: ignore
tok2vec_projected.set_dim("nO", hidden_width)
# FIXME: we use `output` as a container for the output layer's
# weights and biases. Thinc optimizers cannot handle resizing
# of parameters. So, when the parser model is resized, we
# construct a new `output` layer, which has a different key in
# the optimizer. Once the optimizer supports parameter resizing,
# we can replace the `output` layer by `output_W` and `output_b`
# parameters in this model.
output = Linear(nO=None, nI=hidden_width, init_W=zero_init)
return Model(
name="parser_model",
forward=forward,
init=init,
layers=[tok2vec_projected, output],
refs={
"tok2vec": tok2vec_projected,
"output": output,
},
params={
"hidden_W": None, # Floats2d W for the hidden layer
"hidden_b": None, # Floats1d bias for the hidden layer
"hidden_pad": None, # Floats1d padding for the hidden layer
},
dims={
"nO": None, # Output size
"nP": maxout_pieces,
"nH": hidden_width,
"nI": tok2vec_projected.maybe_get_dim("nO"),
"nF": state_tokens,
},
attrs={
"beam_width": beam_width,
"beam_density": beam_density,
"unseen_classes": set(unseen_classes),
"resize_output": resize_output,
},
)
def resize_output(model: Model, new_nO: int) -> Model:
old_nO = model.maybe_get_dim("nO")
output = model.get_ref("output")
if old_nO is None:
model.set_dim("nO", new_nO)
output.set_dim("nO", new_nO)
output.initialize()
return model
elif new_nO <= old_nO:
return model
elif output.has_param("W"):
nH = model.get_dim("nH")
new_output = Linear(nO=new_nO, nI=nH, init_W=zero_init)
new_output.initialize()
new_W = new_output.get_param("W")
new_b = new_output.get_param("b")
old_W = output.get_param("W")
old_b = output.get_param("b")
new_W[:old_nO] = old_W # type: ignore
new_b[:old_nO] = old_b # type: ignore
for i in range(old_nO, new_nO):
model.attrs["unseen_classes"].add(i)
model.layers[-1] = new_output
model.set_ref("output", new_output)
# TODO: Avoid this private intrusion
model._dims["nO"] = new_nO
return model
def init(
model,
X: Optional[Tuple[List[Doc], TransitionSystem]] = None,
Y: Optional[Tuple[List[State], List[Floats2d]]] = None,
):
if X is not None:
docs, moves = X
model.get_ref("tok2vec").initialize(X=docs)
else:
model.get_ref("tok2vec").initialize()
inferred_nO = _infer_nO(Y)
if inferred_nO is not None:
current_nO = model.maybe_get_dim("nO")
if current_nO is None or current_nO != inferred_nO:
model.attrs["resize_output"](model, inferred_nO)
nO = model.get_dim("nO")
nP = model.get_dim("nP")
nH = model.get_dim("nH")
nI = model.get_dim("nI")
nF = model.get_dim("nF")
ops = model.ops
Wl = ops.alloc2f(nH * nP, nF * nI)
bl = ops.alloc1f(nH * nP)
padl = ops.alloc1f(nI)
# Wl = zero_init(ops, Wl.shape)
Wl = glorot_uniform_init(ops, Wl.shape)
padl = uniform_init(ops, padl.shape) # type: ignore
# TODO: Experiment with whether better to initialize output_W
model.set_param("hidden_W", Wl)
model.set_param("hidden_b", bl)
model.set_param("hidden_pad", padl)
# model = _lsuv_init(model)
return model
class TransitionModelInputs:
"""
Input to transition model.
"""
# dataclass annotation is not yet supported in Cython 0.29.x,
# so, we'll do something close to it.
actions: Optional[List[Ints1d]]
docs: List[Doc]
max_moves: int
moves: TransitionSystem
states: Optional[List[State]]
__slots__ = [
"actions",
"docs",
"max_moves",
"moves",
"states",
]
def __init__(
self,
docs: List[Doc],
moves: TransitionSystem,
actions: Optional[List[Ints1d]]=None,
max_moves: int=0,
states: Optional[List[State]]=None):
"""
actions (Optional[List[Ints1d]]): actions to apply for each Doc.
docs (List[Doc]): Docs to predict transition sequences for.
max_moves: (int): the maximum number of moves to apply, values less
than 1 will apply moves to states until they are final states.
moves (TransitionSystem): the transition system to use when predicting
the transition sequences.
states (Optional[List[States]]): the initial states to predict the
transition sequences for. When absent, the initial states are
initialized from the provided Docs.
"""
self.actions = actions
self.docs = docs
self.moves = moves
self.max_moves = max_moves
self.states = states
def forward(model, inputs: TransitionModelInputs, is_train: bool):
docs = inputs.docs
moves = inputs.moves
actions = inputs.actions
beam_width = model.attrs["beam_width"]
hidden_pad = model.get_param("hidden_pad")
tok2vec = model.get_ref("tok2vec")
states = moves.init_batch(docs) if inputs.states is None else inputs.states
tokvecs, backprop_tok2vec = tok2vec(docs, is_train)
tokvecs = model.ops.xp.vstack((tokvecs, hidden_pad))
feats, backprop_feats = _forward_precomputable_affine(model, tokvecs, is_train)
seen_mask = _get_seen_mask(model)
if not is_train and beam_width == 1 and isinstance(model.ops, NumpyOps):
# Note: max_moves is only used during training, so we don't need to
# pass it to the greedy inference path.
return _forward_greedy_cpu(model, moves, states, feats, seen_mask, actions=actions)
else:
return _forward_fallback(model, moves, states, tokvecs, backprop_tok2vec,
feats, backprop_feats, seen_mask, is_train, actions=actions,
max_moves=inputs.max_moves)
def _forward_greedy_cpu(model: Model, TransitionSystem moves, states: List[StateClass], np.ndarray feats,
np.ndarray[np.npy_bool, ndim=1] seen_mask, actions: Optional[List[Ints1d]]=None):
cdef vector[StateC*] c_states
cdef StateClass state
for state in states:
if not state.is_final():
c_states.push_back(state.c)
weights = _get_c_weights(model, <float*>feats.data, seen_mask)
# Precomputed features have rows for each token, plus one for padding.
cdef int n_tokens = feats.shape[0] - 1
sizes = _get_c_sizes(model, c_states.size(), n_tokens)
cdef CBlas cblas = model.ops.cblas()
scores = _parse_batch(cblas, moves, &c_states[0], weights, sizes, actions=actions)
def backprop(dY):
raise ValueError(Errors.E4004)
return (states, scores), backprop
cdef list _parse_batch(CBlas cblas, TransitionSystem moves, StateC** states,
WeightsC weights, SizesC sizes, actions: Optional[List[Ints1d]]=None):
cdef int i, j
cdef vector[StateC *] unfinished
cdef ActivationsC activations = _alloc_activations(sizes)
cdef np.ndarray step_scores
cdef np.ndarray step_actions
scores = []
while sizes.states >= 1:
step_scores = numpy.empty((sizes.states, sizes.classes), dtype="f")
step_actions = actions[0] if actions is not None else None
with nogil:
_predict_states(cblas, &activations, <float*>step_scores.data, states, &weights, sizes)
if actions is None:
# Validate actions, argmax, take action.
c_transition_batch(moves, states, <const float*>step_scores.data, sizes.classes,
sizes.states)
else:
c_apply_actions(moves, states, <const int*>step_actions.data, sizes.states)
for i in range(sizes.states):
if not states[i].is_final():
unfinished.push_back(states[i])
for i in range(unfinished.size()):
states[i] = unfinished[i]
sizes.states = unfinished.size()
scores.append(step_scores)
unfinished.clear()
actions = actions[1:] if actions is not None else None
_free_activations(&activations)
return scores
def _forward_fallback(
model: Model,
moves: TransitionSystem,
states: List[StateClass],
tokvecs, backprop_tok2vec,
feats,
backprop_feats,
seen_mask,
is_train: bool,
actions: Optional[List[Ints1d]]=None,
max_moves: int=0):
nF = model.get_dim("nF")
output = model.get_ref("output")
hidden_b = model.get_param("hidden_b")
nH = model.get_dim("nH")
nP = model.get_dim("nP")
beam_width = model.attrs["beam_width"]
beam_density = model.attrs["beam_density"]
ops = model.ops
all_ids = []
all_which = []
all_statevecs = []
all_scores = []
if beam_width == 1:
batch = GreedyBatch(moves, states, None)
else:
batch = _beam_utils.BeamBatch(
moves, states, None, width=beam_width, density=beam_density
)
arange = ops.xp.arange(nF)
n_moves = 0
while not batch.is_done:
ids = numpy.zeros((len(batch.get_unfinished_states()), nF), dtype="i")
for i, state in enumerate(batch.get_unfinished_states()):
state.set_context_tokens(ids, i, nF)
# Sum the state features, add the bias and apply the activation (maxout)
# to create the state vectors.
preacts2f = feats[ids, arange].sum(axis=1) # type: ignore
preacts2f += hidden_b
preacts = ops.reshape3f(preacts2f, preacts2f.shape[0], nH, nP)
assert preacts.shape[0] == len(batch.get_unfinished_states()), preacts.shape
statevecs, which = ops.maxout(preacts)
# We don't use output's backprop, since we want to backprop for
# all states at once, rather than a single state.
scores = output.predict(statevecs)
scores[:, seen_mask] = ops.xp.nanmin(scores)
# Transition the states, filtering out any that are finished.
cpu_scores = ops.to_numpy(scores)
if actions is None:
batch.advance(cpu_scores)
else:
batch.advance_with_actions(actions[0])
actions = actions[1:]
all_scores.append(scores)
if is_train:
# Remember intermediate results for the backprop.
all_ids.append(ids)
all_statevecs.append(statevecs)
all_which.append(which)
if n_moves >= max_moves >= 1:
break
n_moves += 1
def backprop_parser(d_states_d_scores):
ids = ops.xp.vstack(all_ids)
which = ops.xp.vstack(all_which)
statevecs = ops.xp.vstack(all_statevecs)
_, d_scores = d_states_d_scores
if model.attrs.get("unseen_classes"):
# If we have a negative gradient (i.e. the probability should
# increase) on any classes we filtered out as unseen, mark
# them as seen.
for clas in set(model.attrs["unseen_classes"]):
if (d_scores[:, clas] < 0).any():
model.attrs["unseen_classes"].remove(clas)
d_scores *= seen_mask == False
# Calculate the gradients for the parameters of the output layer.
# The weight gemm is (nS, nO) @ (nS, nH).T
output.inc_grad("b", d_scores.sum(axis=0))
output.inc_grad("W", ops.gemm(d_scores, statevecs, trans1=True))
# Now calculate d_statevecs, by backproping through the output linear layer.
# This gemm is (nS, nO) @ (nO, nH)
output_W = output.get_param("W")
d_statevecs = ops.gemm(d_scores, output_W)
# Backprop through the maxout activation
d_preacts = ops.backprop_maxout(d_statevecs, which, nP)
d_preacts2f = ops.reshape2f(d_preacts, d_preacts.shape[0], nH * nP)
model.inc_grad("hidden_b", d_preacts2f.sum(axis=0))
# We don't need to backprop the summation, because we pass back the IDs instead
d_state_features = backprop_feats((d_preacts2f, ids))
d_tokvecs = ops.alloc2f(tokvecs.shape[0], tokvecs.shape[1])
ops.scatter_add(d_tokvecs, ids, d_state_features)
model.inc_grad("hidden_pad", d_tokvecs[-1])
return (backprop_tok2vec(d_tokvecs[:-1]), None)
return (list(batch), all_scores), backprop_parser
def _get_seen_mask(model: Model) -> numpy.array[bool, 1]:
mask = model.ops.xp.zeros(model.get_dim("nO"), dtype="bool")
for class_ in model.attrs.get("unseen_classes", set()):
mask[class_] = True
return mask
def _forward_precomputable_affine(model, X: Floats2d, is_train: bool):
W: Floats2d = model.get_param("hidden_W")
nF = model.get_dim("nF")
nH = model.get_dim("nH")
nP = model.get_dim("nP")
nI = model.get_dim("nI")
# The weights start out (nH * nP, nF * nI). Transpose and reshape to (nF * nH *nP, nI)
W3f = model.ops.reshape3f(W, nH * nP, nF, nI)
W3f = W3f.transpose((1, 0, 2))
W2f = model.ops.reshape2f(W3f, nF * nH * nP, nI)
assert X.shape == (X.shape[0], nI), X.shape
Yf_ = model.ops.gemm(X, W2f, trans2=True)
Yf = model.ops.reshape3f(Yf_, Yf_.shape[0], nF, nH * nP)
def backward(dY_ids: Tuple[Floats3d, Ints2d]):
# This backprop is particularly tricky, because we get back a different
# thing from what we put out. We put out an array of shape:
# (nB, nF, nH, nP), and get back:
# (nB, nH, nP) and ids (nB, nF)
# The ids tell us the values of nF, so we would have:
#
# dYf = zeros((nB, nF, nH, nP))
# for b in range(nB):
# for f in range(nF):
# dYf[b, ids[b, f]] += dY[b]
#
# However, we avoid building that array for efficiency -- and just pass
# in the indices.
dY, ids = dY_ids
dXf = model.ops.gemm(dY, W)
Xf = X[ids].reshape((ids.shape[0], -1))
dW = model.ops.gemm(dY, Xf, trans1=True)
model.inc_grad("hidden_W", dW)
return model.ops.reshape3f(dXf, dXf.shape[0], nF, nI)
return Yf, backward
def _infer_nO(Y: Optional[Tuple[List[State], List[Floats2d]]]) -> Optional[int]:
if Y is None:
return None
_, scores = Y
if len(scores) == 0:
return None
assert scores[0].shape[0] >= 1
assert len(scores[0].shape) == 2
return scores[0].shape[1]
def _lsuv_init(model: Model):
"""This is like the 'layer sequential unit variance', but instead
of taking the actual inputs, we randomly generate whitened data.
Why's this all so complicated? We have a huge number of inputs,
and the maxout unit makes guessing the dynamics tricky. Instead
we set the maxout weights to values that empirically result in
whitened outputs given whitened inputs.
"""
W = model.maybe_get_param("hidden_W")
if W is not None and W.any():
return
nF = model.get_dim("nF")
nH = model.get_dim("nH")
nP = model.get_dim("nP")
nI = model.get_dim("nI")
W = model.ops.alloc4f(nF, nH, nP, nI)
b = model.ops.alloc2f(nH, nP)
pad = model.ops.alloc4f(1, nF, nH, nP)
ops = model.ops
W = normal_init(ops, W.shape, mean=float(ops.xp.sqrt(1.0 / nF * nI)))
pad = normal_init(ops, pad.shape, mean=1.0)
model.set_param("W", W)
model.set_param("b", b)
model.set_param("pad", pad)
ids = ops.alloc_f((5000, nF), dtype="f")
ids += ops.xp.random.uniform(0, 1000, ids.shape)
ids = ops.asarray(ids, dtype="i")
tokvecs = ops.alloc_f((5000, nI), dtype="f")
tokvecs += ops.xp.random.normal(loc=0.0, scale=1.0, size=tokvecs.size).reshape(
tokvecs.shape
)
def predict(ids, tokvecs):
# nS ids. nW tokvecs. Exclude the padding array.
hiddens, _ = _forward_precomputable_affine(model, tokvecs[:-1], False)
vectors = model.ops.alloc2f(ids.shape[0], nH * nP)
# need nS vectors
hiddens = hiddens.reshape((hiddens.shape[0] * nF, nH * nP))
model.ops.scatter_add(vectors, ids.flatten(), hiddens)
vectors3f = model.ops.reshape3f(vectors, vectors.shape[0], nH, nP)
vectors3f += b
return model.ops.maxout(vectors3f)[0]
tol_var = 0.01
tol_mean = 0.01
t_max = 10
W = cast(Floats4d, model.get_param("hidden_W").copy())
b = cast(Floats2d, model.get_param("hidden_b").copy())
for t_i in range(t_max):
acts1 = predict(ids, tokvecs)
var = model.ops.xp.var(acts1)
mean = model.ops.xp.mean(acts1)
if abs(var - 1.0) >= tol_var:
W /= model.ops.xp.sqrt(var)
model.set_param("hidden_W", W)
elif abs(mean) >= tol_mean:
b -= mean
model.set_param("hidden_b", b)
else:
break
return model
cdef WeightsC _get_c_weights(model, const float* feats, np.ndarray[np.npy_bool, ndim=1] seen_mask) except *:
output = model.get_ref("output")
cdef np.ndarray hidden_b = model.get_param("hidden_b")
cdef np.ndarray output_W = output.get_param("W")
cdef np.ndarray output_b = output.get_param("b")
cdef WeightsC weights
weights.feat_weights = feats
weights.feat_bias = <const float*>hidden_b.data
weights.hidden_weights = <const float *> output_W.data
weights.hidden_bias = <const float *> output_b.data
weights.seen_mask = <const int8_t*> seen_mask.data
return weights
cdef SizesC _get_c_sizes(model, int batch_size, int tokens) except *:
cdef SizesC sizes
sizes.states = batch_size
sizes.classes = model.get_dim("nO")
sizes.hiddens = model.get_dim("nH")
sizes.pieces = model.get_dim("nP")
sizes.feats = model.get_dim("nF")
sizes.embed_width = model.get_dim("nI")
sizes.tokens = tokens
return sizes
cdef ActivationsC _alloc_activations(SizesC n) nogil:
cdef ActivationsC A
memset(&A, 0, sizeof(A))
_resize_activations(&A, n)
return A
cdef void _free_activations(const ActivationsC* A) nogil:
free(A.token_ids)
free(A.unmaxed)
free(A.hiddens)
free(A.is_valid)
cdef void _resize_activations(ActivationsC* A, SizesC n) nogil:
if n.states <= A._max_size:
A._curr_size = n.states
return
if A._max_size == 0:
A.token_ids = <int*>calloc(n.states * n.feats, sizeof(A.token_ids[0]))
A.unmaxed = <float*>calloc(n.states * n.hiddens * n.pieces, sizeof(A.unmaxed[0]))
A.hiddens = <float*>calloc(n.states * n.hiddens, sizeof(A.hiddens[0]))
A.is_valid = <int*>calloc(n.states * n.classes, sizeof(A.is_valid[0]))
A._max_size = n.states
else:
A.token_ids = <int*>realloc(A.token_ids,
n.states * n.feats * sizeof(A.token_ids[0]))
A.unmaxed = <float*>realloc(A.unmaxed,
n.states * n.hiddens * n.pieces * sizeof(A.unmaxed[0]))
A.hiddens = <float*>realloc(A.hiddens,
n.states * n.hiddens * sizeof(A.hiddens[0]))
A.is_valid = <int*>realloc(A.is_valid,
n.states * n.classes * sizeof(A.is_valid[0]))
A._max_size = n.states
A._curr_size = n.states
cdef void _predict_states(CBlas cblas, ActivationsC* A, float* scores, StateC** states, const WeightsC* W, SizesC n) nogil:
_resize_activations(A, n)
for i in range(n.states):
states[i].set_context_tokens(&A.token_ids[i*n.feats], n.feats)
memset(A.unmaxed, 0, n.states * n.hiddens * n.pieces * sizeof(float))
_sum_state_features(cblas, A.unmaxed, W.feat_weights, A.token_ids, n)
for i in range(n.states):
saxpy(cblas)(n.hiddens * n.pieces, 1., W.feat_bias, 1, &A.unmaxed[i*n.hiddens*n.pieces], 1)
for j in range(n.hiddens):
index = i * n.hiddens * n.pieces + j * n.pieces
which = arg_max(&A.unmaxed[index], n.pieces)
A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
if W.hidden_weights == NULL:
memcpy(scores, A.hiddens, n.states * n.classes * sizeof(float))
else:
# Compute hidden-to-output
sgemm(cblas)(False, True, n.states, n.classes, n.hiddens,
1.0, <const float *>A.hiddens, n.hiddens,
<const float *>W.hidden_weights, n.hiddens,
0.0, scores, n.classes)
# Add bias
for i in range(n.states):
saxpy(cblas)(n.classes, 1., W.hidden_bias, 1, &scores[i*n.classes], 1)
# Set unseen classes to minimum value
i = 0
min_ = scores[0]
for i in range(1, n.states * n.classes):
if scores[i] < min_:
min_ = scores[i]
for i in range(n.states):
for j in range(n.classes):
if W.seen_mask[j]:
scores[i*n.classes+j] = min_
cdef void _sum_state_features(CBlas cblas, float* output,
const float* cached, const int* token_ids, SizesC n) nogil:
cdef int idx, b, f, i
cdef const float* feature
cdef int B = n.states
cdef int O = n.hiddens * n.pieces
cdef int F = n.feats
cdef int T = n.tokens
padding = cached + (T * F * O)
cdef int id_stride = F*O
cdef float one = 1.
for b in range(B):
for f in range(F):
if token_ids[f] < 0:
feature = &padding[f*O]
else:
idx = token_ids[f] * id_stride + f*O
feature = &cached[idx]
saxpy(cblas)(O, one, <const float*>feature, 1, &output[b*O], 1)
token_ids += F

View File

@ -7,6 +7,7 @@ from cpython.ref cimport PyObject, Py_XDECREF
from ...typedefs cimport hash_t, class_t
from .transition_system cimport TransitionSystem, Transition
from ...errors import Errors
from .batch cimport Batch
from .search cimport Beam, MaxViolation
from .search import MaxViolation
from .stateclass cimport StateC, StateClass
@ -26,7 +27,7 @@ cdef int check_final_state(void* _state, void* extra_args) except -1:
return state.is_final()
cdef class BeamBatch(object):
cdef class BeamBatch(Batch):
cdef public TransitionSystem moves
cdef public object states
cdef public object docs

View File

@ -0,0 +1,2 @@
cdef int arg_max(const float* scores, const int n_classes) nogil
cdef int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil

View File

@ -0,0 +1,22 @@
# cython: infer_types=True
cdef inline int arg_max(const float* scores, const int n_classes) nogil:
if n_classes == 2:
return 0 if scores[0] > scores[1] else 1
cdef int i
cdef int best = 0
cdef float mode = scores[0]
for i in range(1, n_classes):
if scores[i] > mode:
mode = scores[i]
best = i
return best
cdef inline int arg_max_if_valid(const float* scores, const int* is_valid, int n) nogil:
cdef int best = -1
for i in range(n):
if is_valid[i] >= 1:
if best == -1 or scores[i] > scores[best]:
best = i
return best

View File

@ -6,7 +6,6 @@ cimport libcpp
from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector
from libcpp.set cimport set
from cpython.exc cimport PyErr_CheckSignals, PyErr_SetFromErrno
from murmurhash.mrmr cimport hash64
from ...vocab cimport EMPTY_LEXEME
@ -26,7 +25,7 @@ cdef struct ArcC:
cdef cppclass StateC:
int* _heads
vector[int] _heads
const TokenC* _sent
vector[int] _stack
vector[int] _rebuffer
@ -34,31 +33,34 @@ cdef cppclass StateC:
unordered_map[int, vector[ArcC]] _left_arcs
unordered_map[int, vector[ArcC]] _right_arcs
vector[libcpp.bool] _unshiftable
vector[int] history
set[int] _sent_starts
TokenC _empty_token
int length
int offset
int _b_i
__init__(const TokenC* sent, int length) nogil:
__init__(const TokenC* sent, int length) nogil except +:
this._heads.resize(length, -1)
this._unshiftable.resize(length, False)
# Reserve memory ahead of time to minimize allocations during parsing.
# The initial capacity set here ideally reflects the expected average-case/majority usage.
cdef int init_capacity = 32
this._stack.reserve(init_capacity)
this._rebuffer.reserve(init_capacity)
this._ents.reserve(init_capacity)
this._left_arcs.reserve(init_capacity)
this._right_arcs.reserve(init_capacity)
this.history.reserve(init_capacity)
this._sent = sent
this._heads = <int*>calloc(length, sizeof(int))
if not (this._sent and this._heads):
with gil:
PyErr_SetFromErrno(MemoryError)
PyErr_CheckSignals()
this.offset = 0
this.length = length
this._b_i = 0
for i in range(length):
this._heads[i] = -1
this._unshiftable.push_back(0)
memset(&this._empty_token, 0, sizeof(TokenC))
this._empty_token.lex = &EMPTY_LEXEME
__dealloc__():
free(this._heads)
void set_context_tokens(int* ids, int n) nogil:
cdef int i, j
if n == 1:
@ -131,19 +133,20 @@ cdef cppclass StateC:
ids[i] = -1
int S(int i) nogil const:
if i >= this._stack.size():
cdef int stack_size = this._stack.size()
if i >= stack_size or i < 0:
return -1
elif i < 0:
return -1
return this._stack.at(this._stack.size() - (i+1))
else:
return this._stack[stack_size - (i+1)]
int B(int i) nogil const:
cdef int buf_size = this._rebuffer.size()
if i < 0:
return -1
elif i < this._rebuffer.size():
return this._rebuffer.at(this._rebuffer.size() - (i+1))
elif i < buf_size:
return this._rebuffer[buf_size - (i+1)]
else:
b_i = this._b_i + (i - this._rebuffer.size())
b_i = this._b_i + (i - buf_size)
if b_i >= this.length:
return -1
else:
@ -242,7 +245,7 @@ cdef cppclass StateC:
return 0
elif this._sent[word].sent_start == 1:
return 1
elif this._sent_starts.count(word) >= 1:
elif this._sent_starts.const_find(word) != this._sent_starts.const_end():
return 1
else:
return 0
@ -327,7 +330,7 @@ cdef cppclass StateC:
if item >= this._unshiftable.size():
return 0
else:
return this._unshiftable.at(item)
return this._unshiftable[item]
void set_reshiftable(int item) nogil:
if item < this._unshiftable.size():
@ -347,6 +350,9 @@ cdef cppclass StateC:
this._heads[child] = head
void map_del_arc(unordered_map[int, vector[ArcC]]* heads_arcs, int h_i, int c_i) nogil:
cdef vector[ArcC]* arcs
cdef ArcC* arc
arcs_it = heads_arcs.find(h_i)
if arcs_it == heads_arcs.end():
return
@ -355,12 +361,12 @@ cdef cppclass StateC:
if arcs.size() == 0:
return
arc = arcs.back()
arc = &arcs.back()
if arc.head == h_i and arc.child == c_i:
arcs.pop_back()
else:
for i in range(arcs.size()-1):
arc = arcs.at(i)
arc = &deref(arcs)[i]
if arc.head == h_i and arc.child == c_i:
arc.head = -1
arc.child = -1
@ -400,10 +406,11 @@ cdef cppclass StateC:
this._rebuffer = src._rebuffer
this._sent_starts = src._sent_starts
this._unshiftable = src._unshiftable
memcpy(this._heads, src._heads, this.length * sizeof(this._heads[0]))
this._heads = src._heads
this._ents = src._ents
this._left_arcs = src._left_arcs
this._right_arcs = src._right_arcs
this._b_i = src._b_i
this.offset = src.offset
this._empty_token = src._empty_token
this.history = src.history

View File

@ -773,6 +773,8 @@ cdef class ArcEager(TransitionSystem):
return list(arcs)
def has_gold(self, Example eg, start=0, end=None):
if end is not None and end < 0:
end = None
for word in eg.y[start:end]:
if word.dep != 0:
return True
@ -858,6 +860,7 @@ cdef class ArcEager(TransitionSystem):
state.print_state()
)))
action.do(state.c, action.label)
state.c.history.push_back(i)
break
else:
failed = False

View File

@ -0,0 +1,2 @@
cdef class Batch:
pass

View File

@ -0,0 +1,52 @@
from typing import Any
TransitionSystem = Any # TODO
cdef class Batch:
def advance(self, scores):
raise NotImplementedError
def get_states(self):
raise NotImplementedError
@property
def is_done(self):
raise NotImplementedError
def get_unfinished_states(self):
raise NotImplementedError
def __getitem__(self, i):
raise NotImplementedError
def __len__(self):
raise NotImplementedError
class GreedyBatch(Batch):
def __init__(self, moves: TransitionSystem, states, golds):
self._moves = moves
self._states = states
self._next_states = [s for s in states if not s.is_final()]
def advance(self, scores):
self._next_states = self._moves.transition_states(self._next_states, scores)
def advance_with_actions(self, actions):
self._next_states = self._moves.apply_actions(self._next_states, actions)
def get_states(self):
return self._states
@property
def is_done(self):
return all(s.is_final() for s in self._states)
def get_unfinished_states(self):
return [st for st in self._states if not st.is_final()]
def __getitem__(self, i):
return self._states[i]
def __len__(self):
return len(self._states)

View File

@ -156,7 +156,7 @@ cdef class BiluoPushDown(TransitionSystem):
if token.ent_type:
labels.add(token.ent_type_)
return labels
def move_name(self, int move, attr_t label):
if move == OUT:
return 'O'
@ -306,6 +306,8 @@ cdef class BiluoPushDown(TransitionSystem):
for span in eg.y.spans.get(neg_key, []):
if span.start >= start and span.end <= end:
return True
if end is not None and end < 0:
end = None
for word in eg.y[start:end]:
if word.ent_iob != 0:
return True
@ -646,7 +648,7 @@ cdef class Unit:
cost += 1
break
return cost
cdef class Out:

View File

@ -20,6 +20,10 @@ cdef class StateClass:
if self._borrowed != 1:
del self.c
@property
def history(self):
return list(self.c.history)
@property
def stack(self):
return [self.S(i) for i in range(self.c.stack_depth())]
@ -176,3 +180,6 @@ cdef class StateClass:
def clone(self, StateClass src):
self.c.clone(src.c)
def set_context_tokens(self, int[:, :] output, int row, int n_feats):
self.c.set_context_tokens(&output[row, 0], n_feats)

View File

@ -53,3 +53,10 @@ cdef class TransitionSystem:
cdef int set_costs(self, int* is_valid, weight_t* costs,
const StateC* state, gold) except -1
cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
int batch_size) nogil
cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
int nr_class, int batch_size) nogil

View File

@ -1,6 +1,8 @@
# cython: infer_types=True
from __future__ import print_function
from cymem.cymem cimport Pool
from libc.stdlib cimport calloc, free
from libcpp.vector cimport vector
from collections import Counter
import srsly
@ -10,6 +12,7 @@ from ...typedefs cimport weight_t, attr_t
from ...tokens.doc cimport Doc
from ...structs cimport TokenC
from .stateclass cimport StateClass
from ._parser_utils cimport arg_max_if_valid
from ...errors import Errors
from ... import util
@ -73,7 +76,18 @@ cdef class TransitionSystem:
offset += len(doc)
return states
def follow_history(self, doc, history):
cdef int clas
cdef StateClass state = StateClass(doc)
for clas in history:
action = self.c[clas]
action.do(state.c, action.label)
state.c.history.push_back(clas)
return state
def get_oracle_sequence(self, Example example, _debug=False):
if not self.has_gold(example):
return []
states, golds, _ = self.init_gold_batch([example])
if not states:
return []
@ -85,6 +99,8 @@ cdef class TransitionSystem:
return self.get_oracle_sequence_from_state(state, gold)
def get_oracle_sequence_from_state(self, StateClass state, gold, _debug=None):
if state.is_final():
return []
cdef Pool mem = Pool()
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
assert self.n_moves > 0
@ -110,6 +126,7 @@ cdef class TransitionSystem:
"S0 head?", str(state.has_head(state.S(0))),
)))
action.do(state.c, action.label)
state.c.history.push_back(i)
break
else:
if _debug:
@ -137,6 +154,28 @@ cdef class TransitionSystem:
raise ValueError(Errors.E170.format(name=name))
action = self.lookup_transition(name)
action.do(state.c, action.label)
state.c.history.push_back(action.clas)
def apply_actions(self, states, const int[::1] actions):
assert len(states) == actions.shape[0]
cdef StateClass state
cdef vector[StateC*] c_states
c_states.resize(len(states))
cdef int i
for (i, state) in enumerate(states):
c_states[i] = state.c
c_apply_actions(self, &c_states[0], &actions[0], actions.shape[0])
return [state for state in states if not state.c.is_final()]
def transition_states(self, states, float[:, ::1] scores):
assert len(states) == scores.shape[0]
cdef StateClass state
cdef float* c_scores = &scores[0, 0]
cdef vector[StateC*] c_states
for state in states:
c_states.push_back(state.c)
c_transition_batch(self, &c_states[0], c_scores, scores.shape[1], scores.shape[0])
return [state for state in states if not state.c.is_final()]
cdef Transition lookup_transition(self, object name) except *:
raise NotImplementedError
@ -250,3 +289,35 @@ cdef class TransitionSystem:
self.cfg.update(msg['cfg'])
self.initialize_actions(labels)
return self
cdef void c_apply_actions(TransitionSystem moves, StateC** states, const int* actions,
int batch_size) nogil:
cdef int i
cdef Transition action
cdef StateC* state
for i in range(batch_size):
state = states[i]
action = moves.c[actions[i]]
action.do(state, action.label)
state.history.push_back(action.clas)
cdef void c_transition_batch(TransitionSystem moves, StateC** states, const float* scores,
int nr_class, int batch_size) nogil:
is_valid = <int*>calloc(moves.n_moves, sizeof(int))
cdef int i, guess
cdef Transition action
for i in range(batch_size):
moves.set_valid(is_valid, states[i])
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
if guess == -1:
# This shouldn't happen, but it's hard to raise an error here,
# and we don't want to infinite loop. So, force to end state.
states[i].force_final()
else:
action = moves.c[guess]
action.do(states[i], action.label)
states[i].history.push_back(guess)
free(is_valid)

View File

@ -4,8 +4,8 @@ from typing import Optional, Iterable, Callable
from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem
from .transition_parser cimport Parser
from ._parser_internals.arc_eager cimport ArcEager
from .transition_parser import Parser
from ._parser_internals.arc_eager import ArcEager
from .functions import merge_subtokens
from ..language import Language
@ -18,12 +18,11 @@ from ..util import registry
default_model_config = """
[model]
@architectures = "spacy.TransitionBasedParser.v2"
@architectures = "spacy.TransitionBasedParser.v3"
state_type = "parser"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
@ -123,6 +122,7 @@ def make_parser(
scorer=scorer,
)
@Language.factory(
"beam_parser",
assigns=["token.dep", "token.head", "token.is_sent_start", "doc.sents"],
@ -228,6 +228,7 @@ def parser_score(examples, **kwargs):
DOCS: https://spacy.io/api/dependencyparser#score
"""
def has_sents(doc):
return doc.has_annotation("SENT_START")
@ -235,8 +236,11 @@ def parser_score(examples, **kwargs):
dep = getattr(token, attr)
dep = token.vocab.strings.as_string(dep).lower()
return dep
results = {}
results.update(Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs))
results.update(
Scorer.score_spans(examples, "sents", has_annotation=has_sents, **kwargs)
)
kwargs.setdefault("getter", dep_getter)
kwargs.setdefault("ignore_labels", ("p", "punct"))
results.update(Scorer.score_deps(examples, "dep", **kwargs))
@ -249,11 +253,12 @@ def make_parser_scorer():
return parser_score
cdef class DependencyParser(Parser):
class DependencyParser(Parser):
"""Pipeline component for dependency parsing.
DOCS: https://spacy.io/api/dependencyparser
"""
TransitionSystem = ArcEager
def __init__(
@ -273,8 +278,7 @@ cdef class DependencyParser(Parser):
incorrect_spans_key=None,
scorer=parser_score,
):
"""Create a DependencyParser.
"""
"""Create a DependencyParser."""
super().__init__(
vocab,
model,

View File

@ -5,7 +5,7 @@ from itertools import islice
import numpy as np
import srsly
from thinc.api import Config, Model
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps
from thinc.types import ArrayXd, Floats2d, Ints1d
from thinc.legacy import LegacySequenceCategoricalCrossentropy
@ -22,6 +22,8 @@ from .. import util
ActivationsT = Dict[str, Union[List[Floats2d], List[Ints1d]]]
# The cutoff value of *top_k* above which an alternative method is used to process guesses.
TOP_K_GUARDRAIL = 20
default_model_config = """
@ -125,6 +127,7 @@ class EditTreeLemmatizer(TrainablePipe):
self.cfg: Dict[str, Any] = {"labels": []}
self.scorer = scorer
self.save_activations = save_activations
self.numpy_ops = NumpyOps()
def get_loss(
self, examples: Iterable[Example], scores: List[Floats2d]
@ -140,7 +143,7 @@ class EditTreeLemmatizer(TrainablePipe):
for (predicted, gold_lemma) in zip(
eg.predicted, eg.get_aligned("LEMMA", as_string=True)
):
if gold_lemma is None:
if gold_lemma is None or gold_lemma == "":
label = -1
else:
tree_id = self.trees.add(predicted.text, gold_lemma)
@ -155,7 +158,38 @@ class EditTreeLemmatizer(TrainablePipe):
return float(loss), d_scores
def get_teacher_student_loss(
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
) -> Tuple[float, List[Floats2d]]:
"""Calculate the loss and its gradient for a batch of student
scores, relative to teacher scores.
teacher_scores: Scores representing the teacher model's predictions.
student_scores: Scores representing the student model's predictions.
RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://spacy.io/api/edittreelemmatizer#get_teacher_student_loss
"""
loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
d_scores, loss = loss_func(student_scores, teacher_scores)
if self.model.ops.xp.isnan(loss):
raise ValueError(Errors.E910.format(name=self.name))
return float(loss), d_scores
def predict(self, docs: Iterable[Doc]) -> ActivationsT:
if self.top_k == 1:
scores2guesses = self._scores2guesses_top_k_equals_1
elif self.top_k <= TOP_K_GUARDRAIL:
scores2guesses = self._scores2guesses_top_k_greater_1
else:
scores2guesses = self._scores2guesses_top_k_guardrail
# The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
# of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
# for its principal purpose of lemmatizing tokens. However, the code could also
# be used for other purposes, and with very large values of *top_k* the method
# becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
# instead.
n_docs = len(list(docs))
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
@ -170,20 +204,52 @@ class EditTreeLemmatizer(TrainablePipe):
return {"probabilities": scores, "tree_ids": guesses}
scores = self.model.predict(docs)
assert len(scores) == n_docs
guesses = self._scores2guesses(docs, scores)
guesses = scores2guesses(docs, scores)
assert len(guesses) == n_docs
return {"probabilities": scores, "tree_ids": guesses}
def _scores2guesses(self, docs, scores):
def _scores2guesses_top_k_equals_1(self, docs, scores):
guesses = []
for doc, doc_scores in zip(docs, scores):
if self.top_k == 1:
doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
else:
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
doc_guesses = doc_scores.argmax(axis=1)
doc_guesses = self.numpy_ops.asarray(doc_guesses)
if not isinstance(doc_guesses, np.ndarray):
doc_guesses = doc_guesses.get()
doc_compat_guesses = []
for i, token in enumerate(doc):
tree_id = self.cfg["labels"][doc_guesses[i]]
if self.trees.apply(tree_id, token.text) is not None:
doc_compat_guesses.append(tree_id)
else:
doc_compat_guesses.append(-1)
guesses.append(np.array(doc_compat_guesses))
return guesses
def _scores2guesses_top_k_greater_1(self, docs, scores):
guesses = []
top_k = min(self.top_k, len(self.labels))
for doc, doc_scores in zip(docs, scores):
doc_scores = self.numpy_ops.asarray(doc_scores)
doc_compat_guesses = []
for i, token in enumerate(doc):
for _ in range(top_k):
candidate = int(doc_scores[i].argmax())
candidate_tree_id = self.cfg["labels"][candidate]
if self.trees.apply(candidate_tree_id, token.text) is not None:
doc_compat_guesses.append(candidate_tree_id)
break
doc_scores[i, candidate] = np.finfo(np.float32).min
else:
doc_compat_guesses.append(-1)
guesses.append(np.array(doc_compat_guesses))
return guesses
def _scores2guesses_top_k_guardrail(self, docs, scores):
guesses = []
for doc, doc_scores in zip(docs, scores):
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
doc_guesses = self.numpy_ops.asarray(doc_guesses)
doc_compat_guesses = []
for token, candidates in zip(doc, doc_guesses):

View File

@ -13,7 +13,6 @@ from ..kb import KnowledgeBase, Candidate
from ..ml import empty_kb
from ..tokens import Doc, Span
from .pipe import deserialize_config
from .legacy.entity_linker import EntityLinker_v1
from .trainable_pipe import TrainablePipe
from ..language import Language
from ..vocab import Vocab
@ -120,6 +119,12 @@ def make_entity_linker(
"""
if not model.attrs.get("include_span_maker", False):
try:
from spacy_legacy.components.entity_linker import EntityLinker_v1
except:
raise ImportError(
"In order to use v1 of the EntityLinker, you must use spacy-legacy>=3.0.12."
)
# The only difference in arguments here is that use_gold_ents and threshold aren't available.
return EntityLinker_v1(
nlp.vocab,
@ -453,7 +458,11 @@ class EntityLinker(TrainablePipe):
docs_ents: List[Ragged] = []
docs_scores: List[Ragged] = []
if not docs:
return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
return {
KNOWLEDGE_BASE_IDS: final_kb_ids,
"ents": docs_ents,
"scores": docs_scores,
}
if isinstance(docs, Doc):
docs = [docs]
for doc in docs:
@ -585,7 +594,11 @@ class EntityLinker(TrainablePipe):
method="predict", msg="result variables not of equal length"
)
raise RuntimeError(err)
return {KNOWLEDGE_BASE_IDS: final_kb_ids, "ents": docs_ents, "scores": docs_scores}
return {
KNOWLEDGE_BASE_IDS: final_kb_ids,
"ents": docs_ents,
"scores": docs_scores,
}
def set_annotations(self, docs: Iterable[Doc], activations: ActivationsT) -> None:
"""Modify a batch of documents, using pre-computed scores.

View File

@ -1,3 +0,0 @@
from .entity_linker import EntityLinker_v1
__all__ = ["EntityLinker_v1"]

View File

@ -1,422 +0,0 @@
# This file is present to provide a prior version of the EntityLinker component
# for backwards compatability. For details see #9669.
from typing import Optional, Iterable, Callable, Dict, Union, List, Any
from thinc.types import Floats2d
from pathlib import Path
from itertools import islice
import srsly
import random
from thinc.api import CosineDistance, Model, Optimizer
from thinc.api import set_dropout_rate
import warnings
from ...kb import KnowledgeBase, Candidate
from ...ml import empty_kb
from ...tokens import Doc, Span
from ..pipe import deserialize_config
from ..trainable_pipe import TrainablePipe
from ...language import Language
from ...vocab import Vocab
from ...training import Example, validate_examples, validate_get_examples
from ...errors import Errors, Warnings
from ...util import SimpleFrozenList
from ... import util
from ...scorer import Scorer
# See #9050
BACKWARD_OVERWRITE = True
def entity_linker_score(examples, **kwargs):
return Scorer.score_links(examples, negative_labels=[EntityLinker_v1.NIL], **kwargs)
class EntityLinker_v1(TrainablePipe):
"""Pipeline component for named entity linking.
DOCS: https://spacy.io/api/entitylinker
"""
NIL = "NIL" # string used to refer to a non-existing link
def __init__(
self,
vocab: Vocab,
model: Model,
name: str = "entity_linker",
*,
labels_discard: Iterable[str],
n_sents: int,
incl_prior: bool,
incl_context: bool,
entity_vector_length: int,
get_candidates: Callable[[KnowledgeBase, Span], Iterable[Candidate]],
overwrite: bool = BACKWARD_OVERWRITE,
scorer: Optional[Callable] = entity_linker_score,
) -> None:
"""Initialize an entity linker.
vocab (Vocab): The shared vocabulary.
model (thinc.api.Model): The Thinc Model powering the pipeline component.
name (str): The component instance name, used to add entries to the
losses during training.
labels_discard (Iterable[str]): NER labels that will automatically get a "NIL" prediction.
n_sents (int): The number of neighbouring sentences to take into account.
incl_prior (bool): Whether or not to include prior probabilities from the KB in the model.
incl_context (bool): Whether or not to include the local context in the model.
entity_vector_length (int): Size of encoding vectors in the KB.
get_candidates (Callable[[KnowledgeBase, Span], Iterable[Candidate]]): Function that
produces a list of candidates, given a certain knowledge base and a textual mention.
scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links.
DOCS: https://spacy.io/api/entitylinker#init
"""
self.vocab = vocab
self.model = model
self.name = name
self.labels_discard = list(labels_discard)
self.n_sents = n_sents
self.incl_prior = incl_prior
self.incl_context = incl_context
self.get_candidates = get_candidates
self.cfg: Dict[str, Any] = {"overwrite": overwrite}
self.distance = CosineDistance(normalize=False)
# how many neighbour sentences to take into account
# create an empty KB by default. If you want to load a predefined one, specify it in 'initialize'.
self.kb = empty_kb(entity_vector_length)(self.vocab)
self.scorer = scorer
def set_kb(self, kb_loader: Callable[[Vocab], KnowledgeBase]):
"""Define the KB of this pipe by providing a function that will
create it using this object's vocab."""
if not callable(kb_loader):
raise ValueError(Errors.E885.format(arg_type=type(kb_loader)))
self.kb = kb_loader(self.vocab)
def validate_kb(self) -> None:
# Raise an error if the knowledge base is not initialized.
if self.kb is None:
raise ValueError(Errors.E1018.format(name=self.name))
if len(self.kb) == 0:
raise ValueError(Errors.E139.format(name=self.name))
def initialize(
self,
get_examples: Callable[[], Iterable[Example]],
*,
nlp: Optional[Language] = None,
kb_loader: Optional[Callable[[Vocab], KnowledgeBase]] = None,
):
"""Initialize the pipe for training, using a representative set
of data examples.
get_examples (Callable[[], Iterable[Example]]): Function that
returns a representative sample of gold-standard Example objects.
nlp (Language): The current nlp object the component is part of.
kb_loader (Callable[[Vocab], KnowledgeBase]): A function that creates an InMemoryLookupKB from a Vocab instance.
Note that providing this argument, will overwrite all data accumulated in the current KB.
Use this only when loading a KB as-such from file.
DOCS: https://spacy.io/api/entitylinker#initialize
"""
validate_get_examples(get_examples, "EntityLinker_v1.initialize")
if kb_loader is not None:
self.set_kb(kb_loader)
self.validate_kb()
nO = self.kb.entity_vector_length
doc_sample = []
vector_sample = []
for example in islice(get_examples(), 10):
doc_sample.append(example.x)
vector_sample.append(self.model.ops.alloc1f(nO))
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
assert len(vector_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(
X=doc_sample, Y=self.model.ops.asarray(vector_sample, dtype="float32")
)
def update(
self,
examples: Iterable[Example],
*,
drop: float = 0.0,
sgd: Optional[Optimizer] = None,
losses: Optional[Dict[str, float]] = None,
) -> Dict[str, float]:
"""Learn from a batch of documents and gold-standard information,
updating the pipe's model. Delegates to predict and get_loss.
examples (Iterable[Example]): A batch of Example objects.
drop (float): The dropout rate.
sgd (thinc.api.Optimizer): The optimizer.
losses (Dict[str, float]): Optional record of the loss during training.
Updated using the component name as the key.
RETURNS (Dict[str, float]): The updated losses dictionary.
DOCS: https://spacy.io/api/entitylinker#update
"""
self.validate_kb()
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
if not examples:
return losses
validate_examples(examples, "EntityLinker_v1.update")
sentence_docs = []
for eg in examples:
sentences = [s for s in eg.reference.sents]
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
for ent in eg.reference.ents:
# KB ID of the first token is the same as the whole span
kb_id = kb_ids[ent.start]
if kb_id:
try:
# find the sentence in the list of sentences.
sent_index = sentences.index(ent.sent)
except AttributeError:
# Catch the exception when ent.sent is None and provide a user-friendly warning
raise RuntimeError(Errors.E030) from None
# get n previous sentences, if there are any
start_sentence = max(0, sent_index - self.n_sents)
# get n posterior sentences, or as many < n as there are
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
# get token positions
start_token = sentences[start_sentence].start
end_token = sentences[end_sentence].end
# append that span as a doc to training
sent_doc = eg.predicted[start_token:end_token].as_doc()
sentence_docs.append(sent_doc)
set_dropout_rate(self.model, drop)
if not sentence_docs:
warnings.warn(Warnings.W093.format(name="Entity Linker"))
return losses
sentence_encodings, bp_context = self.model.begin_update(sentence_docs)
loss, d_scores = self.get_loss(
sentence_encodings=sentence_encodings, examples=examples
)
bp_context(d_scores)
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += loss
return losses
def get_loss(self, examples: Iterable[Example], sentence_encodings: Floats2d):
validate_examples(examples, "EntityLinker_v1.get_loss")
entity_encodings = []
for eg in examples:
kb_ids = eg.get_aligned("ENT_KB_ID", as_string=True)
for ent in eg.reference.ents:
kb_id = kb_ids[ent.start]
if kb_id:
entity_encoding = self.kb.get_vector(kb_id)
entity_encodings.append(entity_encoding)
entity_encodings = self.model.ops.asarray2f(entity_encodings)
if sentence_encodings.shape != entity_encodings.shape:
err = Errors.E147.format(
method="get_loss", msg="gold entities do not match up"
)
raise RuntimeError(err)
gradients = self.distance.get_grad(sentence_encodings, entity_encodings)
loss = self.distance.get_loss(sentence_encodings, entity_encodings)
loss = loss / len(entity_encodings)
return float(loss), gradients
def predict(self, docs: Iterable[Doc]) -> List[str]:
"""Apply the pipeline's model to a batch of docs, without modifying them.
Returns the KB IDs for each entity in each doc, including NIL if there is
no prediction.
docs (Iterable[Doc]): The documents to predict.
RETURNS (List[str]): The models prediction for each document.
DOCS: https://spacy.io/api/entitylinker#predict
"""
self.validate_kb()
entity_count = 0
final_kb_ids: List[str] = []
if not docs:
return final_kb_ids
if isinstance(docs, Doc):
docs = [docs]
for i, doc in enumerate(docs):
sentences = [s for s in doc.sents]
if len(doc) > 0:
# Looping through each entity (TODO: rewrite)
for ent in doc.ents:
sent = ent.sent
sent_index = sentences.index(sent)
assert sent_index >= 0
# get n_neighbour sentences, clipped to the length of the document
start_sentence = max(0, sent_index - self.n_sents)
end_sentence = min(len(sentences) - 1, sent_index + self.n_sents)
start_token = sentences[start_sentence].start
end_token = sentences[end_sentence].end
sent_doc = doc[start_token:end_token].as_doc()
# currently, the context is the same for each entity in a sentence (should be refined)
xp = self.model.ops.xp
if self.incl_context:
sentence_encoding = self.model.predict([sent_doc])[0]
sentence_encoding_t = sentence_encoding.T
sentence_norm = xp.linalg.norm(sentence_encoding_t)
entity_count += 1
if ent.label_ in self.labels_discard:
# ignoring this entity - setting to NIL
final_kb_ids.append(self.NIL)
else:
candidates = list(self.get_candidates(self.kb, ent))
if not candidates:
# no prediction possible for this entity - setting to NIL
final_kb_ids.append(self.NIL)
elif len(candidates) == 1:
# shortcut for efficiency reasons: take the 1 candidate
final_kb_ids.append(candidates[0].entity_)
else:
random.shuffle(candidates)
# set all prior probabilities to 0 if incl_prior=False
prior_probs = xp.asarray([c.prior_prob for c in candidates])
if not self.incl_prior:
prior_probs = xp.asarray([0.0 for _ in candidates])
scores = prior_probs
# add in similarity from the context
if self.incl_context:
entity_encodings = xp.asarray(
[c.entity_vector for c in candidates]
)
entity_norm = xp.linalg.norm(entity_encodings, axis=1)
if len(entity_encodings) != len(prior_probs):
raise RuntimeError(
Errors.E147.format(
method="predict",
msg="vectors not of equal length",
)
)
# cosine similarity
sims = xp.dot(entity_encodings, sentence_encoding_t) / (
sentence_norm * entity_norm
)
if sims.shape != prior_probs.shape:
raise ValueError(Errors.E161)
scores = prior_probs + sims - (prior_probs * sims)
best_index = scores.argmax().item()
best_candidate = candidates[best_index]
final_kb_ids.append(best_candidate.entity_)
if not (len(final_kb_ids) == entity_count):
err = Errors.E147.format(
method="predict", msg="result variables not of equal length"
)
raise RuntimeError(err)
return final_kb_ids
def set_annotations(self, docs: Iterable[Doc], kb_ids: List[str]) -> None:
"""Modify a batch of documents, using pre-computed scores.
docs (Iterable[Doc]): The documents to modify.
kb_ids (List[str]): The IDs to set, produced by EntityLinker.predict.
DOCS: https://spacy.io/api/entitylinker#set_annotations
"""
count_ents = len([ent for doc in docs for ent in doc.ents])
if count_ents != len(kb_ids):
raise ValueError(Errors.E148.format(ents=count_ents, ids=len(kb_ids)))
i = 0
overwrite = self.cfg["overwrite"]
for doc in docs:
for ent in doc.ents:
kb_id = kb_ids[i]
i += 1
for token in ent:
if token.ent_kb_id == 0 or overwrite:
token.ent_kb_id_ = kb_id
def to_bytes(self, *, exclude=tuple()):
"""Serialize the pipe to a bytestring.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (bytes): The serialized object.
DOCS: https://spacy.io/api/entitylinker#to_bytes
"""
self._validate_serialization_attrs()
serialize = {}
if hasattr(self, "cfg") and self.cfg is not None:
serialize["cfg"] = lambda: srsly.json_dumps(self.cfg)
serialize["vocab"] = lambda: self.vocab.to_bytes(exclude=exclude)
serialize["kb"] = self.kb.to_bytes
serialize["model"] = self.model.to_bytes
return util.to_bytes(serialize, exclude)
def from_bytes(self, bytes_data, *, exclude=tuple()):
"""Load the pipe from a bytestring.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (TrainablePipe): The loaded object.
DOCS: https://spacy.io/api/entitylinker#from_bytes
"""
self._validate_serialization_attrs()
def load_model(b):
try:
self.model.from_bytes(b)
except AttributeError:
raise ValueError(Errors.E149) from None
deserialize = {}
if hasattr(self, "cfg") and self.cfg is not None:
deserialize["cfg"] = lambda b: self.cfg.update(srsly.json_loads(b))
deserialize["vocab"] = lambda b: self.vocab.from_bytes(b, exclude=exclude)
deserialize["kb"] = lambda b: self.kb.from_bytes(b)
deserialize["model"] = load_model
util.from_bytes(bytes_data, deserialize, exclude)
return self
def to_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> None:
"""Serialize the pipe to disk.
path (str / Path): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
DOCS: https://spacy.io/api/entitylinker#to_disk
"""
serialize = {}
serialize["vocab"] = lambda p: self.vocab.to_disk(p, exclude=exclude)
serialize["cfg"] = lambda p: srsly.write_json(p, self.cfg)
serialize["kb"] = lambda p: self.kb.to_disk(p)
serialize["model"] = lambda p: self.model.to_disk(p)
util.to_disk(path, serialize, exclude)
def from_disk(
self, path: Union[str, Path], *, exclude: Iterable[str] = SimpleFrozenList()
) -> "EntityLinker_v1":
"""Load the pipe from disk. Modifies the object in place and returns it.
path (str / Path): Path to a directory.
exclude (Iterable[str]): String names of serialization fields to exclude.
RETURNS (EntityLinker): The modified EntityLinker object.
DOCS: https://spacy.io/api/entitylinker#from_disk
"""
def load_model(p):
try:
with p.open("rb") as infile:
self.model.from_bytes(infile.read())
except AttributeError:
raise ValueError(Errors.E149) from None
deserialize: Dict[str, Callable[[Any], Any]] = {}
deserialize["cfg"] = lambda p: self.cfg.update(deserialize_config(p))
deserialize["vocab"] = lambda p: self.vocab.from_disk(p, exclude=exclude)
deserialize["kb"] = lambda p: self.kb.from_disk(p)
deserialize["model"] = load_model
util.from_disk(path, deserialize, exclude)
return self
def rehearse(self, examples, *, sgd=None, losses=None, **config):
raise NotImplementedError
def add_label(self, label):
raise NotImplementedError

View File

@ -4,22 +4,22 @@ from typing import Optional, Iterable, Callable
from thinc.api import Model, Config
from ._parser_internals.transition_system import TransitionSystem
from .transition_parser cimport Parser
from ._parser_internals.ner cimport BiluoPushDown
from .transition_parser import Parser
from ._parser_internals.ner import BiluoPushDown
from ..language import Language
from ..scorer import get_ner_prf, PRFScore
from ..training import validate_examples
from ..util import registry
from ..training import remove_bilu_prefix
default_model_config = """
[model]
@architectures = "spacy.TransitionBasedParser.v2"
@architectures = "spacy.TransitionBasedParser.v3"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = true
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v2"
@ -44,8 +44,12 @@ DEFAULT_NER_MODEL = Config().from_str(default_model_config)["model"]
"incorrect_spans_key": None,
"scorer": {"@scorers": "spacy.ner_scorer.v1"},
},
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)
def make_ner(
nlp: Language,
@ -98,6 +102,7 @@ def make_ner(
scorer=scorer,
)
@Language.factory(
"beam_ner",
assigns=["doc.ents", "token.ent_iob", "token.ent_type"],
@ -111,7 +116,12 @@ def make_ner(
"incorrect_spans_key": None,
"scorer": None,
},
default_score_weights={"ents_f": 1.0, "ents_p": 0.0, "ents_r": 0.0, "ents_per_type": None},
default_score_weights={
"ents_f": 1.0,
"ents_p": 0.0,
"ents_r": 0.0,
"ents_per_type": None,
},
)
def make_beam_ner(
nlp: Language,
@ -185,11 +195,12 @@ def make_ner_scorer():
return ner_score
cdef class EntityRecognizer(Parser):
class EntityRecognizer(Parser):
"""Pipeline component for named entity recognition.
DOCS: https://spacy.io/api/entityrecognizer
"""
TransitionSystem = BiluoPushDown
def __init__(
@ -207,15 +218,14 @@ cdef class EntityRecognizer(Parser):
incorrect_spans_key=None,
scorer=ner_score,
):
"""Create an EntityRecognizer.
"""
"""Create an EntityRecognizer."""
super().__init__(
vocab,
model,
name,
moves,
update_with_oracle_cut_size=update_with_oracle_cut_size,
min_action_freq=1, # not relevant for NER
min_action_freq=1, # not relevant for NER
learn_tokens=False, # not relevant for NER
beam_width=beam_width,
beam_density=beam_density,
@ -242,8 +252,11 @@ cdef class EntityRecognizer(Parser):
def labels(self):
# Get the labels from the model by looking at the available moves, e.g.
# B-PERSON, I-PERSON, L-PERSON, U-PERSON
labels = set(remove_bilu_prefix(move) for move in self.move_names
if move[0] in ("B", "I", "L", "U"))
labels = set(
remove_bilu_prefix(move)
for move in self.move_names
if move[0] in ("B", "I", "L", "U")
)
return tuple(sorted(labels))
def scored_ents(self, beams):

View File

@ -87,6 +87,10 @@ cdef class Pipe:
return self.scorer(examples, **scorer_kwargs)
return {}
@property
def is_distillable(self) -> bool:
return False
@property
def is_trainable(self) -> bool:
return False

View File

@ -1,12 +1,11 @@
from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast
from typing import Union
from typing import Union, Protocol, runtime_checkable
from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops
from thinc.api import Optimizer
from thinc.types import Ragged, Ints2d, Floats2d
import numpy
from ..compat import Protocol, runtime_checkable
from ..scorer import Scorer
from ..language import Language
from .trainable_pipe import TrainablePipe

View File

@ -1,5 +1,6 @@
# cython: infer_types=True, profile=True, binding=True
from typing import Callable, Dict, Iterable, List, Optional, Union
from typing import Tuple
import numpy
import srsly
from thinc.api import Model, set_dropout_rate, Config
@ -245,7 +246,6 @@ class Tagger(TrainablePipe):
DOCS: https://spacy.io/api/tagger#rehearse
"""
loss_func = LegacySequenceCategoricalCrossentropy()
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
@ -259,12 +259,32 @@ class Tagger(TrainablePipe):
set_dropout_rate(self.model, drop)
tag_scores, bp_tag_scores = self.model.begin_update(docs)
tutor_tag_scores, _ = self._rehearsal_model.begin_update(docs)
grads, loss = loss_func(tag_scores, tutor_tag_scores)
loss, grads = self.get_teacher_student_loss(tutor_tag_scores, tag_scores)
bp_tag_scores(grads)
self.finish_update(sgd)
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += loss
return losses
def get_teacher_student_loss(
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d]
) -> Tuple[float, List[Floats2d]]:
"""Calculate the loss and its gradient for a batch of student
scores, relative to teacher scores.
teacher_scores: Scores representing the teacher model's predictions.
student_scores: Scores representing the student model's predictions.
RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://spacy.io/api/tagger#get_teacher_student_loss
"""
loss_func = LegacySequenceCategoricalCrossentropy(normalize=False)
d_scores, loss = loss_func(student_scores, teacher_scores)
if self.model.ops.xp.isnan(loss):
raise ValueError(Errors.E910.format(name=self.name))
return float(loss), d_scores
def get_loss(self, examples, scores):
"""Find the loss and gradient of loss for the batch of documents and
their predicted scores.

View File

@ -6,7 +6,7 @@ import warnings
from ..tokens.doc cimport Doc
from ..training import validate_examples
from ..training import validate_examples, validate_distillation_examples
from ..errors import Errors, Warnings
from .pipe import Pipe, deserialize_config
from .. import util
@ -56,6 +56,53 @@ cdef class TrainablePipe(Pipe):
except Exception as e:
error_handler(self.name, self, [doc], e)
def distill(self,
teacher_pipe: Optional["TrainablePipe"],
examples: Iterable["Example"],
*,
drop: float=0.0,
sgd: Optional[Optimizer]=None,
losses: Optional[Dict[str, float]]=None) -> Dict[str, float]:
"""Train a pipe (the student) on the predictions of another pipe
(the teacher). The student is typically trained on the probability
distribution of the teacher, but details may differ per pipe.
teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
from.
examples (Iterable[Example]): Distillation examples. The reference
(teacher) and predicted (student) docs must have the same number of
tokens and the same orthography.
drop (float): dropout rate.
sgd (Optional[Optimizer]): An optimizer. Will be created via
create_optimizer if not set.
losses (Optional[Dict[str, float]]): Optional record of loss during
distillation.
RETURNS: The updated losses dictionary.
DOCS: https://spacy.io/api/pipe#distill
"""
# By default we require a teacher pipe, but there are downstream
# implementations that don't require a pipe.
if teacher_pipe is None:
raise ValueError(Errors.E4002.format(name=self.name))
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
validate_distillation_examples(examples, "TrainablePipe.distill")
set_dropout_rate(self.model, drop)
for node in teacher_pipe.model.walk():
if node.name == "softmax":
node.attrs["softmax_normalize"] = True
teacher_scores = teacher_pipe.model.predict([eg.reference for eg in examples])
student_scores, bp_student_scores = self.model.begin_update([eg.predicted for eg in examples])
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
bp_student_scores(d_scores)
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += loss
return losses
def pipe(self, stream: Iterable[Doc], *, batch_size: int=128) -> Iterator[Doc]:
"""Apply the pipe to a stream of documents. This usually happens under
the hood when the nlp object is called on a text and all components are
@ -169,6 +216,19 @@ cdef class TrainablePipe(Pipe):
"""
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_loss", name=self.name))
def get_teacher_student_loss(self, teacher_scores, student_scores):
"""Calculate the loss and its gradient for a batch of student
scores, relative to teacher scores.
teacher_scores: Scores representing the teacher model's predictions.
student_scores: Scores representing the student model's predictions.
RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://spacy.io/api/pipe#get_teacher_student_loss
"""
raise NotImplementedError(Errors.E931.format(parent="TrainablePipe", method="get_teacher_student_loss", name=self.name))
def create_optimizer(self) -> Optimizer:
"""Create an optimizer for the pipeline component.
@ -205,6 +265,14 @@ cdef class TrainablePipe(Pipe):
"""
raise NotImplementedError(Errors.E931.format(parent="Pipe", method="add_label", name=self.name))
@property
def is_distillable(self) -> bool:
# Normally a pipe overrides `get_teacher_student_loss` to implement
# distillation. In more exceptional cases, a pipe can provide its
# own `distill` implementation. If neither of these methods is
# overridden, the pipe does not implement distillation.
return not (self.__class__.distill is TrainablePipe.distill and self.__class__.get_teacher_student_loss is TrainablePipe.get_teacher_student_loss)
@property
def is_trainable(self) -> bool:
return True

View File

@ -1,21 +0,0 @@
from cymem.cymem cimport Pool
from thinc.backends.cblas cimport CBlas
from ..vocab cimport Vocab
from .trainable_pipe cimport TrainablePipe
from ._parser_internals.transition_system cimport Transition, TransitionSystem
from ._parser_internals._state cimport StateC
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC
cdef class Parser(TrainablePipe):
cdef public object _rehearsal_model
cdef readonly TransitionSystem moves
cdef public object _multitasks
cdef object _cpu_ops
cdef void _parseC(self, CBlas cblas, StateC** states,
WeightsC weights, SizesC sizes) nogil
cdef void c_transition_batch(self, StateC** states, const float* scores,
int nr_class, int batch_size) nogil

View File

@ -1,5 +1,6 @@
# cython: infer_types=True, cdivision=True, boundscheck=False, binding=True
from __future__ import print_function
from typing import Dict, Iterable, List, Optional, Tuple
from cymem.cymem cimport Pool
cimport numpy as np
from itertools import islice
@ -7,25 +8,30 @@ from libcpp.vector cimport vector
from libc.string cimport memset, memcpy
from libc.stdlib cimport calloc, free
import random
import contextlib
import srsly
from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps
from thinc.api import get_ops, set_dropout_rate, CupyOps, NumpyOps, Optimizer
from thinc.api import chain, softmax_activation, use_ops, get_array_module
from thinc.legacy import LegacySequenceCategoricalCrossentropy
from thinc.types import Floats2d, Ints1d
import numpy.random
import numpy
import warnings
from ._parser_internals.stateclass cimport StateClass
from ..ml.tb_framework import TransitionModelInputs
from ._parser_internals.stateclass cimport StateC, StateClass
from ._parser_internals.search cimport Beam
from ..ml.parser_model cimport alloc_activations, free_activations
from ..ml.parser_model cimport predict_states, arg_max_if_valid
from ..ml.parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
from ..ml.parser_model cimport get_c_weights, get_c_sizes
from ..tokens.doc cimport Doc
from .trainable_pipe import TrainablePipe
from .trainable_pipe cimport TrainablePipe
from ._parser_internals cimport _beam_utils
from ._parser_internals import _beam_utils
from ..vocab cimport Vocab
from ._parser_internals.transition_system cimport Transition, TransitionSystem
from ..typedefs cimport weight_t
from ..training import validate_examples, validate_get_examples
from ..training import validate_distillation_examples
from ..errors import Errors, Warnings
from .. import util
@ -33,7 +39,7 @@ from .. import util
NUMPY_OPS = NumpyOps()
cdef class Parser(TrainablePipe):
class Parser(TrainablePipe):
"""
Base class of the DependencyParser and EntityRecognizer.
"""
@ -133,8 +139,9 @@ cdef class Parser(TrainablePipe):
@property
def move_names(self):
names = []
cdef TransitionSystem moves = self.moves
for i in range(self.moves.n_moves):
name = self.moves.move_name(self.moves.c[i].move, self.moves.c[i].label)
name = self.moves.move_name(moves.c[i].move, moves.c[i].label)
# Explicitly removing the internal "U-" token used for blocking entities
if name != "U-":
names.append(name)
@ -203,6 +210,118 @@ cdef class Parser(TrainablePipe):
# Defined in subclasses, to avoid circular import
raise NotImplementedError
def distill(self,
teacher_pipe: Optional[TrainablePipe],
examples: Iterable["Example"],
*,
drop: float=0.0,
sgd: Optional[Optimizer]=None,
losses: Optional[Dict[str, float]]=None):
"""Train a pipe (the student) on the predictions of another pipe
(the teacher). The student is trained on the transition probabilities
of the teacher.
teacher_pipe (Optional[TrainablePipe]): The teacher pipe to learn
from.
examples (Iterable[Example]): Distillation examples. The reference
(teacher) and predicted (student) docs must have the same number of
tokens and the same orthography.
drop (float): dropout rate.
sgd (Optional[Optimizer]): An optimizer. Will be created via
create_optimizer if not set.
losses (Optional[Dict[str, float]]): Optional record of loss during
distillation.
RETURNS: The updated losses dictionary.
DOCS: https://spacy.io/api/dependencyparser#distill
"""
if teacher_pipe is None:
raise ValueError(Errors.E4002.format(name=self.name))
if losses is None:
losses = {}
losses.setdefault(self.name, 0.0)
validate_distillation_examples(examples, "TransitionParser.distill")
set_dropout_rate(self.model, drop)
student_docs = [eg.predicted for eg in examples]
max_moves = self.cfg["update_with_oracle_cut_size"]
if max_moves >= 1:
# Chop sequences into lengths of this many words, to make the
# batch uniform length. Since we do not have a gold standard
# sequence, we use the teacher's predictions as the gold
# standard.
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
states = self._init_batch(teacher_pipe, student_docs, max_moves)
else:
states = self.moves.init_batch(student_docs)
# We distill as follows: 1. we first let the student predict transition
# sequences (and the corresponding transition probabilities); (2) we
# let the teacher follow the student's predicted transition sequences
# to obtain the teacher's transition probabilities; (3) we compute the
# gradients of the student's transition distributions relative to the
# teacher's distributions.
student_inputs = TransitionModelInputs(docs=student_docs, moves=self.moves,
max_moves=max_moves)
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
actions = states2actions(student_states)
teacher_inputs = TransitionModelInputs(docs=[eg.reference for eg in examples],
moves=self.moves, actions=actions)
(_, teacher_scores) = teacher_pipe.model.predict(teacher_inputs)
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores)
backprop_scores((student_states, d_scores))
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += loss
return losses
def get_teacher_student_loss(
self, teacher_scores: List[Floats2d], student_scores: List[Floats2d],
normalize: bool=False,
) -> Tuple[float, List[Floats2d]]:
"""Calculate the loss and its gradient for a batch of student
scores, relative to teacher scores.
teacher_scores: Scores representing the teacher model's predictions.
student_scores: Scores representing the student model's predictions.
RETURNS (Tuple[float, float]): The loss and the gradient.
DOCS: https://spacy.io/api/dependencyparser#get_teacher_student_loss
"""
# We can't easily hook up a softmax layer in the parsing model, since
# the get_loss does additional masking. So, we could apply softmax
# manually here and use Thinc's cross-entropy loss. But it's a bit
# suboptimal, since we can have a lot of states that would result in
# many kernel launches. Futhermore the parsing model's backprop expects
# a XP array, so we'd have to concat the softmaxes anyway. So, like
# the get_loss implementation, we'll compute the loss and gradients
# ourselves.
teacher_scores = self.model.ops.softmax(self.model.ops.xp.vstack(teacher_scores),
axis=-1, inplace=True)
student_scores = self.model.ops.softmax(self.model.ops.xp.vstack(student_scores),
axis=-1, inplace=True)
assert teacher_scores.shape == student_scores.shape
d_scores = student_scores - teacher_scores
if normalize:
d_scores /= d_scores.shape[0]
loss = (d_scores**2).sum() / d_scores.size
return float(loss), d_scores
def init_multitask_objectives(self, get_examples, pipeline, **cfg):
"""Setup models for secondary objectives, to benefit from multi-task
learning. This method is intended to be overridden by subclasses.
@ -223,9 +342,6 @@ cdef class Parser(TrainablePipe):
stream: The sequence of documents to process.
batch_size (int): Number of documents to accumulate into a working set.
error_handler (Callable[[str, List[Doc], Exception], Any]): Function that
deals with a failing batch of documents. The default function just reraises
the exception.
YIELDS (Doc): Documents, in order.
"""
@ -247,78 +363,29 @@ cdef class Parser(TrainablePipe):
def predict(self, docs):
if isinstance(docs, Doc):
docs = [docs]
self._ensure_labels_are_added(docs)
if not any(len(doc) for doc in docs):
result = self.moves.init_batch(docs)
return result
if self.cfg["beam_width"] == 1:
return self.greedy_parse(docs, drop=0.0)
else:
return self.beam_parse(
docs,
drop=0.0,
beam_width=self.cfg["beam_width"],
beam_density=self.cfg["beam_density"]
)
with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
inputs = TransitionModelInputs(docs=docs, moves=self.moves)
states_or_beams, _ = self.model.predict(inputs)
return states_or_beams
def greedy_parse(self, docs, drop=0.):
cdef vector[StateC*] states
cdef StateClass state
cdef CBlas cblas = self._cpu_ops.cblas()
self._resize()
self._ensure_labels_are_added(docs)
set_dropout_rate(self.model, drop)
batch = self.moves.init_batch(docs)
model = self.model.predict(docs)
weights = get_c_weights(model)
for state in batch:
if not state.is_final():
states.push_back(state.c)
sizes = get_c_sizes(model, states.size())
with nogil:
self._parseC(cblas, &states[0], weights, sizes)
model.clear_memory()
del model
return batch
with _change_attrs(self.model, beam_width=1):
inputs = TransitionModelInputs(docs=docs, moves=self.moves)
states, _ = self.model.predict(inputs)
return states
def beam_parse(self, docs, int beam_width, float drop=0., beam_density=0.):
cdef Beam beam
cdef Doc doc
self._ensure_labels_are_added(docs)
batch = _beam_utils.BeamBatch(
self.moves,
self.moves.init_batch(docs),
None,
beam_width,
density=beam_density
)
model = self.model.predict(docs)
while not batch.is_done:
states = batch.get_unfinished_states()
if not states:
break
scores = model.predict(states)
batch.advance(scores)
model.clear_memory()
del model
return list(batch)
cdef void _parseC(self, CBlas cblas, StateC** states,
WeightsC weights, SizesC sizes) nogil:
cdef int i, j
cdef vector[StateC*] unfinished
cdef ActivationsC activations = alloc_activations(sizes)
while sizes.states >= 1:
predict_states(cblas, &activations, states, &weights, sizes)
# Validate actions, argmax, take action.
self.c_transition_batch(states,
activations.scores, sizes.classes, sizes.states)
for i in range(sizes.states):
if not states[i].is_final():
unfinished.push_back(states[i])
for i in range(unfinished.size()):
states[i] = unfinished[i]
sizes.states = unfinished.size()
unfinished.clear()
free_activations(&activations)
with _change_attrs(self.model, beam_width=self.cfg["beam_width"], beam_density=self.cfg["beam_density"]):
inputs = TransitionModelInputs(docs=docs, moves=self.moves)
beams, _ = self.model.predict(inputs)
return beams
def set_annotations(self, docs, states_or_beams):
cdef StateClass state
@ -330,35 +397,6 @@ cdef class Parser(TrainablePipe):
for hook in self.postprocesses:
hook(doc)
def transition_states(self, states, float[:, ::1] scores):
cdef StateClass state
cdef float* c_scores = &scores[0, 0]
cdef vector[StateC*] c_states
for state in states:
c_states.push_back(state.c)
self.c_transition_batch(&c_states[0], c_scores, scores.shape[1], scores.shape[0])
return [state for state in states if not state.c.is_final()]
cdef void c_transition_batch(self, StateC** states, const float* scores,
int nr_class, int batch_size) nogil:
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
with gil:
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
is_valid = <int*>calloc(self.moves.n_moves, sizeof(int))
cdef int i, guess
cdef Transition action
for i in range(batch_size):
self.moves.set_valid(is_valid, states[i])
guess = arg_max_if_valid(&scores[i*nr_class], is_valid, nr_class)
if guess == -1:
# This shouldn't happen, but it's hard to raise an error here,
# and we don't want to infinite loop. So, force to end state.
states[i].force_final()
else:
action = self.moves.c[guess]
action.do(states[i], action.label)
free(is_valid)
def update(self, examples, *, drop=0., sgd=None, losses=None):
cdef StateClass state
if losses is None:
@ -370,67 +408,99 @@ cdef class Parser(TrainablePipe):
)
for multitask in self._multitasks:
multitask.update(examples, drop=drop, sgd=sgd)
# We need to take care to act on the whole batch, because we might be
# getting vectors via a listener.
n_examples = len([eg for eg in examples if self.moves.has_gold(eg)])
if n_examples == 0:
return losses
set_dropout_rate(self.model, drop)
# The probability we use beam update, instead of falling back to
# a greedy update
beam_update_prob = self.cfg["beam_update_prob"]
if self.cfg['beam_width'] >= 2 and numpy.random.random() < beam_update_prob:
return self.update_beam(
examples,
beam_width=self.cfg["beam_width"],
sgd=sgd,
losses=losses,
beam_density=self.cfg["beam_density"]
)
docs = [eg.x for eg in examples if len(eg.x)]
max_moves = self.cfg["update_with_oracle_cut_size"]
if max_moves >= 1:
# Chop sequences into lengths of this many words, to make the
# batch uniform length.
max_moves = int(random.uniform(max_moves // 2, max_moves * 2))
states, golds, _ = self._init_gold_batch(
max_moves = int(random.uniform(max(max_moves // 2, 1), max_moves * 2))
init_states, gold_states, _ = self._init_gold_batch(
examples,
max_length=max_moves
)
else:
states, golds, _ = self.moves.init_gold_batch(examples)
if not states:
return losses
model, backprop_tok2vec = self.model.begin_update([eg.x for eg in examples])
all_states = list(states)
states_golds = list(zip(states, golds))
n_moves = 0
while states_golds:
states, golds = zip(*states_golds)
scores, backprop = model.begin_update(states)
d_scores = self.get_batch_loss(states, golds, scores, losses)
# Note that the gradient isn't normalized by the batch size
# here, because our "samples" are really the states...But we
# can't normalize by the number of states either, as then we'd
# be getting smaller gradients for states in long sequences.
backprop(d_scores)
# Follow the predicted action
self.transition_states(states, scores)
states_golds = [(s, g) for (s, g) in zip(states, golds) if not s.is_final()]
if max_moves >= 1 and n_moves >= max_moves:
break
n_moves += 1
init_states, gold_states, _ = self.moves.init_gold_batch(examples)
backprop_tok2vec(golds)
inputs = TransitionModelInputs(docs=docs, moves=self.moves,
max_moves=max_moves, states=[state.copy() for state in init_states])
(pred_states, scores), backprop_scores = self.model.begin_update(inputs)
if sum(s.shape[0] for s in scores) == 0:
return losses
d_scores = self.get_loss((gold_states, init_states, pred_states, scores),
examples, max_moves)
backprop_scores((pred_states, d_scores))
if sgd not in (None, False):
self.finish_update(sgd)
losses[self.name] += float((d_scores**2).sum())
# Ugh, this is annoying. If we're working on GPU, we want to free the
# memory ASAP. It seems that Python doesn't necessarily get around to
# removing these in time if we don't explicitly delete? It's confusing.
del backprop
del backprop_tok2vec
model.clear_memory()
del model
del backprop_scores
return losses
def get_loss(self, states_scores, examples, max_moves):
gold_states, init_states, pred_states, scores = states_scores
scores = self.model.ops.xp.vstack(scores)
costs = self._get_costs_from_histories(
examples,
gold_states,
init_states,
[list(state.history) for state in pred_states],
max_moves
)
xp = get_array_module(scores)
best_costs = costs.min(axis=1, keepdims=True)
gscores = scores.copy()
min_score = scores.min() - 1000
assert costs.shape == scores.shape, (costs.shape, scores.shape)
gscores[costs > best_costs] = min_score
max_ = scores.max(axis=1, keepdims=True)
gmax = gscores.max(axis=1, keepdims=True)
exp_scores = xp.exp(scores - max_)
exp_gscores = xp.exp(gscores - gmax)
Z = exp_scores.sum(axis=1, keepdims=True)
gZ = exp_gscores.sum(axis=1, keepdims=True)
d_scores = exp_scores / Z
d_scores -= (costs <= best_costs) * (exp_gscores / gZ)
return d_scores
def _get_costs_from_histories(self, examples, gold_states, init_states, histories, max_moves):
cdef TransitionSystem moves = self.moves
cdef StateClass state
cdef int clas
cdef int nF = self.model.get_dim("nF")
cdef int nO = moves.n_moves
cdef int nS = sum([len(history) for history in histories])
cdef Pool mem = Pool()
cdef np.ndarray costs_i
is_valid = <int*>mem.alloc(nO, sizeof(int))
batch = list(zip(init_states, histories, gold_states))
n_moves = 0
output = []
while batch:
costs = numpy.zeros((len(batch), nO), dtype="f")
for i, (state, history, gold) in enumerate(batch):
costs_i = costs[i]
clas = history.pop(0)
moves.set_costs(is_valid, <weight_t*>costs_i.data, state.c, gold)
action = moves.c[clas]
action.do(state.c, action.label)
state.c.history.push_back(clas)
output.append(costs)
batch = [(s, h, g) for s, h, g in batch if len(h) != 0]
if n_moves >= max_moves >= 1:
break
n_moves += 1
return self.model.ops.xp.vstack(output)
def rehearse(self, examples, sgd=None, losses=None, **cfg):
"""Perform a "rehearsal" update, to prevent catastrophic forgetting."""
if losses is None:
@ -440,10 +510,9 @@ cdef class Parser(TrainablePipe):
multitask.rehearse(examples, losses=losses, sgd=sgd)
if self._rehearsal_model is None:
return None
losses.setdefault(self.name, 0.)
losses.setdefault(self.name, 0.0)
validate_examples(examples, "Parser.rehearse")
docs = [eg.predicted for eg in examples]
states = self.moves.init_batch(docs)
# This is pretty dirty, but the NER can resize itself in init_batch,
# if labels are missing. We therefore have to check whether we need to
# expand our model output.
@ -451,85 +520,33 @@ cdef class Parser(TrainablePipe):
# Prepare the stepwise model, and get the callback for finishing the batch
set_dropout_rate(self._rehearsal_model, 0.0)
set_dropout_rate(self.model, 0.0)
tutor, _ = self._rehearsal_model.begin_update(docs)
model, backprop_tok2vec = self.model.begin_update(docs)
n_scores = 0.
loss = 0.
while states:
targets, _ = tutor.begin_update(states)
guesses, backprop = model.begin_update(states)
d_scores = (guesses - targets) / targets.shape[0]
# If all weights for an output are 0 in the original model, don't
# supervise that output. This allows us to add classes.
loss += (d_scores**2).sum()
backprop(d_scores)
# Follow the predicted action
self.transition_states(states, guesses)
states = [state for state in states if not state.is_final()]
n_scores += d_scores.size
# Do the backprop
backprop_tok2vec(docs)
student_inputs = TransitionModelInputs(docs=docs, moves=self.moves)
(student_states, student_scores), backprop_scores = self.model.begin_update(student_inputs)
actions = states2actions(student_states)
teacher_inputs = TransitionModelInputs(docs=docs, moves=self.moves, actions=actions)
_, teacher_scores = self._rehearsal_model.predict(teacher_inputs)
loss, d_scores = self.get_teacher_student_loss(teacher_scores, student_scores, normalize=True)
teacher_scores = self.model.ops.xp.vstack(teacher_scores)
student_scores = self.model.ops.xp.vstack(student_scores)
assert teacher_scores.shape == student_scores.shape
d_scores = (student_scores - teacher_scores) / teacher_scores.shape[0]
# If all weights for an output are 0 in the original model, don't
# supervise that output. This allows us to add classes.
loss = (d_scores**2).sum() / d_scores.size
backprop_scores((student_states, d_scores))
if sgd is not None:
self.finish_update(sgd)
losses[self.name] += loss / n_scores
del backprop
del backprop_tok2vec
model.clear_memory()
tutor.clear_memory()
del model
del tutor
losses[self.name] += loss
return losses
def update_beam(self, examples, *, beam_width,
drop=0., sgd=None, losses=None, beam_density=0.0):
states, golds, _ = self.moves.init_gold_batch(examples)
if not states:
return losses
# Prepare the stepwise model, and get the callback for finishing the batch
model, backprop_tok2vec = self.model.begin_update(
[eg.predicted for eg in examples])
loss = _beam_utils.update_beam(
self.moves,
states,
golds,
model,
beam_width,
beam_density=beam_density,
)
losses[self.name] += loss
backprop_tok2vec(golds)
if sgd is not None:
self.finish_update(sgd)
def get_batch_loss(self, states, golds, float[:, ::1] scores, losses):
cdef StateClass state
cdef Pool mem = Pool()
cdef int i
# n_moves should not be zero at this point, but make sure to avoid zero-length mem alloc
assert self.moves.n_moves > 0, Errors.E924.format(name=self.name)
is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int))
costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float))
cdef np.ndarray d_scores = numpy.zeros((len(states), self.moves.n_moves),
dtype='f', order='C')
c_d_scores = <float*>d_scores.data
unseen_classes = self.model.attrs["unseen_classes"]
for i, (state, gold) in enumerate(zip(states, golds)):
memset(is_valid, 0, self.moves.n_moves * sizeof(int))
memset(costs, 0, self.moves.n_moves * sizeof(float))
self.moves.set_costs(is_valid, costs, state.c, gold)
for j in range(self.moves.n_moves):
if costs[j] <= 0.0 and j in unseen_classes:
unseen_classes.remove(j)
cpu_log_loss(c_d_scores,
costs, is_valid, &scores[i, 0], d_scores.shape[1])
c_d_scores += d_scores.shape[1]
# Note that we don't normalize this. See comment in update() for why.
if losses is not None:
losses.setdefault(self.name, 0.)
losses[self.name] += (d_scores**2).sum()
return d_scores
raise NotImplementedError
def set_output(self, nO):
self.model.attrs["resize_output"](self.model, nO)
@ -568,7 +585,7 @@ cdef class Parser(TrainablePipe):
for example in islice(get_examples(), 10):
doc_sample.append(example.predicted)
assert len(doc_sample) > 0, Errors.E923.format(name=self.name)
self.model.initialize(doc_sample)
self.model.initialize((doc_sample, self.moves))
if nlp is not None:
self.init_multitask_objectives(get_examples, nlp.pipeline)
@ -625,28 +642,63 @@ cdef class Parser(TrainablePipe):
raise ValueError(Errors.E149) from None
return self
def _init_gold_batch(self, examples, max_length):
"""Make a square batch, of length equal to the shortest transition
def _init_batch(self, teacher_step_model, docs, max_length):
"""Make a square batch of length equal to the shortest transition
sequence or a cap. A long
doc will get multiple states. Let's say we have a doc of length 2*N,
where N is the shortest doc. We'll make two states, one representing
long_doc[:N], and another representing long_doc[N:]."""
long_doc[:N], and another representing long_doc[N:]. In contrast to
_init_gold_batch, this version uses a teacher model to generate the
cut sequences."""
cdef:
StateClass start_state
StateClass state
Transition action
all_states = self.moves.init_batch([eg.predicted for eg in examples])
all_states = self.moves.init_batch(docs)
states = []
to_cut = []
for state, doc in zip(all_states, docs):
if not state.is_final():
if len(doc) < max_length:
states.append(state)
else:
to_cut.append(state)
while to_cut:
states.extend(state.copy() for state in to_cut)
# Move states forward max_length actions.
length = 0
while to_cut and length < max_length:
teacher_scores = teacher_step_model.predict(to_cut)
self.transition_states(to_cut, teacher_scores)
# States that are completed do not need further cutting.
to_cut = [state for state in to_cut if not state.is_final()]
length += 1
return states
def _init_gold_batch(self, examples, max_length):
"""Make a square batch, of length equal to the shortest transition
sequence or a cap. A long doc will get multiple states. Let's say we
have a doc of length 2*N, where N is the shortest doc. We'll make
two states, one representing long_doc[:N], and another representing
long_doc[N:]."""
cdef:
StateClass start_state
StateClass state
Transition action
TransitionSystem moves = self.moves
all_states = moves.init_batch([eg.predicted for eg in examples])
states = []
golds = []
to_cut = []
for state, eg in zip(all_states, examples):
if self.moves.has_gold(eg) and not state.is_final():
gold = self.moves.init_gold(state, eg)
if moves.has_gold(eg) and not state.is_final():
gold = moves.init_gold(state, eg)
if len(eg.x) < max_length:
states.append(state)
golds.append(gold)
else:
oracle_actions = self.moves.get_oracle_sequence_from_state(
oracle_actions = moves.get_oracle_sequence_from_state(
state.copy(), gold)
to_cut.append((eg, state, gold, oracle_actions))
if not to_cut:
@ -656,13 +708,52 @@ cdef class Parser(TrainablePipe):
for i in range(0, len(oracle_actions), max_length):
start_state = state.copy()
for clas in oracle_actions[i:i+max_length]:
action = self.moves.c[clas]
action = moves.c[clas]
action.do(state.c, action.label)
if state.is_final():
break
if self.moves.has_gold(eg, start_state.B(0), state.B(0)):
if moves.has_gold(eg, start_state.B(0), state.B(0)):
states.append(start_state)
golds.append(gold)
if state.is_final():
break
return states, golds, max_length
@contextlib.contextmanager
def _change_attrs(model, **kwargs):
"""Temporarily modify a thinc model's attributes."""
unset = object()
old_attrs = {}
for key, value in kwargs.items():
old_attrs[key] = model.attrs.get(key, unset)
model.attrs[key] = value
yield model
for key, value in old_attrs.items():
if value is unset:
model.attrs.pop(key)
else:
model.attrs[key] = value
def states2actions(states: List[StateClass]) -> List[Ints1d]:
cdef int step
cdef StateClass state
cdef StateC* c_state
actions = []
while True:
step = len(actions)
step_actions = []
for state in states:
c_state = state.c
if step < c_state.history.size():
step_actions.append(c_state.history[step])
# We are done if we have exhausted all histories.
if len(step_actions) == 0:
break
actions.append(numpy.array(step_actions, dtype="i"))
return actions

View File

@ -1,6 +1,5 @@
from typing import Dict, List, Union, Optional, Any, Callable, Type, Tuple
from typing import Iterable, TypeVar, TYPE_CHECKING
from .compat import Literal
from typing import Iterable, TypeVar, Literal, TYPE_CHECKING
from enum import Enum
from pydantic import BaseModel, Field, ValidationError, validator, create_model
from pydantic import StrictStr, StrictInt, StrictFloat, StrictBool, ConstrainedStr
@ -163,15 +162,33 @@ class TokenPatternString(BaseModel):
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy")
FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy1")
FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy2")
FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy3")
FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy4")
FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy5")
FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy6")
FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy7")
FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy8")
FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy9")
FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy1"
)
FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy2"
)
FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy3"
)
FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy4"
)
FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy5"
)
FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy6"
)
FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy7"
)
FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy8"
)
FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy9"
)
class Config:
extra = "forbid"
@ -405,6 +422,27 @@ class ConfigSchemaInit(BaseModel):
arbitrary_types_allowed = True
class ConfigSchemaDistillEmpty(BaseModel):
class Config:
extra = "forbid"
class ConfigSchemaDistill(BaseModel):
# fmt: off
batcher: Batcher = Field(..., title="Batcher for the training data")
corpus: StrictStr = Field(..., title="Path in the config to the distillation data")
dropout: StrictFloat = Field(..., title="Dropout rate")
max_epochs: StrictInt = Field(..., title="Maximum number of epochs to distill for")
max_steps: StrictInt = Field(..., title="Maximum number of steps to distill for")
optimizer: Optimizer = Field(..., title="The optimizer to use")
student_to_teacher: Dict[str, str] = Field(..., title="Mapping from student to teacher pipe")
# fmt: on
class Config:
extra = "forbid"
arbitrary_types_allowed = True
class ConfigSchema(BaseModel):
training: ConfigSchemaTraining
nlp: ConfigSchemaNlp
@ -412,6 +450,7 @@ class ConfigSchema(BaseModel):
components: Dict[str, Dict[str, Any]]
corpora: Dict[str, Reader]
initialize: ConfigSchemaInit
distillation: Union[ConfigSchemaDistill, ConfigSchemaDistillEmpty] = {} # type: ignore[assignment]
class Config:
extra = "allow"
@ -423,6 +462,7 @@ CONFIG_SCHEMAS = {
"training": ConfigSchemaTraining,
"pretraining": ConfigSchemaPretrain,
"initialize": ConfigSchemaInit,
"distill": ConfigSchemaDistill,
}

View File

@ -104,7 +104,7 @@ class Scorer:
def __init__(
self,
nlp: Optional["Language"] = None,
default_lang: str = "xx",
default_lang: str = "mul",
default_pipeline: Iterable[str] = DEFAULT_PIPELINE,
**cfg,
) -> None:

View File

@ -86,7 +86,7 @@ These are the main fixtures that are currently available:
| Fixture | Description |
| ----------------------------------- | ---------------------------------------------------------------------------- |
| `tokenizer` | Basic, language-independent tokenizer. Identical to the `xx` language class. |
| `tokenizer` | Basic, language-independent tokenizer. Identical to the `mul` language class. |
| `en_tokenizer`, `de_tokenizer`, ... | Creates an English, German etc. tokenizer. |
| `en_vocab` | Creates an instance of the English `Vocab`. |

View File

@ -83,7 +83,7 @@ def register_cython_tests(cython_mod_name: str, test_mod_name: str):
@pytest.fixture(scope="module")
def tokenizer():
return get_lang_class("xx")().tokenizer
return get_lang_class("mul")().tokenizer
@pytest.fixture(scope="session")
@ -243,8 +243,8 @@ def id_tokenizer():
@pytest.fixture(scope="session")
def is_tokenizer():
return get_lang_class("is")().tokenizer
def isl_tokenizer():
return get_lang_class("isl")().tokenizer
@pytest.fixture(scope="session")
@ -496,8 +496,8 @@ def vi_tokenizer():
@pytest.fixture(scope="session")
def xx_tokenizer():
return get_lang_class("xx")().tokenizer
def mul_tokenizer():
return get_lang_class("mul")().tokenizer
@pytest.fixture(scope="session")

View File

@ -9,7 +9,7 @@ from thinc.api import NumpyOps, get_current_ops
from spacy.attrs import DEP, ENT_IOB, ENT_TYPE, HEAD, IS_ALPHA, MORPH, POS
from spacy.attrs import SENT_START, TAG
from spacy.lang.en import English
from spacy.lang.xx import MultiLanguage
from spacy.lang.mul import MultiLanguage
from spacy.language import Language
from spacy.lexeme import Lexeme
from spacy.tokens import Doc, Span, SpanGroup, Token

View File

@ -1,7 +1,7 @@
import pytest
def test_long_text(is_tokenizer):
def test_long_text(isl_tokenizer):
# Excerpt: European Convention on Human Rights
text = """
hafa í huga, yfirlýsing þessi hefur það markmið tryggja
@ -15,12 +15,12 @@ réttlætis og friðar í heiminum og best er tryggt, annars vegar með
virku, lýðræðislegu stjórnarfari og, hins vegar, almennum skilningi
og varðveislu þeirra mannréttinda, sem eru grundvöllur frelsisins;
"""
tokens = is_tokenizer(text)
tokens = isl_tokenizer(text)
assert len(tokens) == 120
@pytest.mark.xfail
def test_ordinal_number(is_tokenizer):
def test_ordinal_number(isl_tokenizer):
text = "10. desember 1948"
tokens = is_tokenizer(text)
tokens = isl_tokenizer(text)
assert len(tokens) == 3

View File

@ -1,6 +1,6 @@
import pytest
IS_BASIC_TOKENIZATION_TESTS = [
ISL_BASIC_TOKENIZATION_TESTS = [
(
"Enginn maður skal sæta pyndingum eða ómannlegri eða "
"vanvirðandi meðferð eða refsingu. ",
@ -23,8 +23,8 @@ IS_BASIC_TOKENIZATION_TESTS = [
]
@pytest.mark.parametrize("text,expected_tokens", IS_BASIC_TOKENIZATION_TESTS)
def test_is_tokenizer_basic(is_tokenizer, text, expected_tokens):
tokens = is_tokenizer(text)
@pytest.mark.parametrize("text,expected_tokens", ISL_BASIC_TOKENIZATION_TESTS)
def test_isl_tokenizer_basic(isl_tokenizer, text, expected_tokens):
tokens = isl_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

@ -1,7 +1,7 @@
import pytest
def test_long_text(xx_tokenizer):
def test_long_text(mul_tokenizer):
# Excerpt: Text in Skolt Sami taken from https://www.samediggi.fi
text = """
ʹmmla lie Euroopp unioon oʹdinakai alggmeer. ʹmmlai alggmeerstatus lij raʹvvjum Lääʹddjânnam vuâđđlääʹjjest.
@ -20,5 +20,5 @@ vuâđđlääʹjj meâldlaž jiõččvaaldâšm. Säʹmmlai jiõččvaldšma kuu
Sääʹmteʹǧǧ.
"""
tokens = xx_tokenizer(text)
tokens = mul_tokenizer(text)
assert len(tokens) == 179

View File

@ -1,6 +1,6 @@
import pytest
XX_BASIC_TOKENIZATION_TESTS = [
MUL_BASIC_TOKENIZATION_TESTS = [
(
"Lääʹddjânnmest lie nuʹtt 10 000 säʹmmliʹžžed. Seeʹst pâʹjjel",
[
@ -18,8 +18,8 @@ XX_BASIC_TOKENIZATION_TESTS = [
]
@pytest.mark.parametrize("text,expected_tokens", XX_BASIC_TOKENIZATION_TESTS)
def test_xx_tokenizer_basic(xx_tokenizer, text, expected_tokens):
tokens = xx_tokenizer(text)
@pytest.mark.parametrize("text,expected_tokens", MUL_BASIC_TOKENIZATION_TESTS)
def test_mul_tokenizer_basic(mul_tokenizer, text, expected_tokens):
tokens = mul_tokenizer(text)
token_list = [token.text for token in tokens if not token.is_space]
assert expected_tokens == token_list

View File

@ -7,10 +7,10 @@ from spacy.util import get_lang_class
# excluded: ja, ko, th, vi, zh
LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
"en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
"hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
"mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
"hr", "hu", "hy", "id", "isl", "it", "kn", "ky", "lb", "lt", "lv",
"mk", "ml", "mr", "mul", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
"si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
"tr", "tt", "uk", "ur", "xx", "yo"]
"tr", "tt", "uk", "ur", "yo"]
# fmt: on

View File

@ -13,6 +13,7 @@ from spacy.pipeline._parser_internals.ner import BiluoPushDown
from spacy.training import Example, iob_to_biluo, split_bilu_label
from spacy.tokens import Doc, Span
from spacy.vocab import Vocab
from thinc.api import fix_random_seed
import logging
from ..util import make_tempdir
@ -412,7 +413,7 @@ def test_train_empty():
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
ner = nlp.add_pipe("ner", last=True)
ner.add_label("PERSON")
nlp.initialize()
nlp.initialize(get_examples=lambda: train_examples)
for itn in range(2):
losses = {}
batches = util.minibatch(train_examples, size=8)
@ -539,11 +540,11 @@ def test_block_ner():
assert [token.ent_type_ for token in doc] == expected_types
@pytest.mark.parametrize("use_upper", [True, False])
def test_overfitting_IO(use_upper):
def test_overfitting_IO():
fix_random_seed(1)
# Simple test to try and quickly overfit the NER component
nlp = English()
ner = nlp.add_pipe("ner", config={"model": {"use_upper": use_upper}})
ner = nlp.add_pipe("ner", config={"model": {}})
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@ -575,7 +576,6 @@ def test_overfitting_IO(use_upper):
assert ents2[0].label_ == "LOC"
# Ensure that the predictions are still the same, even after adding a new label
ner2 = nlp2.get_pipe("ner")
assert ner2.model.attrs["has_upper"] == use_upper
ner2.add_label("RANDOM_NEW_LABEL")
doc3 = nlp2(test_text)
ents3 = doc3.ents
@ -617,6 +617,52 @@ def test_overfitting_IO(use_upper):
assert ents[1].kb_id == 0
def test_is_distillable():
nlp = English()
ner = nlp.add_pipe("ner")
assert ner.is_distillable
def test_distill():
teacher = English()
teacher_ner = teacher.add_pipe("ner")
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
for ent in annotations.get("entities"):
teacher_ner.add_label(ent[2])
optimizer = teacher.initialize(get_examples=lambda: train_examples)
for i in range(50):
losses = {}
teacher.update(train_examples, sgd=optimizer, losses=losses)
assert losses["ner"] < 0.00001
student = English()
student_ner = student.add_pipe("ner")
student_ner.initialize(
get_examples=lambda: train_examples, labels=teacher_ner.label_data
)
distill_examples = [
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
]
for i in range(100):
losses = {}
student_ner.distill(teacher_ner, distill_examples, sgd=optimizer, losses=losses)
assert losses["ner"] < 0.0001
# test the trained model
test_text = "I like London."
doc = student(test_text)
ents = doc.ents
assert len(ents) == 1
assert ents[0].text == "London"
assert ents[0].label_ == "LOC"
def test_beam_ner_scores():
# Test that we can get confidence values out of the beam_ner pipe
beam_width = 16

View File

@ -1,13 +1,17 @@
import itertools
import pytest
import numpy
from numpy.testing import assert_equal
from thinc.api import Adam
from spacy import registry, util
from spacy.attrs import DEP, NORM
from spacy.lang.en import English
from spacy.tokens import Doc
from spacy.training import Example
from spacy.tokens import Doc
from spacy.vocab import Vocab
from spacy import util, registry
from thinc.api import fix_random_seed
from ...pipeline import DependencyParser
from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL
@ -59,6 +63,8 @@ PARTIAL_DATA = [
),
]
PARSERS = ["parser"] # TODO: Test beam_parser when ready
eps = 0.1
@ -171,6 +177,57 @@ def test_parser_parse_one_word_sentence(en_vocab, en_parser, words):
assert doc[0].dep != 0
def test_parser_apply_actions(en_vocab, en_parser):
words = ["I", "ate", "pizza"]
words2 = ["Eat", "more", "pizza", "!"]
doc1 = Doc(en_vocab, words=words)
doc2 = Doc(en_vocab, words=words2)
docs = [doc1, doc2]
moves = en_parser.moves
moves.add_action(0, "")
moves.add_action(1, "")
moves.add_action(2, "nsubj")
moves.add_action(3, "obj")
moves.add_action(2, "amod")
actions = [
numpy.array([0, 0], dtype="i"),
numpy.array([2, 0], dtype="i"),
numpy.array([0, 4], dtype="i"),
numpy.array([3, 3], dtype="i"),
numpy.array([1, 1], dtype="i"),
numpy.array([1, 1], dtype="i"),
numpy.array([0], dtype="i"),
numpy.array([1], dtype="i"),
]
states = moves.init_batch(docs)
active_states = states
for step_actions in actions:
active_states = moves.apply_actions(active_states, step_actions)
assert len(active_states) == 0
for (state, doc) in zip(states, docs):
moves.set_annotations(state, doc)
assert docs[0][0].head.i == 1
assert docs[0][0].dep_ == "nsubj"
assert docs[0][1].head.i == 1
assert docs[0][1].dep_ == "ROOT"
assert docs[0][2].head.i == 1
assert docs[0][2].dep_ == "obj"
assert docs[1][0].head.i == 0
assert docs[1][0].dep_ == "ROOT"
assert docs[1][1].head.i == 2
assert docs[1][1].dep_ == "amod"
assert docs[1][2].head.i == 0
assert docs[1][2].dep_ == "obj"
@pytest.mark.skip(
reason="The step_through API was removed (but should be brought back)"
)
@ -319,7 +376,7 @@ def test_parser_constructor(en_vocab):
DependencyParser(en_vocab, model)
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
@pytest.mark.parametrize("pipe_name", PARSERS)
def test_incomplete_data(pipe_name):
# Test that the parser works with incomplete information
nlp = English()
@ -345,11 +402,15 @@ def test_incomplete_data(pipe_name):
assert doc[2].head.i == 1
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
def test_overfitting_IO(pipe_name):
@pytest.mark.parametrize(
"pipe_name,max_moves", itertools.product(PARSERS, [0, 1, 5, 100])
)
def test_overfitting_IO(pipe_name, max_moves):
fix_random_seed(0)
# Simple test to try and quickly overfit the dependency parser (normal or beam)
nlp = English()
parser = nlp.add_pipe(pipe_name)
parser.cfg["update_with_oracle_cut_size"] = max_moves
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
@ -396,16 +457,67 @@ def test_overfitting_IO(pipe_name):
assert_equal(batch_deps_1, no_batch_deps)
def test_is_distillable():
nlp = English()
parser = nlp.add_pipe("parser")
assert parser.is_distillable
def test_distill():
teacher = English()
teacher_parser = teacher.add_pipe("parser")
train_examples = []
for text, annotations in TRAIN_DATA:
train_examples.append(Example.from_dict(teacher.make_doc(text), annotations))
for dep in annotations.get("deps", []):
teacher_parser.add_label(dep)
optimizer = teacher.initialize(get_examples=lambda: train_examples)
for i in range(200):
losses = {}
teacher.update(train_examples, sgd=optimizer, losses=losses)
assert losses["parser"] < 0.0001
student = English()
student_parser = student.add_pipe("parser")
student_parser.initialize(
get_examples=lambda: train_examples, labels=teacher_parser.label_data
)
distill_examples = [
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
]
for i in range(200):
losses = {}
student_parser.distill(
teacher_parser, distill_examples, sgd=optimizer, losses=losses
)
assert losses["parser"] < 0.0001
test_text = "I like securities."
doc = student(test_text)
assert doc[0].dep_ == "nsubj"
assert doc[2].dep_ == "dobj"
assert doc[3].dep_ == "punct"
assert doc[0].head.i == 1
assert doc[2].head.i == 1
assert doc[3].head.i == 1
# fmt: off
@pytest.mark.slow
@pytest.mark.parametrize("pipe_name", ["parser", "beam_parser"])
@pytest.mark.parametrize(
"parser_config",
[
# TransitionBasedParser V1
({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
# TransitionBasedParser V2
# TODO: re-enable after we have a spacy-legacy release for v4. See
# https://github.com/explosion/spacy-legacy/pull/36
#({"@architectures": "spacy.TransitionBasedParser.v1", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": True}),
({"@architectures": "spacy.TransitionBasedParser.v2", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2, "use_upper": False}),
({"@architectures": "spacy.TransitionBasedParser.v3", "tok2vec": DEFAULT_TOK2VEC_MODEL, "state_type": "parser", "extra_state_tokens": False, "hidden_width": 64, "maxout_pieces": 2}),
],
)
# fmt: on

View File

@ -103,14 +103,15 @@ def test_initialize_from_labels():
}
def test_no_data():
@pytest.mark.parametrize("top_k", (1, 5, 30))
def test_no_data(top_k):
# Test that the lemmatizer provides a nice error when there's no tagging data / labels
TEXTCAT_DATA = [
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
]
nlp = English()
nlp.add_pipe("trainable_lemmatizer")
nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
nlp.add_pipe("textcat")
train_examples = []
@ -121,10 +122,11 @@ def test_no_data():
nlp.initialize(get_examples=lambda: train_examples)
def test_incomplete_data():
@pytest.mark.parametrize("top_k", (1, 5, 30))
def test_incomplete_data(top_k):
# Test that the lemmatizer works with incomplete information
nlp = English()
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
lemmatizer.min_tree_freq = 1
train_examples = []
for t in PARTIAL_DATA:
@ -141,10 +143,25 @@ def test_incomplete_data():
assert doc[1].lemma_ == "like"
assert doc[2].lemma_ == "blue"
# Check that incomplete annotations are ignored.
scores, _ = lemmatizer.model([eg.predicted for eg in train_examples], is_train=True)
_, dX = lemmatizer.get_loss(train_examples, scores)
xp = lemmatizer.model.ops.xp
def test_overfitting_IO():
# Missing annotations.
assert xp.count_nonzero(dX[0][0]) == 0
assert xp.count_nonzero(dX[0][3]) == 0
assert xp.count_nonzero(dX[1][0]) == 0
assert xp.count_nonzero(dX[1][3]) == 0
# Misaligned annotations.
assert xp.count_nonzero(dX[1][1]) == 0
@pytest.mark.parametrize("top_k", (1, 5, 30))
def test_overfitting_IO(top_k):
nlp = English()
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
lemmatizer.min_tree_freq = 1
train_examples = []
for t in TRAIN_DATA:
@ -177,7 +194,7 @@ def test_overfitting_IO():
# Check model after a {to,from}_bytes roundtrip
nlp_bytes = nlp.to_bytes()
nlp3 = English()
nlp3.add_pipe("trainable_lemmatizer")
nlp3.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
nlp3.from_bytes(nlp_bytes)
doc3 = nlp3(test_text)
assert doc3[0].lemma_ == "she"
@ -195,6 +212,53 @@ def test_overfitting_IO():
assert doc4[3].lemma_ == "egg"
def test_is_distillable():
nlp = English()
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
assert lemmatizer.is_distillable
def test_distill():
teacher = English()
teacher_lemmatizer = teacher.add_pipe("trainable_lemmatizer")
teacher_lemmatizer.min_tree_freq = 1
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
optimizer = teacher.initialize(get_examples=lambda: train_examples)
for i in range(50):
losses = {}
teacher.update(train_examples, sgd=optimizer, losses=losses)
assert losses["trainable_lemmatizer"] < 0.00001
student = English()
student_lemmatizer = student.add_pipe("trainable_lemmatizer")
student_lemmatizer.min_tree_freq = 1
student_lemmatizer.initialize(
get_examples=lambda: train_examples, labels=teacher_lemmatizer.label_data
)
distill_examples = [
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
]
for i in range(50):
losses = {}
student_lemmatizer.distill(
teacher_lemmatizer, distill_examples, sgd=optimizer, losses=losses
)
assert losses["trainable_lemmatizer"] < 0.00001
test_text = "She likes blue eggs"
doc = student(test_text)
assert doc[0].lemma_ == "she"
assert doc[1].lemma_ == "like"
assert doc[2].lemma_ == "blue"
assert doc[3].lemma_ == "egg"
def test_lemmatizer_requires_labels():
nlp = English()
nlp.add_pipe("trainable_lemmatizer")

View File

@ -12,7 +12,6 @@ from spacy.lang.en import English
from spacy.ml import load_kb
from spacy.ml.models.entity_linker import build_span_maker
from spacy.pipeline import EntityLinker, TrainablePipe
from spacy.pipeline.legacy import EntityLinker_v1
from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL
from spacy.scorer import Scorer
from spacy.tests.util import make_tempdir
@ -997,6 +996,8 @@ def test_scorer_links():
)
# fmt: on
def test_legacy_architectures(name, config):
from spacy_legacy.components.entity_linker import EntityLinker_v1
# Ensure that the legacy architectures still work
vector_length = 3
nlp = English()

View File

@ -50,6 +50,12 @@ def test_implicit_label():
nlp.initialize(get_examples=lambda: train_examples)
def test_is_distillable():
nlp = English()
morphologizer = nlp.add_pipe("morphologizer")
assert morphologizer.is_distillable
def test_no_resize():
nlp = Language()
morphologizer = nlp.add_pipe("morphologizer")

View File

@ -11,6 +11,12 @@ from spacy.pipeline import TrainablePipe
from spacy.tests.util import make_tempdir
def test_is_distillable():
nlp = English()
senter = nlp.add_pipe("senter")
assert senter.is_distillable
def test_label_types():
nlp = Language()
senter = nlp.add_pipe("senter")

View File

@ -47,7 +47,7 @@ def person_org_date_patterns(person_org_patterns):
def test_span_ruler_add_empty(patterns):
"""Test that patterns don't get added excessively."""
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"validate": True})
ruler.add_patterns(patterns)
pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
@ -58,7 +58,7 @@ def test_span_ruler_add_empty(patterns):
def test_span_ruler_init(patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns)
assert len(ruler) == len(patterns)
@ -74,7 +74,7 @@ def test_span_ruler_init(patterns):
def test_span_ruler_no_patterns_warns():
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
assert len(ruler) == 0
assert len(ruler.labels) == 0
@ -86,7 +86,7 @@ def test_span_ruler_no_patterns_warns():
def test_span_ruler_init_patterns(patterns):
# initialize with patterns
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
assert len(ruler.labels) == 0
ruler.initialize(lambda: [], patterns=patterns)
@ -110,7 +110,7 @@ def test_span_ruler_init_patterns(patterns):
def test_span_ruler_init_clear(patterns):
"""Test that initialization clears patterns."""
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns)
assert len(ruler.labels) == 4
@ -119,7 +119,7 @@ def test_span_ruler_init_clear(patterns):
def test_span_ruler_clear(patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns)
assert len(ruler.labels) == 4
@ -133,7 +133,7 @@ def test_span_ruler_clear(patterns):
def test_span_ruler_existing(patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"overwrite": False})
ruler.add_patterns(patterns)
doc = nlp.make_doc("OH HELLO WORLD bye bye")
@ -148,7 +148,7 @@ def test_span_ruler_existing(patterns):
def test_span_ruler_existing_overwrite(patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
ruler.add_patterns(patterns)
doc = nlp.make_doc("OH HELLO WORLD bye bye")
@ -161,13 +161,13 @@ def test_span_ruler_existing_overwrite(patterns):
def test_span_ruler_serialize_bytes(patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns)
assert len(ruler) == len(patterns)
assert len(ruler.labels) == 4
ruler_bytes = ruler.to_bytes()
new_nlp = spacy.blank("xx")
new_nlp = spacy.blank("mul")
new_ruler = new_nlp.add_pipe("span_ruler")
assert len(new_ruler) == 0
assert len(new_ruler.labels) == 0
@ -181,7 +181,7 @@ def test_span_ruler_serialize_bytes(patterns):
def test_span_ruler_validate():
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
validated_ruler = nlp.add_pipe(
"span_ruler", name="validated_span_ruler", config={"validate": True}
@ -203,14 +203,14 @@ def test_span_ruler_validate():
def test_span_ruler_properties(patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"overwrite": True})
ruler.add_patterns(patterns)
assert sorted(ruler.labels) == sorted(set([p["label"] for p in patterns]))
def test_span_ruler_overlapping_spans(overlapping_patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(overlapping_patterns)
doc = ruler(nlp.make_doc("foo bar baz"))
@ -220,7 +220,7 @@ def test_span_ruler_overlapping_spans(overlapping_patterns):
def test_span_ruler_scorer(overlapping_patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(overlapping_patterns)
text = "foo bar baz"
@ -243,7 +243,7 @@ def test_span_ruler_multiprocessing(n_process):
patterns = [{"label": "FASTFOOD", "pattern": "Pizza Hut"}]
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns)
@ -253,7 +253,7 @@ def test_span_ruler_multiprocessing(n_process):
def test_span_ruler_serialize_dir(patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(patterns)
with make_tempdir() as d:
@ -264,7 +264,7 @@ def test_span_ruler_serialize_dir(patterns):
def test_span_ruler_remove_basic(person_org_patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_patterns)
doc = ruler(nlp.make_doc("Dina went to school"))
@ -279,7 +279,7 @@ def test_span_ruler_remove_basic(person_org_patterns):
def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_patterns)
assert len(ruler.patterns) == 3
@ -290,7 +290,7 @@ def test_span_ruler_remove_nonexisting_pattern(person_org_patterns):
def test_span_ruler_remove_several_patterns(person_org_patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_patterns)
doc = ruler(nlp.make_doc("Dina founded the company ACME."))
@ -314,7 +314,7 @@ def test_span_ruler_remove_several_patterns(person_org_patterns):
def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_date_patterns)
doc = ruler(nlp.make_doc("Dina founded the company ACME on June 14th"))
@ -332,7 +332,7 @@ def test_span_ruler_remove_patterns_in_a_row(person_org_date_patterns):
def test_span_ruler_remove_all_patterns(person_org_date_patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
ruler.add_patterns(person_org_date_patterns)
assert len(ruler.patterns) == 4
@ -348,7 +348,7 @@ def test_span_ruler_remove_all_patterns(person_org_date_patterns):
def test_span_ruler_remove_and_add():
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler")
patterns1 = [{"label": "DATE1", "pattern": "last time"}]
ruler.add_patterns(patterns1)
@ -404,7 +404,7 @@ def test_span_ruler_remove_and_add():
def test_span_ruler_spans_filter(overlapping_patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe(
"span_ruler",
config={"spans_filter": {"@misc": "spacy.first_longest_spans_filter.v1"}},
@ -416,7 +416,7 @@ def test_span_ruler_spans_filter(overlapping_patterns):
def test_span_ruler_ents_default_filter(overlapping_patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe("span_ruler", config={"annotate_ents": True})
ruler.add_patterns(overlapping_patterns)
doc = ruler(nlp.make_doc("foo bar baz"))
@ -425,7 +425,7 @@ def test_span_ruler_ents_default_filter(overlapping_patterns):
def test_span_ruler_ents_overwrite_filter(overlapping_patterns):
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe(
"span_ruler",
config={
@ -452,7 +452,7 @@ def test_span_ruler_ents_bad_filter(overlapping_patterns):
return pass_through_filter
nlp = spacy.blank("xx")
nlp = spacy.blank("mul")
ruler = nlp.add_pipe(
"span_ruler",
config={

View File

@ -24,7 +24,9 @@ def test_issue4348():
optimizer = nlp.initialize()
for i in range(5):
losses = {}
batches = util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
batches = util.minibatch(
TRAIN_DATA, size=compounding(4.0, 32.0, 1.001).to_generator()
)
for batch in batches:
nlp.update(batch, sgd=optimizer, losses=losses)
@ -213,6 +215,52 @@ def test_overfitting_IO():
assert doc3[0].tag_ != "N"
def test_is_distillable():
nlp = English()
tagger = nlp.add_pipe("tagger")
assert tagger.is_distillable
def test_distill():
teacher = English()
teacher_tagger = teacher.add_pipe("tagger")
train_examples = []
for t in TRAIN_DATA:
train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
optimizer = teacher.initialize(get_examples=lambda: train_examples)
for i in range(50):
losses = {}
teacher.update(train_examples, sgd=optimizer, losses=losses)
assert losses["tagger"] < 0.00001
student = English()
student_tagger = student.add_pipe("tagger")
student_tagger.min_tree_freq = 1
student_tagger.initialize(
get_examples=lambda: train_examples, labels=teacher_tagger.label_data
)
distill_examples = [
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TRAIN_DATA
]
for i in range(50):
losses = {}
student_tagger.distill(
teacher_tagger, distill_examples, sgd=optimizer, losses=losses
)
assert losses["tagger"] < 0.00001
test_text = "I like blue eggs"
doc = student(test_text)
assert doc[0].tag_ == "N"
assert doc[1].tag_ == "V"
assert doc[2].tag_ == "J"
assert doc[3].tag_ == "N"
def test_save_activations():
# Test if activations are correctly added to Doc when requested.
nlp = English()

View File

@ -91,7 +91,9 @@ def test_issue3611():
optimizer = nlp.initialize()
for i in range(3):
losses = {}
batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
batches = util.minibatch(
train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
)
for batch in batches:
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
@ -128,7 +130,9 @@ def test_issue4030():
optimizer = nlp.initialize()
for i in range(3):
losses = {}
batches = util.minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
batches = util.minibatch(
train_data, size=compounding(4.0, 32.0, 1.001).to_generator()
)
for batch in batches:
nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
@ -565,6 +569,12 @@ def test_initialize_examples(name, get_examples, train_data):
nlp.initialize(get_examples=get_examples())
def test_is_distillable():
nlp = English()
textcat = nlp.add_pipe("textcat")
assert not textcat.is_distillable
def test_overfitting_IO():
# Simple test to try and quickly overfit the single-label textcat component - ensuring the ML models work correctly
fix_random_seed(0)

View File

@ -382,7 +382,7 @@ cfg_string_multi = """
factory = "ner"
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
@architectures = "spacy.TransitionBasedParser.v3"
[components.ner.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"

View File

@ -6,10 +6,11 @@ import spacy
from spacy.lang.de import German
from spacy.lang.en import English
from spacy.language import DEFAULT_CONFIG, DEFAULT_CONFIG_PRETRAIN_PATH
from spacy.language import DEFAULT_CONFIG_DISTILL_PATH
from spacy.language import Language
from spacy.ml.models import MaxoutWindowEncoder, MultiHashEmbed
from spacy.ml.models import build_tb_parser_model, build_Tok2Vec_model
from spacy.schemas import ConfigSchema, ConfigSchemaPretrain
from spacy.schemas import ConfigSchema, ConfigSchemaDistill, ConfigSchemaPretrain
from spacy.util import load_config, load_config_from_str
from spacy.util import load_model_from_config, registry
@ -66,6 +67,60 @@ factory = "tagger"
width = ${components.tok2vec.model.width}
"""
distill_config_string = """
[paths]
train = null
dev = null
[corpora]
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
[training]
[training.batcher]
@batchers = "spacy.batch_by_words.v1"
size = 666
[nlp]
lang = "en"
pipeline = ["tok2vec", "tagger"]
[components]
[components.tok2vec]
factory = "tok2vec"
[components.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 342
depth = 4
window_size = 1
embed_size = 2000
maxout_pieces = 3
subword_features = true
[components.tagger]
factory = "tagger"
[components.tagger.model]
@architectures = "spacy.Tagger.v2"
[components.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecListener.v1"
width = ${components.tok2vec.model.width}
[distill]
"""
pretrain_config_string = """
[paths]
train = null
@ -122,33 +177,11 @@ width = ${components.tok2vec.model.width}
parser_config_string_upper = """
[model]
@architectures = "spacy.TransitionBasedParser.v2"
@architectures = "spacy.TransitionBasedParser.v3"
state_type = "parser"
extra_state_tokens = false
hidden_width = 66
maxout_pieces = 2
use_upper = true
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = null
width = 333
depth = 4
embed_size = 5555
window_size = 1
maxout_pieces = 7
subword_features = false
"""
parser_config_string_no_upper = """
[model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 66
maxout_pieces = 2
use_upper = false
[model.tok2vec]
@architectures = "spacy.HashEmbedCNN.v1"
@ -179,7 +212,6 @@ def my_parser():
extra_state_tokens=True,
hidden_width=65,
maxout_pieces=5,
use_upper=True,
)
return parser
@ -224,6 +256,14 @@ def test_create_nlp_from_config():
load_model_from_config(Config(bad_cfg), auto_fill=True)
def test_nlp_from_distillation_config():
"""Test that the default distillation config validates properly"""
config = Config().from_str(distill_config_string)
distill_config = load_config(DEFAULT_CONFIG_DISTILL_PATH)
filled = config.merge(distill_config)
registry.resolve(filled["distillation"], schema=ConfigSchemaDistill)
def test_create_nlp_from_pretraining_config():
"""Test that the default pretraining config validates properly"""
config = Config().from_str(pretrain_config_string)
@ -285,15 +325,16 @@ def test_serialize_custom_nlp():
nlp.to_disk(d)
nlp2 = spacy.load(d)
model = nlp2.get_pipe("parser").model
model.get_ref("tok2vec")
# check that we have the correct settings, not the default ones
assert model.get_ref("upper").get_dim("nI") == 65
assert model.get_ref("lower").get_dim("nI") == 65
assert model.get_ref("tok2vec") is not None
assert model.has_param("hidden_W")
assert model.has_param("hidden_b")
output = model.get_ref("output")
assert output is not None
assert output.has_param("W")
assert output.has_param("b")
@pytest.mark.parametrize(
"parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
)
@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
def test_serialize_parser(parser_config_string):
"""Create a non-default parser config to check nlp serializes it correctly"""
nlp = English()
@ -306,11 +347,13 @@ def test_serialize_parser(parser_config_string):
nlp.to_disk(d)
nlp2 = spacy.load(d)
model = nlp2.get_pipe("parser").model
model.get_ref("tok2vec")
# check that we have the correct settings, not the default ones
if model.attrs["has_upper"]:
assert model.get_ref("upper").get_dim("nI") == 66
assert model.get_ref("lower").get_dim("nI") == 66
assert model.get_ref("tok2vec") is not None
assert model.has_param("hidden_W")
assert model.has_param("hidden_b")
output = model.get_ref("output")
assert output is not None
assert output.has_param("b")
assert output.has_param("W")
def test_config_nlp_roundtrip():
@ -457,9 +500,7 @@ def test_config_auto_fill_extra_fields():
load_model_from_config(nlp.config)
@pytest.mark.parametrize(
"parser_config_string", [parser_config_string_upper, parser_config_string_no_upper]
)
@pytest.mark.parametrize("parser_config_string", [parser_config_string_upper])
def test_config_validate_literal(parser_config_string):
nlp = English()
config = Config().from_str(parser_config_string)

View File

@ -618,7 +618,6 @@ def test_string_to_list_intify(value):
assert string_to_list(value, intify=True) == [1, 2, 3]
@pytest.mark.skip(reason="Temporarily skip for dev version")
def test_download_compatibility():
spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False
@ -629,7 +628,6 @@ def test_download_compatibility():
assert get_minor_version(about.__version__) == get_minor_version(version)
@pytest.mark.skip(reason="Temporarily skip for dev version")
def test_validate_compatibility_table():
spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False
@ -1076,7 +1074,7 @@ def test_cli_find_threshold(capsys):
)
with make_tempdir() as nlp_dir:
nlp.to_disk(nlp_dir)
res = find_threshold(
best_threshold, best_score, res = find_threshold(
model=nlp_dir,
data_path=docs_dir / "docs.spacy",
pipe_name="tc_multi",
@ -1084,10 +1082,10 @@ def test_cli_find_threshold(capsys):
scores_key="cats_macro_f",
silent=True,
)
assert res[0] != thresholds[0]
assert thresholds[0] < res[0] < thresholds[9]
assert res[1] == 1.0
assert res[2][1.0] == 0.0
assert best_threshold != thresholds[0]
assert thresholds[0] < best_threshold < thresholds[9]
assert best_score == max(res.values())
assert res[1.0] == 0.0
# Test with spancat.
nlp, _ = init_nlp((("spancat", {}),))
@ -1209,3 +1207,69 @@ def test_walk_directory():
assert (len(walk_directory(d, suffix="iob"))) == 2
assert (len(walk_directory(d, suffix="conll"))) == 3
assert (len(walk_directory(d, suffix="pdf"))) == 0
def test_debug_data_trainable_lemmatizer_basic():
examples = [
("She likes green eggs", {"lemmas": ["she", "like", "green", "egg"]}),
("Eat blue ham", {"lemmas": ["eat", "blue", "ham"]}),
]
nlp = Language()
train_examples = []
for t in examples:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
# ref test_edit_tree_lemmatizer::test_initialize_from_labels
# this results in 4 trees
assert len(data["lemmatizer_trees"]) == 4
def test_debug_data_trainable_lemmatizer_partial():
partial_examples = [
# partial annotation
("She likes green eggs", {"lemmas": ["", "like", "green", ""]}),
# misaligned partial annotation
(
"He hates green eggs",
{
"words": ["He", "hat", "es", "green", "eggs"],
"lemmas": ["", "hat", "e", "green", ""],
},
),
]
nlp = Language()
train_examples = []
for t in partial_examples:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
assert data["partial_lemma_annotations"] == 2
def test_debug_data_trainable_lemmatizer_low_cardinality():
low_cardinality_examples = [
("She likes green eggs", {"lemmas": ["no", "no", "no", "no"]}),
("Eat blue ham", {"lemmas": ["no", "no", "no"]}),
]
nlp = Language()
train_examples = []
for t in low_cardinality_examples:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
assert data["n_low_cardinality_lemmas"] == 2
def test_debug_data_trainable_lemmatizer_not_annotated():
unannotated_examples = [
("She likes green eggs", {}),
("Eat blue ham", {}),
]
nlp = Language()
train_examples = []
for t in unannotated_examples:
train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
assert data["no_lemma_annotations"] == 2

View File

@ -3,6 +3,7 @@ from pathlib import Path
import pytest
import subprocess
from typer.testing import CliRunner
from spacy.tokens import DocBin, Doc
import spacy
from spacy.cli._util import app
@ -240,3 +241,60 @@ def test_multi_code_evaluate(code_paths, data_paths, noop_config):
# check that it succeeds with the code arg
result = subprocess.run([*cmd, *code_paths])
assert result.returncode == 0
def test_benchmark_accuracy_alias():
# Verify that the `evaluate` alias works correctly.
result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
assert result_benchmark.stdout == result_evaluate.stdout.replace(
"spacy evaluate", "spacy benchmark accuracy"
)
def test_debug_data_trainable_lemmatizer_cli(en_vocab):
train_docs = [
Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]),
Doc(
en_vocab,
words=["Dogs", "are", "great", "too"],
lemmas=["dog", "be", "great", "too"],
),
]
dev_docs = [
Doc(en_vocab, words=["Cats", "are", "cute"], lemmas=["cat", "be", "cute"]),
Doc(en_vocab, words=["Pets", "are", "great"], lemmas=["pet", "be", "great"]),
]
with make_tempdir() as d_in:
train_bin = DocBin(docs=train_docs)
train_bin.to_disk(d_in / "train.spacy")
dev_bin = DocBin(docs=dev_docs)
dev_bin.to_disk(d_in / "dev.spacy")
# `debug data` requires an input pipeline config
CliRunner().invoke(
app,
[
"init",
"config",
f"{d_in}/config.cfg",
"--lang",
"en",
"--pipeline",
"trainable_lemmatizer",
],
)
result_debug_data = CliRunner().invoke(
app,
[
"debug",
"data",
f"{d_in}/config.cfg",
"--paths.train",
f"{d_in}/train.spacy",
"--paths.dev",
f"{d_in}/dev.spacy",
],
)
# Instead of checking specific wording of the output, which may change,
# we'll check that this section of the debug output is present.
assert "= Trainable Lemmatizer =" in result_debug_data.stdout

View File

@ -26,6 +26,12 @@ except ImportError:
pass
TAGGER_TRAIN_DATA = [
("I like green eggs", {"tags": ["N", "V", "J", "N"]}),
("Eat blue ham", {"tags": ["V", "J", "N"]}),
]
def evil_component(doc):
if "2" in doc.text:
raise ValueError("no dice")
@ -658,11 +664,12 @@ def test_spacy_blank():
("fra", "fr"),
("fre", "fr"),
("iw", "he"),
("is", "isl"),
("mo", "ro"),
("mul", "xx"),
("mul", "mul"),
("no", "nb"),
("pt-BR", "pt"),
("xx", "xx"),
("xx", "mul"),
("zh-Hans", "zh"),
("zh-Hant", None),
("zxx", None),
@ -683,11 +690,11 @@ def test_language_matching(lang, target):
("fra", "fr"),
("fre", "fr"),
("iw", "he"),
("is", "isl"),
("mo", "ro"),
("mul", "xx"),
("xx", "mul"),
("no", "nb"),
("pt-BR", "pt"),
("xx", "xx"),
("zh-Hans", "zh"),
],
)
@ -799,3 +806,66 @@ def test_component_return():
nlp.add_pipe("test_component_bad_pipe")
with pytest.raises(ValueError, match="instead of a Doc"):
nlp("text")
@pytest.mark.slow
@pytest.mark.parametrize("teacher_tagger_name", ["tagger", "teacher_tagger"])
def test_distill(teacher_tagger_name):
teacher = English()
teacher_tagger = teacher.add_pipe("tagger", name=teacher_tagger_name)
train_examples = []
for t in TAGGER_TRAIN_DATA:
train_examples.append(Example.from_dict(teacher.make_doc(t[0]), t[1]))
optimizer = teacher.initialize(get_examples=lambda: train_examples)
for i in range(50):
losses = {}
teacher.update(train_examples, sgd=optimizer, losses=losses)
assert losses[teacher_tagger_name] < 0.00001
student = English()
student_tagger = student.add_pipe("tagger")
student_tagger.min_tree_freq = 1
student_tagger.initialize(
get_examples=lambda: train_examples, labels=teacher_tagger.label_data
)
distill_examples = [
Example.from_dict(teacher.make_doc(t[0]), {}) for t in TAGGER_TRAIN_DATA
]
student_to_teacher = (
None
if teacher_tagger.name == student_tagger.name
else {student_tagger.name: teacher_tagger.name}
)
for i in range(50):
losses = {}
student.distill(
teacher,
distill_examples,
sgd=optimizer,
losses=losses,
student_to_teacher=student_to_teacher,
)
assert losses["tagger"] < 0.00001
test_text = "I like blue eggs"
doc = student(test_text)
assert doc[0].tag_ == "N"
assert doc[1].tag_ == "V"
assert doc[2].tag_ == "J"
assert doc[3].tag_ == "N"
# Do an extra update to check if annotates works, though we can't really
# validate the resuls, since the annotations are ephemeral.
student.distill(
teacher,
distill_examples,
sgd=optimizer,
losses=losses,
student_to_teacher=student_to_teacher,
annotates=["tagger"],
)

View File

@ -5,10 +5,8 @@ from pathlib import Path
from spacy.about import __version__ as spacy_version
from spacy import util
from spacy import prefer_gpu, require_gpu, require_cpu
from spacy.ml._precomputable_affine import PrecomputableAffine
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
from spacy.util import dot_to_object, SimpleFrozenList, import_file
from spacy.util import to_ternary_int, find_available_port
from spacy.util import dot_to_object, SimpleFrozenList, import_file, to_ternary_int
from spacy.util import find_available_port
from thinc.api import Config, Optimizer, ConfigValidationError
from thinc.api import get_current_ops, set_current_ops, NumpyOps, CupyOps, MPSOps
from thinc.compat import has_cupy_gpu, has_torch_mps_gpu
@ -81,34 +79,6 @@ def test_util_get_package_path(package):
assert isinstance(path, Path)
def test_PrecomputableAffine(nO=4, nI=5, nF=3, nP=2):
model = PrecomputableAffine(nO=nO, nI=nI, nF=nF, nP=nP).initialize()
assert model.get_param("W").shape == (nF, nO, nP, nI)
tensor = model.ops.alloc((10, nI))
Y, get_dX = model.begin_update(tensor)
assert Y.shape == (tensor.shape[0] + 1, nF, nO, nP)
dY = model.ops.alloc((15, nO, nP))
ids = model.ops.alloc((15, nF))
ids[1, 2] = -1
dY[1] = 1
assert not model.has_grad("pad")
d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
assert d_pad[0, 2, 0, 0] == 1.0
ids.fill(0.0)
dY.fill(0.0)
dY[0] = 0
ids[1, 2] = 0
ids[1, 1] = -1
ids[1, 0] = -1
dY[1] = 1
ids[2, 0] = -1
dY[2] = 5
d_pad = _backprop_precomputable_affine_padding(model, dY, ids)
assert d_pad[0, 0, 0, 0] == 6
assert d_pad[0, 1, 0, 0] == 1
assert d_pad[0, 2, 0, 0] == 0
def test_prefer_gpu():
current_ops = get_current_ops()
if has_cupy_gpu:

View File

@ -36,6 +36,7 @@ LANGUAGES = [
"hu",
pytest.param("id", marks=pytest.mark.slow()),
pytest.param("it", marks=pytest.mark.slow()),
pytest.param("isl", marks=pytest.mark.slow()),
pytest.param("kn", marks=pytest.mark.slow()),
pytest.param("lb", marks=pytest.mark.slow()),
pytest.param("lt", marks=pytest.mark.slow()),

View File

@ -0,0 +1,78 @@
from typing import IO, Generator, Iterable, List, TextIO, Tuple
from contextlib import contextmanager
from pathlib import Path
import pytest
import tempfile
from spacy.lang.en import English
from spacy.training import Example, PlainTextCorpus
from spacy.util import make_tempdir
# Intentional newlines to check that they are skipped.
PLAIN_TEXT_DOC = """
This is a doc. It contains two sentences.
This is another doc.
A third doc.
"""
PLAIN_TEXT_DOC_TOKENIZED = [
[
"This",
"is",
"a",
"doc",
".",
"It",
"contains",
"two",
"sentences",
".",
],
["This", "is", "another", "doc", "."],
["A", "third", "doc", "."],
]
@pytest.mark.parametrize("min_length", [0, 5])
@pytest.mark.parametrize("max_length", [0, 5])
def test_plain_text_reader(min_length, max_length):
nlp = English()
with _string_to_tmp_file(PLAIN_TEXT_DOC) as file_path:
corpus = PlainTextCorpus(
file_path, min_length=min_length, max_length=max_length
)
check = [
doc
for doc in PLAIN_TEXT_DOC_TOKENIZED
if len(doc) >= min_length and (max_length == 0 or len(doc) <= max_length)
]
reference, predicted = _examples_to_tokens(corpus(nlp))
assert reference == check
assert predicted == check
@contextmanager
def _string_to_tmp_file(s: str) -> Generator[Path, None, None]:
with make_tempdir() as d:
file_path = Path(d) / "string.txt"
with open(file_path, "w", encoding="utf-8") as f:
f.write(s)
yield file_path
def _examples_to_tokens(
examples: Iterable[Example],
) -> Tuple[List[List[str]], List[List[str]]]:
reference = []
predicted = []
for eg in examples:
reference.append([t.text for t in eg.reference])
predicted.append([t.text for t in eg.predicted])
return reference, predicted

View File

@ -8,7 +8,7 @@ from spacy.lang.en import English
from spacy.tokens import Doc, DocBin
from spacy.training import Alignment, Corpus, Example, biluo_tags_to_offsets
from spacy.training import biluo_tags_to_spans, docs_to_json, iob_to_biluo
from spacy.training import offsets_to_biluo_tags
from spacy.training import offsets_to_biluo_tags, validate_distillation_examples
from spacy.training.alignment_array import AlignmentArray
from spacy.training.align import get_alignments
from spacy.training.converters import json_to_docs
@ -365,6 +365,19 @@ def test_example_from_dict_some_ner(en_vocab):
assert ner_tags == ["U-LOC", None, None, None]
def test_validate_distillation_examples(en_vocab):
words = ["a", "b", "c", "d"]
spaces = [True, True, False, True]
predicted = Doc(en_vocab, words=words, spaces=spaces)
example = Example.from_dict(predicted, {})
validate_distillation_examples([example], "test_validate_distillation_examples")
example = Example.from_dict(predicted, {"words": words + ["e"]})
with pytest.raises(ValueError, match=r"distillation"):
validate_distillation_examples([example], "test_validate_distillation_examples")
@pytest.mark.filterwarnings("ignore::UserWarning")
def test_json_to_docs_no_ner(en_vocab):
data = [
@ -905,7 +918,9 @@ def _train_tuples(train_data):
optimizer = nlp.initialize()
for i in range(5):
losses = {}
batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001))
batches = minibatch(
train_examples, size=compounding(4.0, 32.0, 1.001).to_generator()
)
for batch in batches:
nlp.update(batch, sgd=optimizer, losses=losses)

View File

@ -37,7 +37,7 @@ cdef class Tokenizer:
bint with_special_cases) except -1
cdef int _tokenize(self, Doc tokens, str span, hash_t key,
int* has_special, bint with_special_cases) except -1
cdef str _split_affixes(self, Pool mem, str string,
cdef str _split_affixes(self, str string,
vector[LexemeC*] *prefixes,
vector[LexemeC*] *suffixes, int* has_special,
bint with_special_cases)

View File

@ -389,14 +389,14 @@ cdef class Tokenizer:
cdef vector[LexemeC*] suffixes
cdef int orig_size
orig_size = tokens.length
span = self._split_affixes(tokens.mem, span, &prefixes, &suffixes,
span = self._split_affixes(span, &prefixes, &suffixes,
has_special, with_special_cases)
self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
with_special_cases)
self._save_cached(&tokens.c[orig_size], orig_key, has_special,
tokens.length - orig_size)
cdef str _split_affixes(self, Pool mem, str string,
cdef str _split_affixes(self, str string,
vector[const LexemeC*] *prefixes,
vector[const LexemeC*] *suffixes,
int* has_special,
@ -419,7 +419,7 @@ cdef class Tokenizer:
minus_pre = string[pre_len:]
if minus_pre and with_special_cases and self._specials.get(hash_string(minus_pre)) != NULL:
string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix))
prefixes.push_back(self.vocab.get(prefix))
break
suf_len = self.find_suffix(string[pre_len:])
if suf_len != 0:
@ -427,18 +427,18 @@ cdef class Tokenizer:
minus_suf = string[:-suf_len]
if minus_suf and with_special_cases and self._specials.get(hash_string(minus_suf)) != NULL:
string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix))
suffixes.push_back(self.vocab.get(suffix))
break
if pre_len and suf_len and (pre_len + suf_len) <= len(string):
string = string[pre_len:-suf_len]
prefixes.push_back(self.vocab.get(mem, prefix))
suffixes.push_back(self.vocab.get(mem, suffix))
prefixes.push_back(self.vocab.get(prefix))
suffixes.push_back(self.vocab.get(suffix))
elif pre_len:
string = minus_pre
prefixes.push_back(self.vocab.get(mem, prefix))
prefixes.push_back(self.vocab.get(prefix))
elif suf_len:
string = minus_suf
suffixes.push_back(self.vocab.get(mem, suffix))
suffixes.push_back(self.vocab.get(suffix))
return string
cdef int _attach_tokens(self, Doc tokens, str string,
@ -465,11 +465,11 @@ cdef class Tokenizer:
# We're always saying 'no' to spaces here -- the caller will
# fix up the outermost one, with reference to the original.
# See Issue #859
tokens.push_back(self.vocab.get(tokens.mem, string), False)
tokens.push_back(self.vocab.get(string), False)
else:
matches = self.find_infix(string)
if not matches:
tokens.push_back(self.vocab.get(tokens.mem, string), False)
tokens.push_back(self.vocab.get(string), False)
else:
# Let's say we have dyn-o-mite-dave - the regex finds the
# start and end positions of the hyphens
@ -484,7 +484,7 @@ cdef class Tokenizer:
if infix_start != start:
span = string[start:infix_start]
tokens.push_back(self.vocab.get(tokens.mem, span), False)
tokens.push_back(self.vocab.get(span), False)
if infix_start != infix_end:
# If infix_start != infix_end, it means the infix
@ -492,11 +492,11 @@ cdef class Tokenizer:
# for tokenization in some languages (see
# https://github.com/explosion/spaCy/issues/768)
infix_span = string[infix_start:infix_end]
tokens.push_back(self.vocab.get(tokens.mem, infix_span), False)
tokens.push_back(self.vocab.get(infix_span), False)
start = infix_end
span = string[start:]
if span:
tokens.push_back(self.vocab.get(tokens.mem, span), False)
tokens.push_back(self.vocab.get(span), False)
cdef vector[const LexemeC*].reverse_iterator it = suffixes.rbegin()
while it != suffixes.rend():
lexeme = deref(it)

View File

@ -266,12 +266,12 @@ cdef class Doc:
cdef const LexemeC* lexeme
for word, has_space in zip(words, spaces):
if isinstance(word, str):
lexeme = self.vocab.get(self.mem, word)
lexeme = self.vocab.get(word)
elif isinstance(word, bytes):
raise ValueError(Errors.E028.format(value=word))
else:
try:
lexeme = self.vocab.get_by_orth(self.mem, word)
lexeme = self.vocab.get_by_orth(word)
except TypeError:
raise TypeError(Errors.E1022.format(wtype=type(word)))
self.push_back(lexeme, has_space)
@ -1430,7 +1430,7 @@ cdef class Doc:
end = start + attrs[i, 0]
has_space = attrs[i, 1]
orth_ = text[start:end]
lex = self.vocab.get(self.mem, orth_)
lex = self.vocab.get(orth_)
self.push_back(lex, has_space)
start = end + has_space
self.from_array(msg["array_head"][2:], attrs[:, 2:])
@ -1536,7 +1536,7 @@ cdef class Doc:
assert words == reconstructed_words
for word, has_space in zip(words, spaces):
lex = self.vocab.get(self.mem, word)
lex = self.vocab.get(word)
self.push_back(lex, has_space)
# Set remaining token-level attributes via Doc.from_array().

Some files were not shown because too many files have changed in this diff Show More