diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index d0db75f9a..ed69f611b 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -52,17 +52,17 @@ steps:
python -W error -c "import spacy"
displayName: "Test import"
-# - script: |
-# python -m spacy download ca_core_news_sm
-# python -m spacy download ca_core_news_md
-# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-# displayName: 'Test download CLI'
-# condition: eq(variables['python_version'], '3.8')
-#
-# - script: |
-# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-# displayName: 'Test no warnings on load (#11713)'
-# condition: eq(variables['python_version'], '3.8')
+ - script: |
+ python -m spacy download ca_core_news_sm
+ python -m spacy download ca_core_news_md
+ python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+ displayName: 'Test download CLI'
+ condition: eq(variables['python_version'], '3.8')
+
+ - script: |
+ python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+ displayName: 'Test no warnings on load (#11713)'
+ condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@@ -86,17 +86,17 @@ steps:
displayName: 'Test train CLI'
condition: eq(variables['python_version'], '3.8')
-# - script: |
-# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-# displayName: 'Test assemble CLI'
-# condition: eq(variables['python_version'], '3.8')
-#
-# - script: |
-# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-# displayName: 'Test assemble CLI vectors warning'
-# condition: eq(variables['python_version'], '3.8')
+ - script: |
+ python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+ PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+ displayName: 'Test assemble CLI'
+ condition: eq(variables['python_version'], '3.8')
+
+ - script: |
+ python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+ python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+ displayName: 'Test assemble CLI vectors warning'
+ condition: eq(variables['python_version'], '3.8')
- script: |
python -m pip install -U -r requirements.txt
diff --git a/.github/workflows/autoblack.yml b/.github/workflows/autoblack.yml
index 70882c3cc..555322782 100644
--- a/.github/workflows/autoblack.yml
+++ b/.github/workflows/autoblack.yml
@@ -16,7 +16,7 @@ jobs:
with:
ref: ${{ github.head_ref }}
- uses: actions/setup-python@v4
- - run: pip install black
+ - run: pip install black -c requirements.txt
- name: Auto-format code if needed
run: black spacy
# We can't run black --check here because that returns a non-zero excit
diff --git a/.gitignore b/.gitignore
index ac333f958..af75a4d47 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg
spacy/tests/package/pyproject.toml
spacy/tests/package/requirements.txt
-# Website
-website/.cache/
-website/public/
-website/node_modules
-website/.npm
-website/logs
-*.log
-npm-debug.log*
-quickstart-training-generator.js
-
# Cython / C extensions
cythonize.json
spacy/*.html
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1f396bd71..f6f6dab59 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
Python modules. If you've built spaCy from source, you'll already have both
tools installed.
+As a general rule of thumb, we use f-strings for any formatting of strings.
+One exception are calls to Python's `logging` functionality.
+To avoid unnecessary string conversions in these cases, we use string formatting
+templates with `%s` and `%d` etc.
+
**⚠️ Note that formatting and linting is currently only possible for Python
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
diff --git a/README.md b/README.md
index 195424551..49aa6796e 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
-💫 **Version 3.4 out now!**
+💫 **Version 3.5 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 0f7ea91f9..dba11bd1a 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -11,18 +11,28 @@ trigger:
exclude:
- "website/*"
- "*.md"
+ - "*.mdx"
- ".github/workflows/*"
pr:
paths:
exclude:
- "*.md"
+ - "*.mdx"
- "website/docs/*"
- "website/src/*"
+ - "website/meta/*.tsx"
+ - "website/meta/*.mjs"
+ - "website/meta/languages.json"
+ - "website/meta/site.json"
+ - "website/meta/sidebars.json"
+ - "website/meta/type-annotations.json"
+ - "website/pages/*"
- ".github/workflows/*"
jobs:
- # Perform basic checks for most important errors (syntax etc.) Uses the config
- # defined in .flake8 and overwrites the selected codes.
+ # Check formatting and linting. Perform basic checks for most important errors
+ # (syntax etc.) Uses the config defined in setup.cfg and overwrites the
+ # selected codes.
- job: "Validate"
pool:
vmImage: "ubuntu-latest"
@@ -30,6 +40,10 @@ jobs:
- task: UsePythonVersion@0
inputs:
versionSpec: "3.7"
+ - script: |
+ pip install black -c requirements.txt
+ python -m black spacy --check
+ displayName: "black"
- script: |
pip install flake8==5.0.4
python -m flake8 spacy --count --select=E901,E999,F821,F822,F823,W605 --show-source --statistics
diff --git a/requirements.txt b/requirements.txt
index 5bc1c8684..bc9fc183c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,7 +22,7 @@ langcodes>=3.2.0,<4.0.0
# Official Python utilities
setuptools
packaging>=20.0
-typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"
+typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
# Development dependencies
pre-commit>=2.13.0
cython>=0.25,<3.0
@@ -31,10 +31,10 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0
-mypy>=0.990,<0.1000; platform_machine != "aarch64" and python_version >= "3.7"
+mypy>=0.990,<1.1.0; platform_machine != "aarch64" and python_version >= "3.7"
types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1
types-setuptools>=57.0.0
types-requests
types-setuptools>=57.0.0
-black>=22.0,<23.0
+black==22.3.0
diff --git a/setup.cfg b/setup.cfg
index 79dff9e30..cddc5148c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -63,7 +63,7 @@ install_requires =
# Official Python utilities
setuptools
packaging>=20.0
- typing_extensions>=3.7.4,<4.2.0; python_version < "3.8"
+ typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
langcodes>=3.2.0,<4.0.0
[options.entry_points]
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index 47d05b5b6..c855d1b70 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -4,6 +4,7 @@ from ._util import app, setup_cli # noqa: F401
# These are the actual functions, NOT the wrapped CLI commands. The CLI commands
# are registered automatically and won't have to be imported here.
+from .benchmark_speed import benchmark_speed_cli # noqa: F401
from .download import download # noqa: F401
from .info import info # noqa: F401
from .package import package # noqa: F401
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index cc01708a2..12c49a75d 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -46,6 +46,7 @@ DEBUG_HELP = """Suite of helpful commands for debugging and profiling. Includes
commands to check and validate your config files, training and evaluation data,
and custom model implementations.
"""
+BENCHMARK_HELP = """Commands for benchmarking pipelines."""
INIT_HELP = """Commands for initializing configs and pipeline packages."""
CONFIGURE_HELP = """Commands for automatically modifying configs."""
@@ -55,6 +56,7 @@ Arg = typer.Argument
Opt = typer.Option
app = typer.Typer(name=NAME, help=HELP)
+benchmark_cli = typer.Typer(name="benchmark", help=BENCHMARK_HELP, no_args_is_help=True)
project_cli = typer.Typer(name="project", help=PROJECT_HELP, no_args_is_help=True)
debug_cli = typer.Typer(name="debug", help=DEBUG_HELP, no_args_is_help=True)
init_cli = typer.Typer(name="init", help=INIT_HELP, no_args_is_help=True)
@@ -62,6 +64,7 @@ configure_cli = typer.Typer(name="configure", help=CONFIGURE_HELP, no_args_is_he
app.add_typer(project_cli)
app.add_typer(debug_cli)
+app.add_typer(benchmark_cli)
app.add_typer(init_cli)
app.add_typer(configure_cli)
@@ -90,9 +93,9 @@ def parse_config_overrides(
cli_overrides = _parse_overrides(args, is_cli=True)
if cli_overrides:
keys = [k for k in cli_overrides if k not in env_overrides]
- logger.debug(f"Config overrides from CLI: {keys}")
+ logger.debug("Config overrides from CLI: %s", keys)
if env_overrides:
- logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
+ logger.debug("Config overrides from env variables: %s", list(env_overrides))
return {**cli_overrides, **env_overrides}
diff --git a/spacy/cli/benchmark_speed.py b/spacy/cli/benchmark_speed.py
new file mode 100644
index 000000000..4eb20a5fa
--- /dev/null
+++ b/spacy/cli/benchmark_speed.py
@@ -0,0 +1,174 @@
+from typing import Iterable, List, Optional
+import random
+from itertools import islice
+import numpy
+from pathlib import Path
+import time
+from tqdm import tqdm
+import typer
+from wasabi import msg
+
+from .. import util
+from ..language import Language
+from ..tokens import Doc
+from ..training import Corpus
+from ._util import Arg, Opt, benchmark_cli, setup_gpu
+
+
+@benchmark_cli.command(
+ "speed",
+ context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def benchmark_speed_cli(
+ # fmt: off
+ ctx: typer.Context,
+ model: str = Arg(..., help="Model name or path"),
+ data_path: Path = Arg(..., help="Location of binary evaluation data in .spacy format", exists=True),
+ batch_size: Optional[int] = Opt(None, "--batch-size", "-b", min=1, help="Override the pipeline batch size"),
+ no_shuffle: bool = Opt(False, "--no-shuffle", help="Do not shuffle benchmark data"),
+ use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
+ n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
+ warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
+ # fmt: on
+):
+ """
+ Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
+ data in the binary .spacy format.
+ """
+ setup_gpu(use_gpu=use_gpu, silent=False)
+
+ nlp = util.load_model(model)
+ batch_size = batch_size if batch_size is not None else nlp.batch_size
+ corpus = Corpus(data_path)
+ docs = [eg.predicted for eg in corpus(nlp)]
+
+ if len(docs) == 0:
+ msg.fail("Cannot benchmark speed using an empty corpus.", exits=1)
+
+ print(f"Warming up for {warmup_epochs} epochs...")
+ warmup(nlp, docs, warmup_epochs, batch_size)
+
+ print()
+ print(f"Benchmarking {n_batches} batches...")
+ wps = benchmark(nlp, docs, n_batches, batch_size, not no_shuffle)
+
+ print()
+ print_outliers(wps)
+ print_mean_with_ci(wps)
+
+
+# Lowercased, behaves as a context manager function.
+class time_context:
+ """Register the running time of a context."""
+
+ def __enter__(self):
+ self.start = time.perf_counter()
+ return self
+
+ def __exit__(self, type, value, traceback):
+ self.elapsed = time.perf_counter() - self.start
+
+
+class Quartiles:
+ """Calculate the q1, q2, q3 quartiles and the inter-quartile range (iqr)
+ of a sample."""
+
+ q1: float
+ q2: float
+ q3: float
+ iqr: float
+
+ def __init__(self, sample: numpy.ndarray) -> None:
+ self.q1 = numpy.quantile(sample, 0.25)
+ self.q2 = numpy.quantile(sample, 0.5)
+ self.q3 = numpy.quantile(sample, 0.75)
+ self.iqr = self.q3 - self.q1
+
+
+def annotate(
+ nlp: Language, docs: List[Doc], batch_size: Optional[int]
+) -> numpy.ndarray:
+ docs = nlp.pipe(tqdm(docs, unit="doc"), batch_size=batch_size)
+ wps = []
+ while True:
+ with time_context() as elapsed:
+ batch_docs = list(
+ islice(docs, batch_size if batch_size else nlp.batch_size)
+ )
+ if len(batch_docs) == 0:
+ break
+ n_tokens = count_tokens(batch_docs)
+ wps.append(n_tokens / elapsed.elapsed)
+
+ return numpy.array(wps)
+
+
+def benchmark(
+ nlp: Language,
+ docs: List[Doc],
+ n_batches: int,
+ batch_size: int,
+ shuffle: bool,
+) -> numpy.ndarray:
+ if shuffle:
+ bench_docs = [
+ nlp.make_doc(random.choice(docs).text)
+ for _ in range(n_batches * batch_size)
+ ]
+ else:
+ bench_docs = [
+ nlp.make_doc(docs[i % len(docs)].text)
+ for i in range(n_batches * batch_size)
+ ]
+
+ return annotate(nlp, bench_docs, batch_size)
+
+
+def bootstrap(x, statistic=numpy.mean, iterations=10000) -> numpy.ndarray:
+ """Apply a statistic to repeated random samples of an array."""
+ return numpy.fromiter(
+ (
+ statistic(numpy.random.choice(x, len(x), replace=True))
+ for _ in range(iterations)
+ ),
+ numpy.float64,
+ )
+
+
+def count_tokens(docs: Iterable[Doc]) -> int:
+ return sum(len(doc) for doc in docs)
+
+
+def print_mean_with_ci(sample: numpy.ndarray):
+ mean = numpy.mean(sample)
+ bootstrap_means = bootstrap(sample)
+ bootstrap_means.sort()
+
+ # 95% confidence interval
+ low = bootstrap_means[int(len(bootstrap_means) * 0.025)]
+ high = bootstrap_means[int(len(bootstrap_means) * 0.975)]
+
+ print(f"Mean: {mean:.1f} words/s (95% CI: {low-mean:.1f} +{high-mean:.1f})")
+
+
+def print_outliers(sample: numpy.ndarray):
+ quartiles = Quartiles(sample)
+
+ n_outliers = numpy.sum(
+ (sample < (quartiles.q1 - 1.5 * quartiles.iqr))
+ | (sample > (quartiles.q3 + 1.5 * quartiles.iqr))
+ )
+ n_extreme_outliers = numpy.sum(
+ (sample < (quartiles.q1 - 3.0 * quartiles.iqr))
+ | (sample > (quartiles.q3 + 3.0 * quartiles.iqr))
+ )
+ print(
+ f"Outliers: {(100 * n_outliers) / len(sample):.1f}%, extreme outliers: {(100 * n_extreme_outliers) / len(sample)}%"
+ )
+
+
+def warmup(
+ nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
+) -> numpy.ndarray:
+ docs = warmup_epochs * docs
+ return annotate(nlp, docs, batch_size)
diff --git a/spacy/cli/debug_data.py b/spacy/cli/debug_data.py
index a85324e87..f20673f25 100644
--- a/spacy/cli/debug_data.py
+++ b/spacy/cli/debug_data.py
@@ -17,6 +17,7 @@ from ..pipeline import TrainablePipe
from ..pipeline._parser_internals import nonproj
from ..pipeline._parser_internals.nonproj import DELIMITER
from ..pipeline import Morphologizer, SpanCategorizer
+from ..pipeline._edit_tree_internals.edit_trees import EditTrees
from ..morphology import Morphology
from ..language import Language
from ..util import registry, resolve_dot_names
@@ -671,6 +672,59 @@ def debug_data(
f"Found {gold_train_data['n_cycles']} projectivized train sentence(s) with cycles"
)
+ if "trainable_lemmatizer" in factory_names:
+ msg.divider("Trainable Lemmatizer")
+ trees_train: Set[str] = gold_train_data["lemmatizer_trees"]
+ trees_dev: Set[str] = gold_dev_data["lemmatizer_trees"]
+ # This is necessary context when someone is attempting to interpret whether the
+ # number of trees exclusively in the dev set is meaningful.
+ msg.info(f"{len(trees_train)} lemmatizer trees generated from training data")
+ msg.info(f"{len(trees_dev)} lemmatizer trees generated from dev data")
+ dev_not_train = trees_dev - trees_train
+
+ if len(dev_not_train) != 0:
+ pct = len(dev_not_train) / len(trees_dev)
+ msg.info(
+ f"{len(dev_not_train)} lemmatizer trees ({pct*100:.1f}% of dev trees)"
+ " were found exclusively in the dev data."
+ )
+ else:
+ # Would we ever expect this case? It seems like it would be pretty rare,
+ # and we might actually want a warning?
+ msg.info("All trees in dev data present in training data.")
+
+ if gold_train_data["n_low_cardinality_lemmas"] > 0:
+ n = gold_train_data["n_low_cardinality_lemmas"]
+ msg.warn(f"{n} training docs with 0 or 1 unique lemmas.")
+
+ if gold_dev_data["n_low_cardinality_lemmas"] > 0:
+ n = gold_dev_data["n_low_cardinality_lemmas"]
+ msg.warn(f"{n} dev docs with 0 or 1 unique lemmas.")
+
+ if gold_train_data["no_lemma_annotations"] > 0:
+ n = gold_train_data["no_lemma_annotations"]
+ msg.warn(f"{n} training docs with no lemma annotations.")
+ else:
+ msg.good("All training docs have lemma annotations.")
+
+ if gold_dev_data["no_lemma_annotations"] > 0:
+ n = gold_dev_data["no_lemma_annotations"]
+ msg.warn(f"{n} dev docs with no lemma annotations.")
+ else:
+ msg.good("All dev docs have lemma annotations.")
+
+ if gold_train_data["partial_lemma_annotations"] > 0:
+ n = gold_train_data["partial_lemma_annotations"]
+ msg.info(f"{n} training docs with partial lemma annotations.")
+ else:
+ msg.good("All training docs have complete lemma annotations.")
+
+ if gold_dev_data["partial_lemma_annotations"] > 0:
+ n = gold_dev_data["partial_lemma_annotations"]
+ msg.info(f"{n} dev docs with partial lemma annotations.")
+ else:
+ msg.good("All dev docs have complete lemma annotations.")
+
msg.divider("Summary")
good_counts = msg.counts[MESSAGES.GOOD]
warn_counts = msg.counts[MESSAGES.WARN]
@@ -732,7 +786,13 @@ def _compile_gold(
"n_cats_multilabel": 0,
"n_cats_bad_values": 0,
"texts": set(),
+ "lemmatizer_trees": set(),
+ "no_lemma_annotations": 0,
+ "partial_lemma_annotations": 0,
+ "n_low_cardinality_lemmas": 0,
}
+ if "trainable_lemmatizer" in factory_names:
+ trees = EditTrees(nlp.vocab.strings)
for eg in examples:
gold = eg.reference
doc = eg.predicted
@@ -862,6 +922,25 @@ def _compile_gold(
data["n_nonproj"] += 1
if nonproj.contains_cycle(aligned_heads):
data["n_cycles"] += 1
+ if "trainable_lemmatizer" in factory_names:
+ # from EditTreeLemmatizer._labels_from_data
+ if all(token.lemma == 0 for token in gold):
+ data["no_lemma_annotations"] += 1
+ continue
+ if any(token.lemma == 0 for token in gold):
+ data["partial_lemma_annotations"] += 1
+ lemma_set = set()
+ for token in gold:
+ if token.lemma != 0:
+ lemma_set.add(token.lemma)
+ tree_id = trees.add(token.text, token.lemma_)
+ tree_str = trees.tree_to_str(tree_id)
+ data["lemmatizer_trees"].add(tree_str)
+ # We want to identify cases where lemmas aren't assigned
+ # or are all assigned the same value, as this would indicate
+ # an issue since we're expecting a large set of lemmas
+ if len(lemma_set) < 2 and len(gold) > 1:
+ data["n_low_cardinality_lemmas"] += 1
return data
diff --git a/spacy/cli/evaluate.py b/spacy/cli/evaluate.py
index 0d08d2c5e..8f3d6b859 100644
--- a/spacy/cli/evaluate.py
+++ b/spacy/cli/evaluate.py
@@ -7,12 +7,15 @@ from thinc.api import fix_random_seed
from ..training import Corpus
from ..tokens import Doc
-from ._util import app, Arg, Opt, setup_gpu, import_code
+from ._util import app, Arg, Opt, setup_gpu, import_code, benchmark_cli
from ..scorer import Scorer
from .. import util
from .. import displacy
+@benchmark_cli.command(
+ "accuracy",
+)
@app.command("evaluate")
def evaluate_cli(
# fmt: off
@@ -36,7 +39,7 @@ def evaluate_cli(
dependency parses in a HTML file, set as output directory as the
displacy_path argument.
- DOCS: https://spacy.io/api/cli#evaluate
+ DOCS: https://spacy.io/api/cli#benchmark-accuracy
"""
import_code(code_path)
evaluate(
diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
index 6e3cde88c..8894baa50 100644
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
# in the list.
while commands:
for i, cmd in enumerate(list(commands)):
- logger.debug(f"CMD: {cmd['name']}.")
+ logger.debug("CMD: %s.", cmd["name"])
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if all(dep.exists() for dep in deps):
cmd_hash = get_command_hash("", "", deps, cmd["script"])
for output_path in cmd.get("outputs", []):
url = storage.pull(output_path, command_hash=cmd_hash)
logger.debug(
- f"URL: {url} for {output_path} with command hash {cmd_hash}"
+ "URL: %s for %s with command hash %s",
+ url,
+ output_path,
+ cmd_hash,
)
yield url, output_path
@@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
commands.pop(i)
break
else:
- logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
+ logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
else:
# If we didn't break the for loop, break the while loop.
break
diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py
index bc779e9cd..a8178de21 100644
--- a/spacy/cli/project/push.py
+++ b/spacy/cli/project/push.py
@@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []):
- logger.debug(f"CMD: cmd['name']")
+ logger.debug("CMD: %s", cmd["name"])
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if any(not dep.exists() for dep in deps):
- logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
+ logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
continue
cmd_hash = get_command_hash(
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
)
- logger.debug(f"CMD_HASH: {cmd_hash}")
+ logger.debug("CMD_HASH: %s", cmd_hash)
for output_path in cmd.get("outputs", []):
output_loc = project_dir / output_path
if output_loc.exists() and _is_not_empty_dir(output_loc):
@@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
content_hash=get_content_hash(output_loc),
)
logger.debug(
- f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
+ "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
)
yield output_path, url
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index a3cfd96dd..ea6bba2c9 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -106,9 +106,7 @@ def serve(
if is_in_jupyter():
warnings.warn(Warnings.W011)
- render(
- docs, style=style, page=page, minify=minify, options=options, manual=manual
- )
+ render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
httpd = simple_server.make_server(host, port, app)
print(f"\nUsing the '{style}' visualizer")
print(f"Serving on http://{host}:{port} ...\n")
diff --git a/spacy/errors.py b/spacy/errors.py
index 498df0320..d143e341c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -965,8 +965,8 @@ class Errors(metaclass=ErrorsWithCodes):
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
E1049 = ("No available port found for displaCy on host {host}. Please specify an available port "
- "with `displacy.serve(doc, port)`")
- E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port)` "
+ "with `displacy.serve(doc, port=port)`")
+ E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` "
"or use `auto_switch_port=True` to pick an available port automatically.")
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index 485e52c2f..edba523cf 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -25,7 +25,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
to support entity linking of named entities to real-world concepts.
- DOCS: https://spacy.io/api/kb_in_memory
+ DOCS: https://spacy.io/api/inmemorylookupkb
"""
def __init__(self, Vocab vocab, entity_vector_length):
diff --git a/spacy/language.py b/spacy/language.py
index e0abfd5e7..9fdcf6328 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -104,7 +104,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
@registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables):
- util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
+ util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
lookups = load_lookups(lang=lang, tables=tables)
return lookups
@@ -1969,7 +1969,7 @@ class Language:
pipe = self.get_pipe(pipe_name)
pipe_cfg = self._pipe_configs[pipe_name]
if listeners:
- util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
+ util.logger.debug("Replacing listeners of component '%s'", pipe_name)
if len(list(listeners)) != len(pipe_listeners):
# The number of listeners defined in the component model doesn't
# match the listeners to replace, so we won't be able to update
diff --git a/spacy/matcher/levenshtein.pyx b/spacy/matcher/levenshtein.pyx
index 0e8cd26da..e823ce99d 100644
--- a/spacy/matcher/levenshtein.pyx
+++ b/spacy/matcher/levenshtein.pyx
@@ -22,7 +22,7 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
max_edits = fuzzy
else:
# allow at least two edits (to allow at least one transposition) and up
- # to 20% of the pattern string length
+ # to 30% of the pattern string length
max_edits = max(2, round(0.3 * len(pattern_text)))
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index 77ea7b7a6..48922865b 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -5,8 +5,12 @@ from ..vocab import Vocab
from ..tokens import Doc, Span
class Matcher:
- def __init__(self, vocab: Vocab, validate: bool = ...,
- fuzzy_compare: Callable[[str, str, int], bool] = ...) -> None: ...
+ def __init__(
+ self,
+ vocab: Vocab,
+ validate: bool = ...,
+ fuzzy_compare: Callable[[str, str, int], bool] = ...,
+ ) -> None: ...
def __reduce__(self) -> Any: ...
def __len__(self) -> int: ...
def __contains__(self, key: str) -> bool: ...
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index a56c9975e..332badd8c 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -5,8 +5,8 @@ from itertools import islice
import numpy as np
import srsly
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d, Ints2d
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps
+from thinc.types import Floats2d, Ints2d
from ._edit_tree_internals.edit_trees import EditTrees
from ._edit_tree_internals.schemas import validate_edit_tree
@@ -20,6 +20,10 @@ from ..vocab import Vocab
from .. import util
+# The cutoff value of *top_k* above which an alternative method is used to process guesses.
+TOP_K_GUARDRAIL = 20
+
+
default_model_config = """
[model]
@architectures = "spacy.Tagger.v2"
@@ -115,6 +119,7 @@ class EditTreeLemmatizer(TrainablePipe):
self.cfg: Dict[str, Any] = {"labels": []}
self.scorer = scorer
+ self.numpy_ops = NumpyOps()
def get_loss(
self, examples: Iterable[Example], scores: List[Floats2d]
@@ -128,7 +133,7 @@ class EditTreeLemmatizer(TrainablePipe):
for (predicted, gold_lemma) in zip(
eg.predicted, eg.get_aligned("LEMMA", as_string=True)
):
- if gold_lemma is None:
+ if gold_lemma is None or gold_lemma == "":
label = -1
else:
tree_id = self.trees.add(predicted.text, gold_lemma)
@@ -144,6 +149,18 @@ class EditTreeLemmatizer(TrainablePipe):
return float(loss), d_scores
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
+ if self.top_k == 1:
+ scores2guesses = self._scores2guesses_top_k_equals_1
+ elif self.top_k <= TOP_K_GUARDRAIL:
+ scores2guesses = self._scores2guesses_top_k_greater_1
+ else:
+ scores2guesses = self._scores2guesses_top_k_guardrail
+ # The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
+ # of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
+ # for its principal purpose of lemmatizing tokens. However, the code could also
+ # be used for other purposes, and with very large values of *top_k* the method
+ # becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
+ # instead.
n_docs = len(list(docs))
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
@@ -153,20 +170,52 @@ class EditTreeLemmatizer(TrainablePipe):
return guesses
scores = self.model.predict(docs)
assert len(scores) == n_docs
- guesses = self._scores2guesses(docs, scores)
+ guesses = scores2guesses(docs, scores)
assert len(guesses) == n_docs
return guesses
- def _scores2guesses(self, docs, scores):
+ def _scores2guesses_top_k_equals_1(self, docs, scores):
guesses = []
for doc, doc_scores in zip(docs, scores):
- if self.top_k == 1:
- doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
- else:
- doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
+ doc_guesses = doc_scores.argmax(axis=1)
+ doc_guesses = self.numpy_ops.asarray(doc_guesses)
- if not isinstance(doc_guesses, np.ndarray):
- doc_guesses = doc_guesses.get()
+ doc_compat_guesses = []
+ for i, token in enumerate(doc):
+ tree_id = self.cfg["labels"][doc_guesses[i]]
+ if self.trees.apply(tree_id, token.text) is not None:
+ doc_compat_guesses.append(tree_id)
+ else:
+ doc_compat_guesses.append(-1)
+ guesses.append(np.array(doc_compat_guesses))
+
+ return guesses
+
+ def _scores2guesses_top_k_greater_1(self, docs, scores):
+ guesses = []
+ top_k = min(self.top_k, len(self.labels))
+ for doc, doc_scores in zip(docs, scores):
+ doc_scores = self.numpy_ops.asarray(doc_scores)
+ doc_compat_guesses = []
+ for i, token in enumerate(doc):
+ for _ in range(top_k):
+ candidate = int(doc_scores[i].argmax())
+ candidate_tree_id = self.cfg["labels"][candidate]
+ if self.trees.apply(candidate_tree_id, token.text) is not None:
+ doc_compat_guesses.append(candidate_tree_id)
+ break
+ doc_scores[i, candidate] = np.finfo(np.float32).min
+ else:
+ doc_compat_guesses.append(-1)
+ guesses.append(np.array(doc_compat_guesses))
+
+ return guesses
+
+ def _scores2guesses_top_k_guardrail(self, docs, scores):
+ guesses = []
+ for doc, doc_scores in zip(docs, scores):
+ doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
+ doc_guesses = self.numpy_ops.asarray(doc_guesses)
doc_compat_guesses = []
for token, candidates in zip(doc, doc_guesses):
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 3675c12dd..140592dcd 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -163,15 +163,33 @@ class TokenPatternString(BaseModel):
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy")
- FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy1")
- FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy2")
- FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy3")
- FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy4")
- FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy5")
- FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy6")
- FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy7")
- FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy8")
- FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy9")
+ FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy1"
+ )
+ FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy2"
+ )
+ FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy3"
+ )
+ FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy4"
+ )
+ FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy5"
+ )
+ FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy6"
+ )
+ FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy7"
+ )
+ FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy8"
+ )
+ FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy9"
+ )
class Config:
extra = "forbid"
diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
index 3676b35af..b4631037a 100644
--- a/spacy/tests/doc/test_span.py
+++ b/spacy/tests/doc/test_span.py
@@ -163,6 +163,18 @@ def test_char_span(doc, i_sent, i, j, text):
assert span.text == text
+def test_char_span_attributes(doc):
+ label = "LABEL"
+ kb_id = "KB_ID"
+ span_id = "SPAN_ID"
+ span1 = doc.char_span(20, 45, label=label, kb_id=kb_id, span_id=span_id)
+ span2 = doc[1:].char_span(15, 40, label=label, kb_id=kb_id, span_id=span_id)
+ assert span1.text == span2.text
+ assert span1.label_ == span2.label_ == label
+ assert span1.kb_id_ == span2.kb_id_ == kb_id
+ assert span1.id_ == span2.id_ == span_id
+
+
def test_spans_sent_spans(doc):
sents = list(doc.sents)
assert sents[0].start == 0
@@ -367,6 +379,14 @@ def test_spans_by_character(doc):
span1.start_char + 1, span1.end_char, label="GPE", alignment_mode="unk"
)
+ # Span.char_span + alignment mode "contract"
+ span2 = doc[0:2].char_span(
+ span1.start_char - 3, span1.end_char, label="GPE", alignment_mode="contract"
+ )
+ assert span1.start_char == span2.start_char
+ assert span1.end_char == span2.end_char
+ assert span2.label_ == "GPE"
+
def test_span_to_array(doc):
span = doc[1:-2]
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index b12ca5dd4..128d75680 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -101,14 +101,15 @@ def test_initialize_from_labels():
}
-def test_no_data():
+@pytest.mark.parametrize("top_k", (1, 5, 30))
+def test_no_data(top_k):
# Test that the lemmatizer provides a nice error when there's no tagging data / labels
TEXTCAT_DATA = [
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
]
nlp = English()
- nlp.add_pipe("trainable_lemmatizer")
+ nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
nlp.add_pipe("textcat")
train_examples = []
@@ -119,10 +120,11 @@ def test_no_data():
nlp.initialize(get_examples=lambda: train_examples)
-def test_incomplete_data():
+@pytest.mark.parametrize("top_k", (1, 5, 30))
+def test_incomplete_data(top_k):
# Test that the lemmatizer works with incomplete information
nlp = English()
- lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+ lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
lemmatizer.min_tree_freq = 1
train_examples = []
for t in PARTIAL_DATA:
@@ -139,10 +141,25 @@ def test_incomplete_data():
assert doc[1].lemma_ == "like"
assert doc[2].lemma_ == "blue"
+ # Check that incomplete annotations are ignored.
+ scores, _ = lemmatizer.model([eg.predicted for eg in train_examples], is_train=True)
+ _, dX = lemmatizer.get_loss(train_examples, scores)
+ xp = lemmatizer.model.ops.xp
-def test_overfitting_IO():
+ # Missing annotations.
+ assert xp.count_nonzero(dX[0][0]) == 0
+ assert xp.count_nonzero(dX[0][3]) == 0
+ assert xp.count_nonzero(dX[1][0]) == 0
+ assert xp.count_nonzero(dX[1][3]) == 0
+
+ # Misaligned annotations.
+ assert xp.count_nonzero(dX[1][1]) == 0
+
+
+@pytest.mark.parametrize("top_k", (1, 5, 30))
+def test_overfitting_IO(top_k):
nlp = English()
- lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+ lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
lemmatizer.min_tree_freq = 1
train_examples = []
for t in TRAIN_DATA:
@@ -175,7 +192,7 @@ def test_overfitting_IO():
# Check model after a {to,from}_bytes roundtrip
nlp_bytes = nlp.to_bytes()
nlp3 = English()
- nlp3.add_pipe("trainable_lemmatizer")
+ nlp3.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
nlp3.from_bytes(nlp_bytes)
doc3 = nlp3(test_text)
assert doc3[0].lemma_ == "she"
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 10701263f..1188e4a1b 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -620,7 +620,6 @@ def test_string_to_list_intify(value):
assert string_to_list(value, intify=True) == [1, 2, 3]
-@pytest.mark.skip(reason="Temporarily skip for dev version")
def test_download_compatibility():
spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False
@@ -631,7 +630,6 @@ def test_download_compatibility():
assert get_minor_version(about.__version__) == get_minor_version(version)
-@pytest.mark.skip(reason="Temporarily skip for dev version")
def test_validate_compatibility_table():
spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False
@@ -1021,8 +1019,6 @@ def test_local_remote_storage_pull_missing():
def test_cli_find_threshold(capsys):
- thresholds = numpy.linspace(0, 1, 10)
-
def make_examples(nlp: Language) -> List[Example]:
docs: List[Example] = []
@@ -1078,7 +1074,7 @@ def test_cli_find_threshold(capsys):
)
with make_tempdir() as nlp_dir:
nlp.to_disk(nlp_dir)
- res = find_threshold(
+ best_threshold, best_score, res = find_threshold(
model=nlp_dir,
data_path=docs_dir / "docs.spacy",
pipe_name="tc_multi",
@@ -1086,16 +1082,14 @@ def test_cli_find_threshold(capsys):
scores_key="cats_macro_f",
silent=True,
)
- assert res[0] != thresholds[0]
- assert thresholds[0] < res[0] < thresholds[9]
- assert res[1] == 1.0
- assert res[2][1.0] == 0.0
+ assert best_score == max(res.values())
+ assert res[1.0] == 0.0
# Test with spancat.
nlp, _ = init_nlp((("spancat", {}),))
with make_tempdir() as nlp_dir:
nlp.to_disk(nlp_dir)
- res = find_threshold(
+ best_threshold, best_score, res = find_threshold(
model=nlp_dir,
data_path=docs_dir / "docs.spacy",
pipe_name="spancat",
@@ -1103,10 +1097,8 @@ def test_cli_find_threshold(capsys):
scores_key="spans_sc_f",
silent=True,
)
- assert res[0] != thresholds[0]
- assert thresholds[0] < res[0] < thresholds[8]
- assert res[1] >= 0.6
- assert res[2][1.0] == 0.0
+ assert best_score == max(res.values())
+ assert res[1.0] == 0.0
# Having multiple textcat_multilabel components should work, since the name has to be specified.
nlp, _ = init_nlp((("textcat_multilabel", {}),))
@@ -1276,3 +1268,69 @@ def test_walk_directory():
assert (len(walk_directory(d, suffix="iob"))) == 2
assert (len(walk_directory(d, suffix="conll"))) == 3
assert (len(walk_directory(d, suffix="pdf"))) == 0
+
+
+def test_debug_data_trainable_lemmatizer_basic():
+ examples = [
+ ("She likes green eggs", {"lemmas": ["she", "like", "green", "egg"]}),
+ ("Eat blue ham", {"lemmas": ["eat", "blue", "ham"]}),
+ ]
+ nlp = Language()
+ train_examples = []
+ for t in examples:
+ train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+ data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
+ # ref test_edit_tree_lemmatizer::test_initialize_from_labels
+ # this results in 4 trees
+ assert len(data["lemmatizer_trees"]) == 4
+
+
+def test_debug_data_trainable_lemmatizer_partial():
+ partial_examples = [
+ # partial annotation
+ ("She likes green eggs", {"lemmas": ["", "like", "green", ""]}),
+ # misaligned partial annotation
+ (
+ "He hates green eggs",
+ {
+ "words": ["He", "hat", "es", "green", "eggs"],
+ "lemmas": ["", "hat", "e", "green", ""],
+ },
+ ),
+ ]
+ nlp = Language()
+ train_examples = []
+ for t in partial_examples:
+ train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+ data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
+ assert data["partial_lemma_annotations"] == 2
+
+
+def test_debug_data_trainable_lemmatizer_low_cardinality():
+ low_cardinality_examples = [
+ ("She likes green eggs", {"lemmas": ["no", "no", "no", "no"]}),
+ ("Eat blue ham", {"lemmas": ["no", "no", "no"]}),
+ ]
+ nlp = Language()
+ train_examples = []
+ for t in low_cardinality_examples:
+ train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+ data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
+ assert data["n_low_cardinality_lemmas"] == 2
+
+
+def test_debug_data_trainable_lemmatizer_not_annotated():
+ unannotated_examples = [
+ ("She likes green eggs", {}),
+ ("Eat blue ham", {}),
+ ]
+ nlp = Language()
+ train_examples = []
+ for t in unannotated_examples:
+ train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
+
+ data = _compile_gold(train_examples, ["trainable_lemmatizer"], nlp, True)
+ assert data["no_lemma_annotations"] == 2
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
index 873a3ff66..40100412a 100644
--- a/spacy/tests/test_cli_app.py
+++ b/spacy/tests/test_cli_app.py
@@ -1,9 +1,10 @@
import os
from pathlib import Path
from typer.testing import CliRunner
+from spacy.tokens import DocBin, Doc
from spacy.cli._util import app
-from .util import make_tempdir
+from .util import make_tempdir, normalize_whitespace
def test_convert_auto():
@@ -31,3 +32,60 @@ def test_convert_auto_conflict():
assert "All input files must be same type" in result.stdout
out_files = os.listdir(d_out)
assert len(out_files) == 0
+
+
+def test_benchmark_accuracy_alias():
+ # Verify that the `evaluate` alias works correctly.
+ result_benchmark = CliRunner().invoke(app, ["benchmark", "accuracy", "--help"])
+ result_evaluate = CliRunner().invoke(app, ["evaluate", "--help"])
+ assert normalize_whitespace(result_benchmark.stdout) == normalize_whitespace(
+ result_evaluate.stdout.replace("spacy evaluate", "spacy benchmark accuracy")
+ )
+
+
+def test_debug_data_trainable_lemmatizer_cli(en_vocab):
+ train_docs = [
+ Doc(en_vocab, words=["I", "like", "cats"], lemmas=["I", "like", "cat"]),
+ Doc(
+ en_vocab,
+ words=["Dogs", "are", "great", "too"],
+ lemmas=["dog", "be", "great", "too"],
+ ),
+ ]
+ dev_docs = [
+ Doc(en_vocab, words=["Cats", "are", "cute"], lemmas=["cat", "be", "cute"]),
+ Doc(en_vocab, words=["Pets", "are", "great"], lemmas=["pet", "be", "great"]),
+ ]
+ with make_tempdir() as d_in:
+ train_bin = DocBin(docs=train_docs)
+ train_bin.to_disk(d_in / "train.spacy")
+ dev_bin = DocBin(docs=dev_docs)
+ dev_bin.to_disk(d_in / "dev.spacy")
+ # `debug data` requires an input pipeline config
+ CliRunner().invoke(
+ app,
+ [
+ "init",
+ "config",
+ f"{d_in}/config.cfg",
+ "--lang",
+ "en",
+ "--pipeline",
+ "trainable_lemmatizer",
+ ],
+ )
+ result_debug_data = CliRunner().invoke(
+ app,
+ [
+ "debug",
+ "data",
+ f"{d_in}/config.cfg",
+ "--paths.train",
+ f"{d_in}/train.spacy",
+ "--paths.dev",
+ f"{d_in}/dev.spacy",
+ ],
+ )
+ # Instead of checking specific wording of the output, which may change,
+ # we'll check that this section of the debug output is present.
+ assert "= Trainable Lemmatizer =" in result_debug_data.stdout
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 03790eb86..236856dad 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -46,7 +46,7 @@ def assert_sents_error(doc):
def warn_error(proc_name, proc, docs, e):
logger = logging.getLogger("spacy")
- logger.warning(f"Trouble with component {proc_name}.")
+ logger.warning("Trouble with component %s.", proc_name)
@pytest.fixture
diff --git a/spacy/tests/training/test_corpus.py b/spacy/tests/training/test_corpus.py
new file mode 100644
index 000000000..b4f9cc13a
--- /dev/null
+++ b/spacy/tests/training/test_corpus.py
@@ -0,0 +1,78 @@
+from typing import IO, Generator, Iterable, List, TextIO, Tuple
+from contextlib import contextmanager
+from pathlib import Path
+import pytest
+import tempfile
+
+from spacy.lang.en import English
+from spacy.training import Example, PlainTextCorpus
+from spacy.util import make_tempdir
+
+# Intentional newlines to check that they are skipped.
+PLAIN_TEXT_DOC = """
+
+This is a doc. It contains two sentences.
+This is another doc.
+
+A third doc.
+
+"""
+
+PLAIN_TEXT_DOC_TOKENIZED = [
+ [
+ "This",
+ "is",
+ "a",
+ "doc",
+ ".",
+ "It",
+ "contains",
+ "two",
+ "sentences",
+ ".",
+ ],
+ ["This", "is", "another", "doc", "."],
+ ["A", "third", "doc", "."],
+]
+
+
+@pytest.mark.parametrize("min_length", [0, 5])
+@pytest.mark.parametrize("max_length", [0, 5])
+def test_plain_text_reader(min_length, max_length):
+ nlp = English()
+ with _string_to_tmp_file(PLAIN_TEXT_DOC) as file_path:
+ corpus = PlainTextCorpus(
+ file_path, min_length=min_length, max_length=max_length
+ )
+
+ check = [
+ doc
+ for doc in PLAIN_TEXT_DOC_TOKENIZED
+ if len(doc) >= min_length and (max_length == 0 or len(doc) <= max_length)
+ ]
+ reference, predicted = _examples_to_tokens(corpus(nlp))
+
+ assert reference == check
+ assert predicted == check
+
+
+@contextmanager
+def _string_to_tmp_file(s: str) -> Generator[Path, None, None]:
+ with make_tempdir() as d:
+ file_path = Path(d) / "string.txt"
+ with open(file_path, "w", encoding="utf-8") as f:
+ f.write(s)
+ yield file_path
+
+
+def _examples_to_tokens(
+ examples: Iterable[Example],
+) -> Tuple[List[List[str]], List[List[str]]]:
+ reference = []
+ predicted = []
+
+ for eg in examples:
+ reference.append([t.text for t in eg.reference])
+ predicted.append([t.text for t in eg.predicted])
+
+ return reference, predicted
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index d5f3c39ff..c2647558d 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -1,6 +1,7 @@
import numpy
import tempfile
import contextlib
+import re
import srsly
from spacy.tokens import Doc
from spacy.vocab import Vocab
@@ -95,3 +96,7 @@ def assert_packed_msg_equal(b1, b2):
for (k1, v1), (k2, v2) in zip(sorted(msg1.items()), sorted(msg2.items())):
assert k1 == k2
assert v1 == v2
+
+
+def normalize_whitespace(s):
+ return re.sub(r"\s+", " ", s)
diff --git a/spacy/tokens/doc.pyi b/spacy/tokens/doc.pyi
index f0cdaee87..9d45960ab 100644
--- a/spacy/tokens/doc.pyi
+++ b/spacy/tokens/doc.pyi
@@ -108,6 +108,7 @@ class Doc:
kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ...,
alignment_mode: str = ...,
+ span_id: Union[int, str] = ...,
) -> Span: ...
def similarity(self, other: Union[Doc, Span, Token, Lexeme]) -> float: ...
@property
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 075bc4d15..7dfe0ca9f 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -528,9 +528,9 @@ cdef class Doc:
doc (Doc): The parent document.
start_idx (int): The index of the first character of the span.
end_idx (int): The index of the first character after the span.
- label (uint64 or string): A label to attach to the Span, e.g. for
+ label (Union[int, str]): A label to attach to the Span, e.g. for
named entities.
- kb_id (uint64 or string): An ID from a KB to capture the meaning of a
+ kb_id (Union[int, str]): An ID from a KB to capture the meaning of a
named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span.
@@ -539,6 +539,7 @@ cdef class Doc:
with token boundaries), "contract" (span of all tokens completely
within the character span), "expand" (span of all tokens at least
partially covered by the character span). Defaults to "strict".
+ span_id (Union[int, str]): An identifier to associate with the span.
RETURNS (Span): The newly constructed object.
DOCS: https://spacy.io/api/doc#char_span
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 9986a90e6..a92f19e20 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -98,6 +98,9 @@ class Span:
label: Union[int, str] = ...,
kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ...,
+ id: Union[int, str] = ...,
+ alignment_mode: str = ...,
+ span_id: Union[int, str] = ...,
) -> Span: ...
@property
def conjuncts(self) -> Tuple[Token]: ...
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index 99a5f43bd..cfe1236df 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -362,7 +362,7 @@ cdef class Span:
result = xp.dot(vector, other.vector) / (self.vector_norm * other.vector_norm)
# ensure we get a scalar back (numpy does this automatically but cupy doesn't)
return result.item()
-
+
cpdef np.ndarray to_array(self, object py_attr_ids):
"""Given a list of M attribute IDs, export the tokens to a numpy
`ndarray` of shape `(N, M)`, where `N` is the length of the document.
@@ -639,21 +639,28 @@ cdef class Span:
else:
return self.doc[root]
- def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0):
+ def char_span(self, int start_idx, int end_idx, label=0, kb_id=0, vector=None, id=0, alignment_mode="strict", span_id=0):
"""Create a `Span` object from the slice `span.text[start : end]`.
start (int): The index of the first character of the span.
end (int): The index of the first character after the span.
- label (uint64 or string): A label to attach to the Span, e.g. for
+ label (Union[int, str]): A label to attach to the Span, e.g. for
named entities.
- kb_id (uint64 or string): An ID from a KB to capture the meaning of a named entity.
+ kb_id (Union[int, str]): An ID from a KB to capture the meaning of a named entity.
vector (ndarray[ndim=1, dtype='float32']): A meaning representation of
the span.
+ id (Union[int, str]): Unused.
+ alignment_mode (str): How character indices are aligned to token
+ boundaries. Options: "strict" (character indices must be aligned
+ with token boundaries), "contract" (span of all tokens completely
+ within the character span), "expand" (span of all tokens at least
+ partially covered by the character span). Defaults to "strict".
+ span_id (Union[int, str]): An identifier to associate with the span.
RETURNS (Span): The newly constructed object.
"""
start_idx += self.c.start_char
end_idx += self.c.start_char
- return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector)
+ return self.doc.char_span(start_idx, end_idx, label=label, kb_id=kb_id, vector=vector, alignment_mode=alignment_mode, span_id=span_id)
@property
def conjuncts(self):
diff --git a/spacy/training/__init__.py b/spacy/training/__init__.py
index 71d1fa775..a6f873f05 100644
--- a/spacy/training/__init__.py
+++ b/spacy/training/__init__.py
@@ -1,4 +1,4 @@
-from .corpus import Corpus, JsonlCorpus # noqa: F401
+from .corpus import Corpus, JsonlCorpus, PlainTextCorpus # noqa: F401
from .example import Example, validate_examples, validate_get_examples # noqa: F401
from .alignment import Alignment # noqa: F401
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index 426fddf90..7e2494f5b 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -11,7 +11,7 @@ def create_copy_from_base_model(
) -> Callable[[Language], Language]:
def copy_from_base_model(nlp):
if tokenizer:
- logger.info(f"Copying tokenizer from: {tokenizer}")
+ logger.info("Copying tokenizer from: %s", tokenizer)
base_nlp = load_model(tokenizer)
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
@@ -23,7 +23,7 @@ def create_copy_from_base_model(
)
)
if vocab:
- logger.info(f"Copying vocab from: {vocab}")
+ logger.info("Copying vocab from: %s", vocab)
# only reload if the vocab is from a different model
if tokenizer != vocab:
base_nlp = load_model(vocab)
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index b9f929fcd..086ad831c 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -29,7 +29,7 @@ def create_docbin_reader(
) -> Callable[["Language"], Iterable[Example]]:
if path is None:
raise ValueError(Errors.E913)
- util.logger.debug(f"Loading corpus from path: {path}")
+ util.logger.debug("Loading corpus from path: %s", path)
return Corpus(
path,
gold_preproc=gold_preproc,
@@ -58,6 +58,28 @@ def read_labels(path: Path, *, require: bool = False):
return srsly.read_json(path)
+@util.registry.readers("spacy.PlainTextCorpus.v1")
+def create_plain_text_reader(
+ path: Optional[Path],
+ min_length: int = 0,
+ max_length: int = 0,
+) -> Callable[["Language"], Iterable[Doc]]:
+ """Iterate Example objects from a file or directory of plain text
+ UTF-8 files with one line per doc.
+
+ path (Path): The directory or filename to read from.
+ min_length (int): Minimum document length (in tokens). Shorter documents
+ will be skipped. Defaults to 0, which indicates no limit.
+ max_length (int): Maximum document length (in tokens). Longer documents will
+ be skipped. Defaults to 0, which indicates no limit.
+
+ DOCS: https://spacy.io/api/corpus#plaintextcorpus
+ """
+ if path is None:
+ raise ValueError(Errors.E913)
+ return PlainTextCorpus(path, min_length=min_length, max_length=max_length)
+
+
def walk_corpus(path: Union[str, Path], file_type) -> List[Path]:
path = util.ensure_path(path)
if not path.is_dir() and path.parts[-1].endswith(file_type):
@@ -257,3 +279,52 @@ class JsonlCorpus:
# We don't *need* an example here, but it seems nice to
# make it match the Corpus signature.
yield Example(doc, Doc(nlp.vocab, words=words, spaces=spaces))
+
+
+class PlainTextCorpus:
+ """Iterate Example objects from a file or directory of plain text
+ UTF-8 files with one line per doc.
+
+ path (Path): The directory or filename to read from.
+ min_length (int): Minimum document length (in tokens). Shorter documents
+ will be skipped. Defaults to 0, which indicates no limit.
+ max_length (int): Maximum document length (in tokens). Longer documents will
+ be skipped. Defaults to 0, which indicates no limit.
+
+ DOCS: https://spacy.io/api/corpus#plaintextcorpus
+ """
+
+ file_type = "txt"
+
+ def __init__(
+ self,
+ path: Optional[Union[str, Path]],
+ *,
+ min_length: int = 0,
+ max_length: int = 0,
+ ) -> None:
+ self.path = util.ensure_path(path)
+ self.min_length = min_length
+ self.max_length = max_length
+
+ def __call__(self, nlp: "Language") -> Iterator[Example]:
+ """Yield examples from the data.
+
+ nlp (Language): The current nlp object.
+ YIELDS (Example): The example objects.
+
+ DOCS: https://spacy.io/api/corpus#plaintextcorpus-call
+ """
+ for loc in walk_corpus(self.path, ".txt"):
+ with open(loc, encoding="utf-8") as f:
+ for text in f:
+ text = text.rstrip("\r\n")
+ if len(text):
+ doc = nlp.make_doc(text)
+ if self.min_length >= 1 and len(doc) < self.min_length:
+ continue
+ elif self.max_length >= 1 and len(doc) > self.max_length:
+ continue
+ # We don't *need* an example here, but it seems nice to
+ # make it match the Corpus signature.
+ yield Example(doc, doc.copy())
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 6304e4a84..e90617852 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
frozen_components = T["frozen_components"]
# Sourced components that require resume_training
resume_components = [p for p in sourced if p not in frozen_components]
- logger.info(f"Pipeline: {nlp.pipe_names}")
+ logger.info("Pipeline: %s", nlp.pipe_names)
if resume_components:
with nlp.select_pipes(enable=resume_components):
- logger.info(f"Resuming training for: {resume_components}")
+ logger.info("Resuming training for: %s", resume_components)
nlp.resume_training(sgd=optimizer)
# Make sure that listeners are defined before initializing further
nlp._link_components()
@@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
if T["max_epochs"] == -1:
sample_size = 100
logger.debug(
- f"Due to streamed train corpus, using only first {sample_size} "
- f"examples for initialization. If necessary, provide all labels "
- f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
+ "Due to streamed train corpus, using only first %s examples for initialization. "
+ "If necessary, provide all labels in [initialize]. "
+ "More info: https://spacy.io/api/cli#init_labels",
+ sample_size,
)
nlp.initialize(
lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
)
else:
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
- logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
+ logger.info("Initialized pipeline components: %s", nlp.pipe_names)
# Detect components with listeners that are not frozen consistently
for name, proc in nlp.pipeline:
for listener in getattr(
@@ -109,7 +110,7 @@ def init_vocab(
) -> None:
if lookups:
nlp.vocab.lookups = lookups
- logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
+ logger.info("Added vocab lookups: %s", ", ".join(lookups.tables))
data_path = ensure_path(data)
if data_path is not None:
lex_attrs = srsly.read_jsonl(data_path)
@@ -125,11 +126,11 @@ def init_vocab(
else:
oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob})
- logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
+ logger.info("Added %d lexical entries to the vocab", len(nlp.vocab))
logger.info("Created vocabulary")
if vectors is not None:
load_vectors_into_model(nlp, vectors)
- logger.info(f"Added vectors: {vectors}")
+ logger.info("Added vectors: %s", vectors)
# warn if source model vectors are not identical
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
@@ -191,7 +192,7 @@ def init_tok2vec(
if weights_data is not None:
layer = get_tok2vec_ref(nlp, P)
layer.from_bytes(weights_data)
- logger.info(f"Loaded pretrained weights from {init_tok2vec}")
+ logger.info("Loaded pretrained weights from %s", init_tok2vec)
return True
return False
@@ -216,13 +217,13 @@ def convert_vectors(
nlp.vocab.deduplicate_vectors()
else:
if vectors_loc:
- logger.info(f"Reading vectors from {vectors_loc}")
+ logger.info("Reading vectors from %s", vectors_loc)
vectors_data, vector_keys, floret_settings = read_vectors(
vectors_loc,
truncate,
mode=mode,
)
- logger.info(f"Loaded vectors from {vectors_loc}")
+ logger.info("Loaded vectors from %s", vectors_loc)
else:
vectors_data, vector_keys = (None, None)
if vector_keys is not None and mode != VectorsMode.floret:
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 885257772..eca40e3d9 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -370,6 +370,6 @@ def clean_output_dir(path: Optional[Path]) -> None:
if subdir.exists():
try:
shutil.rmtree(str(subdir))
- logger.debug(f"Removed existing output directory: {subdir}")
+ logger.debug("Removed existing output directory: %s", subdir)
except Exception as e:
raise IOError(Errors.E901.format(path=path)) from e
diff --git a/website/.dockerignore b/website/.dockerignore
new file mode 100644
index 000000000..e4a88552e
--- /dev/null
+++ b/website/.dockerignore
@@ -0,0 +1,9 @@
+.cache/
+.next/
+public/
+node_modules
+.npm
+logs
+*.log
+npm-debug.log*
+quickstart-training-generator.js
diff --git a/website/.gitignore b/website/.gitignore
index 70ef99fa5..599c0953a 100644
--- a/website/.gitignore
+++ b/website/.gitignore
@@ -1,5 +1,7 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
+quickstart-training-generator.js
+
# dependencies
/node_modules
/.pnp
@@ -41,4 +43,4 @@ next-env.d.ts
public/robots.txt
public/sitemap*
public/sw.js*
-public/workbox*
\ No newline at end of file
+public/workbox*
diff --git a/website/Dockerfile b/website/Dockerfile
index f71733e55..9b2f6cac4 100644
--- a/website/Dockerfile
+++ b/website/Dockerfile
@@ -1,16 +1,14 @@
-FROM node:11.15.0
+FROM node:18
-WORKDIR /spacy-io
-
-RUN npm install -g gatsby-cli@2.7.4
-
-COPY package.json .
-COPY package-lock.json .
-
-RUN npm install
+USER node
# This is so the installed node_modules will be up one directory
# from where a user mounts files, so that they don't accidentally mount
# their own node_modules from a different build
# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
-WORKDIR /spacy-io/website/
+WORKDIR /home/node
+COPY --chown=node package.json .
+COPY --chown=node package-lock.json .
+RUN npm install
+
+WORKDIR /home/node/website/
diff --git a/website/README.md b/website/README.md
index e9d7aec26..a434efe9a 100644
--- a/website/README.md
+++ b/website/README.md
@@ -41,33 +41,27 @@ If you'd like to do this, **be sure you do _not_ include your local
`node_modules` folder**, since there are some dependencies that need to be built
for the image system. Rename it before using.
-```bash
-docker run -it \
- -v $(pwd):/spacy-io/website \
- -p 8000:8000 \
- ghcr.io/explosion/spacy-io \
- gatsby develop -H 0.0.0.0
-```
-
-This will allow you to access the built website at http://0.0.0.0:8000/ in your
-browser, and still edit code in your editor while having the site reflect those
-changes.
-
-**Note**: If you're working on a Mac with an M1 processor, you might see
-segfault errors from `qemu` if you use the default image. To fix this use the
-`arm64` tagged image in the `docker run` command
-(ghcr.io/explosion/spacy-io:arm64).
-
-### Building the Docker image
-
-If you'd like to build the image locally, you can do so like this:
+First build the Docker image. This only needs to be done on the first run
+or when changes are made to `Dockerfile` or the website dependencies:
```bash
docker build -t spacy-io .
```
-This will take some time, so if you want to use the prebuilt image you'll save a
-bit of time.
+You can then build and run the website with:
+
+```bash
+docker run -it \
+ --rm \
+ -v $(pwd):/home/node/website \
+ -p 3000:3000 \
+ spacy-io \
+ npm run dev -- -H 0.0.0.0
+```
+
+This will allow you to access the built website at http://0.0.0.0:3000/ in your
+browser, and still edit code in your editor while having the site reflect those
+changes.
## Project structure
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 9fa576fa5..678fe9be8 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -14,6 +14,7 @@ menu:
- ['train', 'train']
- ['pretrain', 'pretrain']
- ['evaluate', 'evaluate']
+ - ['benchmark', 'benchmark']
- ['apply', 'apply']
- ['find-threshold', 'find-threshold']
- ['assemble', 'assemble']
@@ -361,10 +362,10 @@ $ python -m spacy convert [input_file] [output_dir] [--converter] [--file-type]
| `--file-type`, `-t` | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. ~~str (option)~~ |
| `--n-sents`, `-n` | Number of sentences per document. Supported for: `conll`, `conllu`, `iob`, `ner` ~~int (option)~~ |
| `--seg-sents`, `-s` | Segment sentences. Supported for: `conll`, `ner` ~~bool (flag)~~ |
-| `--base`, `-b`, `--model` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str](option)~~ |
+| `--base`, `-b`, `--model` | Trained spaCy pipeline for sentence segmentation to use as base (for `--seg-sents`). ~~Optional[str] (option)~~ |
| `--morphology`, `-m` | Enable appending morphology to tags. Supported for: `conllu` ~~bool (flag)~~ |
| `--merge-subtokens`, `-T` | Merge CoNLL-U subtokens ~~bool (flag)~~ |
-| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path](option)~~ |
+| `--ner-map`, `-nm` | NER tag mapping (as JSON-encoded dict of entity types). Supported for: `conllu` ~~Optional[Path] (option)~~ |
| `--lang`, `-l` | Language code (if tokenizer required). ~~Optional[str] \(option)~~ |
| `--concatenate`, `-C` | Concatenate output to a single file ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
@@ -1227,8 +1228,19 @@ $ python -m spacy pretrain [config_path] [output_dir] [--code] [--resume-path] [
## evaluate {id="evaluate",version="2",tag="command"}
-Evaluate a trained pipeline. Expects a loadable spaCy pipeline (package name or
-path) and evaluation data in the
+The `evaluate` subcommand is superseded by
+[`spacy benchmark accuracy`](#benchmark-accuracy). `evaluate` is provided as an
+alias to `benchmark accuracy` for compatibility.
+
+## benchmark {id="benchmark", version="3.5"}
+
+The `spacy benchmark` CLI includes commands for benchmarking the accuracy and
+speed of your spaCy pipelines.
+
+### accuracy {id="benchmark-accuracy", version="3.5", tag="command"}
+
+Evaluate the accuracy of a trained pipeline. Expects a loadable spaCy pipeline
+(package name or path) and evaluation data in the
[binary `.spacy` format](/api/data-formats#binary-training). The
`--gold-preproc` option sets up the evaluation examples with gold-standard
sentences and tokens for the predictions. Gold preprocessing helps the
@@ -1239,7 +1251,7 @@ skew. To render a sample of dependency parses in a HTML file using the
`--displacy-path` argument.
```bash
-$ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
+$ python -m spacy benchmark accuracy [model] [data_path] [--output] [--code] [--gold-preproc] [--gpu-id] [--displacy-path] [--displacy-limit]
```
| Name | Description |
@@ -1255,6 +1267,29 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Training results and optional metrics and visualizations. |
+### speed {id="benchmark-speed", version="3.5", tag="command"}
+
+Benchmark the speed of a trained pipeline with a 95% confidence interval.
+Expects a loadable spaCy pipeline (package name or path) and benchmark data in
+the [binary `.spacy` format](/api/data-formats#binary-training). The pipeline is
+warmed up before any measurements are taken.
+
+```cli
+$ python -m spacy benchmark speed [model] [data_path] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup]
+```
+
+| Name | Description |
+| -------------------- | -------------------------------------------------------------------------------------------------------- |
+| `model` | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ |
+| `data_path` | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
+| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~ |
+| `--no-shuffle` | Do not shuffle documents in the benchmark data. ~~bool (flag)~~ |
+| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
+| `--batches` | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~ |
+| `--warmup`, `-w` | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~ |
+| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
+| **PRINTS** | Pipeline speed in words per second with a 95% confidence interval. |
+
## apply {id="apply", version="3.5", tag="command"}
Applies a trained pipeline to data and stores the resulting annotated documents
@@ -1268,23 +1303,23 @@ input formats are:
When a directory is provided it is traversed recursively to collect all files.
-```cli
+```bash
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
```
-| Name | Description |
-| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
-| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
-| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
-| `--code`, `-c`
+
```markdown
[](https://spacy.io)
@@ -575,8 +578,9 @@ project is using spaCy, you can grab one of our **spaCy badges** here:
```markdown
-[](https://spacy.io)
+[](https://spacy.io)
```
diff --git a/website/docs/usage/v3-5.mdx b/website/docs/usage/v3-5.mdx
new file mode 100644
index 000000000..3ca64f8a2
--- /dev/null
+++ b/website/docs/usage/v3-5.mdx
@@ -0,0 +1,230 @@
+---
+title: What's New in v3.5
+teaser: New features and how to upgrade
+menu:
+ - ['New Features', 'features']
+ - ['Upgrading Notes', 'upgrading']
+---
+
+## New features {id="features",hidden="true"}
+
+spaCy v3.5 introduces three new CLI commands, `apply`, `benchmark` and
+`find-threshold`, adds fuzzy matching, provides improvements to our entity
+linking functionality, and includes a range of language updates and bug fixes.
+
+### New CLI commands {id="cli"}
+
+#### apply CLI
+
+The [`apply` CLI](/api/cli#apply) can be used to apply a pipeline to one or more
+`.txt`, `.jsonl` or `.spacy` input files, saving the annotated docs in a single
+`.spacy` file.
+
+```bash
+$ spacy apply en_core_web_sm my_texts/ output.spacy
+```
+
+#### benchmark CLI
+
+The [`benchmark` CLI](/api/cli#benchmark) has been added to extend the existing
+`evaluate` functionality with a wider range of profiling subcommands.
+
+The `benchmark accuracy` CLI is introduced as an alias for `evaluate`. The new
+`benchmark speed` CLI performs warmup rounds before measuring the speed in words
+per second on batches of randomly shuffled documents from the provided data.
+
+```bash
+$ spacy benchmark speed my_pipeline data.spacy
+```
+
+The output is the mean performance using batches (`nlp.pipe`) with a 95%
+confidence interval, e.g., profiling `en_core_web_sm` on CPU:
+
+```none
+Outliers: 2.0%, extreme outliers: 0.0%
+Mean: 18904.1 words/s (95% CI: -256.9 +244.1)
+```
+
+#### find-threshold CLI
+
+The [`find-threshold` CLI](/api/cli#find-threshold) runs a series of trials
+across threshold values from `0.0` to `1.0` and identifies the best threshold
+for the provided score metric.
+
+The following command runs 20 trials for the `spancat` component in
+`my_pipeline`, recording the `spans_sc_f` score for each value of the threshold
+`[components.spancat.threshold]` from `0.0` to `1.0`:
+
+```bash
+$ spacy find-threshold my_pipeline data.spacy spancat threshold spans_sc_f --n_trials 20
+```
+
+The `find-threshold` CLI can be used with `textcat_multilabel`, `spancat` and
+custom components with thresholds that are applied while predicting or scoring.
+
+### Fuzzy matching {id="fuzzy"}
+
+New `FUZZY` operators support [fuzzy matching](/usage/rule-based-matching#fuzzy)
+with the `Matcher`. By default, the `FUZZY` operator allows a Levenshtein edit
+distance of 2 and up to 30% of the pattern string length. `FUZZY1`..`FUZZY9` can
+be used to specify the exact number of allowed edits.
+
+```python
+# Match lowercase with fuzzy matching (allows up to 3 edits)
+pattern = [{"LOWER": {"FUZZY": "definitely"}}]
+
+# Match custom attribute values with fuzzy matching (allows up to 3 edits)
+pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
+
+# Match with exact Levenshtein edit distance limits (allows up to 4 edits)
+pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
+```
+
+Note that `FUZZY` uses Levenshtein edit distance rather than Damerau-Levenshtein
+edit distance, so a transposition like `teh` for `the` counts as two edits, one
+insertion and one deletion.
+
+If you'd prefer an alternate fuzzy matching algorithm, you can provide your own
+custom method to the `Matcher` or as a config option for an entity ruler and
+span ruler.
+
+### FUZZY and REGEX with lists {id="fuzzy-regex-lists"}
+
+The `FUZZY` and `REGEX` operators are also now supported for lists with `IN` and
+`NOT_IN`:
+
+```python
+pattern = [{"TEXT": {"FUZZY": {"IN": ["awesome", "cool", "wonderful"]}}}]
+pattern = [{"TEXT": {"REGEX": {"NOT_IN": ["^awe(some)?$", "^wonder(ful)?"]}}}]
+```
+
+### Entity linking generalization {id="el"}
+
+The knowledge base used for entity linking is now easier to customize and has a
+new default implementation [`InMemoryLookupKB`](/api/inmemorylookupkb).
+
+### Additional features and improvements {id="additional-features-and-improvements"}
+
+- Language updates:
+ - Extended support for Slovenian
+ - Fixed lookup fallback for French and Catalan lemmatizers
+ - Switch Russian and Ukrainian lemmatizers to `pymorphy3`
+ - Support for editorial punctuation in Ancient Greek
+ - Update to Russian tokenizer exceptions
+ - Small fix for Dutch stop words
+- Allow up to `typer` v0.7.x, `mypy` 0.990 and `typing_extensions` v4.4.x.
+- New `spacy.ConsoleLogger.v3` with expanded progress
+ [tracking](/api/top-level#ConsoleLogger).
+- Improved scoring behavior for `textcat` with `spacy.textcat_scorer.v2` and
+ `spacy.textcat_multilabel_scorer.v2`.
+- Updates so that downstream components can train properly on a frozen `tok2vec`
+ or `transformer` layer.
+- Allow interpolation of variables in directory names in projects.
+- Support for local file system [remotes](/usage/projects#remote) for projects.
+- Improve UX around `displacy.serve` when the default port is in use.
+- Optional `before_update` callback that is invoked at the start of each
+ [training step](/api/data-formats#config-training).
+- Improve performance of `SpanGroup` and fix typing issues for `SpanGroup` and
+ `Span` objects.
+- Patch a
+ [security vulnerability](https://github.com/advisories/GHSA-gw9q-c7gh-j9vm) in
+ extracting tar files.
+- Add equality definition for `Vectors`.
+- Ensure `Vocab.to_disk` respects the exclude setting for `lookups` and
+ `vectors`.
+- Correctly handle missing annotations in the edit tree lemmatizer.
+
+### Trained pipeline updates {id="pipelines"}
+
+- The CNN pipelines add `IS_SPACE` as a `tok2vec` feature for `tagger` and
+ `morphologizer` components to improve tagging of non-whitespace vs. whitespace
+ tokens.
+- The transformer pipelines require `spacy-transformers` v1.2, which uses the
+ exact alignment from `tokenizers` for fast tokenizers instead of the heuristic
+ alignment from `spacy-alignments`. For all trained pipelines except
+ `ja_core_news_trf`, the alignments between spaCy tokens and transformer tokens
+ may be slightly different. More details about the `spacy-transformers` changes
+ in the
+ [v1.2.0 release notes](https://github.com/explosion/spacy-transformers/releases/tag/v1.2.0).
+
+## Notes about upgrading from v3.4 {id="upgrading"}
+
+### Validation of textcat values {id="textcat-validation"}
+
+An error is now raised when unsupported values are given as input to train a
+`textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
+as explained in the [docs](/api/textcategorizer#assigned-attributes).
+
+### Using the default knowledge base
+
+As `KnowledgeBase` is now an abstract class, you should call the constructor of
+the new `InMemoryLookupKB` instead when you want to use spaCy's default KB
+implementation:
+
+```diff
+- kb = KnowledgeBase()
++ kb = InMemoryLookupKB()
+```
+
+If you've written a custom KB that inherits from `KnowledgeBase`, you'll need to
+implement its abstract methods, or alternatively inherit from `InMemoryLookupKB`
+instead.
+
+### Updated scorers for tokenization and textcat {id="scores"}
+
+We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported
+`token_acc` will drop from v3.4 to v3.5, but if `token_p/r/f` stay the same,
+your tokenization performance has not changed from v3.4.
+
+For new `textcat` or `textcat_multilabel` configs, the new default `v2` scorers:
+
+- ignore `threshold` for `textcat`, so the reported `cats_p/r/f` may increase
+ slightly in v3.5 even though the underlying predictions are unchanged
+- report the performance of only the **final** `textcat` or `textcat_multilabel`
+ component in the pipeline by default
+- allow custom scorers to be used to score multiple `textcat` and
+ `textcat_multilabel` components with `Scorer.score_cats` by restricting the
+ evaluation to the component's provided labels
+
+### Pipeline package version compatibility {id="version-compat"}
+
+> #### Using legacy implementations
+>
+> In spaCy v3, you'll still be able to load and reference legacy implementations
+> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
+> components or architectures change and newer versions are available in the
+> core library.
+
+When you're loading a pipeline package trained with an earlier version of spaCy
+v3, you will see a warning telling you that the pipeline may be incompatible.
+This doesn't necessarily have to be true, but we recommend running your
+pipelines against your test suite or evaluation data to make sure there are no
+unexpected results.
+
+If you're using one of the [trained pipelines](/models) we provide, you should
+run [`spacy download`](/api/cli#download) to update to the latest version. To
+see an overview of all installed packages and their compatibility, you can run
+[`spacy validate`](/api/cli#validate).
+
+If you've trained your own custom pipeline and you've confirmed that it's still
+working as expected, you can update the spaCy version requirements in the
+[`meta.json`](/api/data-formats#meta):
+
+```diff
+- "spacy_version": ">=3.4.0,<3.5.0",
++ "spacy_version": ">=3.4.0,<3.6.0",
+```
+
+### Updating v3.4 configs
+
+To update a config from spaCy v3.4 with the new v3.5 settings, run
+[`init fill-config`](/api/cli#init-fill-config):
+
+```cli
+$ python -m spacy init fill-config config-v3.4.cfg config-v3.5.cfg
+```
+
+In many cases ([`spacy train`](/api/cli#train),
+[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
+automatically, but you'll need to fill in the new settings to run
+[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx
index f1ff6dd3d..1d3682af4 100644
--- a/website/docs/usage/visualizers.mdx
+++ b/website/docs/usage/visualizers.mdx
@@ -437,6 +437,6 @@ Alternatively, if you're using [Streamlit](https://streamlit.io), check out the
helps you integrate spaCy visualizations into your apps. It includes a full
embedded visualizer, as well as individual components.
-
+
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 339e4085b..b5c555da6 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -13,7 +13,8 @@
{ "text": "New in v3.1", "url": "/usage/v3-1" },
{ "text": "New in v3.2", "url": "/usage/v3-2" },
{ "text": "New in v3.3", "url": "/usage/v3-3" },
- { "text": "New in v3.4", "url": "/usage/v3-4" }
+ { "text": "New in v3.4", "url": "/usage/v3-4" },
+ { "text": "New in v3.5", "url": "/usage/v3-5" }
]
},
{
@@ -129,6 +130,7 @@
"items": [
{ "text": "Attributes", "url": "/api/attributes" },
{ "text": "Corpus", "url": "/api/corpus" },
+ { "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
{ "text": "KnowledgeBase", "url": "/api/kb" },
{ "text": "Lookups", "url": "/api/lookups" },
{ "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },
diff --git a/website/meta/site.json b/website/meta/site.json
index 5dcb89443..3d4f2d5ee 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -27,7 +27,7 @@
"indexName": "spacy"
},
"binderUrl": "explosion/spacy-io-binder",
- "binderVersion": "3.4",
+ "binderVersion": "3.5",
"sections": [
{ "id": "usage", "title": "Usage Documentation", "theme": "blue" },
{ "id": "models", "title": "Models Documentation", "theme": "blue" },
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 43a78d609..16e3bc361 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -2377,7 +2377,7 @@
"author": "Nikita Kitaev",
"author_links": {
"github": "nikitakit",
- "website": " http://kitaev.io"
+ "website": "http://kitaev.io"
},
"category": ["research", "pipeline"]
},
diff --git a/website/pages/_app.tsx b/website/pages/_app.tsx
index 8db80a672..a837d9ce8 100644
--- a/website/pages/_app.tsx
+++ b/website/pages/_app.tsx
@@ -17,7 +17,7 @@ export default function App({ Component, pageProps }: AppProps) {
diff --git a/website/pages/index.tsx b/website/pages/index.tsx
index 170bca137..fc0dba378 100644
--- a/website/pages/index.tsx
+++ b/website/pages/index.tsx
@@ -13,7 +13,7 @@ import {
LandingBanner,
} from '../src/components/landing'
import { H2 } from '../src/components/typography'
-import { InlineCode } from '../src/components/code'
+import { InlineCode } from '../src/components/inlineCode'
import { Ul, Li } from '../src/components/list'
import Button from '../src/components/button'
import Link from '../src/components/link'
@@ -89,8 +89,8 @@ const Landing = () => {
-
+
diff --git a/website/src/components/accordion.js b/website/src/components/accordion.js
index 504f415a5..9ff145bd2 100644
--- a/website/src/components/accordion.js
+++ b/website/src/components/accordion.js
@@ -33,7 +33,7 @@ export default function Accordion({ title, id, expanded = false, spaced = false,
event.stopPropagation()}
>
¶
diff --git a/website/src/components/card.js b/website/src/components/card.js
index 9eb597b7b..ef43eb866 100644
--- a/website/src/components/card.js
+++ b/website/src/components/card.js
@@ -1,6 +1,7 @@
import React from 'react'
import PropTypes from 'prop-types'
import classNames from 'classnames'
+import ImageNext from 'next/image'
import Link from './link'
import { H5 } from './typography'
@@ -10,7 +11,7 @@ export default function Card({ title, to, image, header, small, onClick, childre
return (
{image && (
)}
-
+
{children}
+
-
-)
-
-export default CodeBlock
-
-export const Pre = (props) => {
- return
-
{props.children}
-}
-
-export const InlineCode = ({ wrap = false, className, children, ...props }) => {
- const codeClassNames = classNames(classes['inline-code'], className, {
- [classes['wrap']]: wrap || (isString(children) && children.length >= WRAP_THRESHOLD),
- })
- return (
-
- {children}
-
- )
-}
-
-InlineCode.propTypes = {
- wrap: PropTypes.bool,
- className: PropTypes.string,
- children: PropTypes.node,
-}
-
-function linkType(el, showLink = true) {
- if (!isString(el) || !el.length) return el
- const elStr = el.trim()
- if (!elStr) return el
- const typeUrl = CUSTOM_TYPES[elStr]
- const url = typeUrl == true ? DEFAULT_TYPE_URL : typeUrl
- const ws = el[0] == ' '
- return url && showLink ? (
- {props.children}
+}
+
+const CodeBlock = (props) => (
+
+
+)
+export default CodeBlock
diff --git a/website/src/components/codeDynamic.js b/website/src/components/codeDynamic.js
new file mode 100644
index 000000000..8c9483567
--- /dev/null
+++ b/website/src/components/codeDynamic.js
@@ -0,0 +1,5 @@
+import dynamic from 'next/dynamic'
+
+export default dynamic(() => import('./code'), {
+ loading: () =>
+
{data.spacy_version &&
- ))}
+
+