diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index d0db75f9a..ed69f611b 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -52,17 +52,17 @@ steps:
python -W error -c "import spacy"
displayName: "Test import"
-# - script: |
-# python -m spacy download ca_core_news_sm
-# python -m spacy download ca_core_news_md
-# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
-# displayName: 'Test download CLI'
-# condition: eq(variables['python_version'], '3.8')
-#
-# - script: |
-# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
-# displayName: 'Test no warnings on load (#11713)'
-# condition: eq(variables['python_version'], '3.8')
+ - script: |
+ python -m spacy download ca_core_news_sm
+ python -m spacy download ca_core_news_md
+ python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
+ displayName: 'Test download CLI'
+ condition: eq(variables['python_version'], '3.8')
+
+ - script: |
+ python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
+ displayName: 'Test no warnings on load (#11713)'
+ condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@@ -86,17 +86,17 @@ steps:
displayName: 'Test train CLI'
condition: eq(variables['python_version'], '3.8')
-# - script: |
-# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
-# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
-# displayName: 'Test assemble CLI'
-# condition: eq(variables['python_version'], '3.8')
-#
-# - script: |
-# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
-# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
-# displayName: 'Test assemble CLI vectors warning'
-# condition: eq(variables['python_version'], '3.8')
+ - script: |
+ python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
+ PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
+ displayName: 'Test assemble CLI'
+ condition: eq(variables['python_version'], '3.8')
+
+ - script: |
+ python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
+ python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
+ displayName: 'Test assemble CLI vectors warning'
+ condition: eq(variables['python_version'], '3.8')
- script: |
python -m pip install -U -r requirements.txt
diff --git a/.gitignore b/.gitignore
index ac333f958..af75a4d47 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg
spacy/tests/package/pyproject.toml
spacy/tests/package/requirements.txt
-# Website
-website/.cache/
-website/public/
-website/node_modules
-website/.npm
-website/logs
-*.log
-npm-debug.log*
-quickstart-training-generator.js
-
# Cython / C extensions
cythonize.json
spacy/*.html
diff --git a/README.md b/README.md
index 195424551..49aa6796e 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
-💫 **Version 3.4 out now!**
+💫 **Version 3.5 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)
diff --git a/requirements.txt b/requirements.txt
index 5bc1c8684..1bd4518af 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -22,7 +22,7 @@ langcodes>=3.2.0,<4.0.0
# Official Python utilities
setuptools
packaging>=20.0
-typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"
+typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
# Development dependencies
pre-commit>=2.13.0
cython>=0.25,<3.0
diff --git a/setup.cfg b/setup.cfg
index 79dff9e30..cddc5148c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -63,7 +63,7 @@ install_requires =
# Official Python utilities
setuptools
packaging>=20.0
- typing_extensions>=3.7.4,<4.2.0; python_version < "3.8"
+ typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
langcodes>=3.2.0,<4.0.0
[options.entry_points]
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index a3cfd96dd..ea6bba2c9 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -106,9 +106,7 @@ def serve(
if is_in_jupyter():
warnings.warn(Warnings.W011)
- render(
- docs, style=style, page=page, minify=minify, options=options, manual=manual
- )
+ render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
httpd = simple_server.make_server(host, port, app)
print(f"\nUsing the '{style}' visualizer")
print(f"Serving on http://{host}:{port} ...\n")
diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx
index 485e52c2f..edba523cf 100644
--- a/spacy/kb/kb_in_memory.pyx
+++ b/spacy/kb/kb_in_memory.pyx
@@ -25,7 +25,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
to support entity linking of named entities to real-world concepts.
- DOCS: https://spacy.io/api/kb_in_memory
+ DOCS: https://spacy.io/api/inmemorylookupkb
"""
def __init__(self, Vocab vocab, entity_vector_length):
diff --git a/spacy/matcher/levenshtein.pyx b/spacy/matcher/levenshtein.pyx
index 0e8cd26da..e823ce99d 100644
--- a/spacy/matcher/levenshtein.pyx
+++ b/spacy/matcher/levenshtein.pyx
@@ -22,7 +22,7 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
max_edits = fuzzy
else:
# allow at least two edits (to allow at least one transposition) and up
- # to 20% of the pattern string length
+ # to 30% of the pattern string length
max_edits = max(2, round(0.3 * len(pattern_text)))
return levenshtein(input_text, pattern_text, max_edits) <= max_edits
diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi
index 77ea7b7a6..48922865b 100644
--- a/spacy/matcher/matcher.pyi
+++ b/spacy/matcher/matcher.pyi
@@ -5,8 +5,12 @@ from ..vocab import Vocab
from ..tokens import Doc, Span
class Matcher:
- def __init__(self, vocab: Vocab, validate: bool = ...,
- fuzzy_compare: Callable[[str, str, int], bool] = ...) -> None: ...
+ def __init__(
+ self,
+ vocab: Vocab,
+ validate: bool = ...,
+ fuzzy_compare: Callable[[str, str, int], bool] = ...,
+ ) -> None: ...
def __reduce__(self) -> Any: ...
def __len__(self) -> int: ...
def __contains__(self, key: str) -> bool: ...
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index e83fe63ba..332badd8c 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -5,8 +5,8 @@ from itertools import islice
import numpy as np
import srsly
-from thinc.api import Config, Model, SequenceCategoricalCrossentropy
-from thinc.types import Floats2d, Ints1d, Ints2d
+from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps
+from thinc.types import Floats2d, Ints2d
from ._edit_tree_internals.edit_trees import EditTrees
from ._edit_tree_internals.schemas import validate_edit_tree
@@ -20,6 +20,10 @@ from ..vocab import Vocab
from .. import util
+# The cutoff value of *top_k* above which an alternative method is used to process guesses.
+TOP_K_GUARDRAIL = 20
+
+
default_model_config = """
[model]
@architectures = "spacy.Tagger.v2"
@@ -115,6 +119,7 @@ class EditTreeLemmatizer(TrainablePipe):
self.cfg: Dict[str, Any] = {"labels": []}
self.scorer = scorer
+ self.numpy_ops = NumpyOps()
def get_loss(
self, examples: Iterable[Example], scores: List[Floats2d]
@@ -144,6 +149,18 @@ class EditTreeLemmatizer(TrainablePipe):
return float(loss), d_scores
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
+ if self.top_k == 1:
+ scores2guesses = self._scores2guesses_top_k_equals_1
+ elif self.top_k <= TOP_K_GUARDRAIL:
+ scores2guesses = self._scores2guesses_top_k_greater_1
+ else:
+ scores2guesses = self._scores2guesses_top_k_guardrail
+ # The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
+ # of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
+ # for its principal purpose of lemmatizing tokens. However, the code could also
+ # be used for other purposes, and with very large values of *top_k* the method
+ # becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
+ # instead.
n_docs = len(list(docs))
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
@@ -153,20 +170,52 @@ class EditTreeLemmatizer(TrainablePipe):
return guesses
scores = self.model.predict(docs)
assert len(scores) == n_docs
- guesses = self._scores2guesses(docs, scores)
+ guesses = scores2guesses(docs, scores)
assert len(guesses) == n_docs
return guesses
- def _scores2guesses(self, docs, scores):
+ def _scores2guesses_top_k_equals_1(self, docs, scores):
guesses = []
for doc, doc_scores in zip(docs, scores):
- if self.top_k == 1:
- doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
- else:
- doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
+ doc_guesses = doc_scores.argmax(axis=1)
+ doc_guesses = self.numpy_ops.asarray(doc_guesses)
- if not isinstance(doc_guesses, np.ndarray):
- doc_guesses = doc_guesses.get()
+ doc_compat_guesses = []
+ for i, token in enumerate(doc):
+ tree_id = self.cfg["labels"][doc_guesses[i]]
+ if self.trees.apply(tree_id, token.text) is not None:
+ doc_compat_guesses.append(tree_id)
+ else:
+ doc_compat_guesses.append(-1)
+ guesses.append(np.array(doc_compat_guesses))
+
+ return guesses
+
+ def _scores2guesses_top_k_greater_1(self, docs, scores):
+ guesses = []
+ top_k = min(self.top_k, len(self.labels))
+ for doc, doc_scores in zip(docs, scores):
+ doc_scores = self.numpy_ops.asarray(doc_scores)
+ doc_compat_guesses = []
+ for i, token in enumerate(doc):
+ for _ in range(top_k):
+ candidate = int(doc_scores[i].argmax())
+ candidate_tree_id = self.cfg["labels"][candidate]
+ if self.trees.apply(candidate_tree_id, token.text) is not None:
+ doc_compat_guesses.append(candidate_tree_id)
+ break
+ doc_scores[i, candidate] = np.finfo(np.float32).min
+ else:
+ doc_compat_guesses.append(-1)
+ guesses.append(np.array(doc_compat_guesses))
+
+ return guesses
+
+ def _scores2guesses_top_k_guardrail(self, docs, scores):
+ guesses = []
+ for doc, doc_scores in zip(docs, scores):
+ doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
+ doc_guesses = self.numpy_ops.asarray(doc_guesses)
doc_compat_guesses = []
for token, candidates in zip(doc, doc_guesses):
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 3675c12dd..140592dcd 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -163,15 +163,33 @@ class TokenPatternString(BaseModel):
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy")
- FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy1")
- FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy2")
- FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy3")
- FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy4")
- FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy5")
- FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy6")
- FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy7")
- FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy8")
- FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy9")
+ FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy1"
+ )
+ FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy2"
+ )
+ FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy3"
+ )
+ FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy4"
+ )
+ FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy5"
+ )
+ FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy6"
+ )
+ FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy7"
+ )
+ FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy8"
+ )
+ FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
+ None, alias="fuzzy9"
+ )
class Config:
extra = "forbid"
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index c4f9b09f3..128d75680 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -101,14 +101,15 @@ def test_initialize_from_labels():
}
-def test_no_data():
+@pytest.mark.parametrize("top_k", (1, 5, 30))
+def test_no_data(top_k):
# Test that the lemmatizer provides a nice error when there's no tagging data / labels
TEXTCAT_DATA = [
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
]
nlp = English()
- nlp.add_pipe("trainable_lemmatizer")
+ nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
nlp.add_pipe("textcat")
train_examples = []
@@ -119,10 +120,11 @@ def test_no_data():
nlp.initialize(get_examples=lambda: train_examples)
-def test_incomplete_data():
+@pytest.mark.parametrize("top_k", (1, 5, 30))
+def test_incomplete_data(top_k):
# Test that the lemmatizer works with incomplete information
nlp = English()
- lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+ lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
lemmatizer.min_tree_freq = 1
train_examples = []
for t in PARTIAL_DATA:
@@ -154,9 +156,10 @@ def test_incomplete_data():
assert xp.count_nonzero(dX[1][1]) == 0
-def test_overfitting_IO():
+@pytest.mark.parametrize("top_k", (1, 5, 30))
+def test_overfitting_IO(top_k):
nlp = English()
- lemmatizer = nlp.add_pipe("trainable_lemmatizer")
+ lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
lemmatizer.min_tree_freq = 1
train_examples = []
for t in TRAIN_DATA:
@@ -189,7 +192,7 @@ def test_overfitting_IO():
# Check model after a {to,from}_bytes roundtrip
nlp_bytes = nlp.to_bytes()
nlp3 = English()
- nlp3.add_pipe("trainable_lemmatizer")
+ nlp3.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
nlp3.from_bytes(nlp_bytes)
doc3 = nlp3(test_text)
assert doc3[0].lemma_ == "she"
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index c88e20de2..d00f66c60 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -618,7 +618,6 @@ def test_string_to_list_intify(value):
assert string_to_list(value, intify=True) == [1, 2, 3]
-@pytest.mark.skip(reason="Temporarily skip for dev version")
def test_download_compatibility():
spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False
@@ -629,7 +628,6 @@ def test_download_compatibility():
assert get_minor_version(about.__version__) == get_minor_version(version)
-@pytest.mark.skip(reason="Temporarily skip for dev version")
def test_validate_compatibility_table():
spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False
diff --git a/website/.dockerignore b/website/.dockerignore
new file mode 100644
index 000000000..e4a88552e
--- /dev/null
+++ b/website/.dockerignore
@@ -0,0 +1,9 @@
+.cache/
+.next/
+public/
+node_modules
+.npm
+logs
+*.log
+npm-debug.log*
+quickstart-training-generator.js
diff --git a/website/.gitignore b/website/.gitignore
index 70ef99fa5..599c0953a 100644
--- a/website/.gitignore
+++ b/website/.gitignore
@@ -1,5 +1,7 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
+quickstart-training-generator.js
+
# dependencies
/node_modules
/.pnp
@@ -41,4 +43,4 @@ next-env.d.ts
public/robots.txt
public/sitemap*
public/sw.js*
-public/workbox*
\ No newline at end of file
+public/workbox*
diff --git a/website/Dockerfile b/website/Dockerfile
index f71733e55..9b2f6cac4 100644
--- a/website/Dockerfile
+++ b/website/Dockerfile
@@ -1,16 +1,14 @@
-FROM node:11.15.0
+FROM node:18
-WORKDIR /spacy-io
-
-RUN npm install -g gatsby-cli@2.7.4
-
-COPY package.json .
-COPY package-lock.json .
-
-RUN npm install
+USER node
# This is so the installed node_modules will be up one directory
# from where a user mounts files, so that they don't accidentally mount
# their own node_modules from a different build
# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
-WORKDIR /spacy-io/website/
+WORKDIR /home/node
+COPY --chown=node package.json .
+COPY --chown=node package-lock.json .
+RUN npm install
+
+WORKDIR /home/node/website/
diff --git a/website/README.md b/website/README.md
index e9d7aec26..a434efe9a 100644
--- a/website/README.md
+++ b/website/README.md
@@ -41,33 +41,27 @@ If you'd like to do this, **be sure you do _not_ include your local
`node_modules` folder**, since there are some dependencies that need to be built
for the image system. Rename it before using.
-```bash
-docker run -it \
- -v $(pwd):/spacy-io/website \
- -p 8000:8000 \
- ghcr.io/explosion/spacy-io \
- gatsby develop -H 0.0.0.0
-```
-
-This will allow you to access the built website at http://0.0.0.0:8000/ in your
-browser, and still edit code in your editor while having the site reflect those
-changes.
-
-**Note**: If you're working on a Mac with an M1 processor, you might see
-segfault errors from `qemu` if you use the default image. To fix this use the
-`arm64` tagged image in the `docker run` command
-(ghcr.io/explosion/spacy-io:arm64).
-
-### Building the Docker image
-
-If you'd like to build the image locally, you can do so like this:
+First build the Docker image. This only needs to be done on the first run
+or when changes are made to `Dockerfile` or the website dependencies:
```bash
docker build -t spacy-io .
```
-This will take some time, so if you want to use the prebuilt image you'll save a
-bit of time.
+You can then build and run the website with:
+
+```bash
+docker run -it \
+ --rm \
+ -v $(pwd):/home/node/website \
+ -p 3000:3000 \
+ spacy-io \
+ npm run dev -- -H 0.0.0.0
+```
+
+This will allow you to access the built website at http://0.0.0.0:3000/ in your
+browser, and still edit code in your editor while having the site reflect those
+changes.
## Project structure
diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx
index 5f10c8bc0..ca4023101 100644
--- a/website/docs/api/cli.mdx
+++ b/website/docs/api/cli.mdx
@@ -1215,19 +1215,19 @@ When a directory is provided it is traversed recursively to collect all files.
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
```
-| Name | Description |
-| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
-| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
-| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
-| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
-| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
-| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
-| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
-| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
-| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
-| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
-| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
+| Name | Description |
+| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
+| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
+| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
+| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
+| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
+| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
+| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
+| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
+| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
+| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
## find-threshold {id="find-threshold",version="3.5",tag="command"}
diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx
index 5c30d252e..bafb2f2da 100644
--- a/website/docs/api/entitylinker.mdx
+++ b/website/docs/api/entitylinker.mdx
@@ -15,7 +15,7 @@ world". It requires a `KnowledgeBase`, as well as a function to generate
plausible candidates from that `KnowledgeBase` given a certain textual mention,
and a machine learning model to pick the right candidate, given the local
context of the mention. `EntityLinker` defaults to using the
-[`InMemoryLookupKB`](/api/kb_in_memory) implementation.
+[`InMemoryLookupKB`](/api/inmemorylookupkb) implementation.
## Assigned Attributes {id="assigned-attributes"}
diff --git a/website/docs/api/kb_in_memory.mdx b/website/docs/api/inmemorylookupkb.mdx
similarity index 96%
rename from website/docs/api/kb_in_memory.mdx
rename to website/docs/api/inmemorylookupkb.mdx
index e85b63c45..c24fe78d6 100644
--- a/website/docs/api/kb_in_memory.mdx
+++ b/website/docs/api/inmemorylookupkb.mdx
@@ -43,7 +43,7 @@ The length of the fixed-size entity vectors in the knowledge base.
Add an entity to the knowledge base, specifying its corpus frequency and entity
vector, which should be of length
-[`entity_vector_length`](/api/kb_in_memory#entity_vector_length).
+[`entity_vector_length`](/api/inmemorylookupkb#entity_vector_length).
> #### Example
>
@@ -79,8 +79,9 @@ frequency and entity vector for each entity.
Add an alias or mention to the knowledge base, specifying its potential KB
identifiers and their prior probabilities. The entity identifiers should refer
-to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity)
-or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior
+to entities previously added with
+[`add_entity`](/api/inmemorylookupkb#add_entity) or
+[`set_entities`](/api/inmemorylookupkb#set_entities). The sum of the prior
probabilities should not exceed 1. Note that an empty string can not be used as
alias.
@@ -156,7 +157,7 @@ Get a list of all aliases in the knowledge base.
Given a certain textual mention as input, retrieve a list of candidate entities
of type [`Candidate`](/api/kb#candidate). Wraps
-[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
> #### Example
>
@@ -174,7 +175,7 @@ of type [`Candidate`](/api/kb#candidate). Wraps
## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
-Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an
+Same as [`get_candidates()`](/api/inmemorylookupkb#get_candidates), but for an
arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
will call `get_candidates_batch()` instead of `get_candidates()`, if the config
parameter `candidates_batch_size` is greater or equal than 1.
@@ -231,7 +232,7 @@ Given a certain entity ID, retrieve its pretrained entity vector.
## InMemoryLookupKB.get_vectors {id="get_vectors",tag="method"}
-Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary
+Same as [`get_vector()`](/api/inmemorylookupkb#get_vector), but for an arbitrary
number of entity IDs.
The default implementation of `get_vectors()` executes `get_vector()` in a loop.
diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx
index 887b7fe97..2b0d4d9d6 100644
--- a/website/docs/api/kb.mdx
+++ b/website/docs/api/kb.mdx
@@ -21,8 +21,8 @@ functions called by the [`EntityLinker`](/api/entitylinker) component.
This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
-implementation up to that point is available as `InMemoryLookupKB` from 3.5
-onwards.
+implementation up to that point is available as
+[`InMemoryLookupKB`](/api/inmemorylookupkb) from 3.5 onwards.
@@ -110,14 +110,15 @@ to you.
From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
-[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow
-more flexibility in customizing knowledge bases. Some of its methods were moved
-to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those
-being `get_alias_candidates()`. This method is now available as
-[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
-Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates)
+[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
+allow more flexibility in customizing knowledge bases. Some of its methods were
+moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
+one of those being `get_alias_candidates()`. This method is now available as
+[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
+Note:
+[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
defaults to
-[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
+[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
## KnowledgeBase.get_vector {id="get_vector",tag="method"}
diff --git a/website/docs/usage/101/_architecture.mdx b/website/docs/usage/101/_architecture.mdx
index 5727c6921..2a63a3741 100644
--- a/website/docs/usage/101/_architecture.mdx
+++ b/website/docs/usage/101/_architecture.mdx
@@ -79,7 +79,7 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
| [`KnowledgeBase`](/api/kb) | Abstract base class for storage and retrieval of data for entity linking. |
-| [`InMemoryLookupKB`](/api/kb_in_memory) | Implementation of `KnowledgeBase` storing all data in memory. |
+| [`InMemoryLookupKB`](/api/inmemorylookupkb) | Implementation of `KnowledgeBase` storing all data in memory. |
| [`Candidate`](/api/kb#candidate) | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`. |
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx
index 8c9de0d79..08d2b3b91 100644
--- a/website/docs/usage/rule-based-matching.mdx
+++ b/website/docs/usage/rule-based-matching.mdx
@@ -384,14 +384,14 @@ the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum
allowed edit distance directly.
```python
-# Match lowercase with fuzzy matching (allows 2 edits)
+# Match lowercase with fuzzy matching (allows 3 edits)
pattern = [{"LOWER": {"FUZZY": "definitely"}}]
-# Match custom attribute values with fuzzy matching (allows 2 edits)
+# Match custom attribute values with fuzzy matching (allows 3 edits)
pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
-# Match with exact Levenshtein edit distance limits (allows 3 edits)
-pattern = [{"_": {"country": {"FUZZY3": "Kyrgyzstan"}}}]
+# Match with exact Levenshtein edit distance limits (allows 4 edits)
+pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
```
#### Regex and fuzzy matching with lists {id="regex-fuzzy-lists", version="3.5"}
diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx
index 87735fed1..e0daebe35 100644
--- a/website/docs/usage/saving-loading.mdx
+++ b/website/docs/usage/saving-loading.mdx
@@ -684,10 +684,15 @@ If your pipeline includes
[custom components](/usage/processing-pipelines#custom-components), model
architectures or other [code](/usage/training#custom-code), those functions need
to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know
-how to create the objects referenced in the config. The
-[`spacy package`](/api/cli#package) command lets you provide one or more paths
-to Python files containing custom registered functions using the `--code`
-argument.
+how to create the objects referenced in the config. If you're loading your own
+pipeline in Python, you can make custom components available just by importing
+the code that defines them before calling
+[`spacy.load`](/api/top-level#spacy.load). This is also how the `--code`
+argument to CLI commands works.
+
+With the [`spacy package`](/api/cli#package) command, you can provide one or
+more paths to Python files containing custom registered functions using the
+`--code` argument.
> #### \_\_init\_\_.py (excerpt)
>
diff --git a/website/docs/usage/v3-5.mdx b/website/docs/usage/v3-5.mdx
new file mode 100644
index 000000000..ac61338e3
--- /dev/null
+++ b/website/docs/usage/v3-5.mdx
@@ -0,0 +1,215 @@
+---
+title: What's New in v3.5
+teaser: New features and how to upgrade
+menu:
+ - ['New Features', 'features']
+ - ['Upgrading Notes', 'upgrading']
+---
+
+## New features {id="features",hidden="true"}
+
+spaCy v3.5 introduces three new CLI commands, `apply`, `benchmark` and
+`find-threshold`, adds fuzzy matching, provides improvements to our entity
+linking functionality, and includes a range of language updates and bug fixes.
+
+### New CLI commands {id="cli"}
+
+#### apply CLI
+
+The [`apply` CLI](/api/cli#apply) can be used to apply a pipeline to one or more
+`.txt`, `.jsonl` or `.spacy` input files, saving the annotated docs in a single
+`.spacy` file.
+
+```bash
+$ spacy apply en_core_web_sm my_texts/ output.spacy
+```
+
+#### benchmark CLI
+
+The [`benchmark` CLI](/api/cli#benchmark) has been added to extend the existing
+`evaluate` functionality with a wider range of profiling subcommands.
+
+The `benchmark accuracy` CLI is introduced as an alias for `evaluate`. The new
+`benchmark speed` CLI performs warmup rounds before measuring the speed in words
+per second on batches of randomly shuffled documents from the provided data.
+
+```bash
+$ spacy benchmark speed my_pipeline data.spacy
+```
+
+The output is the mean performance using batches (`nlp.pipe`) with a 95%
+confidence interval, e.g., profiling `en_core_web_sm` on CPU:
+
+```none
+Outliers: 2.0%, extreme outliers: 0.0%
+Mean: 18904.1 words/s (95% CI: -256.9 +244.1)
+```
+
+#### find-threshold CLI
+
+The [`find-threshold` CLI](/api/cli#find-threshold) runs a series of trials
+across threshold values from `0.0` to `1.0` and identifies the best threshold
+for the provided score metric.
+
+The following command runs 20 trials for the `spancat` component in
+`my_pipeline`, recording the `spans_sc_f` score for each value of the threshold
+`[components.spancat.threshold]` from `0.0` to `1.0`:
+
+```bash
+$ spacy find-threshold my_pipeline data.spacy spancat threshold spans_sc_f --n_trials 20
+```
+
+The `find-threshold` CLI can be used with `textcat_multilabel`, `spancat` and
+custom components with thresholds that are applied while predicting or scoring.
+
+### Fuzzy matching {id="fuzzy"}
+
+New `FUZZY` operators support [fuzzy matching](/usage/rule-based-matching#fuzzy)
+with the `Matcher`. By default, the `FUZZY` operator allows a Levenshtein edit
+distance of 2 and up to 30% of the pattern string length. `FUZZY1`..`FUZZY9` can
+be used to specify the exact number of allowed edits.
+
+```python
+# Match lowercase with fuzzy matching (allows up to 3 edits)
+pattern = [{"LOWER": {"FUZZY": "definitely"}}]
+
+# Match custom attribute values with fuzzy matching (allows up to 3 edits)
+pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
+
+# Match with exact Levenshtein edit distance limits (allows up to 4 edits)
+pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
+```
+
+Note that `FUZZY` uses Levenshtein edit distance rather than Damerau-Levenshtein
+edit distance, so a transposition like `teh` for `the` counts as two edits, one
+insertion and one deletion.
+
+If you'd prefer an alternate fuzzy matching algorithm, you can provide your own
+custom method to the `Matcher` or as a config option for an entity ruler and
+span ruler.
+
+### FUZZY and REGEX with lists {id="fuzzy-regex-lists"}
+
+The `FUZZY` and `REGEX` operators are also now supported for lists with `IN` and
+`NOT_IN`:
+
+```python
+pattern = [{"TEXT": {"FUZZY": {"IN": ["awesome", "cool", "wonderful"]}}}]
+pattern = [{"TEXT": {"REGEX": {"NOT_IN": ["^awe(some)?$", "^wonder(ful)?"]}}}]
+```
+
+### Entity linking generalization {id="el"}
+
+The knowledge base used for entity linking is now easier to customize and has a
+new default implementation [`InMemoryLookupKB`](/api/inmemorylookupkb).
+
+### Additional features and improvements {id="additional-features-and-improvements"}
+
+- Language updates:
+ - Extended support for Slovenian
+ - Fixed lookup fallback for French and Catalan lemmatizers
+ - Switch Russian and Ukrainian lemmatizers to `pymorphy3`
+ - Support for editorial punctuation in Ancient Greek
+ - Update to Russian tokenizer exceptions
+ - Small fix for Dutch stop words
+- Allow up to `typer` v0.7.x, `mypy` 0.990 and `typing_extensions` v4.4.x.
+- New `spacy.ConsoleLogger.v3` with expanded progress
+ [tracking](/api/top-level#ConsoleLogger).
+- Improved scoring behavior for `textcat` with `spacy.textcat_scorer.v2` and
+ `spacy.textcat_multilabel_scorer.v2`.
+- Updates so that downstream components can train properly on a frozen `tok2vec`
+ or `transformer` layer.
+- Allow interpolation of variables in directory names in projects.
+- Support for local file system [remotes](/usage/projects#remote) for projects.
+- Improve UX around `displacy.serve` when the default port is in use.
+- Optional `before_update` callback that is invoked at the start of each
+ [training step](/api/data-formats#config-training).
+- Improve performance of `SpanGroup` and fix typing issues for `SpanGroup` and
+ `Span` objects.
+- Patch a
+ [security vulnerability](https://github.com/advisories/GHSA-gw9q-c7gh-j9vm) in
+ extracting tar files.
+- Add equality definition for `Vectors`.
+- Ensure `Vocab.to_disk` respects the exclude setting for `lookups` and
+ `vectors`.
+- Correctly handle missing annotations in the edit tree lemmatizer.
+
+### Trained pipeline updates {id="pipelines"}
+
+- The CNN pipelines add `IS_SPACE` as a `tok2vec` feature for `tagger` and
+ `morphologizer` components to improve tagging of non-whitespace vs. whitespace
+ tokens.
+- The transformer pipelines require `spacy-transformers` v1.2, which uses the
+ exact alignment from `tokenizers` for fast tokenizers instead of the heuristic
+ alignment from `spacy-alignments`. For all trained pipelines except
+ `ja_core_news_trf`, the alignments between spaCy tokens and transformer tokens
+ may be slightly different. More details about the `spacy-transformers` changes
+ in the
+ [v1.2.0 release notes](https://github.com/explosion/spacy-transformers/releases/tag/v1.2.0).
+
+## Notes about upgrading from v3.4 {id="upgrading"}
+
+### Validation of textcat values {id="textcat-validation"}
+
+An error is now raised when unsupported values are given as input to train a
+`textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
+as explained in the [docs](/api/textcategorizer#assigned-attributes).
+
+### Updated scorers for tokenization and textcat {id="scores"}
+
+We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported
+`token_acc` will drop from v3.4 to v3.5, but if `token_p/r/f` stay the same,
+your tokenization performance has not changed from v3.4.
+
+For new `textcat` or `textcat_multilabel` configs, the new default `v2` scorers:
+
+- ignore `threshold` for `textcat`, so the reported `cats_p/r/f` may increase
+ slightly in v3.5 even though the underlying predictions are unchanged
+- report the performance of only the **final** `textcat` or `textcat_multilabel`
+ component in the pipeline by default
+- allow custom scorers to be used to score multiple `textcat` and
+ `textcat_multilabel` components with `Scorer.score_cats` by restricting the
+ evaluation to the component's provided labels
+
+### Pipeline package version compatibility {id="version-compat"}
+
+> #### Using legacy implementations
+>
+> In spaCy v3, you'll still be able to load and reference legacy implementations
+> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
+> components or architectures change and newer versions are available in the
+> core library.
+
+When you're loading a pipeline package trained with an earlier version of spaCy
+v3, you will see a warning telling you that the pipeline may be incompatible.
+This doesn't necessarily have to be true, but we recommend running your
+pipelines against your test suite or evaluation data to make sure there are no
+unexpected results.
+
+If you're using one of the [trained pipelines](/models) we provide, you should
+run [`spacy download`](/api/cli#download) to update to the latest version. To
+see an overview of all installed packages and their compatibility, you can run
+[`spacy validate`](/api/cli#validate).
+
+If you've trained your own custom pipeline and you've confirmed that it's still
+working as expected, you can update the spaCy version requirements in the
+[`meta.json`](/api/data-formats#meta):
+
+```diff
+- "spacy_version": ">=3.4.0,<3.5.0",
++ "spacy_version": ">=3.4.0,<3.6.0",
+```
+
+### Updating v3.4 configs
+
+To update a config from spaCy v3.4 with the new v3.5 settings, run
+[`init fill-config`](/api/cli#init-fill-config):
+
+```cli
+$ python -m spacy init fill-config config-v3.4.cfg config-v3.5.cfg
+```
+
+In many cases ([`spacy train`](/api/cli#train),
+[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
+automatically, but you'll need to fill in the new settings to run
+[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 339e4085b..b5c555da6 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -13,7 +13,8 @@
{ "text": "New in v3.1", "url": "/usage/v3-1" },
{ "text": "New in v3.2", "url": "/usage/v3-2" },
{ "text": "New in v3.3", "url": "/usage/v3-3" },
- { "text": "New in v3.4", "url": "/usage/v3-4" }
+ { "text": "New in v3.4", "url": "/usage/v3-4" },
+ { "text": "New in v3.5", "url": "/usage/v3-5" }
]
},
{
@@ -129,6 +130,7 @@
"items": [
{ "text": "Attributes", "url": "/api/attributes" },
{ "text": "Corpus", "url": "/api/corpus" },
+ { "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
{ "text": "KnowledgeBase", "url": "/api/kb" },
{ "text": "Lookups", "url": "/api/lookups" },
{ "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },
diff --git a/website/pages/index.tsx b/website/pages/index.tsx
index 170bca137..4c0932926 100644
--- a/website/pages/index.tsx
+++ b/website/pages/index.tsx
@@ -89,7 +89,7 @@ const Landing = () => {
- In the five years since its release, spaCy has become an industry standard with
+ Since its release in 2015, spaCy has become an industry standard with
a huge ecosystem. Choose from a variety of plugins, integrate with your machine
learning stack and build custom components and workflows.
diff --git a/website/src/components/seo.js b/website/src/components/seo.js
index 5d12ffa04..d338c43f3 100644
--- a/website/src/components/seo.js
+++ b/website/src/components/seo.js
@@ -9,6 +9,8 @@ import socialImageLegacy from '../images/social_legacy.jpg'
import siteMetadata from '../../meta/site.json'
import Head from 'next/head'
+import { siteUrl } from '../../meta/dynamicMeta.mjs'
+
function getPageTitle(title, sitename, slogan, sectionTitle, nightly, legacy) {
if (sectionTitle && title) {
const suffix = nightly ? ' (nightly)' : legacy ? ' (legacy)' : ''
@@ -25,7 +27,7 @@ function getImage(section, nightly, legacy) {
if (legacy) return socialImageLegacy
if (section === 'api') return socialImageApi
if (section === 'universe') return socialImageUniverse
- return socialImageDefault
+ return `${siteUrl}${socialImageDefault.src}`
}
export default function SEO({
@@ -46,7 +48,7 @@ export default function SEO({
nightly,
legacy
)
- const socialImage = getImage(section, nightly, legacy).src
+ const socialImage = getImage(section, nightly, legacy)
const meta = [
{
name: 'description',
diff --git a/website/src/styles/list.module.sass b/website/src/styles/list.module.sass
index 1a352d9dd..2fb9ab8ef 100644
--- a/website/src/styles/list.module.sass
+++ b/website/src/styles/list.module.sass
@@ -20,6 +20,10 @@
display: inline-block
margin-bottom: var(--spacing-sm)
+ .ol, .ul
+ margin-top: var(--spacing-xs)
+ margin-bottom: var(--spacing-xs)
+
&:before
content: '\25CF'
position: relative
diff --git a/website/src/templates/index.js b/website/src/templates/index.js
index aa7595ddc..80b5a24d4 100644
--- a/website/src/templates/index.js
+++ b/website/src/templates/index.js
@@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
}
const navAlert = (
-
- 💥 Out now: spaCy v3.4
+
+ 💥 Out now: spaCy v3.5
)