Merge branch 'master' into feature/etl

This commit is contained in:
richard@explosion.ai 2023-02-01 17:03:29 +01:00
commit 5a08596f92
30 changed files with 440 additions and 147 deletions

View File

@ -52,17 +52,17 @@ steps:
python -W error -c "import spacy"
displayName: "Test import"
# - script: |
# python -m spacy download ca_core_news_sm
# python -m spacy download ca_core_news_md
# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
# displayName: 'Test download CLI'
# condition: eq(variables['python_version'], '3.8')
#
# - script: |
# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
# displayName: 'Test no warnings on load (#11713)'
# condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy download ca_core_news_sm
python -m spacy download ca_core_news_md
python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')"
displayName: 'Test download CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')"
displayName: 'Test no warnings on load (#11713)'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json .
@ -86,17 +86,17 @@ steps:
displayName: 'Test train CLI'
condition: eq(variables['python_version'], '3.8')
# - script: |
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
# displayName: 'Test assemble CLI'
# condition: eq(variables['python_version'], '3.8')
#
# - script: |
# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
# displayName: 'Test assemble CLI vectors warning'
# condition: eq(variables['python_version'], '3.8')
- script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')"
PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir
displayName: 'Test assemble CLI'
condition: eq(variables['python_version'], '3.8')
- script: |
python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')"
python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113
displayName: 'Test assemble CLI vectors warning'
condition: eq(variables['python_version'], '3.8')
- script: |
python -m pip install -U -r requirements.txt

10
.gitignore vendored
View File

@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg
spacy/tests/package/pyproject.toml
spacy/tests/package/requirements.txt
# Website
website/.cache/
website/public/
website/node_modules
website/.npm
website/logs
*.log
npm-debug.log*
quickstart-training-generator.js
# Cython / C extensions
cythonize.json
spacy/*.html

View File

@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
💫 **Version 3.4 out now!**
💫 **Version 3.5 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
[![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8)

View File

@ -22,7 +22,7 @@ langcodes>=3.2.0,<4.0.0
# Official Python utilities
setuptools
packaging>=20.0
typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8"
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
# Development dependencies
pre-commit>=2.13.0
cython>=0.25,<3.0

View File

@ -63,7 +63,7 @@ install_requires =
# Official Python utilities
setuptools
packaging>=20.0
typing_extensions>=3.7.4,<4.2.0; python_version < "3.8"
typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8"
langcodes>=3.2.0,<4.0.0
[options.entry_points]

View File

@ -106,9 +106,7 @@ def serve(
if is_in_jupyter():
warnings.warn(Warnings.W011)
render(
docs, style=style, page=page, minify=minify, options=options, manual=manual
)
render(docs, style=style, page=page, minify=minify, options=options, manual=manual)
httpd = simple_server.make_server(host, port, app)
print(f"\nUsing the '{style}' visualizer")
print(f"Serving on http://{host}:{port} ...\n")

View File

@ -25,7 +25,7 @@ cdef class InMemoryLookupKB(KnowledgeBase):
"""An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases,
to support entity linking of named entities to real-world concepts.
DOCS: https://spacy.io/api/kb_in_memory
DOCS: https://spacy.io/api/inmemorylookupkb
"""
def __init__(self, Vocab vocab, entity_vector_length):

View File

@ -22,7 +22,7 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int =
max_edits = fuzzy
else:
# allow at least two edits (to allow at least one transposition) and up
# to 20% of the pattern string length
# to 30% of the pattern string length
max_edits = max(2, round(0.3 * len(pattern_text)))
return levenshtein(input_text, pattern_text, max_edits) <= max_edits

View File

@ -5,8 +5,12 @@ from ..vocab import Vocab
from ..tokens import Doc, Span
class Matcher:
def __init__(self, vocab: Vocab, validate: bool = ...,
fuzzy_compare: Callable[[str, str, int], bool] = ...) -> None: ...
def __init__(
self,
vocab: Vocab,
validate: bool = ...,
fuzzy_compare: Callable[[str, str, int], bool] = ...,
) -> None: ...
def __reduce__(self) -> Any: ...
def __len__(self) -> int: ...
def __contains__(self, key: str) -> bool: ...

View File

@ -5,8 +5,8 @@ from itertools import islice
import numpy as np
import srsly
from thinc.api import Config, Model, SequenceCategoricalCrossentropy
from thinc.types import Floats2d, Ints1d, Ints2d
from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps
from thinc.types import Floats2d, Ints2d
from ._edit_tree_internals.edit_trees import EditTrees
from ._edit_tree_internals.schemas import validate_edit_tree
@ -20,6 +20,10 @@ from ..vocab import Vocab
from .. import util
# The cutoff value of *top_k* above which an alternative method is used to process guesses.
TOP_K_GUARDRAIL = 20
default_model_config = """
[model]
@architectures = "spacy.Tagger.v2"
@ -115,6 +119,7 @@ class EditTreeLemmatizer(TrainablePipe):
self.cfg: Dict[str, Any] = {"labels": []}
self.scorer = scorer
self.numpy_ops = NumpyOps()
def get_loss(
self, examples: Iterable[Example], scores: List[Floats2d]
@ -144,6 +149,18 @@ class EditTreeLemmatizer(TrainablePipe):
return float(loss), d_scores
def predict(self, docs: Iterable[Doc]) -> List[Ints2d]:
if self.top_k == 1:
scores2guesses = self._scores2guesses_top_k_equals_1
elif self.top_k <= TOP_K_GUARDRAIL:
scores2guesses = self._scores2guesses_top_k_greater_1
else:
scores2guesses = self._scores2guesses_top_k_guardrail
# The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values
# of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used
# for its principal purpose of lemmatizing tokens. However, the code could also
# be used for other purposes, and with very large values of *top_k* the method
# becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used
# instead.
n_docs = len(list(docs))
if not any(len(doc) for doc in docs):
# Handle cases where there are no tokens in any docs.
@ -153,20 +170,52 @@ class EditTreeLemmatizer(TrainablePipe):
return guesses
scores = self.model.predict(docs)
assert len(scores) == n_docs
guesses = self._scores2guesses(docs, scores)
guesses = scores2guesses(docs, scores)
assert len(guesses) == n_docs
return guesses
def _scores2guesses(self, docs, scores):
def _scores2guesses_top_k_equals_1(self, docs, scores):
guesses = []
for doc, doc_scores in zip(docs, scores):
if self.top_k == 1:
doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1)
else:
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
doc_guesses = doc_scores.argmax(axis=1)
doc_guesses = self.numpy_ops.asarray(doc_guesses)
if not isinstance(doc_guesses, np.ndarray):
doc_guesses = doc_guesses.get()
doc_compat_guesses = []
for i, token in enumerate(doc):
tree_id = self.cfg["labels"][doc_guesses[i]]
if self.trees.apply(tree_id, token.text) is not None:
doc_compat_guesses.append(tree_id)
else:
doc_compat_guesses.append(-1)
guesses.append(np.array(doc_compat_guesses))
return guesses
def _scores2guesses_top_k_greater_1(self, docs, scores):
guesses = []
top_k = min(self.top_k, len(self.labels))
for doc, doc_scores in zip(docs, scores):
doc_scores = self.numpy_ops.asarray(doc_scores)
doc_compat_guesses = []
for i, token in enumerate(doc):
for _ in range(top_k):
candidate = int(doc_scores[i].argmax())
candidate_tree_id = self.cfg["labels"][candidate]
if self.trees.apply(candidate_tree_id, token.text) is not None:
doc_compat_guesses.append(candidate_tree_id)
break
doc_scores[i, candidate] = np.finfo(np.float32).min
else:
doc_compat_guesses.append(-1)
guesses.append(np.array(doc_compat_guesses))
return guesses
def _scores2guesses_top_k_guardrail(self, docs, scores):
guesses = []
for doc, doc_scores in zip(docs, scores):
doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1]
doc_guesses = self.numpy_ops.asarray(doc_guesses)
doc_compat_guesses = []
for token, candidates in zip(doc, doc_guesses):

View File

@ -163,15 +163,33 @@ class TokenPatternString(BaseModel):
IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset")
INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects")
FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy")
FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy1")
FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy2")
FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy3")
FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy4")
FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy5")
FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy6")
FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy7")
FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy8")
FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy9")
FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy1"
)
FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy2"
)
FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy3"
)
FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy4"
)
FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy5"
)
FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy6"
)
FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy7"
)
FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy8"
)
FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(
None, alias="fuzzy9"
)
class Config:
extra = "forbid"

View File

@ -101,14 +101,15 @@ def test_initialize_from_labels():
}
def test_no_data():
@pytest.mark.parametrize("top_k", (1, 5, 30))
def test_no_data(top_k):
# Test that the lemmatizer provides a nice error when there's no tagging data / labels
TEXTCAT_DATA = [
("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}),
("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}),
]
nlp = English()
nlp.add_pipe("trainable_lemmatizer")
nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
nlp.add_pipe("textcat")
train_examples = []
@ -119,10 +120,11 @@ def test_no_data():
nlp.initialize(get_examples=lambda: train_examples)
def test_incomplete_data():
@pytest.mark.parametrize("top_k", (1, 5, 30))
def test_incomplete_data(top_k):
# Test that the lemmatizer works with incomplete information
nlp = English()
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
lemmatizer.min_tree_freq = 1
train_examples = []
for t in PARTIAL_DATA:
@ -154,9 +156,10 @@ def test_incomplete_data():
assert xp.count_nonzero(dX[1][1]) == 0
def test_overfitting_IO():
@pytest.mark.parametrize("top_k", (1, 5, 30))
def test_overfitting_IO(top_k):
nlp = English()
lemmatizer = nlp.add_pipe("trainable_lemmatizer")
lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
lemmatizer.min_tree_freq = 1
train_examples = []
for t in TRAIN_DATA:
@ -189,7 +192,7 @@ def test_overfitting_IO():
# Check model after a {to,from}_bytes roundtrip
nlp_bytes = nlp.to_bytes()
nlp3 = English()
nlp3.add_pipe("trainable_lemmatizer")
nlp3.add_pipe("trainable_lemmatizer", config={"top_k": top_k})
nlp3.from_bytes(nlp_bytes)
doc3 = nlp3(test_text)
assert doc3[0].lemma_ == "she"

View File

@ -618,7 +618,6 @@ def test_string_to_list_intify(value):
assert string_to_list(value, intify=True) == [1, 2, 3]
@pytest.mark.skip(reason="Temporarily skip for dev version")
def test_download_compatibility():
spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False
@ -629,7 +628,6 @@ def test_download_compatibility():
assert get_minor_version(about.__version__) == get_minor_version(version)
@pytest.mark.skip(reason="Temporarily skip for dev version")
def test_validate_compatibility_table():
spec = SpecifierSet("==" + about.__version__)
spec.prereleases = False

9
website/.dockerignore Normal file
View File

@ -0,0 +1,9 @@
.cache/
.next/
public/
node_modules
.npm
logs
*.log
npm-debug.log*
quickstart-training-generator.js

2
website/.gitignore vendored
View File

@ -1,5 +1,7 @@
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
quickstart-training-generator.js
# dependencies
/node_modules
/.pnp

View File

@ -1,16 +1,14 @@
FROM node:11.15.0
FROM node:18
WORKDIR /spacy-io
RUN npm install -g gatsby-cli@2.7.4
COPY package.json .
COPY package-lock.json .
RUN npm install
USER node
# This is so the installed node_modules will be up one directory
# from where a user mounts files, so that they don't accidentally mount
# their own node_modules from a different build
# https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders
WORKDIR /spacy-io/website/
WORKDIR /home/node
COPY --chown=node package.json .
COPY --chown=node package-lock.json .
RUN npm install
WORKDIR /home/node/website/

View File

@ -41,33 +41,27 @@ If you'd like to do this, **be sure you do _not_ include your local
`node_modules` folder**, since there are some dependencies that need to be built
for the image system. Rename it before using.
```bash
docker run -it \
-v $(pwd):/spacy-io/website \
-p 8000:8000 \
ghcr.io/explosion/spacy-io \
gatsby develop -H 0.0.0.0
```
This will allow you to access the built website at http://0.0.0.0:8000/ in your
browser, and still edit code in your editor while having the site reflect those
changes.
**Note**: If you're working on a Mac with an M1 processor, you might see
segfault errors from `qemu` if you use the default image. To fix this use the
`arm64` tagged image in the `docker run` command
(ghcr.io/explosion/spacy-io:arm64).
### Building the Docker image
If you'd like to build the image locally, you can do so like this:
First build the Docker image. This only needs to be done on the first run
or when changes are made to `Dockerfile` or the website dependencies:
```bash
docker build -t spacy-io .
```
This will take some time, so if you want to use the prebuilt image you'll save a
bit of time.
You can then build and run the website with:
```bash
docker run -it \
--rm \
-v $(pwd):/home/node/website \
-p 3000:3000 \
spacy-io \
npm run dev -- -H 0.0.0.0
```
This will allow you to access the built website at http://0.0.0.0:3000/ in your
browser, and still edit code in your editor while having the site reflect those
changes.
## Project structure

View File

@ -1215,19 +1215,19 @@ When a directory is provided it is traversed recursively to collect all files.
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
```
| Name | Description |
| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
| `--code`, `-c` <Tag variant="new">3</Tag> | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
| Name | Description |
| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
## find-threshold {id="find-threshold",version="3.5",tag="command"}

View File

@ -15,7 +15,7 @@ world". It requires a `KnowledgeBase`, as well as a function to generate
plausible candidates from that `KnowledgeBase` given a certain textual mention,
and a machine learning model to pick the right candidate, given the local
context of the mention. `EntityLinker` defaults to using the
[`InMemoryLookupKB`](/api/kb_in_memory) implementation.
[`InMemoryLookupKB`](/api/inmemorylookupkb) implementation.
## Assigned Attributes {id="assigned-attributes"}

View File

@ -43,7 +43,7 @@ The length of the fixed-size entity vectors in the knowledge base.
Add an entity to the knowledge base, specifying its corpus frequency and entity
vector, which should be of length
[`entity_vector_length`](/api/kb_in_memory#entity_vector_length).
[`entity_vector_length`](/api/inmemorylookupkb#entity_vector_length).
> #### Example
>
@ -79,8 +79,9 @@ frequency and entity vector for each entity.
Add an alias or mention to the knowledge base, specifying its potential KB
identifiers and their prior probabilities. The entity identifiers should refer
to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity)
or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior
to entities previously added with
[`add_entity`](/api/inmemorylookupkb#add_entity) or
[`set_entities`](/api/inmemorylookupkb#set_entities). The sum of the prior
probabilities should not exceed 1. Note that an empty string can not be used as
alias.
@ -156,7 +157,7 @@ Get a list of all aliases in the knowledge base.
Given a certain textual mention as input, retrieve a list of candidate entities
of type [`Candidate`](/api/kb#candidate). Wraps
[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
> #### Example
>
@ -174,7 +175,7 @@ of type [`Candidate`](/api/kb#candidate). Wraps
## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"}
Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an
Same as [`get_candidates()`](/api/inmemorylookupkb#get_candidates), but for an
arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component
will call `get_candidates_batch()` instead of `get_candidates()`, if the config
parameter `candidates_batch_size` is greater or equal than 1.
@ -231,7 +232,7 @@ Given a certain entity ID, retrieve its pretrained entity vector.
## InMemoryLookupKB.get_vectors {id="get_vectors",tag="method"}
Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary
Same as [`get_vector()`](/api/inmemorylookupkb#get_vector), but for an arbitrary
number of entity IDs.
The default implementation of `get_vectors()` executes `get_vector()` in a loop.

View File

@ -21,8 +21,8 @@ functions called by the [`EntityLinker`](/api/entitylinker) component.
<Infobox variant="warning">
This class was not abstract up to spaCy version 3.5. The `KnowledgeBase`
implementation up to that point is available as `InMemoryLookupKB` from 3.5
onwards.
implementation up to that point is available as
[`InMemoryLookupKB`](/api/inmemorylookupkb) from 3.5 onwards.
</Infobox>
@ -110,14 +110,15 @@ to you.
</Infobox>
From spaCy 3.5 on `KnowledgeBase` is an abstract class (with
[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow
more flexibility in customizing knowledge bases. Some of its methods were moved
to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those
being `get_alias_candidates()`. This method is now available as
[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates)
[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to
allow more flexibility in customizing knowledge bases. Some of its methods were
moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring,
one of those being `get_alias_candidates()`. This method is now available as
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
Note:
[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates)
defaults to
[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates).
[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates).
## KnowledgeBase.get_vector {id="get_vector",tag="method"}

View File

@ -79,7 +79,7 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
| ------------------------------------------------ | -------------------------------------------------------------------------------------------------- |
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
| [`KnowledgeBase`](/api/kb) | Abstract base class for storage and retrieval of data for entity linking. |
| [`InMemoryLookupKB`](/api/kb_in_memory) | Implementation of `KnowledgeBase` storing all data in memory. |
| [`InMemoryLookupKB`](/api/inmemorylookupkb) | Implementation of `KnowledgeBase` storing all data in memory. |
| [`Candidate`](/api/kb#candidate) | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`. |
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |

View File

@ -384,14 +384,14 @@ the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum
allowed edit distance directly.
```python
# Match lowercase with fuzzy matching (allows 2 edits)
# Match lowercase with fuzzy matching (allows 3 edits)
pattern = [{"LOWER": {"FUZZY": "definitely"}}]
# Match custom attribute values with fuzzy matching (allows 2 edits)
# Match custom attribute values with fuzzy matching (allows 3 edits)
pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
# Match with exact Levenshtein edit distance limits (allows 3 edits)
pattern = [{"_": {"country": {"FUZZY3": "Kyrgyzstan"}}}]
# Match with exact Levenshtein edit distance limits (allows 4 edits)
pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
```
#### Regex and fuzzy matching with lists {id="regex-fuzzy-lists", version="3.5"}

View File

@ -684,10 +684,15 @@ If your pipeline includes
[custom components](/usage/processing-pipelines#custom-components), model
architectures or other [code](/usage/training#custom-code), those functions need
to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know
how to create the objects referenced in the config. The
[`spacy package`](/api/cli#package) command lets you provide one or more paths
to Python files containing custom registered functions using the `--code`
argument.
how to create the objects referenced in the config. If you're loading your own
pipeline in Python, you can make custom components available just by importing
the code that defines them before calling
[`spacy.load`](/api/top-level#spacy.load). This is also how the `--code`
argument to CLI commands works.
With the [`spacy package`](/api/cli#package) command, you can provide one or
more paths to Python files containing custom registered functions using the
`--code` argument.
> #### \_\_init\_\_.py (excerpt)
>

215
website/docs/usage/v3-5.mdx Normal file
View File

@ -0,0 +1,215 @@
---
title: What's New in v3.5
teaser: New features and how to upgrade
menu:
- ['New Features', 'features']
- ['Upgrading Notes', 'upgrading']
---
## New features {id="features",hidden="true"}
spaCy v3.5 introduces three new CLI commands, `apply`, `benchmark` and
`find-threshold`, adds fuzzy matching, provides improvements to our entity
linking functionality, and includes a range of language updates and bug fixes.
### New CLI commands {id="cli"}
#### apply CLI
The [`apply` CLI](/api/cli#apply) can be used to apply a pipeline to one or more
`.txt`, `.jsonl` or `.spacy` input files, saving the annotated docs in a single
`.spacy` file.
```bash
$ spacy apply en_core_web_sm my_texts/ output.spacy
```
#### benchmark CLI
The [`benchmark` CLI](/api/cli#benchmark) has been added to extend the existing
`evaluate` functionality with a wider range of profiling subcommands.
The `benchmark accuracy` CLI is introduced as an alias for `evaluate`. The new
`benchmark speed` CLI performs warmup rounds before measuring the speed in words
per second on batches of randomly shuffled documents from the provided data.
```bash
$ spacy benchmark speed my_pipeline data.spacy
```
The output is the mean performance using batches (`nlp.pipe`) with a 95%
confidence interval, e.g., profiling `en_core_web_sm` on CPU:
```none
Outliers: 2.0%, extreme outliers: 0.0%
Mean: 18904.1 words/s (95% CI: -256.9 +244.1)
```
#### find-threshold CLI
The [`find-threshold` CLI](/api/cli#find-threshold) runs a series of trials
across threshold values from `0.0` to `1.0` and identifies the best threshold
for the provided score metric.
The following command runs 20 trials for the `spancat` component in
`my_pipeline`, recording the `spans_sc_f` score for each value of the threshold
`[components.spancat.threshold]` from `0.0` to `1.0`:
```bash
$ spacy find-threshold my_pipeline data.spacy spancat threshold spans_sc_f --n_trials 20
```
The `find-threshold` CLI can be used with `textcat_multilabel`, `spancat` and
custom components with thresholds that are applied while predicting or scoring.
### Fuzzy matching {id="fuzzy"}
New `FUZZY` operators support [fuzzy matching](/usage/rule-based-matching#fuzzy)
with the `Matcher`. By default, the `FUZZY` operator allows a Levenshtein edit
distance of 2 and up to 30% of the pattern string length. `FUZZY1`..`FUZZY9` can
be used to specify the exact number of allowed edits.
```python
# Match lowercase with fuzzy matching (allows up to 3 edits)
pattern = [{"LOWER": {"FUZZY": "definitely"}}]
# Match custom attribute values with fuzzy matching (allows up to 3 edits)
pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}]
# Match with exact Levenshtein edit distance limits (allows up to 4 edits)
pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}]
```
Note that `FUZZY` uses Levenshtein edit distance rather than Damerau-Levenshtein
edit distance, so a transposition like `teh` for `the` counts as two edits, one
insertion and one deletion.
If you'd prefer an alternate fuzzy matching algorithm, you can provide your own
custom method to the `Matcher` or as a config option for an entity ruler and
span ruler.
### FUZZY and REGEX with lists {id="fuzzy-regex-lists"}
The `FUZZY` and `REGEX` operators are also now supported for lists with `IN` and
`NOT_IN`:
```python
pattern = [{"TEXT": {"FUZZY": {"IN": ["awesome", "cool", "wonderful"]}}}]
pattern = [{"TEXT": {"REGEX": {"NOT_IN": ["^awe(some)?$", "^wonder(ful)?"]}}}]
```
### Entity linking generalization {id="el"}
The knowledge base used for entity linking is now easier to customize and has a
new default implementation [`InMemoryLookupKB`](/api/inmemorylookupkb).
### Additional features and improvements {id="additional-features-and-improvements"}
- Language updates:
- Extended support for Slovenian
- Fixed lookup fallback for French and Catalan lemmatizers
- Switch Russian and Ukrainian lemmatizers to `pymorphy3`
- Support for editorial punctuation in Ancient Greek
- Update to Russian tokenizer exceptions
- Small fix for Dutch stop words
- Allow up to `typer` v0.7.x, `mypy` 0.990 and `typing_extensions` v4.4.x.
- New `spacy.ConsoleLogger.v3` with expanded progress
[tracking](/api/top-level#ConsoleLogger).
- Improved scoring behavior for `textcat` with `spacy.textcat_scorer.v2` and
`spacy.textcat_multilabel_scorer.v2`.
- Updates so that downstream components can train properly on a frozen `tok2vec`
or `transformer` layer.
- Allow interpolation of variables in directory names in projects.
- Support for local file system [remotes](/usage/projects#remote) for projects.
- Improve UX around `displacy.serve` when the default port is in use.
- Optional `before_update` callback that is invoked at the start of each
[training step](/api/data-formats#config-training).
- Improve performance of `SpanGroup` and fix typing issues for `SpanGroup` and
`Span` objects.
- Patch a
[security vulnerability](https://github.com/advisories/GHSA-gw9q-c7gh-j9vm) in
extracting tar files.
- Add equality definition for `Vectors`.
- Ensure `Vocab.to_disk` respects the exclude setting for `lookups` and
`vectors`.
- Correctly handle missing annotations in the edit tree lemmatizer.
### Trained pipeline updates {id="pipelines"}
- The CNN pipelines add `IS_SPACE` as a `tok2vec` feature for `tagger` and
`morphologizer` components to improve tagging of non-whitespace vs. whitespace
tokens.
- The transformer pipelines require `spacy-transformers` v1.2, which uses the
exact alignment from `tokenizers` for fast tokenizers instead of the heuristic
alignment from `spacy-alignments`. For all trained pipelines except
`ja_core_news_trf`, the alignments between spaCy tokens and transformer tokens
may be slightly different. More details about the `spacy-transformers` changes
in the
[v1.2.0 release notes](https://github.com/explosion/spacy-transformers/releases/tag/v1.2.0).
## Notes about upgrading from v3.4 {id="upgrading"}
### Validation of textcat values {id="textcat-validation"}
An error is now raised when unsupported values are given as input to train a
`textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0`
as explained in the [docs](/api/textcategorizer#assigned-attributes).
### Updated scorers for tokenization and textcat {id="scores"}
We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported
`token_acc` will drop from v3.4 to v3.5, but if `token_p/r/f` stay the same,
your tokenization performance has not changed from v3.4.
For new `textcat` or `textcat_multilabel` configs, the new default `v2` scorers:
- ignore `threshold` for `textcat`, so the reported `cats_p/r/f` may increase
slightly in v3.5 even though the underlying predictions are unchanged
- report the performance of only the **final** `textcat` or `textcat_multilabel`
component in the pipeline by default
- allow custom scorers to be used to score multiple `textcat` and
`textcat_multilabel` components with `Scorer.score_cats` by restricting the
evaluation to the component's provided labels
### Pipeline package version compatibility {id="version-compat"}
> #### Using legacy implementations
>
> In spaCy v3, you'll still be able to load and reference legacy implementations
> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the
> components or architectures change and newer versions are available in the
> core library.
When you're loading a pipeline package trained with an earlier version of spaCy
v3, you will see a warning telling you that the pipeline may be incompatible.
This doesn't necessarily have to be true, but we recommend running your
pipelines against your test suite or evaluation data to make sure there are no
unexpected results.
If you're using one of the [trained pipelines](/models) we provide, you should
run [`spacy download`](/api/cli#download) to update to the latest version. To
see an overview of all installed packages and their compatibility, you can run
[`spacy validate`](/api/cli#validate).
If you've trained your own custom pipeline and you've confirmed that it's still
working as expected, you can update the spaCy version requirements in the
[`meta.json`](/api/data-formats#meta):
```diff
- "spacy_version": ">=3.4.0,<3.5.0",
+ "spacy_version": ">=3.4.0,<3.6.0",
```
### Updating v3.4 configs
To update a config from spaCy v3.4 with the new v3.5 settings, run
[`init fill-config`](/api/cli#init-fill-config):
```cli
$ python -m spacy init fill-config config-v3.4.cfg config-v3.5.cfg
```
In many cases ([`spacy train`](/api/cli#train),
[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in
automatically, but you'll need to fill in the new settings to run
[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data).

View File

@ -13,7 +13,8 @@
{ "text": "New in v3.1", "url": "/usage/v3-1" },
{ "text": "New in v3.2", "url": "/usage/v3-2" },
{ "text": "New in v3.3", "url": "/usage/v3-3" },
{ "text": "New in v3.4", "url": "/usage/v3-4" }
{ "text": "New in v3.4", "url": "/usage/v3-4" },
{ "text": "New in v3.5", "url": "/usage/v3-5" }
]
},
{
@ -129,6 +130,7 @@
"items": [
{ "text": "Attributes", "url": "/api/attributes" },
{ "text": "Corpus", "url": "/api/corpus" },
{ "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" },
{ "text": "KnowledgeBase", "url": "/api/kb" },
{ "text": "Lookups", "url": "/api/lookups" },
{ "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" },

View File

@ -89,7 +89,7 @@ const Landing = () => {
</LandingCard>
<LandingCard title="Awesome ecosystem" url="/usage/projects" button="Read more">
In the five years since its release, spaCy has become an industry standard with
Since its release in 2015, spaCy has become an industry standard with
a huge ecosystem. Choose from a variety of plugins, integrate with your machine
learning stack and build custom components and workflows.
</LandingCard>

View File

@ -9,6 +9,8 @@ import socialImageLegacy from '../images/social_legacy.jpg'
import siteMetadata from '../../meta/site.json'
import Head from 'next/head'
import { siteUrl } from '../../meta/dynamicMeta.mjs'
function getPageTitle(title, sitename, slogan, sectionTitle, nightly, legacy) {
if (sectionTitle && title) {
const suffix = nightly ? ' (nightly)' : legacy ? ' (legacy)' : ''
@ -25,7 +27,7 @@ function getImage(section, nightly, legacy) {
if (legacy) return socialImageLegacy
if (section === 'api') return socialImageApi
if (section === 'universe') return socialImageUniverse
return socialImageDefault
return `${siteUrl}${socialImageDefault.src}`
}
export default function SEO({
@ -46,7 +48,7 @@ export default function SEO({
nightly,
legacy
)
const socialImage = getImage(section, nightly, legacy).src
const socialImage = getImage(section, nightly, legacy)
const meta = [
{
name: 'description',

View File

@ -20,6 +20,10 @@
display: inline-block
margin-bottom: var(--spacing-sm)
.ol, .ul
margin-top: var(--spacing-xs)
margin-bottom: var(--spacing-xs)
&:before
content: '\25CF'
position: relative

View File

@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => {
}
const navAlert = (
<Link to="/usage/v3-4" hidden>
<strong>💥 Out now:</strong> spaCy v3.4
<Link to="/usage/v3-5" hidden>
<strong>💥 Out now:</strong> spaCy v3.5
</Link>
)