diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index d0db75f9a..ed69f611b 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -52,17 +52,17 @@ steps: python -W error -c "import spacy" displayName: "Test import" -# - script: | -# python -m spacy download ca_core_news_sm -# python -m spacy download ca_core_news_md -# python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" -# displayName: 'Test download CLI' -# condition: eq(variables['python_version'], '3.8') -# -# - script: | -# python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" -# displayName: 'Test no warnings on load (#11713)' -# condition: eq(variables['python_version'], '3.8') + - script: | + python -m spacy download ca_core_news_sm + python -m spacy download ca_core_news_md + python -c "import spacy; nlp=spacy.load('ca_core_news_sm'); doc=nlp('test')" + displayName: 'Test download CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" + displayName: 'Test no warnings on load (#11713)' + condition: eq(variables['python_version'], '3.8') - script: | python -m spacy convert extra/example_data/ner_example_data/ner-token-per-line-conll2003.json . @@ -86,17 +86,17 @@ steps: displayName: 'Test train CLI' condition: eq(variables['python_version'], '3.8') -# - script: | -# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" -# PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir -# displayName: 'Test assemble CLI' -# condition: eq(variables['python_version'], '3.8') -# -# - script: | -# python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" -# python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 -# displayName: 'Test assemble CLI vectors warning' -# condition: eq(variables['python_version'], '3.8') + - script: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_sm'}; config.to_disk('ner_source_sm.cfg')" + PYTHONWARNINGS="error,ignore::DeprecationWarning" python -m spacy assemble ner_source_sm.cfg output_dir + displayName: 'Test assemble CLI' + condition: eq(variables['python_version'], '3.8') + + - script: | + python -c "import spacy; config = spacy.util.load_config('ner.cfg'); config['components']['ner'] = {'source': 'ca_core_news_md'}; config.to_disk('ner_source_md.cfg')" + python -m spacy assemble ner_source_md.cfg output_dir 2>&1 | grep -q W113 + displayName: 'Test assemble CLI vectors warning' + condition: eq(variables['python_version'], '3.8') - script: | python -m pip install -U -r requirements.txt diff --git a/.gitignore b/.gitignore index ac333f958..af75a4d47 100644 --- a/.gitignore +++ b/.gitignore @@ -10,16 +10,6 @@ spacy/tests/package/setup.cfg spacy/tests/package/pyproject.toml spacy/tests/package/requirements.txt -# Website -website/.cache/ -website/public/ -website/node_modules -website/.npm -website/logs -*.log -npm-debug.log* -quickstart-training-generator.js - # Cython / C extensions cythonize.json spacy/*.html diff --git a/README.md b/README.md index 195424551..49aa6796e 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy model packaging, deployment and workflow management. spaCy is commercial open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE). -💫 **Version 3.4 out now!** +💫 **Version 3.5 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) [![Azure Pipelines](https://img.shields.io/azure-devops/build/explosion-ai/public/8/master.svg?logo=azure-pipelines&style=flat-square&label=build)](https://dev.azure.com/explosion-ai/public/_build?definitionId=8) diff --git a/requirements.txt b/requirements.txt index 5bc1c8684..1bd4518af 100644 --- a/requirements.txt +++ b/requirements.txt @@ -22,7 +22,7 @@ langcodes>=3.2.0,<4.0.0 # Official Python utilities setuptools packaging>=20.0 -typing_extensions>=3.7.4.1,<4.2.0; python_version < "3.8" +typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8" # Development dependencies pre-commit>=2.13.0 cython>=0.25,<3.0 diff --git a/setup.cfg b/setup.cfg index 79dff9e30..cddc5148c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -63,7 +63,7 @@ install_requires = # Official Python utilities setuptools packaging>=20.0 - typing_extensions>=3.7.4,<4.2.0; python_version < "3.8" + typing_extensions>=3.7.4.1,<4.5.0; python_version < "3.8" langcodes>=3.2.0,<4.0.0 [options.entry_points] diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py index a3cfd96dd..ea6bba2c9 100644 --- a/spacy/displacy/__init__.py +++ b/spacy/displacy/__init__.py @@ -106,9 +106,7 @@ def serve( if is_in_jupyter(): warnings.warn(Warnings.W011) - render( - docs, style=style, page=page, minify=minify, options=options, manual=manual - ) + render(docs, style=style, page=page, minify=minify, options=options, manual=manual) httpd = simple_server.make_server(host, port, app) print(f"\nUsing the '{style}' visualizer") print(f"Serving on http://{host}:{port} ...\n") diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index 485e52c2f..edba523cf 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -25,7 +25,7 @@ cdef class InMemoryLookupKB(KnowledgeBase): """An `InMemoryLookupKB` instance stores unique identifiers for entities and their textual aliases, to support entity linking of named entities to real-world concepts. - DOCS: https://spacy.io/api/kb_in_memory + DOCS: https://spacy.io/api/inmemorylookupkb """ def __init__(self, Vocab vocab, entity_vector_length): diff --git a/spacy/matcher/levenshtein.pyx b/spacy/matcher/levenshtein.pyx index 0e8cd26da..e823ce99d 100644 --- a/spacy/matcher/levenshtein.pyx +++ b/spacy/matcher/levenshtein.pyx @@ -22,7 +22,7 @@ cpdef bint levenshtein_compare(input_text: str, pattern_text: str, fuzzy: int = max_edits = fuzzy else: # allow at least two edits (to allow at least one transposition) and up - # to 20% of the pattern string length + # to 30% of the pattern string length max_edits = max(2, round(0.3 * len(pattern_text))) return levenshtein(input_text, pattern_text, max_edits) <= max_edits diff --git a/spacy/matcher/matcher.pyi b/spacy/matcher/matcher.pyi index 77ea7b7a6..48922865b 100644 --- a/spacy/matcher/matcher.pyi +++ b/spacy/matcher/matcher.pyi @@ -5,8 +5,12 @@ from ..vocab import Vocab from ..tokens import Doc, Span class Matcher: - def __init__(self, vocab: Vocab, validate: bool = ..., - fuzzy_compare: Callable[[str, str, int], bool] = ...) -> None: ... + def __init__( + self, + vocab: Vocab, + validate: bool = ..., + fuzzy_compare: Callable[[str, str, int], bool] = ..., + ) -> None: ... def __reduce__(self) -> Any: ... def __len__(self) -> int: ... def __contains__(self, key: str) -> bool: ... diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py index e83fe63ba..332badd8c 100644 --- a/spacy/pipeline/edit_tree_lemmatizer.py +++ b/spacy/pipeline/edit_tree_lemmatizer.py @@ -5,8 +5,8 @@ from itertools import islice import numpy as np import srsly -from thinc.api import Config, Model, SequenceCategoricalCrossentropy -from thinc.types import Floats2d, Ints1d, Ints2d +from thinc.api import Config, Model, SequenceCategoricalCrossentropy, NumpyOps +from thinc.types import Floats2d, Ints2d from ._edit_tree_internals.edit_trees import EditTrees from ._edit_tree_internals.schemas import validate_edit_tree @@ -20,6 +20,10 @@ from ..vocab import Vocab from .. import util +# The cutoff value of *top_k* above which an alternative method is used to process guesses. +TOP_K_GUARDRAIL = 20 + + default_model_config = """ [model] @architectures = "spacy.Tagger.v2" @@ -115,6 +119,7 @@ class EditTreeLemmatizer(TrainablePipe): self.cfg: Dict[str, Any] = {"labels": []} self.scorer = scorer + self.numpy_ops = NumpyOps() def get_loss( self, examples: Iterable[Example], scores: List[Floats2d] @@ -144,6 +149,18 @@ class EditTreeLemmatizer(TrainablePipe): return float(loss), d_scores def predict(self, docs: Iterable[Doc]) -> List[Ints2d]: + if self.top_k == 1: + scores2guesses = self._scores2guesses_top_k_equals_1 + elif self.top_k <= TOP_K_GUARDRAIL: + scores2guesses = self._scores2guesses_top_k_greater_1 + else: + scores2guesses = self._scores2guesses_top_k_guardrail + # The behaviour of *_scores2guesses_top_k_greater_1()* is efficient for values + # of *top_k>1* that are likely to be useful when the edit tree lemmatizer is used + # for its principal purpose of lemmatizing tokens. However, the code could also + # be used for other purposes, and with very large values of *top_k* the method + # becomes inefficient. In such cases, *_scores2guesses_top_k_guardrail()* is used + # instead. n_docs = len(list(docs)) if not any(len(doc) for doc in docs): # Handle cases where there are no tokens in any docs. @@ -153,20 +170,52 @@ class EditTreeLemmatizer(TrainablePipe): return guesses scores = self.model.predict(docs) assert len(scores) == n_docs - guesses = self._scores2guesses(docs, scores) + guesses = scores2guesses(docs, scores) assert len(guesses) == n_docs return guesses - def _scores2guesses(self, docs, scores): + def _scores2guesses_top_k_equals_1(self, docs, scores): guesses = [] for doc, doc_scores in zip(docs, scores): - if self.top_k == 1: - doc_guesses = doc_scores.argmax(axis=1).reshape(-1, 1) - else: - doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1] + doc_guesses = doc_scores.argmax(axis=1) + doc_guesses = self.numpy_ops.asarray(doc_guesses) - if not isinstance(doc_guesses, np.ndarray): - doc_guesses = doc_guesses.get() + doc_compat_guesses = [] + for i, token in enumerate(doc): + tree_id = self.cfg["labels"][doc_guesses[i]] + if self.trees.apply(tree_id, token.text) is not None: + doc_compat_guesses.append(tree_id) + else: + doc_compat_guesses.append(-1) + guesses.append(np.array(doc_compat_guesses)) + + return guesses + + def _scores2guesses_top_k_greater_1(self, docs, scores): + guesses = [] + top_k = min(self.top_k, len(self.labels)) + for doc, doc_scores in zip(docs, scores): + doc_scores = self.numpy_ops.asarray(doc_scores) + doc_compat_guesses = [] + for i, token in enumerate(doc): + for _ in range(top_k): + candidate = int(doc_scores[i].argmax()) + candidate_tree_id = self.cfg["labels"][candidate] + if self.trees.apply(candidate_tree_id, token.text) is not None: + doc_compat_guesses.append(candidate_tree_id) + break + doc_scores[i, candidate] = np.finfo(np.float32).min + else: + doc_compat_guesses.append(-1) + guesses.append(np.array(doc_compat_guesses)) + + return guesses + + def _scores2guesses_top_k_guardrail(self, docs, scores): + guesses = [] + for doc, doc_scores in zip(docs, scores): + doc_guesses = np.argsort(doc_scores)[..., : -self.top_k - 1 : -1] + doc_guesses = self.numpy_ops.asarray(doc_guesses) doc_compat_guesses = [] for token, candidates in zip(doc, doc_guesses): diff --git a/spacy/schemas.py b/spacy/schemas.py index 3675c12dd..140592dcd 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -163,15 +163,33 @@ class TokenPatternString(BaseModel): IS_SUPERSET: Optional[List[StrictStr]] = Field(None, alias="is_superset") INTERSECTS: Optional[List[StrictStr]] = Field(None, alias="intersects") FUZZY: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy") - FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy1") - FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy2") - FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy3") - FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy4") - FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy5") - FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy6") - FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy7") - FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy8") - FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field(None, alias="fuzzy9") + FUZZY1: Optional[Union[StrictStr, "TokenPatternString"]] = Field( + None, alias="fuzzy1" + ) + FUZZY2: Optional[Union[StrictStr, "TokenPatternString"]] = Field( + None, alias="fuzzy2" + ) + FUZZY3: Optional[Union[StrictStr, "TokenPatternString"]] = Field( + None, alias="fuzzy3" + ) + FUZZY4: Optional[Union[StrictStr, "TokenPatternString"]] = Field( + None, alias="fuzzy4" + ) + FUZZY5: Optional[Union[StrictStr, "TokenPatternString"]] = Field( + None, alias="fuzzy5" + ) + FUZZY6: Optional[Union[StrictStr, "TokenPatternString"]] = Field( + None, alias="fuzzy6" + ) + FUZZY7: Optional[Union[StrictStr, "TokenPatternString"]] = Field( + None, alias="fuzzy7" + ) + FUZZY8: Optional[Union[StrictStr, "TokenPatternString"]] = Field( + None, alias="fuzzy8" + ) + FUZZY9: Optional[Union[StrictStr, "TokenPatternString"]] = Field( + None, alias="fuzzy9" + ) class Config: extra = "forbid" diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py index c4f9b09f3..128d75680 100644 --- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py +++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py @@ -101,14 +101,15 @@ def test_initialize_from_labels(): } -def test_no_data(): +@pytest.mark.parametrize("top_k", (1, 5, 30)) +def test_no_data(top_k): # Test that the lemmatizer provides a nice error when there's no tagging data / labels TEXTCAT_DATA = [ ("I'm so happy.", {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}}), ("I'm so angry", {"cats": {"POSITIVE": 0.0, "NEGATIVE": 1.0}}), ] nlp = English() - nlp.add_pipe("trainable_lemmatizer") + nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k}) nlp.add_pipe("textcat") train_examples = [] @@ -119,10 +120,11 @@ def test_no_data(): nlp.initialize(get_examples=lambda: train_examples) -def test_incomplete_data(): +@pytest.mark.parametrize("top_k", (1, 5, 30)) +def test_incomplete_data(top_k): # Test that the lemmatizer works with incomplete information nlp = English() - lemmatizer = nlp.add_pipe("trainable_lemmatizer") + lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k}) lemmatizer.min_tree_freq = 1 train_examples = [] for t in PARTIAL_DATA: @@ -154,9 +156,10 @@ def test_incomplete_data(): assert xp.count_nonzero(dX[1][1]) == 0 -def test_overfitting_IO(): +@pytest.mark.parametrize("top_k", (1, 5, 30)) +def test_overfitting_IO(top_k): nlp = English() - lemmatizer = nlp.add_pipe("trainable_lemmatizer") + lemmatizer = nlp.add_pipe("trainable_lemmatizer", config={"top_k": top_k}) lemmatizer.min_tree_freq = 1 train_examples = [] for t in TRAIN_DATA: @@ -189,7 +192,7 @@ def test_overfitting_IO(): # Check model after a {to,from}_bytes roundtrip nlp_bytes = nlp.to_bytes() nlp3 = English() - nlp3.add_pipe("trainable_lemmatizer") + nlp3.add_pipe("trainable_lemmatizer", config={"top_k": top_k}) nlp3.from_bytes(nlp_bytes) doc3 = nlp3(test_text) assert doc3[0].lemma_ == "she" diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index c88e20de2..d00f66c60 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -618,7 +618,6 @@ def test_string_to_list_intify(value): assert string_to_list(value, intify=True) == [1, 2, 3] -@pytest.mark.skip(reason="Temporarily skip for dev version") def test_download_compatibility(): spec = SpecifierSet("==" + about.__version__) spec.prereleases = False @@ -629,7 +628,6 @@ def test_download_compatibility(): assert get_minor_version(about.__version__) == get_minor_version(version) -@pytest.mark.skip(reason="Temporarily skip for dev version") def test_validate_compatibility_table(): spec = SpecifierSet("==" + about.__version__) spec.prereleases = False diff --git a/website/.dockerignore b/website/.dockerignore new file mode 100644 index 000000000..e4a88552e --- /dev/null +++ b/website/.dockerignore @@ -0,0 +1,9 @@ +.cache/ +.next/ +public/ +node_modules +.npm +logs +*.log +npm-debug.log* +quickstart-training-generator.js diff --git a/website/.gitignore b/website/.gitignore index 70ef99fa5..599c0953a 100644 --- a/website/.gitignore +++ b/website/.gitignore @@ -1,5 +1,7 @@ # See https://help.github.com/articles/ignoring-files/ for more about ignoring files. +quickstart-training-generator.js + # dependencies /node_modules /.pnp @@ -41,4 +43,4 @@ next-env.d.ts public/robots.txt public/sitemap* public/sw.js* -public/workbox* \ No newline at end of file +public/workbox* diff --git a/website/Dockerfile b/website/Dockerfile index f71733e55..9b2f6cac4 100644 --- a/website/Dockerfile +++ b/website/Dockerfile @@ -1,16 +1,14 @@ -FROM node:11.15.0 +FROM node:18 -WORKDIR /spacy-io - -RUN npm install -g gatsby-cli@2.7.4 - -COPY package.json . -COPY package-lock.json . - -RUN npm install +USER node # This is so the installed node_modules will be up one directory # from where a user mounts files, so that they don't accidentally mount # their own node_modules from a different build # https://nodejs.org/api/modules.html#modules_loading_from_node_modules_folders -WORKDIR /spacy-io/website/ +WORKDIR /home/node +COPY --chown=node package.json . +COPY --chown=node package-lock.json . +RUN npm install + +WORKDIR /home/node/website/ diff --git a/website/README.md b/website/README.md index e9d7aec26..a434efe9a 100644 --- a/website/README.md +++ b/website/README.md @@ -41,33 +41,27 @@ If you'd like to do this, **be sure you do _not_ include your local `node_modules` folder**, since there are some dependencies that need to be built for the image system. Rename it before using. -```bash -docker run -it \ - -v $(pwd):/spacy-io/website \ - -p 8000:8000 \ - ghcr.io/explosion/spacy-io \ - gatsby develop -H 0.0.0.0 -``` - -This will allow you to access the built website at http://0.0.0.0:8000/ in your -browser, and still edit code in your editor while having the site reflect those -changes. - -**Note**: If you're working on a Mac with an M1 processor, you might see -segfault errors from `qemu` if you use the default image. To fix this use the -`arm64` tagged image in the `docker run` command -(ghcr.io/explosion/spacy-io:arm64). - -### Building the Docker image - -If you'd like to build the image locally, you can do so like this: +First build the Docker image. This only needs to be done on the first run +or when changes are made to `Dockerfile` or the website dependencies: ```bash docker build -t spacy-io . ``` -This will take some time, so if you want to use the prebuilt image you'll save a -bit of time. +You can then build and run the website with: + +```bash +docker run -it \ + --rm \ + -v $(pwd):/home/node/website \ + -p 3000:3000 \ + spacy-io \ + npm run dev -- -H 0.0.0.0 +``` + +This will allow you to access the built website at http://0.0.0.0:3000/ in your +browser, and still edit code in your editor while having the site reflect those +changes. ## Project structure diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 5f10c8bc0..ca4023101 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1215,19 +1215,19 @@ When a directory is provided it is traversed recursively to collect all files. $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process] ``` -| Name | Description | -| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ | -| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ | -| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | -| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ | -| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ | -| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. | +| Name | Description | +| ------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ | +| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ | +| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ | +| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. | ## find-threshold {id="find-threshold",version="3.5",tag="command"} diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index 5c30d252e..bafb2f2da 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -15,7 +15,7 @@ world". It requires a `KnowledgeBase`, as well as a function to generate plausible candidates from that `KnowledgeBase` given a certain textual mention, and a machine learning model to pick the right candidate, given the local context of the mention. `EntityLinker` defaults to using the -[`InMemoryLookupKB`](/api/kb_in_memory) implementation. +[`InMemoryLookupKB`](/api/inmemorylookupkb) implementation. ## Assigned Attributes {id="assigned-attributes"} diff --git a/website/docs/api/kb_in_memory.mdx b/website/docs/api/inmemorylookupkb.mdx similarity index 96% rename from website/docs/api/kb_in_memory.mdx rename to website/docs/api/inmemorylookupkb.mdx index e85b63c45..c24fe78d6 100644 --- a/website/docs/api/kb_in_memory.mdx +++ b/website/docs/api/inmemorylookupkb.mdx @@ -43,7 +43,7 @@ The length of the fixed-size entity vectors in the knowledge base. Add an entity to the knowledge base, specifying its corpus frequency and entity vector, which should be of length -[`entity_vector_length`](/api/kb_in_memory#entity_vector_length). +[`entity_vector_length`](/api/inmemorylookupkb#entity_vector_length). > #### Example > @@ -79,8 +79,9 @@ frequency and entity vector for each entity. Add an alias or mention to the knowledge base, specifying its potential KB identifiers and their prior probabilities. The entity identifiers should refer -to entities previously added with [`add_entity`](/api/kb_in_memory#add_entity) -or [`set_entities`](/api/kb_in_memory#set_entities). The sum of the prior +to entities previously added with +[`add_entity`](/api/inmemorylookupkb#add_entity) or +[`set_entities`](/api/inmemorylookupkb#set_entities). The sum of the prior probabilities should not exceed 1. Note that an empty string can not be used as alias. @@ -156,7 +157,7 @@ Get a list of all aliases in the knowledge base. Given a certain textual mention as input, retrieve a list of candidate entities of type [`Candidate`](/api/kb#candidate). Wraps -[`get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates). +[`get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). > #### Example > @@ -174,7 +175,7 @@ of type [`Candidate`](/api/kb#candidate). Wraps ## InMemoryLookupKB.get_candidates_batch {id="get_candidates_batch",tag="method"} -Same as [`get_candidates()`](/api/kb_in_memory#get_candidates), but for an +Same as [`get_candidates()`](/api/inmemorylookupkb#get_candidates), but for an arbitrary number of mentions. The [`EntityLinker`](/api/entitylinker) component will call `get_candidates_batch()` instead of `get_candidates()`, if the config parameter `candidates_batch_size` is greater or equal than 1. @@ -231,7 +232,7 @@ Given a certain entity ID, retrieve its pretrained entity vector. ## InMemoryLookupKB.get_vectors {id="get_vectors",tag="method"} -Same as [`get_vector()`](/api/kb_in_memory#get_vector), but for an arbitrary +Same as [`get_vector()`](/api/inmemorylookupkb#get_vector), but for an arbitrary number of entity IDs. The default implementation of `get_vectors()` executes `get_vector()` in a loop. diff --git a/website/docs/api/kb.mdx b/website/docs/api/kb.mdx index 887b7fe97..2b0d4d9d6 100644 --- a/website/docs/api/kb.mdx +++ b/website/docs/api/kb.mdx @@ -21,8 +21,8 @@ functions called by the [`EntityLinker`](/api/entitylinker) component. This class was not abstract up to spaCy version 3.5. The `KnowledgeBase` -implementation up to that point is available as `InMemoryLookupKB` from 3.5 -onwards. +implementation up to that point is available as +[`InMemoryLookupKB`](/api/inmemorylookupkb) from 3.5 onwards. @@ -110,14 +110,15 @@ to you. From spaCy 3.5 on `KnowledgeBase` is an abstract class (with -[`InMemoryLookupKB`](/api/kb_in_memory) being a drop-in replacement) to allow -more flexibility in customizing knowledge bases. Some of its methods were moved -to [`InMemoryLookupKB`](/api/kb_in_memory) during this refactoring, one of those -being `get_alias_candidates()`. This method is now available as -[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates). -Note: [`InMemoryLookupKB.get_candidates()`](/api/kb_in_memory#get_candidates) +[`InMemoryLookupKB`](/api/inmemorylookupkb) being a drop-in replacement) to +allow more flexibility in customizing knowledge bases. Some of its methods were +moved to [`InMemoryLookupKB`](/api/inmemorylookupkb) during this refactoring, +one of those being `get_alias_candidates()`. This method is now available as +[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). +Note: +[`InMemoryLookupKB.get_candidates()`](/api/inmemorylookupkb#get_candidates) defaults to -[`InMemoryLookupKB.get_alias_candidates()`](/api/kb_in_memory#get_alias_candidates). +[`InMemoryLookupKB.get_alias_candidates()`](/api/inmemorylookupkb#get_alias_candidates). ## KnowledgeBase.get_vector {id="get_vector",tag="method"} diff --git a/website/docs/usage/101/_architecture.mdx b/website/docs/usage/101/_architecture.mdx index 5727c6921..2a63a3741 100644 --- a/website/docs/usage/101/_architecture.mdx +++ b/website/docs/usage/101/_architecture.mdx @@ -79,7 +79,7 @@ operates on a `Doc` and gives you access to the matched tokens **in context**. | ------------------------------------------------ | -------------------------------------------------------------------------------------------------- | | [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. | | [`KnowledgeBase`](/api/kb) | Abstract base class for storage and retrieval of data for entity linking. | -| [`InMemoryLookupKB`](/api/kb_in_memory) | Implementation of `KnowledgeBase` storing all data in memory. | +| [`InMemoryLookupKB`](/api/inmemorylookupkb) | Implementation of `KnowledgeBase` storing all data in memory. | | [`Candidate`](/api/kb#candidate) | Object associating a textual mention with a specific entity contained in a `KnowledgeBase`. | | [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. | | [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. | diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 8c9de0d79..08d2b3b91 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -384,14 +384,14 @@ the more specific attributes `FUZZY1`..`FUZZY9` you can specify the maximum allowed edit distance directly. ```python -# Match lowercase with fuzzy matching (allows 2 edits) +# Match lowercase with fuzzy matching (allows 3 edits) pattern = [{"LOWER": {"FUZZY": "definitely"}}] -# Match custom attribute values with fuzzy matching (allows 2 edits) +# Match custom attribute values with fuzzy matching (allows 3 edits) pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}] -# Match with exact Levenshtein edit distance limits (allows 3 edits) -pattern = [{"_": {"country": {"FUZZY3": "Kyrgyzstan"}}}] +# Match with exact Levenshtein edit distance limits (allows 4 edits) +pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}] ``` #### Regex and fuzzy matching with lists {id="regex-fuzzy-lists", version="3.5"} diff --git a/website/docs/usage/saving-loading.mdx b/website/docs/usage/saving-loading.mdx index 87735fed1..e0daebe35 100644 --- a/website/docs/usage/saving-loading.mdx +++ b/website/docs/usage/saving-loading.mdx @@ -684,10 +684,15 @@ If your pipeline includes [custom components](/usage/processing-pipelines#custom-components), model architectures or other [code](/usage/training#custom-code), those functions need to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know -how to create the objects referenced in the config. The -[`spacy package`](/api/cli#package) command lets you provide one or more paths -to Python files containing custom registered functions using the `--code` -argument. +how to create the objects referenced in the config. If you're loading your own +pipeline in Python, you can make custom components available just by importing +the code that defines them before calling +[`spacy.load`](/api/top-level#spacy.load). This is also how the `--code` +argument to CLI commands works. + +With the [`spacy package`](/api/cli#package) command, you can provide one or +more paths to Python files containing custom registered functions using the +`--code` argument. > #### \_\_init\_\_.py (excerpt) > diff --git a/website/docs/usage/v3-5.mdx b/website/docs/usage/v3-5.mdx new file mode 100644 index 000000000..ac61338e3 --- /dev/null +++ b/website/docs/usage/v3-5.mdx @@ -0,0 +1,215 @@ +--- +title: What's New in v3.5 +teaser: New features and how to upgrade +menu: + - ['New Features', 'features'] + - ['Upgrading Notes', 'upgrading'] +--- + +## New features {id="features",hidden="true"} + +spaCy v3.5 introduces three new CLI commands, `apply`, `benchmark` and +`find-threshold`, adds fuzzy matching, provides improvements to our entity +linking functionality, and includes a range of language updates and bug fixes. + +### New CLI commands {id="cli"} + +#### apply CLI + +The [`apply` CLI](/api/cli#apply) can be used to apply a pipeline to one or more +`.txt`, `.jsonl` or `.spacy` input files, saving the annotated docs in a single +`.spacy` file. + +```bash +$ spacy apply en_core_web_sm my_texts/ output.spacy +``` + +#### benchmark CLI + +The [`benchmark` CLI](/api/cli#benchmark) has been added to extend the existing +`evaluate` functionality with a wider range of profiling subcommands. + +The `benchmark accuracy` CLI is introduced as an alias for `evaluate`. The new +`benchmark speed` CLI performs warmup rounds before measuring the speed in words +per second on batches of randomly shuffled documents from the provided data. + +```bash +$ spacy benchmark speed my_pipeline data.spacy +``` + +The output is the mean performance using batches (`nlp.pipe`) with a 95% +confidence interval, e.g., profiling `en_core_web_sm` on CPU: + +```none +Outliers: 2.0%, extreme outliers: 0.0% +Mean: 18904.1 words/s (95% CI: -256.9 +244.1) +``` + +#### find-threshold CLI + +The [`find-threshold` CLI](/api/cli#find-threshold) runs a series of trials +across threshold values from `0.0` to `1.0` and identifies the best threshold +for the provided score metric. + +The following command runs 20 trials for the `spancat` component in +`my_pipeline`, recording the `spans_sc_f` score for each value of the threshold +`[components.spancat.threshold]` from `0.0` to `1.0`: + +```bash +$ spacy find-threshold my_pipeline data.spacy spancat threshold spans_sc_f --n_trials 20 +``` + +The `find-threshold` CLI can be used with `textcat_multilabel`, `spancat` and +custom components with thresholds that are applied while predicting or scoring. + +### Fuzzy matching {id="fuzzy"} + +New `FUZZY` operators support [fuzzy matching](/usage/rule-based-matching#fuzzy) +with the `Matcher`. By default, the `FUZZY` operator allows a Levenshtein edit +distance of 2 and up to 30% of the pattern string length. `FUZZY1`..`FUZZY9` can +be used to specify the exact number of allowed edits. + +```python +# Match lowercase with fuzzy matching (allows up to 3 edits) +pattern = [{"LOWER": {"FUZZY": "definitely"}}] + +# Match custom attribute values with fuzzy matching (allows up to 3 edits) +pattern = [{"_": {"country": {"FUZZY": "Kyrgyzstan"}}}] + +# Match with exact Levenshtein edit distance limits (allows up to 4 edits) +pattern = [{"_": {"country": {"FUZZY4": "Kyrgyzstan"}}}] +``` + +Note that `FUZZY` uses Levenshtein edit distance rather than Damerau-Levenshtein +edit distance, so a transposition like `teh` for `the` counts as two edits, one +insertion and one deletion. + +If you'd prefer an alternate fuzzy matching algorithm, you can provide your own +custom method to the `Matcher` or as a config option for an entity ruler and +span ruler. + +### FUZZY and REGEX with lists {id="fuzzy-regex-lists"} + +The `FUZZY` and `REGEX` operators are also now supported for lists with `IN` and +`NOT_IN`: + +```python +pattern = [{"TEXT": {"FUZZY": {"IN": ["awesome", "cool", "wonderful"]}}}] +pattern = [{"TEXT": {"REGEX": {"NOT_IN": ["^awe(some)?$", "^wonder(ful)?"]}}}] +``` + +### Entity linking generalization {id="el"} + +The knowledge base used for entity linking is now easier to customize and has a +new default implementation [`InMemoryLookupKB`](/api/inmemorylookupkb). + +### Additional features and improvements {id="additional-features-and-improvements"} + +- Language updates: + - Extended support for Slovenian + - Fixed lookup fallback for French and Catalan lemmatizers + - Switch Russian and Ukrainian lemmatizers to `pymorphy3` + - Support for editorial punctuation in Ancient Greek + - Update to Russian tokenizer exceptions + - Small fix for Dutch stop words +- Allow up to `typer` v0.7.x, `mypy` 0.990 and `typing_extensions` v4.4.x. +- New `spacy.ConsoleLogger.v3` with expanded progress + [tracking](/api/top-level#ConsoleLogger). +- Improved scoring behavior for `textcat` with `spacy.textcat_scorer.v2` and + `spacy.textcat_multilabel_scorer.v2`. +- Updates so that downstream components can train properly on a frozen `tok2vec` + or `transformer` layer. +- Allow interpolation of variables in directory names in projects. +- Support for local file system [remotes](/usage/projects#remote) for projects. +- Improve UX around `displacy.serve` when the default port is in use. +- Optional `before_update` callback that is invoked at the start of each + [training step](/api/data-formats#config-training). +- Improve performance of `SpanGroup` and fix typing issues for `SpanGroup` and + `Span` objects. +- Patch a + [security vulnerability](https://github.com/advisories/GHSA-gw9q-c7gh-j9vm) in + extracting tar files. +- Add equality definition for `Vectors`. +- Ensure `Vocab.to_disk` respects the exclude setting for `lookups` and + `vectors`. +- Correctly handle missing annotations in the edit tree lemmatizer. + +### Trained pipeline updates {id="pipelines"} + +- The CNN pipelines add `IS_SPACE` as a `tok2vec` feature for `tagger` and + `morphologizer` components to improve tagging of non-whitespace vs. whitespace + tokens. +- The transformer pipelines require `spacy-transformers` v1.2, which uses the + exact alignment from `tokenizers` for fast tokenizers instead of the heuristic + alignment from `spacy-alignments`. For all trained pipelines except + `ja_core_news_trf`, the alignments between spaCy tokens and transformer tokens + may be slightly different. More details about the `spacy-transformers` changes + in the + [v1.2.0 release notes](https://github.com/explosion/spacy-transformers/releases/tag/v1.2.0). + +## Notes about upgrading from v3.4 {id="upgrading"} + +### Validation of textcat values {id="textcat-validation"} + +An error is now raised when unsupported values are given as input to train a +`textcat` or `textcat_multilabel` model - ensure that values are `0.0` or `1.0` +as explained in the [docs](/api/textcategorizer#assigned-attributes). + +### Updated scorers for tokenization and textcat {id="scores"} + +We fixed a bug that inflated the `token_acc` scores in v3.0-v3.4. The reported +`token_acc` will drop from v3.4 to v3.5, but if `token_p/r/f` stay the same, +your tokenization performance has not changed from v3.4. + +For new `textcat` or `textcat_multilabel` configs, the new default `v2` scorers: + +- ignore `threshold` for `textcat`, so the reported `cats_p/r/f` may increase + slightly in v3.5 even though the underlying predictions are unchanged +- report the performance of only the **final** `textcat` or `textcat_multilabel` + component in the pipeline by default +- allow custom scorers to be used to score multiple `textcat` and + `textcat_multilabel` components with `Scorer.score_cats` by restricting the + evaluation to the component's provided labels + +### Pipeline package version compatibility {id="version-compat"} + +> #### Using legacy implementations +> +> In spaCy v3, you'll still be able to load and reference legacy implementations +> via [`spacy-legacy`](https://github.com/explosion/spacy-legacy), even if the +> components or architectures change and newer versions are available in the +> core library. + +When you're loading a pipeline package trained with an earlier version of spaCy +v3, you will see a warning telling you that the pipeline may be incompatible. +This doesn't necessarily have to be true, but we recommend running your +pipelines against your test suite or evaluation data to make sure there are no +unexpected results. + +If you're using one of the [trained pipelines](/models) we provide, you should +run [`spacy download`](/api/cli#download) to update to the latest version. To +see an overview of all installed packages and their compatibility, you can run +[`spacy validate`](/api/cli#validate). + +If you've trained your own custom pipeline and you've confirmed that it's still +working as expected, you can update the spaCy version requirements in the +[`meta.json`](/api/data-formats#meta): + +```diff +- "spacy_version": ">=3.4.0,<3.5.0", ++ "spacy_version": ">=3.4.0,<3.6.0", +``` + +### Updating v3.4 configs + +To update a config from spaCy v3.4 with the new v3.5 settings, run +[`init fill-config`](/api/cli#init-fill-config): + +```cli +$ python -m spacy init fill-config config-v3.4.cfg config-v3.5.cfg +``` + +In many cases ([`spacy train`](/api/cli#train), +[`spacy.load`](/api/top-level#spacy.load)), the new defaults will be filled in +automatically, but you'll need to fill in the new settings to run +[`debug config`](/api/cli#debug) and [`debug data`](/api/cli#debug-data). diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 339e4085b..b5c555da6 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -13,7 +13,8 @@ { "text": "New in v3.1", "url": "/usage/v3-1" }, { "text": "New in v3.2", "url": "/usage/v3-2" }, { "text": "New in v3.3", "url": "/usage/v3-3" }, - { "text": "New in v3.4", "url": "/usage/v3-4" } + { "text": "New in v3.4", "url": "/usage/v3-4" }, + { "text": "New in v3.5", "url": "/usage/v3-5" } ] }, { @@ -129,6 +130,7 @@ "items": [ { "text": "Attributes", "url": "/api/attributes" }, { "text": "Corpus", "url": "/api/corpus" }, + { "text": "InMemoryLookupKB", "url": "/api/inmemorylookupkb" }, { "text": "KnowledgeBase", "url": "/api/kb" }, { "text": "Lookups", "url": "/api/lookups" }, { "text": "MorphAnalysis", "url": "/api/morphology#morphanalysis" }, diff --git a/website/pages/index.tsx b/website/pages/index.tsx index 170bca137..4c0932926 100644 --- a/website/pages/index.tsx +++ b/website/pages/index.tsx @@ -89,7 +89,7 @@ const Landing = () => { - In the five years since its release, spaCy has become an industry standard with + Since its release in 2015, spaCy has become an industry standard with a huge ecosystem. Choose from a variety of plugins, integrate with your machine learning stack and build custom components and workflows. diff --git a/website/src/components/seo.js b/website/src/components/seo.js index 5d12ffa04..d338c43f3 100644 --- a/website/src/components/seo.js +++ b/website/src/components/seo.js @@ -9,6 +9,8 @@ import socialImageLegacy from '../images/social_legacy.jpg' import siteMetadata from '../../meta/site.json' import Head from 'next/head' +import { siteUrl } from '../../meta/dynamicMeta.mjs' + function getPageTitle(title, sitename, slogan, sectionTitle, nightly, legacy) { if (sectionTitle && title) { const suffix = nightly ? ' (nightly)' : legacy ? ' (legacy)' : '' @@ -25,7 +27,7 @@ function getImage(section, nightly, legacy) { if (legacy) return socialImageLegacy if (section === 'api') return socialImageApi if (section === 'universe') return socialImageUniverse - return socialImageDefault + return `${siteUrl}${socialImageDefault.src}` } export default function SEO({ @@ -46,7 +48,7 @@ export default function SEO({ nightly, legacy ) - const socialImage = getImage(section, nightly, legacy).src + const socialImage = getImage(section, nightly, legacy) const meta = [ { name: 'description', diff --git a/website/src/styles/list.module.sass b/website/src/styles/list.module.sass index 1a352d9dd..2fb9ab8ef 100644 --- a/website/src/styles/list.module.sass +++ b/website/src/styles/list.module.sass @@ -20,6 +20,10 @@ display: inline-block margin-bottom: var(--spacing-sm) + .ol, .ul + margin-top: var(--spacing-xs) + margin-bottom: var(--spacing-xs) + &:before content: '\25CF' position: relative diff --git a/website/src/templates/index.js b/website/src/templates/index.js index aa7595ddc..80b5a24d4 100644 --- a/website/src/templates/index.js +++ b/website/src/templates/index.js @@ -58,8 +58,8 @@ const AlertSpace = ({ nightly, legacy }) => { } const navAlert = ( - - 💥 Out now: spaCy v3.4 + + 💥 Out now: spaCy v3.5 )