diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml index 66e0707e0..c6ea98f76 100644 --- a/.github/workflows/gputests.yml +++ b/.github/workflows/gputests.yml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: false matrix: - branch: [master, v4] + branch: [master, main] if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml index f9fd3e817..4a4f08005 100644 --- a/.github/workflows/slowtests.yml +++ b/.github/workflows/slowtests.yml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: false matrix: - branch: [master, v4] + branch: [master, main] if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3c0b27c1d..b85ea8fcc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -452,10 +452,9 @@ and plugins in spaCy v3.0, and we can't wait to see what you build with it! spaCy website. If you're sharing your project on Twitter, feel free to tag [@spacy_io](https://twitter.com/spacy_io) so we can check it out. -- Once your extension is published, you can open an issue on the - [issue tracker](https://github.com/explosion/spacy/issues) to suggest it for the - [resources directory](https://spacy.io/usage/resources#extensions) on the - website. +- Once your extension is published, you can open a + [PR](https://github.com/explosion/spaCy/pulls) to suggest it for the + [Universe](https://spacy.io/universe) page. 📖 **For more tips and best practices, see the [checklist for developing spaCy extensions](https://spacy.io/usage/processing-pipelines#extensions).** diff --git a/spacy/cli/benchmark_speed.py b/spacy/cli/benchmark_speed.py index c7fd771c3..4dd10049c 100644 --- a/spacy/cli/benchmark_speed.py +++ b/spacy/cli/benchmark_speed.py @@ -13,7 +13,7 @@ from .. import util from ..language import Language from ..tokens import Doc from ..training import Corpus -from ._util import Arg, Opt, benchmark_cli, setup_gpu +from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu @benchmark_cli.command( @@ -30,12 +30,14 @@ def benchmark_speed_cli( use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,), warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"), + code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), # fmt: on ): """ Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the binary .spacy format. """ + import_code(code_path) setup_gpu(use_gpu=use_gpu, silent=False) nlp = util.load_model(model) @@ -171,5 +173,5 @@ def print_outliers(sample: numpy.ndarray): def warmup( nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int] ) -> numpy.ndarray: - docs = warmup_epochs * docs + docs = [doc.copy() for doc in docs * warmup_epochs] return annotate(nlp, docs, batch_size) diff --git a/spacy/language.py b/spacy/language.py index 66cdbb05e..59d25586b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1830,6 +1830,7 @@ class Language: # is done, so that they can exit gracefully. for q in texts_q: q.put(_WORK_DONE_SENTINEL) + q.close() # Otherwise, we are stopping because the error handler raised an # exception. The sentinel will be last to go out of the queue. @@ -2464,7 +2465,8 @@ def _apply_pipes( # Stop working if we encounter the end-of-work sentinel. if isinstance(texts_with_ctx, _WorkDoneSentinel): - return + sender.close() + receiver.close() docs = ( ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx @@ -2488,7 +2490,8 @@ def _apply_pipes( # Parent has closed the pipe prematurely. This happens when a # worker encounters an error and the error handler is set to # stop processing. - return + sender.close() + receiver.close() class _Sender: diff --git a/spacy/tests/serialize/test_serialize_extension_attrs.py b/spacy/tests/serialize/test_serialize_extension_attrs.py index f3b6cb000..2fb56c848 100644 --- a/spacy/tests/serialize/test_serialize_extension_attrs.py +++ b/spacy/tests/serialize/test_serialize_extension_attrs.py @@ -15,7 +15,12 @@ def doc_w_attrs(en_tokenizer): Token.set_extension("_test_token", default="t0") doc[1]._._test_token = "t1" - return doc + yield doc + + Doc.remove_extension("_test_attr") + Doc.remove_extension("_test_prop") + Doc.remove_extension("_test_method") + Token.remove_extension("_test_token") def test_serialize_ext_attrs_from_bytes(doc_w_attrs): diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 973053fce..85243b436 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1267,20 +1267,21 @@ the [binary `.spacy` format](/api/data-formats#binary-training). The pipeline is warmed up before any measurements are taken. ```cli -$ python -m spacy benchmark speed [model] [data_path] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup] +$ python -m spacy benchmark speed [model] [data_path] [--code] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup] ``` -| Name | Description | -| -------------------- | -------------------------------------------------------------------------------------------------------- | -| `model` | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | -| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~ | -| `--no-shuffle` | Do not shuffle documents in the benchmark data. ~~bool (flag)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--batches` | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~ | -| `--warmup`, `-w` | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **PRINTS** | Pipeline speed in words per second with a 95% confidence interval. | +| Name | Description | +| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~ | +| `--no-shuffle` | Do not shuffle documents in the benchmark data. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--batches` | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~ | +| `--warmup`, `-w` | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **PRINTS** | Pipeline speed in words per second with a 95% confidence interval. | ## apply {id="apply", version="3.5", tag="command"} @@ -1295,6 +1296,9 @@ input formats are: When a directory is provided it is traversed recursively to collect all files. +When loading a .spacy file, any potential annotations stored on the `Doc` that are not overwritten by the pipeline will be preserved. +If you want to evaluate the pipeline on raw text only, make sure that the .spacy file does not contain any annotations. + ```bash $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process] ``` diff --git a/website/docs/api/large-language-models.mdx b/website/docs/api/large-language-models.mdx index b0ef4c9f9..cefd5c66e 100644 --- a/website/docs/api/large-language-models.mdx +++ b/website/docs/api/large-language-models.mdx @@ -1507,7 +1507,7 @@ These models all take the same parameters: > ```ini > [components.llm.model] > @llm_models = "spacy.Llama2.v1" -> name = "llama2-7b-hf" +> name = "Llama-2-7b-hf" > ``` Currently, these models are provided as part of the core library: diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx index 1e32eb118..319ce88b8 100644 --- a/website/docs/api/vocab.mdx +++ b/website/docs/api/vocab.mdx @@ -13,7 +13,7 @@ between `Doc` objects. Note that a `Vocab` instance is not static. It increases in size as texts with -new tokens are processed. +new tokens are processed. Some models may have an empty vocab at initialization. @@ -93,6 +93,7 @@ given string, you need to look it up in > #### Example > > ```python +> nlp("I'm eating an apple") > apple = nlp.vocab.strings["apple"] > oov = nlp.vocab.strings["dskfodkfos"] > assert apple in nlp.vocab