From a493981163002d0cd2409950512eeeccb6fa4690 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 24 Jan 2024 09:29:57 +0100 Subject: [PATCH 1/8] fix typo (#13254) --- website/docs/api/large-language-models.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/api/large-language-models.mdx b/website/docs/api/large-language-models.mdx index b0ef4c9f9..cefd5c66e 100644 --- a/website/docs/api/large-language-models.mdx +++ b/website/docs/api/large-language-models.mdx @@ -1507,7 +1507,7 @@ These models all take the same parameters: > ```ini > [components.llm.model] > @llm_models = "spacy.Llama2.v1" -> name = "llama2-7b-hf" +> name = "Llama-2-7b-hf" > ``` Currently, these models are provided as part of the core library: From 7496e03a2c18c24454af924347af667e6df0ac70 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 26 Jan 2024 10:58:48 +0100 Subject: [PATCH 2/8] Clarify vocab docs (#13273) * add line to ensure that apple is in fact in the vocab * add that the vocab may be empty --- website/docs/api/vocab.mdx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/website/docs/api/vocab.mdx b/website/docs/api/vocab.mdx index fe774d1a8..57618397d 100644 --- a/website/docs/api/vocab.mdx +++ b/website/docs/api/vocab.mdx @@ -13,7 +13,7 @@ between `Doc` objects. Note that a `Vocab` instance is not static. It increases in size as texts with -new tokens are processed. +new tokens are processed. Some models may have an empty vocab at initialization. @@ -93,6 +93,7 @@ given string, you need to look it up in > #### Example > > ```python +> nlp("I'm eating an apple") > apple = nlp.vocab.strings["apple"] > oov = nlp.vocab.strings["dskfodkfos"] > assert apple in nlp.vocab From 68b85ea950492e4f83d9b1552806ab4a9631236e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 26 Jan 2024 12:10:05 +0100 Subject: [PATCH 3/8] Clarify data_path loading for apply CLI command (#13272) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * attempt to clarify additional annotations on .spacy file * suggestion by Daniël * pipeline instead of pipe --- website/docs/api/cli.mdx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index 51cae960b..db91e1062 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1296,6 +1296,9 @@ input formats are: When a directory is provided it is traversed recursively to collect all files. +When loading a .spacy file, any potential annotations stored on the `Doc` that are not overwritten by the pipeline will be preserved. +If you want to evaluate the pipeline on raw text only, make sure that the .spacy file does not contain any annotations. + ```bash $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process] ``` From 00e938a7c3a74c559d0cc5c33437b698f3b3e770 Mon Sep 17 00:00:00 2001 From: Eliana Vornov Date: Fri, 26 Jan 2024 07:29:22 -0500 Subject: [PATCH 4/8] add custom code support to CLI speed benchmark (#13247) * add custom code support to CLI speed benchmark * sort imports * better copying for warmup docs --- spacy/cli/benchmark_speed.py | 6 ++++-- website/docs/api/cli.mdx | 25 +++++++++++++------------ 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/spacy/cli/benchmark_speed.py b/spacy/cli/benchmark_speed.py index c7fd771c3..4dd10049c 100644 --- a/spacy/cli/benchmark_speed.py +++ b/spacy/cli/benchmark_speed.py @@ -13,7 +13,7 @@ from .. import util from ..language import Language from ..tokens import Doc from ..training import Corpus -from ._util import Arg, Opt, benchmark_cli, setup_gpu +from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu @benchmark_cli.command( @@ -30,12 +30,14 @@ def benchmark_speed_cli( use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,), warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"), + code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"), # fmt: on ): """ Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark data in the binary .spacy format. """ + import_code(code_path) setup_gpu(use_gpu=use_gpu, silent=False) nlp = util.load_model(model) @@ -171,5 +173,5 @@ def print_outliers(sample: numpy.ndarray): def warmup( nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int] ) -> numpy.ndarray: - docs = warmup_epochs * docs + docs = [doc.copy() for doc in docs * warmup_epochs] return annotate(nlp, docs, batch_size) diff --git a/website/docs/api/cli.mdx b/website/docs/api/cli.mdx index db91e1062..950d98c1f 100644 --- a/website/docs/api/cli.mdx +++ b/website/docs/api/cli.mdx @@ -1268,20 +1268,21 @@ the [binary `.spacy` format](/api/data-formats#binary-training). The pipeline is warmed up before any measurements are taken. ```cli -$ python -m spacy benchmark speed [model] [data_path] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup] +$ python -m spacy benchmark speed [model] [data_path] [--code] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup] ``` -| Name | Description | -| -------------------- | -------------------------------------------------------------------------------------------------------- | -| `model` | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ | -| `data_path` | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | -| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~ | -| `--no-shuffle` | Do not shuffle documents in the benchmark data. ~~bool (flag)~~ | -| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | -| `--batches` | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~ | -| `--warmup`, `-w` | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~ | -| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | -| **PRINTS** | Pipeline speed in words per second with a 95% confidence interval. | +| Name | Description | +| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `model` | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ | +| `data_path` | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | +| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ | +| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~ | +| `--no-shuffle` | Do not shuffle documents in the benchmark data. ~~bool (flag)~~ | +| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | +| `--batches` | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~ | +| `--warmup`, `-w` | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~ | +| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | +| **PRINTS** | Pipeline speed in words per second with a 95% confidence interval. | ## apply {id="apply", version="3.5", tag="command"} From 68d7841df593986655d07f9840fcd35e79b28c7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Mon, 29 Jan 2024 13:51:56 +0100 Subject: [PATCH 5/8] Extension serialization attr tests: add teardown (#13284) The doc/token extension serialization tests add extensions that are not serializable with pickle. This didn't cause issues before due to the implicit run order of tests. However, test ordering has changed with pytest 8.0.0, leading to failed tests in test_language. Update the fixtures in the extension serialization tests to do proper teardown and remove the extensions. --- spacy/tests/serialize/test_serialize_extension_attrs.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/spacy/tests/serialize/test_serialize_extension_attrs.py b/spacy/tests/serialize/test_serialize_extension_attrs.py index f3b6cb000..2fb56c848 100644 --- a/spacy/tests/serialize/test_serialize_extension_attrs.py +++ b/spacy/tests/serialize/test_serialize_extension_attrs.py @@ -15,7 +15,12 @@ def doc_w_attrs(en_tokenizer): Token.set_extension("_test_token", default="t0") doc[1]._._test_token = "t1" - return doc + yield doc + + Doc.remove_extension("_test_attr") + Doc.remove_extension("_test_prop") + Doc.remove_extension("_test_method") + Token.remove_extension("_test_token") def test_serialize_ext_attrs_from_bytes(doc_w_attrs): From 89a43f39b775c27af724f90a65e210ecfb94dba2 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 30 Jan 2024 13:49:49 +0100 Subject: [PATCH 6/8] update universe description (#13291) --- CONTRIBUTING.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f6f6dab59..ed75e1fd8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -452,10 +452,9 @@ and plugins in spaCy v3.0, and we can't wait to see what you build with it! spaCy website. If you're sharing your project on Twitter, feel free to tag [@spacy_io](https://twitter.com/spacy_io) so we can check it out. -- Once your extension is published, you can open an issue on the - [issue tracker](https://github.com/explosion/spacy/issues) to suggest it for the - [resources directory](https://spacy.io/usage/resources#extensions) on the - website. +- Once your extension is published, you can open a + [PR](https://github.com/explosion/spaCy/pulls) to suggest it for the + [Universe](https://spacy.io/universe) page. 📖 **For more tips and best practices, see the [checklist for developing spaCy extensions](https://spacy.io/usage/processing-pipelines#extensions).** From d84068e460d4ff3f91280368c3c2f8b8dcd1d5bc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?= Date: Tue, 30 Jan 2024 13:58:28 +0100 Subject: [PATCH 7/8] Run slow tests: v4 -> main (#13290) * Run slow tests: v4 -> main * Also update the branch in GPU tests --- .github/workflows/gputests.yml | 2 +- .github/workflows/slowtests.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/gputests.yml b/.github/workflows/gputests.yml index 66e0707e0..c6ea98f76 100644 --- a/.github/workflows/gputests.yml +++ b/.github/workflows/gputests.yml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: false matrix: - branch: [master, v4] + branch: [master, main] if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: diff --git a/.github/workflows/slowtests.yml b/.github/workflows/slowtests.yml index f9fd3e817..4a4f08005 100644 --- a/.github/workflows/slowtests.yml +++ b/.github/workflows/slowtests.yml @@ -9,7 +9,7 @@ jobs: strategy: fail-fast: false matrix: - branch: [master, v4] + branch: [master, main] if: github.repository_owner == 'explosion' runs-on: ubuntu-latest steps: From 2d4067d021323e91ab80d40019f8e9792e2c8d36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Danie=CC=88l=20de=20Kok?= Date: Fri, 2 Feb 2024 11:39:07 +0100 Subject: [PATCH 8/8] Test if closing explicitly solves recursive lock issues --- spacy/language.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 0287549db..568d2d4fa 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -1716,6 +1716,7 @@ class Language: # is done, so that they can exit gracefully. for q in texts_q: q.put(_WORK_DONE_SENTINEL) + q.close() # Otherwise, we are stopping because the error handler raised an # exception. The sentinel will be last to go out of the queue. @@ -2347,7 +2348,8 @@ def _apply_pipes( # Stop working if we encounter the end-of-work sentinel. if isinstance(texts_with_ctx, _WorkDoneSentinel): - return + sender.close() + receiver.close() docs = ( ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx @@ -2371,7 +2373,8 @@ def _apply_pipes( # Parent has closed the pipe prematurely. This happens when a # worker encounters an error and the error handler is set to # stop processing. - return + sender.close() + receiver.close() class _Sender: