Merge branch 'upstream_main' into feature/docwise-generator-batching

This commit is contained in:
svlandeg 2024-02-06 13:57:41 +01:00
commit af336ac754
9 changed files with 39 additions and 25 deletions

View File

@ -9,7 +9,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
branch: [master, v4] branch: [master, main]
if: github.repository_owner == 'explosion' if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:

View File

@ -9,7 +9,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
branch: [master, v4] branch: [master, main]
if: github.repository_owner == 'explosion' if: github.repository_owner == 'explosion'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:

View File

@ -452,10 +452,9 @@ and plugins in spaCy v3.0, and we can't wait to see what you build with it!
spaCy website. If you're sharing your project on Twitter, feel free to tag spaCy website. If you're sharing your project on Twitter, feel free to tag
[@spacy_io](https://twitter.com/spacy_io) so we can check it out. [@spacy_io](https://twitter.com/spacy_io) so we can check it out.
- Once your extension is published, you can open an issue on the - Once your extension is published, you can open a
[issue tracker](https://github.com/explosion/spacy/issues) to suggest it for the [PR](https://github.com/explosion/spaCy/pulls) to suggest it for the
[resources directory](https://spacy.io/usage/resources#extensions) on the [Universe](https://spacy.io/universe) page.
website.
📖 **For more tips and best practices, see the [checklist for developing spaCy extensions](https://spacy.io/usage/processing-pipelines#extensions).** 📖 **For more tips and best practices, see the [checklist for developing spaCy extensions](https://spacy.io/usage/processing-pipelines#extensions).**

View File

@ -13,7 +13,7 @@ from .. import util
from ..language import Language from ..language import Language
from ..tokens import Doc from ..tokens import Doc
from ..training import Corpus from ..training import Corpus
from ._util import Arg, Opt, benchmark_cli, setup_gpu from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu
@benchmark_cli.command( @benchmark_cli.command(
@ -30,12 +30,14 @@ def benchmark_speed_cli(
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"), use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,), n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"), warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
# fmt: on # fmt: on
): ):
""" """
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
data in the binary .spacy format. data in the binary .spacy format.
""" """
import_code(code_path)
setup_gpu(use_gpu=use_gpu, silent=False) setup_gpu(use_gpu=use_gpu, silent=False)
nlp = util.load_model(model) nlp = util.load_model(model)
@ -171,5 +173,5 @@ def print_outliers(sample: numpy.ndarray):
def warmup( def warmup(
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int] nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
) -> numpy.ndarray: ) -> numpy.ndarray:
docs = warmup_epochs * docs docs = [doc.copy() for doc in docs * warmup_epochs]
return annotate(nlp, docs, batch_size) return annotate(nlp, docs, batch_size)

View File

@ -1830,6 +1830,7 @@ class Language:
# is done, so that they can exit gracefully. # is done, so that they can exit gracefully.
for q in texts_q: for q in texts_q:
q.put(_WORK_DONE_SENTINEL) q.put(_WORK_DONE_SENTINEL)
q.close()
# Otherwise, we are stopping because the error handler raised an # Otherwise, we are stopping because the error handler raised an
# exception. The sentinel will be last to go out of the queue. # exception. The sentinel will be last to go out of the queue.
@ -2464,7 +2465,8 @@ def _apply_pipes(
# Stop working if we encounter the end-of-work sentinel. # Stop working if we encounter the end-of-work sentinel.
if isinstance(texts_with_ctx, _WorkDoneSentinel): if isinstance(texts_with_ctx, _WorkDoneSentinel):
return sender.close()
receiver.close()
docs = ( docs = (
ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx ensure_doc(doc_like, context) for doc_like, context in texts_with_ctx
@ -2488,7 +2490,8 @@ def _apply_pipes(
# Parent has closed the pipe prematurely. This happens when a # Parent has closed the pipe prematurely. This happens when a
# worker encounters an error and the error handler is set to # worker encounters an error and the error handler is set to
# stop processing. # stop processing.
return sender.close()
receiver.close()
class _Sender: class _Sender:

View File

@ -15,7 +15,12 @@ def doc_w_attrs(en_tokenizer):
Token.set_extension("_test_token", default="t0") Token.set_extension("_test_token", default="t0")
doc[1]._._test_token = "t1" doc[1]._._test_token = "t1"
return doc yield doc
Doc.remove_extension("_test_attr")
Doc.remove_extension("_test_prop")
Doc.remove_extension("_test_method")
Token.remove_extension("_test_token")
def test_serialize_ext_attrs_from_bytes(doc_w_attrs): def test_serialize_ext_attrs_from_bytes(doc_w_attrs):

View File

@ -1267,20 +1267,21 @@ the [binary `.spacy` format](/api/data-formats#binary-training). The pipeline is
warmed up before any measurements are taken. warmed up before any measurements are taken.
```cli ```cli
$ python -m spacy benchmark speed [model] [data_path] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup] $ python -m spacy benchmark speed [model] [data_path] [--code] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup]
``` ```
| Name | Description | | Name | Description |
| -------------------- | -------------------------------------------------------------------------------------------------------- | | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ | | `model` | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ |
| `data_path` | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ | | `data_path` | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~ | | `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
| `--no-shuffle` | Do not shuffle documents in the benchmark data. ~~bool (flag)~~ | | `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~ |
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ | | `--no-shuffle` | Do not shuffle documents in the benchmark data. ~~bool (flag)~~ |
| `--batches` | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~ | | `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
| `--warmup`, `-w` | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~ | | `--batches` | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--warmup`, `-w` | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~ |
| **PRINTS** | Pipeline speed in words per second with a 95% confidence interval. | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **PRINTS** | Pipeline speed in words per second with a 95% confidence interval. |
## apply {id="apply", version="3.5", tag="command"} ## apply {id="apply", version="3.5", tag="command"}
@ -1295,6 +1296,9 @@ input formats are:
When a directory is provided it is traversed recursively to collect all files. When a directory is provided it is traversed recursively to collect all files.
When loading a .spacy file, any potential annotations stored on the `Doc` that are not overwritten by the pipeline will be preserved.
If you want to evaluate the pipeline on raw text only, make sure that the .spacy file does not contain any annotations.
```bash ```bash
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process] $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
``` ```

View File

@ -1507,7 +1507,7 @@ These models all take the same parameters:
> ```ini > ```ini
> [components.llm.model] > [components.llm.model]
> @llm_models = "spacy.Llama2.v1" > @llm_models = "spacy.Llama2.v1"
> name = "llama2-7b-hf" > name = "Llama-2-7b-hf"
> ``` > ```
Currently, these models are provided as part of the core library: Currently, these models are provided as part of the core library:

View File

@ -13,7 +13,7 @@ between `Doc` objects.
<Infobox variant="warning"> <Infobox variant="warning">
Note that a `Vocab` instance is not static. It increases in size as texts with Note that a `Vocab` instance is not static. It increases in size as texts with
new tokens are processed. new tokens are processed. Some models may have an empty vocab at initialization.
</Infobox> </Infobox>
@ -93,6 +93,7 @@ given string, you need to look it up in
> #### Example > #### Example
> >
> ```python > ```python
> nlp("I'm eating an apple")
> apple = nlp.vocab.strings["apple"] > apple = nlp.vocab.strings["apple"]
> oov = nlp.vocab.strings["dskfodkfos"] > oov = nlp.vocab.strings["dskfodkfos"]
> assert apple in nlp.vocab > assert apple in nlp.vocab