mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Merge pull request #13282 from danieldk/maintenance/merge-main-into-develop-20240129
Merge main into `develop`
This commit is contained in:
commit
c32c1289a9
|
@ -13,7 +13,7 @@ from .. import util
|
|||
from ..language import Language
|
||||
from ..tokens import Doc
|
||||
from ..training import Corpus
|
||||
from ._util import Arg, Opt, benchmark_cli, setup_gpu
|
||||
from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu
|
||||
|
||||
|
||||
@benchmark_cli.command(
|
||||
|
@ -30,12 +30,14 @@ def benchmark_speed_cli(
|
|||
use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
|
||||
n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
|
||||
warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
|
||||
code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
|
||||
# fmt: on
|
||||
):
|
||||
"""
|
||||
Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
|
||||
data in the binary .spacy format.
|
||||
"""
|
||||
import_code(code_path)
|
||||
setup_gpu(use_gpu=use_gpu, silent=False)
|
||||
|
||||
nlp = util.load_model(model)
|
||||
|
@ -171,5 +173,5 @@ def print_outliers(sample: numpy.ndarray):
|
|||
def warmup(
|
||||
nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
|
||||
) -> numpy.ndarray:
|
||||
docs = warmup_epochs * docs
|
||||
docs = [doc.copy() for doc in docs * warmup_epochs]
|
||||
return annotate(nlp, docs, batch_size)
|
||||
|
|
|
@ -15,7 +15,12 @@ def doc_w_attrs(en_tokenizer):
|
|||
Token.set_extension("_test_token", default="t0")
|
||||
doc[1]._._test_token = "t1"
|
||||
|
||||
return doc
|
||||
yield doc
|
||||
|
||||
Doc.remove_extension("_test_attr")
|
||||
Doc.remove_extension("_test_prop")
|
||||
Doc.remove_extension("_test_method")
|
||||
Token.remove_extension("_test_token")
|
||||
|
||||
|
||||
def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
|
||||
|
|
|
@ -1268,20 +1268,21 @@ the [binary `.spacy` format](/api/data-formats#binary-training). The pipeline is
|
|||
warmed up before any measurements are taken.
|
||||
|
||||
```cli
|
||||
$ python -m spacy benchmark speed [model] [data_path] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup]
|
||||
$ python -m spacy benchmark speed [model] [data_path] [--code] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup]
|
||||
```
|
||||
|
||||
| Name | Description |
|
||||
| -------------------- | -------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
||||
| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~ |
|
||||
| `--no-shuffle` | Do not shuffle documents in the benchmark data. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--batches` | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~ |
|
||||
| `--warmup`, `-w` | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **PRINTS** | Pipeline speed in words per second with a 95% confidence interval. |
|
||||
| Name | Description |
|
||||
| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~ |
|
||||
| `data_path` | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~ |
|
||||
| `--code`, `-c` | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
|
||||
| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~ |
|
||||
| `--no-shuffle` | Do not shuffle documents in the benchmark data. ~~bool (flag)~~ |
|
||||
| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
|
||||
| `--batches` | Number of batches to benchmark on. Defaults to `50`. ~~Optional[int] \(option)~~ |
|
||||
| `--warmup`, `-w` | Iterations over the benchmark data for warmup. Defaults to `3` ~~Optional[int] \(option)~~ |
|
||||
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
|
||||
| **PRINTS** | Pipeline speed in words per second with a 95% confidence interval. |
|
||||
|
||||
## apply {id="apply", version="3.5", tag="command"}
|
||||
|
||||
|
@ -1296,6 +1297,9 @@ input formats are:
|
|||
|
||||
When a directory is provided it is traversed recursively to collect all files.
|
||||
|
||||
When loading a .spacy file, any potential annotations stored on the `Doc` that are not overwritten by the pipeline will be preserved.
|
||||
If you want to evaluate the pipeline on raw text only, make sure that the .spacy file does not contain any annotations.
|
||||
|
||||
```bash
|
||||
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
|
||||
```
|
||||
|
|
|
@ -13,7 +13,7 @@ between `Doc` objects.
|
|||
<Infobox variant ="warning">
|
||||
|
||||
Note that a `Vocab` instance is not static. It increases in size as texts with
|
||||
new tokens are processed.
|
||||
new tokens are processed. Some models may have an empty vocab at initialization.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
@ -93,6 +93,7 @@ given string, you need to look it up in
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> nlp("I'm eating an apple")
|
||||
> apple = nlp.vocab.strings["apple"]
|
||||
> oov = nlp.vocab.strings["dskfodkfos"]
|
||||
> assert apple in nlp.vocab
|
||||
|
|
Loading…
Reference in New Issue
Block a user