mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge remote-tracking branch 'upstream/master' into maintenance/merge-main-into-develop-20240129
This commit is contained in:
		
						commit
						a36474600f
					
				| 
						 | 
					@ -13,7 +13,7 @@ from .. import util
 | 
				
			||||||
from ..language import Language
 | 
					from ..language import Language
 | 
				
			||||||
from ..tokens import Doc
 | 
					from ..tokens import Doc
 | 
				
			||||||
from ..training import Corpus
 | 
					from ..training import Corpus
 | 
				
			||||||
from ._util import Arg, Opt, benchmark_cli, setup_gpu
 | 
					from ._util import Arg, Opt, benchmark_cli, import_code, setup_gpu
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@benchmark_cli.command(
 | 
					@benchmark_cli.command(
 | 
				
			||||||
| 
						 | 
					@ -30,12 +30,14 @@ def benchmark_speed_cli(
 | 
				
			||||||
    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
 | 
					    use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU"),
 | 
				
			||||||
    n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
 | 
					    n_batches: int = Opt(50, "--batches", help="Minimum number of batches to benchmark", min=30,),
 | 
				
			||||||
    warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
 | 
					    warmup_epochs: int = Opt(3, "--warmup", "-w", min=0, help="Number of iterations over the data for warmup"),
 | 
				
			||||||
 | 
					    code_path: Optional[Path] = Opt(None, "--code", "-c", help="Path to Python file with additional code (registered functions) to be imported"),
 | 
				
			||||||
    # fmt: on
 | 
					    # fmt: on
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
 | 
					    Benchmark a pipeline. Expects a loadable spaCy pipeline and benchmark
 | 
				
			||||||
    data in the binary .spacy format.
 | 
					    data in the binary .spacy format.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
 | 
					    import_code(code_path)
 | 
				
			||||||
    setup_gpu(use_gpu=use_gpu, silent=False)
 | 
					    setup_gpu(use_gpu=use_gpu, silent=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    nlp = util.load_model(model)
 | 
					    nlp = util.load_model(model)
 | 
				
			||||||
| 
						 | 
					@ -171,5 +173,5 @@ def print_outliers(sample: numpy.ndarray):
 | 
				
			||||||
def warmup(
 | 
					def warmup(
 | 
				
			||||||
    nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
 | 
					    nlp: Language, docs: List[Doc], warmup_epochs: int, batch_size: Optional[int]
 | 
				
			||||||
) -> numpy.ndarray:
 | 
					) -> numpy.ndarray:
 | 
				
			||||||
    docs = warmup_epochs * docs
 | 
					    docs = [doc.copy() for doc in docs * warmup_epochs]
 | 
				
			||||||
    return annotate(nlp, docs, batch_size)
 | 
					    return annotate(nlp, docs, batch_size)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -15,7 +15,12 @@ def doc_w_attrs(en_tokenizer):
 | 
				
			||||||
    Token.set_extension("_test_token", default="t0")
 | 
					    Token.set_extension("_test_token", default="t0")
 | 
				
			||||||
    doc[1]._._test_token = "t1"
 | 
					    doc[1]._._test_token = "t1"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    return doc
 | 
					    yield doc
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    Doc.remove_extension("_test_attr")
 | 
				
			||||||
 | 
					    Doc.remove_extension("_test_prop")
 | 
				
			||||||
 | 
					    Doc.remove_extension("_test_method")
 | 
				
			||||||
 | 
					    Token.remove_extension("_test_token")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
 | 
					def test_serialize_ext_attrs_from_bytes(doc_w_attrs):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1268,13 +1268,14 @@ the [binary `.spacy` format](/api/data-formats#binary-training). The pipeline is
 | 
				
			||||||
warmed up before any measurements are taken.
 | 
					warmed up before any measurements are taken.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```cli
 | 
					```cli
 | 
				
			||||||
$ python -m spacy benchmark speed [model] [data_path] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup]
 | 
					$ python -m spacy benchmark speed [model] [data_path] [--code] [--batch_size] [--no-shuffle] [--gpu-id] [--batches] [--warmup]
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name                 | Description                                                                                                                                                                          |
 | 
					| Name                 | Description                                                                                                                                                                          |
 | 
				
			||||||
| -------------------- | -------------------------------------------------------------------------------------------------------- |
 | 
					| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `model`              | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~                                                                             |
 | 
					| `model`              | Pipeline to benchmark the speed of. Can be a package or a path to a data directory. ~~str (positional)~~                                                                             |
 | 
				
			||||||
| `data_path`          | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                             |
 | 
					| `data_path`          | Location of benchmark data in spaCy's [binary format](/api/data-formats#training). ~~Path (positional)~~                                                                             |
 | 
				
			||||||
 | 
					| `--code`, `-c`       | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
 | 
				
			||||||
| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~                                                                                       |
 | 
					| `--batch-size`, `-b` | Set the batch size. If not set, the pipeline's batch size is used. ~~Optional[int] \(option)~~                                                                                       |
 | 
				
			||||||
| `--no-shuffle`       | Do not shuffle documents in the benchmark data. ~~bool (flag)~~                                                                                                                      |
 | 
					| `--no-shuffle`       | Do not shuffle documents in the benchmark data. ~~bool (flag)~~                                                                                                                      |
 | 
				
			||||||
| `--gpu-id`, `-g`     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
 | 
					| `--gpu-id`, `-g`     | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~                                                                                                                       |
 | 
				
			||||||
| 
						 | 
					@ -1296,6 +1297,9 @@ input formats are:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
When a directory is provided it is traversed recursively to collect all files.
 | 
					When a directory is provided it is traversed recursively to collect all files.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					When loading a .spacy file, any potential annotations stored on the `Doc` that are not overwritten by the pipeline will be preserved.
 | 
				
			||||||
 | 
					If you want to evaluate the pipeline on raw text only, make sure that the .spacy file does not contain any annotations.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
```bash
 | 
					```bash
 | 
				
			||||||
$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
 | 
					$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,7 +13,7 @@ between `Doc` objects.
 | 
				
			||||||
<Infobox variant ="warning">
 | 
					<Infobox variant ="warning">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Note that a `Vocab` instance is not static. It increases in size as texts with
 | 
					Note that a `Vocab` instance is not static. It increases in size as texts with
 | 
				
			||||||
new tokens are processed.
 | 
					new tokens are processed. Some models may have an empty vocab at initialization.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
</Infobox>
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -93,6 +93,7 @@ given string, you need to look it up in
 | 
				
			||||||
> #### Example
 | 
					> #### Example
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
> ```python
 | 
					> ```python
 | 
				
			||||||
 | 
					> nlp("I'm eating an apple")
 | 
				
			||||||
> apple = nlp.vocab.strings["apple"]
 | 
					> apple = nlp.vocab.strings["apple"]
 | 
				
			||||||
> oov = nlp.vocab.strings["dskfodkfos"]
 | 
					> oov = nlp.vocab.strings["dskfodkfos"]
 | 
				
			||||||
> assert apple in nlp.vocab
 | 
					> assert apple in nlp.vocab
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user