mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Add nlp.batch_size setting
Add a default `batch_size` setting for `Language.pipe` and `Language.evaluate` as `nlp.batch_size`.
This commit is contained in:
		
							parent
							
								
									e09588e6ca
								
							
						
					
					
						commit
						fa8fa474a3
					
				| 
						 | 
				
			
			@ -19,6 +19,7 @@ lang = "{{ lang }}"
 | 
			
		|||
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
 | 
			
		||||
pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
 | 
			
		||||
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
 | 
			
		||||
batch_size = {{ 128 if hardware == "gpu" else 1000 }}
 | 
			
		||||
 | 
			
		||||
[components]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -20,6 +20,8 @@ disabled = []
 | 
			
		|||
before_creation = null
 | 
			
		||||
after_creation = null
 | 
			
		||||
after_pipeline_creation = null
 | 
			
		||||
# Default batch size to use with nlp.pipe and nlp.evaluate
 | 
			
		||||
batch_size = 1000
 | 
			
		||||
 | 
			
		||||
[nlp.tokenizer]
 | 
			
		||||
@tokenizers = "spacy.Tokenizer.v1"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -121,6 +121,7 @@ class Language:
 | 
			
		|||
        max_length: int = 10 ** 6,
 | 
			
		||||
        meta: Dict[str, Any] = {},
 | 
			
		||||
        create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
 | 
			
		||||
        batch_size: int = 1000,
 | 
			
		||||
        **kwargs,
 | 
			
		||||
    ) -> None:
 | 
			
		||||
        """Initialise a Language object.
 | 
			
		||||
| 
						 | 
				
			
			@ -138,6 +139,7 @@ class Language:
 | 
			
		|||
            100,000 characters in one text.
 | 
			
		||||
        create_tokenizer (Callable): Function that takes the nlp object and
 | 
			
		||||
            returns a tokenizer.
 | 
			
		||||
        batch_size (int): Default batch size for pipe and evaluate.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://nightly.spacy.io/api/language#init
 | 
			
		||||
        """
 | 
			
		||||
| 
						 | 
				
			
			@ -173,6 +175,7 @@ class Language:
 | 
			
		|||
            tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
 | 
			
		||||
            create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"]
 | 
			
		||||
        self.tokenizer = create_tokenizer(self)
 | 
			
		||||
        self.batch_size = batch_size
 | 
			
		||||
 | 
			
		||||
    def __init_subclass__(cls, **kwargs):
 | 
			
		||||
        super().__init_subclass__(**kwargs)
 | 
			
		||||
| 
						 | 
				
			
			@ -1268,7 +1271,7 @@ class Language:
 | 
			
		|||
        self,
 | 
			
		||||
        examples: Iterable[Example],
 | 
			
		||||
        *,
 | 
			
		||||
        batch_size: int = 256,
 | 
			
		||||
        batch_size: Optional[int] = None,
 | 
			
		||||
        scorer: Optional[Scorer] = None,
 | 
			
		||||
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
 | 
			
		||||
        scorer_cfg: Optional[Dict[str, Any]] = None,
 | 
			
		||||
| 
						 | 
				
			
			@ -1288,6 +1291,8 @@ class Language:
 | 
			
		|||
        DOCS: https://nightly.spacy.io/api/language#evaluate
 | 
			
		||||
        """
 | 
			
		||||
        validate_examples(examples, "Language.evaluate")
 | 
			
		||||
        if batch_size is None:
 | 
			
		||||
            batch_size = self.batch_size
 | 
			
		||||
        if component_cfg is None:
 | 
			
		||||
            component_cfg = {}
 | 
			
		||||
        if scorer_cfg is None:
 | 
			
		||||
| 
						 | 
				
			
			@ -1366,7 +1371,7 @@ class Language:
 | 
			
		|||
        texts: Iterable[str],
 | 
			
		||||
        *,
 | 
			
		||||
        as_tuples: bool = False,
 | 
			
		||||
        batch_size: int = 1000,
 | 
			
		||||
        batch_size: Optional[int] = None,
 | 
			
		||||
        disable: Iterable[str] = SimpleFrozenList(),
 | 
			
		||||
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
 | 
			
		||||
        n_process: int = 1,
 | 
			
		||||
| 
						 | 
				
			
			@ -1377,7 +1382,7 @@ class Language:
 | 
			
		|||
        as_tuples (bool): If set to True, inputs should be a sequence of
 | 
			
		||||
            (text, context) tuples. Output will then be a sequence of
 | 
			
		||||
            (doc, context) tuples. Defaults to False.
 | 
			
		||||
        batch_size (int): The number of texts to buffer.
 | 
			
		||||
        batch_size (Optional[int]): The number of texts to buffer.
 | 
			
		||||
        disable (List[str]): Names of the pipeline components to disable.
 | 
			
		||||
        component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
 | 
			
		||||
            arguments for specific components.
 | 
			
		||||
| 
						 | 
				
			
			@ -1404,6 +1409,8 @@ class Language:
 | 
			
		|||
            return
 | 
			
		||||
        if component_cfg is None:
 | 
			
		||||
            component_cfg = {}
 | 
			
		||||
        if batch_size is None:
 | 
			
		||||
            batch_size = self.batch_size
 | 
			
		||||
 | 
			
		||||
        pipes = (
 | 
			
		||||
            []
 | 
			
		||||
| 
						 | 
				
			
			@ -1618,6 +1625,7 @@ class Language:
 | 
			
		|||
                    nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
 | 
			
		||||
        disabled_pipes = [*config["nlp"]["disabled"], *disable]
 | 
			
		||||
        nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
 | 
			
		||||
        nlp.batch_size = config["nlp"]["batch_size"]
 | 
			
		||||
        nlp.config = filled if auto_fill else config
 | 
			
		||||
        if after_pipeline_creation is not None:
 | 
			
		||||
            nlp = after_pipeline_creation(nlp)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -329,6 +329,7 @@ class ConfigSchemaNlp(BaseModel):
 | 
			
		|||
    before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
 | 
			
		||||
    after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
 | 
			
		||||
    after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
 | 
			
		||||
    batch_size: Optional[int] = Field(..., title="Default batch size")
 | 
			
		||||
    # fmt: on
 | 
			
		||||
 | 
			
		||||
    class Config:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -42,6 +42,7 @@ information in [`Language.meta`](/api/language#meta) and not to configure the
 | 
			
		|||
| `max_length`       | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~                                    |
 | 
			
		||||
| `meta`             | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~                                                        |
 | 
			
		||||
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
 | 
			
		||||
| `batch_size`       | Default batch size for `pipe` and `evaluate`. Defaults to `1000`. ~~int~~                                                |
 | 
			
		||||
 | 
			
		||||
## Language.from_config {#from_config tag="classmethod" new="3"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -195,7 +196,7 @@ more efficient than processing texts one-by-one.
 | 
			
		|||
| `texts`                                    | A sequence of strings. ~~Iterable[str]~~                                                                                                                            |
 | 
			
		||||
| _keyword-only_                             |                                                                                                                                                                     |
 | 
			
		||||
| `as_tuples`                                | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
 | 
			
		||||
| `batch_size`                               | The number of texts to buffer. ~~int~~                                                                                                                              |
 | 
			
		||||
| `batch_size`                               | The number of texts to buffer. ~~Optional[int]~~                                                                                                                    |
 | 
			
		||||
| `disable`                                  | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~                                                                     |
 | 
			
		||||
| `cleanup`                                  | If `True`, unneeded strings are freed to control memory use. Experimental. ~~bool~~                                                                                 |
 | 
			
		||||
| `component_cfg`                            | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~                      |
 | 
			
		||||
| 
						 | 
				
			
			@ -357,7 +358,7 @@ objects instead of tuples of `Doc` and `GoldParse` objects.
 | 
			
		|||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `examples`      | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~                                                              |
 | 
			
		||||
| _keyword-only_  |                                                                                                                                                |
 | 
			
		||||
| `batch_size`    | The batch size to use. ~~int~~                                                                                                                 |
 | 
			
		||||
| `batch_size`    | The batch size to use. ~~Optional[int]~~                                                                                                       |
 | 
			
		||||
| `scorer`        | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~                                     |
 | 
			
		||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
 | 
			
		||||
| `scorer_cfg`    | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~                                    |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user