diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 37983cb1a..78e17c516 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -19,6 +19,7 @@ lang = "{{ lang }}" {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %} pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }} tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"} +batch_size = {{ 128 if hardware == "gpu" else 1000 }} [components] diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg index d7fc46ea0..c9f82caa0 100644 --- a/spacy/default_config.cfg +++ b/spacy/default_config.cfg @@ -20,6 +20,8 @@ disabled = [] before_creation = null after_creation = null after_pipeline_creation = null +# Default batch size to use with nlp.pipe and nlp.evaluate +batch_size = 1000 [nlp.tokenizer] @tokenizers = "spacy.Tokenizer.v1" diff --git a/spacy/language.py b/spacy/language.py index 7530fa5df..995acbd7b 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -121,6 +121,7 @@ class Language: max_length: int = 10 ** 6, meta: Dict[str, Any] = {}, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, + batch_size: int = 1000, **kwargs, ) -> None: """Initialise a Language object. @@ -138,6 +139,7 @@ class Language: 100,000 characters in one text. create_tokenizer (Callable): Function that takes the nlp object and returns a tokenizer. + batch_size (int): Default batch size for pipe and evaluate. DOCS: https://nightly.spacy.io/api/language#init """ @@ -173,6 +175,7 @@ class Language: tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]} create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"] self.tokenizer = create_tokenizer(self) + self.batch_size = batch_size def __init_subclass__(cls, **kwargs): super().__init_subclass__(**kwargs) @@ -1268,7 +1271,7 @@ class Language: self, examples: Iterable[Example], *, - batch_size: int = 256, + batch_size: Optional[int] = None, scorer: Optional[Scorer] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, scorer_cfg: Optional[Dict[str, Any]] = None, @@ -1276,7 +1279,7 @@ class Language: """Evaluate a model's pipeline components. examples (Iterable[Example]): `Example` objects. - batch_size (int): Batch size to use. + batch_size (Optional[int]): Batch size to use. scorer (Optional[Scorer]): Scorer to use. If not passed in, a new one will be created. component_cfg (dict): An optional dictionary with extra keyword @@ -1288,6 +1291,8 @@ class Language: DOCS: https://nightly.spacy.io/api/language#evaluate """ validate_examples(examples, "Language.evaluate") + if batch_size is None: + batch_size = self.batch_size if component_cfg is None: component_cfg = {} if scorer_cfg is None: @@ -1366,7 +1371,7 @@ class Language: texts: Iterable[str], *, as_tuples: bool = False, - batch_size: int = 1000, + batch_size: Optional[int] = None, disable: Iterable[str] = SimpleFrozenList(), component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, n_process: int = 1, @@ -1377,7 +1382,7 @@ class Language: as_tuples (bool): If set to True, inputs should be a sequence of (text, context) tuples. Output will then be a sequence of (doc, context) tuples. Defaults to False. - batch_size (int): The number of texts to buffer. + batch_size (Optional[int]): The number of texts to buffer. disable (List[str]): Names of the pipeline components to disable. component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword arguments for specific components. @@ -1404,6 +1409,8 @@ class Language: return if component_cfg is None: component_cfg = {} + if batch_size is None: + batch_size = self.batch_size pipes = ( [] @@ -1618,6 +1625,7 @@ class Language: nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name) disabled_pipes = [*config["nlp"]["disabled"], *disable] nlp._disabled = set(p for p in disabled_pipes if p not in exclude) + nlp.batch_size = config["nlp"]["batch_size"] nlp.config = filled if auto_fill else config if after_pipeline_creation is not None: nlp = after_pipeline_creation(nlp) diff --git a/spacy/schemas.py b/spacy/schemas.py index 6f154a1ae..26d7166eb 100644 --- a/spacy/schemas.py +++ b/spacy/schemas.py @@ -329,6 +329,7 @@ class ConfigSchemaNlp(BaseModel): before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization") after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed") after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed") + batch_size: Optional[int] = Field(..., title="Default batch size") # fmt: on class Config: diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index c4cc5b1e4..67375de5b 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -63,6 +63,7 @@ your config and check that it's valid, you can run the > before_creation = null > after_creation = null > after_pipeline_creation = null +> batch_size = 1000 > > [nlp.tokenizer] > @tokenizers = "spacy.Tokenizer.v1" @@ -80,6 +81,7 @@ Defines the `nlp` object, its tokenizer and | `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | | `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ | | `tokenizer` | The tokenizer to use. Defaults to [`Tokenizer`](/api/tokenizer). ~~Callable[[str], Doc]~~ | +| `batch_size` | Default batch size for [`Language.pipe`](/api/language#pipe) and [`Language.evaluate`](/api/language#evaluate). ~~int~~ | ### components {#config-components tag="section"} diff --git a/website/docs/api/language.md b/website/docs/api/language.md index b2a5a776e..382415416 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -42,6 +42,7 @@ information in [`Language.meta`](/api/language#meta) and not to configure the | `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ | | `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ | | `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ | +| `batch_size` | Default batch size for [`pipe`](#pipe) and [`evaluate`](#evaluate). Defaults to `1000`. ~~int~~ | ## Language.from_config {#from_config tag="classmethod" new="3"} @@ -195,7 +196,7 @@ more efficient than processing texts one-by-one. | `texts` | A sequence of strings. ~~Iterable[str]~~ | | _keyword-only_ | | | `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ | -| `batch_size` | The number of texts to buffer. ~~int~~ | +| `batch_size` | The number of texts to buffer. ~~Optional[int]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | | `cleanup` | If `True`, unneeded strings are freed to control memory use. Experimental. ~~bool~~ | | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | @@ -357,7 +358,7 @@ objects instead of tuples of `Doc` and `GoldParse` objects. | --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | | _keyword-only_ | | -| `batch_size` | The batch size to use. ~~int~~ | +| `batch_size` | The batch size to use. ~~Optional[int]~~ | | `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ | | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | | `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |