Add nlp.batch_size setting

Add a default `batch_size` setting for `Language.pipe` and
`Language.evaluate` as `nlp.batch_size`.
This commit is contained in:
Adriane Boyd 2020-12-09 09:13:26 +01:00
parent e09588e6ca
commit fa8fa474a3
5 changed files with 18 additions and 5 deletions

View File

@ -19,6 +19,7 @@ lang = "{{ lang }}"
{%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %} {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components %}
pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }} pipeline = {{ full_pipeline|pprint()|replace("'", '"')|safe }}
tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"} tokenizer = {"@tokenizers": "spacy.Tokenizer.v1"}
batch_size = {{ 128 if hardware == "gpu" else 1000 }}
[components] [components]

View File

@ -20,6 +20,8 @@ disabled = []
before_creation = null before_creation = null
after_creation = null after_creation = null
after_pipeline_creation = null after_pipeline_creation = null
# Default batch size to use with nlp.pipe and nlp.evaluate
batch_size = 1000
[nlp.tokenizer] [nlp.tokenizer]
@tokenizers = "spacy.Tokenizer.v1" @tokenizers = "spacy.Tokenizer.v1"

View File

@ -121,6 +121,7 @@ class Language:
max_length: int = 10 ** 6, max_length: int = 10 ** 6,
meta: Dict[str, Any] = {}, meta: Dict[str, Any] = {},
create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None, create_tokenizer: Optional[Callable[["Language"], Callable[[str], Doc]]] = None,
batch_size: int = 1000,
**kwargs, **kwargs,
) -> None: ) -> None:
"""Initialise a Language object. """Initialise a Language object.
@ -138,6 +139,7 @@ class Language:
100,000 characters in one text. 100,000 characters in one text.
create_tokenizer (Callable): Function that takes the nlp object and create_tokenizer (Callable): Function that takes the nlp object and
returns a tokenizer. returns a tokenizer.
batch_size (int): Default batch size for pipe and evaluate.
DOCS: https://nightly.spacy.io/api/language#init DOCS: https://nightly.spacy.io/api/language#init
""" """
@ -173,6 +175,7 @@ class Language:
tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]} tokenizer_cfg = {"tokenizer": self._config["nlp"]["tokenizer"]}
create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"] create_tokenizer = registry.resolve(tokenizer_cfg)["tokenizer"]
self.tokenizer = create_tokenizer(self) self.tokenizer = create_tokenizer(self)
self.batch_size = batch_size
def __init_subclass__(cls, **kwargs): def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs) super().__init_subclass__(**kwargs)
@ -1268,7 +1271,7 @@ class Language:
self, self,
examples: Iterable[Example], examples: Iterable[Example],
*, *,
batch_size: int = 256, batch_size: Optional[int] = None,
scorer: Optional[Scorer] = None, scorer: Optional[Scorer] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
scorer_cfg: Optional[Dict[str, Any]] = None, scorer_cfg: Optional[Dict[str, Any]] = None,
@ -1288,6 +1291,8 @@ class Language:
DOCS: https://nightly.spacy.io/api/language#evaluate DOCS: https://nightly.spacy.io/api/language#evaluate
""" """
validate_examples(examples, "Language.evaluate") validate_examples(examples, "Language.evaluate")
if batch_size is None:
batch_size = self.batch_size
if component_cfg is None: if component_cfg is None:
component_cfg = {} component_cfg = {}
if scorer_cfg is None: if scorer_cfg is None:
@ -1366,7 +1371,7 @@ class Language:
texts: Iterable[str], texts: Iterable[str],
*, *,
as_tuples: bool = False, as_tuples: bool = False,
batch_size: int = 1000, batch_size: Optional[int] = None,
disable: Iterable[str] = SimpleFrozenList(), disable: Iterable[str] = SimpleFrozenList(),
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
n_process: int = 1, n_process: int = 1,
@ -1377,7 +1382,7 @@ class Language:
as_tuples (bool): If set to True, inputs should be a sequence of as_tuples (bool): If set to True, inputs should be a sequence of
(text, context) tuples. Output will then be a sequence of (text, context) tuples. Output will then be a sequence of
(doc, context) tuples. Defaults to False. (doc, context) tuples. Defaults to False.
batch_size (int): The number of texts to buffer. batch_size (Optional[int]): The number of texts to buffer.
disable (List[str]): Names of the pipeline components to disable. disable (List[str]): Names of the pipeline components to disable.
component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword component_cfg (Dict[str, Dict]): An optional dictionary with extra keyword
arguments for specific components. arguments for specific components.
@ -1404,6 +1409,8 @@ class Language:
return return
if component_cfg is None: if component_cfg is None:
component_cfg = {} component_cfg = {}
if batch_size is None:
batch_size = self.batch_size
pipes = ( pipes = (
[] []
@ -1618,6 +1625,7 @@ class Language:
nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name) nlp.add_pipe(source_name, source=source_nlps[model], name=pipe_name)
disabled_pipes = [*config["nlp"]["disabled"], *disable] disabled_pipes = [*config["nlp"]["disabled"], *disable]
nlp._disabled = set(p for p in disabled_pipes if p not in exclude) nlp._disabled = set(p for p in disabled_pipes if p not in exclude)
nlp.batch_size = config["nlp"]["batch_size"]
nlp.config = filled if auto_fill else config nlp.config = filled if auto_fill else config
if after_pipeline_creation is not None: if after_pipeline_creation is not None:
nlp = after_pipeline_creation(nlp) nlp = after_pipeline_creation(nlp)

View File

@ -329,6 +329,7 @@ class ConfigSchemaNlp(BaseModel):
before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization") before_creation: Optional[Callable[[Type["Language"]], Type["Language"]]] = Field(..., title="Optional callback to modify Language class before initialization")
after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed") after_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after creation and before the pipeline is constructed")
after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed") after_pipeline_creation: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after the pipeline is constructed")
batch_size: Optional[int] = Field(..., title="Default batch size")
# fmt: on # fmt: on
class Config: class Config:

View File

@ -42,6 +42,7 @@ information in [`Language.meta`](/api/language#meta) and not to configure the
| `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ | | `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ |
| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ | | `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ | | `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
| `batch_size` | Default batch size for `pipe` and `evaluate`. Defaults to `1000`. ~~int~~ |
## Language.from_config {#from_config tag="classmethod" new="3"} ## Language.from_config {#from_config tag="classmethod" new="3"}
@ -195,7 +196,7 @@ more efficient than processing texts one-by-one.
| `texts` | A sequence of strings. ~~Iterable[str]~~ | | `texts` | A sequence of strings. ~~Iterable[str]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ | | `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
| `batch_size` | The number of texts to buffer. ~~int~~ | | `batch_size` | The number of texts to buffer. ~~Optional[int]~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
| `cleanup` | If `True`, unneeded strings are freed to control memory use. Experimental. ~~bool~~ | | `cleanup` | If `True`, unneeded strings are freed to control memory use. Experimental. ~~bool~~ |
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
@ -357,7 +358,7 @@ objects instead of tuples of `Doc` and `GoldParse` objects.
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | | --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `batch_size` | The batch size to use. ~~int~~ | | `batch_size` | The batch size to use. ~~Optional[int]~~ |
| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ | | `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ |
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ | | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ | | `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |