diff --git a/spacy/__init__.py b/spacy/__init__.py index 5c286ed80..7334b4149 100644 --- a/spacy/__init__.py +++ b/spacy/__init__.py @@ -18,6 +18,7 @@ from .util import registry, logger # noqa: F401 from .errors import Errors from .language import Language +from .vocab import Vocab from . import util @@ -46,12 +47,22 @@ def load( return util.load_model(name, disable=disable, exclude=exclude, config=config) -def blank(name: str, **overrides) -> Language: +def blank( + name: str, + *, + vocab: Union[Vocab, bool] = True, + config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(), + meta: Dict[str, Any] = util.SimpleFrozenDict() +) -> Language: """Create a blank nlp object for a given language code. name (str): The language code, e.g. "en". - **overrides: Keyword arguments passed to language subclass on init. + vocab (Vocab): A Vocab object. If True, a vocab is created. + config (Dict[str, Any] / Config): Optional config overrides. + meta (Dict[str, Any]): Overrides for nlp.meta. RETURNS (Language): The nlp object. """ LangClass = util.get_lang_class(name) - return LangClass(**overrides) + # We should accept both dot notation and nested dict here for consistency + config = util.dot_to_dict(config) + return LangClass.from_config(config, meta=meta) diff --git a/spacy/errors.py b/spacy/errors.py index f857bea52..84593bede 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -480,6 +480,9 @@ class Errors: E201 = ("Span index out of range.") # TODO: fix numbering after merging develop into master + E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid " + "values are an instance of spacy.vocab.Vocab or True to create one" + " (default).") E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training " "data that does not appear to be a binary classification problem " "with two labels. Labels found: {labels}") diff --git a/spacy/language.py b/spacy/language.py index 905cdca36..543bcd8bc 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -144,6 +144,8 @@ class Language: self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component self._pipe_configs: Dict[str, Config] = {} # config by component + if not isinstance(vocab, Vocab) and vocab is not True: + raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab))) if vocab is True: vectors_name = meta.get("vectors", {}).get("name") vocab = create_vocab( @@ -1458,6 +1460,7 @@ class Language: vocab: Union[Vocab, bool] = True, disable: Iterable[str] = SimpleFrozenList(), exclude: Iterable[str] = SimpleFrozenList(), + meta: Dict[str, Any] = SimpleFrozenDict(), auto_fill: bool = True, validate: bool = True, ) -> "Language": @@ -1472,6 +1475,7 @@ class Language: explicitly enable them by calling nlp.enable_pipe. exclude (Iterable[str]): Names of pipeline components to exclude. Excluded components won't be loaded. + meta (Dict[str, Any]): Meta overrides for nlp.meta. auto_fill (bool): Automatically fill in missing values in config based on defaults and function argument annotations. validate (bool): Validate the component config and arguments against @@ -1525,7 +1529,7 @@ class Language: # inside stuff like the spacy train function. If we loaded them here, # then we would load them twice at runtime: once when we make from config, # and then again when we load from disk. - nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer) + nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta) if after_creation is not None: nlp = after_creation(nlp) if not isinstance(nlp, cls): diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py index 840d878c2..4c689e524 100644 --- a/spacy/tests/test_language.py +++ b/spacy/tests/test_language.py @@ -6,6 +6,7 @@ from spacy.vocab import Vocab from spacy.training import Example from spacy.lang.en import English from spacy.util import registry +import spacy from .util import add_vecs_to_vocab, assert_docs_equal @@ -266,3 +267,24 @@ def test_language_custom_tokenizer(): assert [t.text for t in doc] == ["_hello", "_world"] doc = list(nlp.pipe(["hello world"]))[0] assert [t.text for t in doc] == ["_hello", "_world"] + + +def test_spacy_blank(): + nlp = spacy.blank("en") + assert nlp.config["training"]["dropout"] == 0.1 + config = {"training": {"dropout": 0.2}} + meta = {"name": "my_custom_model"} + nlp = spacy.blank("en", config=config, meta=meta) + assert nlp.config["training"]["dropout"] == 0.2 + assert nlp.meta["name"] == "my_custom_model" + + +@pytest.mark.parametrize( + "value", + [False, None, ["x", "y"], Language, Vocab], +) +def test_language_init_invalid_vocab(value): + err_fragment = "invalid value" + with pytest.raises(ValueError) as e: + Language(value) + assert err_fragment in str(e) diff --git a/website/docs/api/language.md b/website/docs/api/language.md index c24023177..ffdae9ec6 100644 --- a/website/docs/api/language.md +++ b/website/docs/api/language.md @@ -17,7 +17,10 @@ return it. ## Language.\_\_init\_\_ {#init tag="method"} -Initialize a `Language` object. +Initialize a `Language` object. Note that the `meta` is only used for meta +information in [`Language.meta`](/api/language#meta) and not to configure the +`nlp` object or to override the config. To initialize from a config, use +[`Language.from_config`](/api/language#from_config) instead. > #### Example > @@ -37,7 +40,7 @@ Initialize a `Language` object. | `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ | | _keyword-only_ | | | `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ | -| `meta` | Custom meta data for the `Language` class. Is written to by pipelines to add meta data. ~~dict~~ | +| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ | | `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ | ## Language.from_config {#from_config tag="classmethod" new="3"} @@ -58,14 +61,17 @@ model under the hood based on its [`config.cfg`](/api/data-formats#config). > nlp = Language.from_config(config) > ``` -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | -| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ | -| _keyword-only_ | | -| `disable` | List of pipeline component names to disable. ~~Iterable[str]~~ | -| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ | -| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | -| **RETURNS** | The initialized object. ~~Language~~ | +| Name | Description | +| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ | +| _keyword-only_ | | +| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ | +| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ | +| `exclude` | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | +| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ | +| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ | +| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | +| **RETURNS** | The initialized object. ~~Language~~ | ## Language.component {#component tag="classmethod" new="3"} @@ -797,10 +803,19 @@ token.ent_iob, token.ent_type ## Language.meta {#meta tag="property"} -Custom meta data for the Language class. If a trained pipeline is loaded, this +Meta data for the `Language` class, including name, version, data sources, +license, author information and more. If a trained pipeline is loaded, this contains meta data of the pipeline. The `Language.meta` is also what's -serialized as the [`meta.json`](/api/data-formats#meta) when you save an `nlp` -object to disk. +serialized as the `meta.json` when you save an `nlp` object to disk. See the +[meta data format](/api/data-formats#meta) for more details. + + + +As of v3.0, the meta only contains **meta information** about the pipeline and +isn't used to construct the language class and pipeline components. This +information is expressed in the [`config.cfg`](/api/data-formats#config). + + > #### Example > diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index deae39f3d..f52c63f18 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -78,10 +78,14 @@ Create a blank pipeline of a given language class. This function is the twin of > nlp_de = spacy.blank("de") # equivalent to German() > ``` -| Name | Description | -| ----------- | -------------------------------------------------------------------------------------------------------- | -| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ | -| **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ | +| Name | Description | +| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ | +| _keyword-only_ | | +| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | +| `config` 3 | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ | +| `meta` 3 | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~ | +| **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ | ### spacy.info {#spacy.info tag="function"} @@ -744,14 +748,14 @@ and create a `Language` object. The model data will then be loaded in via > nlp = util.load_model("/path/to/data") > ``` -| Name | Description | -| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | Package name or path. ~~str~~ | -| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | -| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | -| `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | -| `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | -| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ | +| Name | Description | +| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `name` | Package name or path. ~~str~~ | +| `vocab` 3 | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | +| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ | +| `exclude` 3 | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | +| `config` 3 | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | +| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ | ### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}