Merge pull request #6067 from explosion/feature/spacy-blank-from-config

This commit is contained in:
Ines Montani 2020-09-15 14:18:33 +02:00 committed by GitHub
commit 2214d1bb7b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 88 additions and 29 deletions

View File

@ -18,6 +18,7 @@ from .util import registry, logger # noqa: F401
from .errors import Errors from .errors import Errors
from .language import Language from .language import Language
from .vocab import Vocab
from . import util from . import util
@ -46,12 +47,22 @@ def load(
return util.load_model(name, disable=disable, exclude=exclude, config=config) return util.load_model(name, disable=disable, exclude=exclude, config=config)
def blank(name: str, **overrides) -> Language: def blank(
name: str,
*,
vocab: Union[Vocab, bool] = True,
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
meta: Dict[str, Any] = util.SimpleFrozenDict()
) -> Language:
"""Create a blank nlp object for a given language code. """Create a blank nlp object for a given language code.
name (str): The language code, e.g. "en". name (str): The language code, e.g. "en".
**overrides: Keyword arguments passed to language subclass on init. vocab (Vocab): A Vocab object. If True, a vocab is created.
config (Dict[str, Any] / Config): Optional config overrides.
meta (Dict[str, Any]): Overrides for nlp.meta.
RETURNS (Language): The nlp object. RETURNS (Language): The nlp object.
""" """
LangClass = util.get_lang_class(name) LangClass = util.get_lang_class(name)
return LangClass(**overrides) # We should accept both dot notation and nested dict here for consistency
config = util.dot_to_dict(config)
return LangClass.from_config(config, meta=meta)

View File

@ -480,6 +480,9 @@ class Errors:
E201 = ("Span index out of range.") E201 = ("Span index out of range.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
"values are an instance of spacy.vocab.Vocab or True to create one"
" (default).")
E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training " E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training "
"data that does not appear to be a binary classification problem " "data that does not appear to be a binary classification problem "
"with two labels. Labels found: {labels}") "with two labels. Labels found: {labels}")

View File

@ -144,6 +144,8 @@ class Language:
self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component
self._pipe_configs: Dict[str, Config] = {} # config by component self._pipe_configs: Dict[str, Config] = {} # config by component
if not isinstance(vocab, Vocab) and vocab is not True:
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
if vocab is True: if vocab is True:
vectors_name = meta.get("vectors", {}).get("name") vectors_name = meta.get("vectors", {}).get("name")
vocab = create_vocab( vocab = create_vocab(
@ -1458,6 +1460,7 @@ class Language:
vocab: Union[Vocab, bool] = True, vocab: Union[Vocab, bool] = True,
disable: Iterable[str] = SimpleFrozenList(), disable: Iterable[str] = SimpleFrozenList(),
exclude: Iterable[str] = SimpleFrozenList(), exclude: Iterable[str] = SimpleFrozenList(),
meta: Dict[str, Any] = SimpleFrozenDict(),
auto_fill: bool = True, auto_fill: bool = True,
validate: bool = True, validate: bool = True,
) -> "Language": ) -> "Language":
@ -1472,6 +1475,7 @@ class Language:
explicitly enable them by calling nlp.enable_pipe. explicitly enable them by calling nlp.enable_pipe.
exclude (Iterable[str]): Names of pipeline components to exclude. exclude (Iterable[str]): Names of pipeline components to exclude.
Excluded components won't be loaded. Excluded components won't be loaded.
meta (Dict[str, Any]): Meta overrides for nlp.meta.
auto_fill (bool): Automatically fill in missing values in config based auto_fill (bool): Automatically fill in missing values in config based
on defaults and function argument annotations. on defaults and function argument annotations.
validate (bool): Validate the component config and arguments against validate (bool): Validate the component config and arguments against
@ -1525,7 +1529,7 @@ class Language:
# inside stuff like the spacy train function. If we loaded them here, # inside stuff like the spacy train function. If we loaded them here,
# then we would load them twice at runtime: once when we make from config, # then we would load them twice at runtime: once when we make from config,
# and then again when we load from disk. # and then again when we load from disk.
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer) nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
if after_creation is not None: if after_creation is not None:
nlp = after_creation(nlp) nlp = after_creation(nlp)
if not isinstance(nlp, cls): if not isinstance(nlp, cls):

View File

@ -6,6 +6,7 @@ from spacy.vocab import Vocab
from spacy.training import Example from spacy.training import Example
from spacy.lang.en import English from spacy.lang.en import English
from spacy.util import registry from spacy.util import registry
import spacy
from .util import add_vecs_to_vocab, assert_docs_equal from .util import add_vecs_to_vocab, assert_docs_equal
@ -266,3 +267,24 @@ def test_language_custom_tokenizer():
assert [t.text for t in doc] == ["_hello", "_world"] assert [t.text for t in doc] == ["_hello", "_world"]
doc = list(nlp.pipe(["hello world"]))[0] doc = list(nlp.pipe(["hello world"]))[0]
assert [t.text for t in doc] == ["_hello", "_world"] assert [t.text for t in doc] == ["_hello", "_world"]
def test_spacy_blank():
nlp = spacy.blank("en")
assert nlp.config["training"]["dropout"] == 0.1
config = {"training": {"dropout": 0.2}}
meta = {"name": "my_custom_model"}
nlp = spacy.blank("en", config=config, meta=meta)
assert nlp.config["training"]["dropout"] == 0.2
assert nlp.meta["name"] == "my_custom_model"
@pytest.mark.parametrize(
"value",
[False, None, ["x", "y"], Language, Vocab],
)
def test_language_init_invalid_vocab(value):
err_fragment = "invalid value"
with pytest.raises(ValueError) as e:
Language(value)
assert err_fragment in str(e)

View File

@ -17,7 +17,10 @@ return it.
## Language.\_\_init\_\_ {#init tag="method"} ## Language.\_\_init\_\_ {#init tag="method"}
Initialize a `Language` object. Initialize a `Language` object. Note that the `meta` is only used for meta
information in [`Language.meta`](/api/language#meta) and not to configure the
`nlp` object or to override the config. To initialize from a config, use
[`Language.from_config`](/api/language#from_config) instead.
> #### Example > #### Example
> >
@ -37,7 +40,7 @@ Initialize a `Language` object.
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ | | `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ | | `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ |
| `meta` | Custom meta data for the `Language` class. Is written to by pipelines to add meta data. ~~dict~~ | | `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ | | `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
## Language.from_config {#from_config tag="classmethod" new="3"} ## Language.from_config {#from_config tag="classmethod" new="3"}
@ -59,10 +62,13 @@ model under the hood based on its [`config.cfg`](/api/data-formats#config).
> ``` > ```
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ | | `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `disable` | List of pipeline component names to disable. ~~Iterable[str]~~ | | `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
| `exclude` | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ | | `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ | | `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | The initialized object. ~~Language~~ | | **RETURNS** | The initialized object. ~~Language~~ |
@ -797,10 +803,19 @@ token.ent_iob, token.ent_type
## Language.meta {#meta tag="property"} ## Language.meta {#meta tag="property"}
Custom meta data for the Language class. If a trained pipeline is loaded, this Meta data for the `Language` class, including name, version, data sources,
license, author information and more. If a trained pipeline is loaded, this
contains meta data of the pipeline. The `Language.meta` is also what's contains meta data of the pipeline. The `Language.meta` is also what's
serialized as the [`meta.json`](/api/data-formats#meta) when you save an `nlp` serialized as the `meta.json` when you save an `nlp` object to disk. See the
object to disk. [meta data format](/api/data-formats#meta) for more details.
<Infobox variant="warning" title="Changed in v3.0">
As of v3.0, the meta only contains **meta information** about the pipeline and
isn't used to construct the language class and pipeline components. This
information is expressed in the [`config.cfg`](/api/data-formats#config).
</Infobox>
> #### Example > #### Example
> >

View File

@ -79,8 +79,12 @@ Create a blank pipeline of a given language class. This function is the twin of
> ``` > ```
| Name | Description | | Name | Description |
| ----------- | -------------------------------------------------------------------------------------------------------- | | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ | | `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ |
| _keyword-only_ | |
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
| `meta` <Tag variant="new">3</tag> | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~ |
| **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ | | **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ |
### spacy.info {#spacy.info tag="function"} ### spacy.info {#spacy.info tag="function"}
@ -745,10 +749,10 @@ and create a `Language` object. The model data will then be loaded in via
> ``` > ```
| Name | Description | | Name | Description |
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `name` | Package name or path. ~~str~~ | | `name` | Package name or path. ~~str~~ |
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. | | `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ | | `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ | | `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ | | **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |