mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Merge pull request #6067 from explosion/feature/spacy-blank-from-config
This commit is contained in:
commit
2214d1bb7b
|
@ -18,6 +18,7 @@ from .util import registry, logger # noqa: F401
|
|||
|
||||
from .errors import Errors
|
||||
from .language import Language
|
||||
from .vocab import Vocab
|
||||
from . import util
|
||||
|
||||
|
||||
|
@ -46,12 +47,22 @@ def load(
|
|||
return util.load_model(name, disable=disable, exclude=exclude, config=config)
|
||||
|
||||
|
||||
def blank(name: str, **overrides) -> Language:
|
||||
def blank(
|
||||
name: str,
|
||||
*,
|
||||
vocab: Union[Vocab, bool] = True,
|
||||
config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
|
||||
meta: Dict[str, Any] = util.SimpleFrozenDict()
|
||||
) -> Language:
|
||||
"""Create a blank nlp object for a given language code.
|
||||
|
||||
name (str): The language code, e.g. "en".
|
||||
**overrides: Keyword arguments passed to language subclass on init.
|
||||
vocab (Vocab): A Vocab object. If True, a vocab is created.
|
||||
config (Dict[str, Any] / Config): Optional config overrides.
|
||||
meta (Dict[str, Any]): Overrides for nlp.meta.
|
||||
RETURNS (Language): The nlp object.
|
||||
"""
|
||||
LangClass = util.get_lang_class(name)
|
||||
return LangClass(**overrides)
|
||||
# We should accept both dot notation and nested dict here for consistency
|
||||
config = util.dot_to_dict(config)
|
||||
return LangClass.from_config(config, meta=meta)
|
||||
|
|
|
@ -480,6 +480,9 @@ class Errors:
|
|||
E201 = ("Span index out of range.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
|
||||
"values are an instance of spacy.vocab.Vocab or True to create one"
|
||||
" (default).")
|
||||
E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training "
|
||||
"data that does not appear to be a binary classification problem "
|
||||
"with two labels. Labels found: {labels}")
|
||||
|
|
|
@ -144,6 +144,8 @@ class Language:
|
|||
self._pipe_meta: Dict[str, "FactoryMeta"] = {} # meta by component
|
||||
self._pipe_configs: Dict[str, Config] = {} # config by component
|
||||
|
||||
if not isinstance(vocab, Vocab) and vocab is not True:
|
||||
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
|
||||
if vocab is True:
|
||||
vectors_name = meta.get("vectors", {}).get("name")
|
||||
vocab = create_vocab(
|
||||
|
@ -1458,6 +1460,7 @@ class Language:
|
|||
vocab: Union[Vocab, bool] = True,
|
||||
disable: Iterable[str] = SimpleFrozenList(),
|
||||
exclude: Iterable[str] = SimpleFrozenList(),
|
||||
meta: Dict[str, Any] = SimpleFrozenDict(),
|
||||
auto_fill: bool = True,
|
||||
validate: bool = True,
|
||||
) -> "Language":
|
||||
|
@ -1472,6 +1475,7 @@ class Language:
|
|||
explicitly enable them by calling nlp.enable_pipe.
|
||||
exclude (Iterable[str]): Names of pipeline components to exclude.
|
||||
Excluded components won't be loaded.
|
||||
meta (Dict[str, Any]): Meta overrides for nlp.meta.
|
||||
auto_fill (bool): Automatically fill in missing values in config based
|
||||
on defaults and function argument annotations.
|
||||
validate (bool): Validate the component config and arguments against
|
||||
|
@ -1525,7 +1529,7 @@ class Language:
|
|||
# inside stuff like the spacy train function. If we loaded them here,
|
||||
# then we would load them twice at runtime: once when we make from config,
|
||||
# and then again when we load from disk.
|
||||
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
|
||||
nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
|
||||
if after_creation is not None:
|
||||
nlp = after_creation(nlp)
|
||||
if not isinstance(nlp, cls):
|
||||
|
|
|
@ -6,6 +6,7 @@ from spacy.vocab import Vocab
|
|||
from spacy.training import Example
|
||||
from spacy.lang.en import English
|
||||
from spacy.util import registry
|
||||
import spacy
|
||||
|
||||
from .util import add_vecs_to_vocab, assert_docs_equal
|
||||
|
||||
|
@ -266,3 +267,24 @@ def test_language_custom_tokenizer():
|
|||
assert [t.text for t in doc] == ["_hello", "_world"]
|
||||
doc = list(nlp.pipe(["hello world"]))[0]
|
||||
assert [t.text for t in doc] == ["_hello", "_world"]
|
||||
|
||||
|
||||
def test_spacy_blank():
|
||||
nlp = spacy.blank("en")
|
||||
assert nlp.config["training"]["dropout"] == 0.1
|
||||
config = {"training": {"dropout": 0.2}}
|
||||
meta = {"name": "my_custom_model"}
|
||||
nlp = spacy.blank("en", config=config, meta=meta)
|
||||
assert nlp.config["training"]["dropout"] == 0.2
|
||||
assert nlp.meta["name"] == "my_custom_model"
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"value",
|
||||
[False, None, ["x", "y"], Language, Vocab],
|
||||
)
|
||||
def test_language_init_invalid_vocab(value):
|
||||
err_fragment = "invalid value"
|
||||
with pytest.raises(ValueError) as e:
|
||||
Language(value)
|
||||
assert err_fragment in str(e)
|
||||
|
|
|
@ -17,7 +17,10 @@ return it.
|
|||
|
||||
## Language.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Initialize a `Language` object.
|
||||
Initialize a `Language` object. Note that the `meta` is only used for meta
|
||||
information in [`Language.meta`](/api/language#meta) and not to configure the
|
||||
`nlp` object or to override the config. To initialize from a config, use
|
||||
[`Language.from_config`](/api/language#from_config) instead.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -37,7 +40,7 @@ Initialize a `Language` object.
|
|||
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
||||
| _keyword-only_ | |
|
||||
| `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ |
|
||||
| `meta` | Custom meta data for the `Language` class. Is written to by pipelines to add meta data. ~~dict~~ |
|
||||
| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
|
||||
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
|
||||
|
||||
## Language.from_config {#from_config tag="classmethod" new="3"}
|
||||
|
@ -59,10 +62,13 @@ model under the hood based on its [`config.cfg`](/api/data-formats#config).
|
|||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `disable` | List of pipeline component names to disable. ~~Iterable[str]~~ |
|
||||
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `exclude` | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||
| `meta` | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~ |
|
||||
| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
|
||||
| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||
| **RETURNS** | The initialized object. ~~Language~~ |
|
||||
|
@ -797,10 +803,19 @@ token.ent_iob, token.ent_type
|
|||
|
||||
## Language.meta {#meta tag="property"}
|
||||
|
||||
Custom meta data for the Language class. If a trained pipeline is loaded, this
|
||||
Meta data for the `Language` class, including name, version, data sources,
|
||||
license, author information and more. If a trained pipeline is loaded, this
|
||||
contains meta data of the pipeline. The `Language.meta` is also what's
|
||||
serialized as the [`meta.json`](/api/data-formats#meta) when you save an `nlp`
|
||||
object to disk.
|
||||
serialized as the `meta.json` when you save an `nlp` object to disk. See the
|
||||
[meta data format](/api/data-formats#meta) for more details.
|
||||
|
||||
<Infobox variant="warning" title="Changed in v3.0">
|
||||
|
||||
As of v3.0, the meta only contains **meta information** about the pipeline and
|
||||
isn't used to construct the language class and pipeline components. This
|
||||
information is expressed in the [`config.cfg`](/api/data-formats#config).
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -79,8 +79,12 @@ Create a blank pipeline of a given language class. This function is the twin of
|
|||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------- |
|
||||
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| `meta` <Tag variant="new">3</tag> | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~ |
|
||||
| **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ |
|
||||
|
||||
### spacy.info {#spacy.info tag="function"}
|
||||
|
@ -745,10 +749,10 @@ and create a `Language` object. The model data will then be loaded in via
|
|||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `name` | Package name or path. ~~str~~ |
|
||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
|
||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | `Language` class with the loaded pipeline. ~~Language~~ |
|
||||
|
|
Loading…
Reference in New Issue
Block a user