mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						aaf01689a1
					
				| 
						 | 
					@ -18,6 +18,7 @@ from .util import registry, logger  # noqa: F401
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .errors import Errors
 | 
					from .errors import Errors
 | 
				
			||||||
from .language import Language
 | 
					from .language import Language
 | 
				
			||||||
 | 
					from .vocab import Vocab
 | 
				
			||||||
from . import util
 | 
					from . import util
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -46,12 +47,22 @@ def load(
 | 
				
			||||||
    return util.load_model(name, disable=disable, exclude=exclude, config=config)
 | 
					    return util.load_model(name, disable=disable, exclude=exclude, config=config)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def blank(name: str, **overrides) -> Language:
 | 
					def blank(
 | 
				
			||||||
 | 
					    name: str,
 | 
				
			||||||
 | 
					    *,
 | 
				
			||||||
 | 
					    vocab: Union[Vocab, bool] = True,
 | 
				
			||||||
 | 
					    config: Union[Dict[str, Any], Config] = util.SimpleFrozenDict(),
 | 
				
			||||||
 | 
					    meta: Dict[str, Any] = util.SimpleFrozenDict()
 | 
				
			||||||
 | 
					) -> Language:
 | 
				
			||||||
    """Create a blank nlp object for a given language code.
 | 
					    """Create a blank nlp object for a given language code.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    name (str): The language code, e.g. "en".
 | 
					    name (str): The language code, e.g. "en".
 | 
				
			||||||
    **overrides: Keyword arguments passed to language subclass on init.
 | 
					    vocab (Vocab): A Vocab object. If True, a vocab is created.
 | 
				
			||||||
 | 
					    config (Dict[str, Any] / Config): Optional config overrides.
 | 
				
			||||||
 | 
					    meta (Dict[str, Any]): Overrides for nlp.meta.
 | 
				
			||||||
    RETURNS (Language): The nlp object.
 | 
					    RETURNS (Language): The nlp object.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    LangClass = util.get_lang_class(name)
 | 
					    LangClass = util.get_lang_class(name)
 | 
				
			||||||
    return LangClass(**overrides)
 | 
					    # We should accept both dot notation and nested dict here for consistency
 | 
				
			||||||
 | 
					    config = util.dot_to_dict(config)
 | 
				
			||||||
 | 
					    return LangClass.from_config(config, meta=meta)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -480,6 +480,9 @@ class Errors:
 | 
				
			||||||
    E201 = ("Span index out of range.")
 | 
					    E201 = ("Span index out of range.")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    # TODO: fix numbering after merging develop into master
 | 
					    # TODO: fix numbering after merging develop into master
 | 
				
			||||||
 | 
					    E918 = ("Received invalid value for vocab: {vocab} ({vocab_type}). Valid "
 | 
				
			||||||
 | 
					            "values are an instance of spacy.vocab.Vocab or True to create one"
 | 
				
			||||||
 | 
					            " (default).")
 | 
				
			||||||
    E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training "
 | 
					    E919 = ("A textcat 'positive_label' '{pos_label}' was provided for training "
 | 
				
			||||||
            "data that does not appear to be a binary classification problem "
 | 
					            "data that does not appear to be a binary classification problem "
 | 
				
			||||||
            "with two labels. Labels found: {labels}")
 | 
					            "with two labels. Labels found: {labels}")
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -144,6 +144,8 @@ class Language:
 | 
				
			||||||
        self._pipe_meta: Dict[str, "FactoryMeta"] = {}  # meta by component
 | 
					        self._pipe_meta: Dict[str, "FactoryMeta"] = {}  # meta by component
 | 
				
			||||||
        self._pipe_configs: Dict[str, Config] = {}  # config by component
 | 
					        self._pipe_configs: Dict[str, Config] = {}  # config by component
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if not isinstance(vocab, Vocab) and vocab is not True:
 | 
				
			||||||
 | 
					            raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
 | 
				
			||||||
        if vocab is True:
 | 
					        if vocab is True:
 | 
				
			||||||
            vectors_name = meta.get("vectors", {}).get("name")
 | 
					            vectors_name = meta.get("vectors", {}).get("name")
 | 
				
			||||||
            vocab = create_vocab(
 | 
					            vocab = create_vocab(
 | 
				
			||||||
| 
						 | 
					@ -1460,6 +1462,7 @@ class Language:
 | 
				
			||||||
        vocab: Union[Vocab, bool] = True,
 | 
					        vocab: Union[Vocab, bool] = True,
 | 
				
			||||||
        disable: Iterable[str] = SimpleFrozenList(),
 | 
					        disable: Iterable[str] = SimpleFrozenList(),
 | 
				
			||||||
        exclude: Iterable[str] = SimpleFrozenList(),
 | 
					        exclude: Iterable[str] = SimpleFrozenList(),
 | 
				
			||||||
 | 
					        meta: Dict[str, Any] = SimpleFrozenDict(),
 | 
				
			||||||
        auto_fill: bool = True,
 | 
					        auto_fill: bool = True,
 | 
				
			||||||
        validate: bool = True,
 | 
					        validate: bool = True,
 | 
				
			||||||
    ) -> "Language":
 | 
					    ) -> "Language":
 | 
				
			||||||
| 
						 | 
					@ -1474,6 +1477,7 @@ class Language:
 | 
				
			||||||
            explicitly enable them by calling nlp.enable_pipe.
 | 
					            explicitly enable them by calling nlp.enable_pipe.
 | 
				
			||||||
        exclude (Iterable[str]): Names of pipeline components to exclude.
 | 
					        exclude (Iterable[str]): Names of pipeline components to exclude.
 | 
				
			||||||
            Excluded components won't be loaded.
 | 
					            Excluded components won't be loaded.
 | 
				
			||||||
 | 
					        meta (Dict[str, Any]): Meta overrides for nlp.meta.
 | 
				
			||||||
        auto_fill (bool): Automatically fill in missing values in config based
 | 
					        auto_fill (bool): Automatically fill in missing values in config based
 | 
				
			||||||
            on defaults and function argument annotations.
 | 
					            on defaults and function argument annotations.
 | 
				
			||||||
        validate (bool): Validate the component config and arguments against
 | 
					        validate (bool): Validate the component config and arguments against
 | 
				
			||||||
| 
						 | 
					@ -1527,7 +1531,7 @@ class Language:
 | 
				
			||||||
        # inside stuff like the spacy train function. If we loaded them here,
 | 
					        # inside stuff like the spacy train function. If we loaded them here,
 | 
				
			||||||
        # then we would load them twice at runtime: once when we make from config,
 | 
					        # then we would load them twice at runtime: once when we make from config,
 | 
				
			||||||
        # and then again when we load from disk.
 | 
					        # and then again when we load from disk.
 | 
				
			||||||
        nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer)
 | 
					        nlp = lang_cls(vocab=vocab, create_tokenizer=create_tokenizer, meta=meta)
 | 
				
			||||||
        if after_creation is not None:
 | 
					        if after_creation is not None:
 | 
				
			||||||
            nlp = after_creation(nlp)
 | 
					            nlp = after_creation(nlp)
 | 
				
			||||||
            if not isinstance(nlp, cls):
 | 
					            if not isinstance(nlp, cls):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -7,6 +7,7 @@ from spacy.training import Example
 | 
				
			||||||
from spacy.lang.en import English
 | 
					from spacy.lang.en import English
 | 
				
			||||||
from spacy.lang.de import German
 | 
					from spacy.lang.de import German
 | 
				
			||||||
from spacy.util import registry
 | 
					from spacy.util import registry
 | 
				
			||||||
 | 
					import spacy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from .util import add_vecs_to_vocab, assert_docs_equal
 | 
					from .util import add_vecs_to_vocab, assert_docs_equal
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -277,3 +278,24 @@ def test_language_from_config_invalid_lang():
 | 
				
			||||||
        Language.from_config(config)
 | 
					        Language.from_config(config)
 | 
				
			||||||
    with pytest.raises(ValueError):
 | 
					    with pytest.raises(ValueError):
 | 
				
			||||||
        German.from_config(config)
 | 
					        German.from_config(config)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_spacy_blank():
 | 
				
			||||||
 | 
					    nlp = spacy.blank("en")
 | 
				
			||||||
 | 
					    assert nlp.config["training"]["dropout"] == 0.1
 | 
				
			||||||
 | 
					    config = {"training": {"dropout": 0.2}}
 | 
				
			||||||
 | 
					    meta = {"name": "my_custom_model"}
 | 
				
			||||||
 | 
					    nlp = spacy.blank("en", config=config, meta=meta)
 | 
				
			||||||
 | 
					    assert nlp.config["training"]["dropout"] == 0.2
 | 
				
			||||||
 | 
					    assert nlp.meta["name"] == "my_custom_model"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					@pytest.mark.parametrize(
 | 
				
			||||||
 | 
					    "value",
 | 
				
			||||||
 | 
					    [False, None, ["x", "y"], Language, Vocab],
 | 
				
			||||||
 | 
					)
 | 
				
			||||||
 | 
					def test_language_init_invalid_vocab(value):
 | 
				
			||||||
 | 
					    err_fragment = "invalid value"
 | 
				
			||||||
 | 
					    with pytest.raises(ValueError) as e:
 | 
				
			||||||
 | 
					        Language(value)
 | 
				
			||||||
 | 
					    assert err_fragment in str(e)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -17,7 +17,10 @@ return it.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Language.\_\_init\_\_ {#init tag="method"}
 | 
					## Language.\_\_init\_\_ {#init tag="method"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Initialize a `Language` object.
 | 
					Initialize a `Language` object. Note that the `meta` is only used for meta
 | 
				
			||||||
 | 
					information in [`Language.meta`](/api/language#meta) and not to configure the
 | 
				
			||||||
 | 
					`nlp` object or to override the config. To initialize from a config, use
 | 
				
			||||||
 | 
					[`Language.from_config`](/api/language#from_config) instead.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Example
 | 
					> #### Example
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
| 
						 | 
					@ -37,7 +40,7 @@ Initialize a `Language` object.
 | 
				
			||||||
| `vocab`            | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~                      |
 | 
					| `vocab`            | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~                      |
 | 
				
			||||||
| _keyword-only_     |                                                                                                                          |
 | 
					| _keyword-only_     |                                                                                                                          |
 | 
				
			||||||
| `max_length`       | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~                                    |
 | 
					| `max_length`       | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~                                    |
 | 
				
			||||||
| `meta`             | Custom meta data for the `Language` class. Is written to by pipelines to add meta data. ~~dict~~                         |
 | 
					| `meta`             | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~                                                        |
 | 
				
			||||||
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
 | 
					| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Language.from_config {#from_config tag="classmethod" new="3"}
 | 
					## Language.from_config {#from_config tag="classmethod" new="3"}
 | 
				
			||||||
| 
						 | 
					@ -58,14 +61,17 @@ model under the hood based on its [`config.cfg`](/api/data-formats#config).
 | 
				
			||||||
> nlp = Language.from_config(config)
 | 
					> nlp = Language.from_config(config)
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name           | Description                                                                                                                                      |
 | 
					| Name           | Description                                                                                                                                                                                                                                      |
 | 
				
			||||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
					| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `config`       | The loaded config. ~~Union[Dict[str, Any], Config]~~                                                                                             |
 | 
					| `config`       | The loaded config. ~~Union[Dict[str, Any], Config]~~                                                                                                                                                                                             |
 | 
				
			||||||
| _keyword-only_ |                                                                                                                                                  |
 | 
					| _keyword-only_ |                                                                                                                                                                                                                                                  |
 | 
				
			||||||
| `disable`      | List of pipeline component names to disable. ~~Iterable[str]~~                                                                                   |
 | 
					| `vocab`        | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~                                                                                                                                              |
 | 
				
			||||||
| `auto_fill`    | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
 | 
					| `disable`      | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
 | 
				
			||||||
| `validate`     | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~                   |
 | 
					| `exclude`      | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~                                                                                                             |
 | 
				
			||||||
| **RETURNS**    | The initialized object. ~~Language~~                                                                                                             |
 | 
					| `meta`         | [Meta data](/api/data-formats#meta) overrides. ~~Dict[str, Any]~~                                                                                                                                                                                |
 | 
				
			||||||
 | 
					| `auto_fill`    | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~                                                                                                 |
 | 
				
			||||||
 | 
					| `validate`     | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~                                                                                                                   |
 | 
				
			||||||
 | 
					| **RETURNS**    | The initialized object. ~~Language~~                                                                                                                                                                                                             |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Language.component {#component tag="classmethod" new="3"}
 | 
					## Language.component {#component tag="classmethod" new="3"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -797,10 +803,19 @@ token.ent_iob, token.ent_type
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Language.meta {#meta tag="property"}
 | 
					## Language.meta {#meta tag="property"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
Custom meta data for the Language class. If a trained pipeline is loaded, this
 | 
					Meta data for the `Language` class, including name, version, data sources,
 | 
				
			||||||
 | 
					license, author information and more. If a trained pipeline is loaded, this
 | 
				
			||||||
contains meta data of the pipeline. The `Language.meta` is also what's
 | 
					contains meta data of the pipeline. The `Language.meta` is also what's
 | 
				
			||||||
serialized as the [`meta.json`](/api/data-formats#meta) when you save an `nlp`
 | 
					serialized as the `meta.json` when you save an `nlp` object to disk. See the
 | 
				
			||||||
object to disk.
 | 
					[meta data format](/api/data-formats#meta) for more details.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					<Infobox variant="warning" title="Changed in v3.0">
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					As of v3.0, the meta only contains **meta information** about the pipeline and
 | 
				
			||||||
 | 
					isn't used to construct the language class and pipeline components. This
 | 
				
			||||||
 | 
					information is expressed in the [`config.cfg`](/api/data-formats#config).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					</Infobox>
 | 
				
			||||||
 | 
					
 | 
				
			||||||
> #### Example
 | 
					> #### Example
 | 
				
			||||||
>
 | 
					>
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -78,10 +78,14 @@ Create a blank pipeline of a given language class. This function is the twin of
 | 
				
			||||||
> nlp_de = spacy.blank("de")   # equivalent to German()
 | 
					> nlp_de = spacy.blank("de")   # equivalent to German()
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name        | Description                                                                                              |
 | 
					| Name                                | Description                                                                                                                                                        |
 | 
				
			||||||
| ----------- | -------------------------------------------------------------------------------------------------------- |
 | 
					| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `name`      | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ |
 | 
					| `name`                              | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~                                                           |
 | 
				
			||||||
| **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~                                     |
 | 
					| _keyword-only_                      |                                                                                                                                                                    |
 | 
				
			||||||
 | 
					| `vocab` <Tag variant="new">3</Tag>  | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~.                             |
 | 
				
			||||||
 | 
					| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
 | 
				
			||||||
 | 
					| `meta` <Tag variant="new">3</tag>   | Optional meta overrides for [`nlp.meta`](/api/language#meta). ~~Dict[str, Any]~~                                                                                   |
 | 
				
			||||||
 | 
					| **RETURNS**                         | An empty `Language` object of the appropriate subclass. ~~Language~~                                                                                               |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### spacy.info {#spacy.info tag="function"}
 | 
					### spacy.info {#spacy.info tag="function"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -744,14 +748,14 @@ and create a `Language` object. The model data will then be loaded in via
 | 
				
			||||||
> nlp = util.load_model("/path/to/data")
 | 
					> nlp = util.load_model("/path/to/data")
 | 
				
			||||||
> ```
 | 
					> ```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| Name                                 | Description                                                                                                                                                                                                                                    |
 | 
					| Name                                 | Description                                                                                                                                                                                                                                      |
 | 
				
			||||||
| ------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
					| ------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
				
			||||||
| `name`                               | Package name or path. ~~str~~                                                                                                                                                                                                                  |
 | 
					| `name`                               | Package name or path. ~~str~~                                                                                                                                                                                                                    |
 | 
				
			||||||
| `vocab` <Tag variant="new">3</Tag>   | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~.                                                                                                         |
 | 
					| `vocab` <Tag variant="new">3</Tag>   | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~.                                                                                                           |
 | 
				
			||||||
| `disable`                            | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [nlp.enable_pipe](/api/language#enable_pipe). ~~List[str]~~ |
 | 
					| `disable`                            | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). Disabled pipes will be loaded but they won't be run unless you explicitly enable them by calling [`nlp.enable_pipe`](/api/language#enable_pipe). ~~List[str]~~ |
 | 
				
			||||||
| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~                                                                                                           |
 | 
					| `exclude` <Tag variant="new">3</Tag> | Names of pipeline components to [exclude](/usage/processing-pipelines#disabling). Excluded components won't be loaded. ~~List[str]~~                                                                                                             |
 | 
				
			||||||
| `config` <Tag variant="new">3</Tag>  | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~                                                                                                 |
 | 
					| `config` <Tag variant="new">3</Tag>  | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~                                                                                                   |
 | 
				
			||||||
| **RETURNS**                          | `Language` class with the loaded pipeline. ~~Language~~                                                                                                                                                                                        |
 | 
					| **RETURNS**                          | `Language` class with the loaded pipeline. ~~Language~~                                                                                                                                                                                          |
 | 
				
			||||||
 | 
					
 | 
				
			||||||
### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
 | 
					### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user