mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
Merge branch 'develop' of https://github.com/explosion/spaCy into develop [ci skip]
This commit is contained in:
parent
fb51b55eb9
commit
6ad59d59fe
|
@ -6,7 +6,7 @@ requires = [
|
|||
"cymem>=2.0.2,<2.1.0",
|
||||
"preshed>=3.0.2,<3.1.0",
|
||||
"murmurhash>=0.28.0,<1.1.0",
|
||||
"thinc>=8.0.0a27,<8.0.0a30",
|
||||
"thinc>=8.0.0a28,<8.0.0a30",
|
||||
"blis>=0.4.0,<0.5.0",
|
||||
"pytokenizations",
|
||||
"smart_open>=2.0.0,<3.0.0"
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
# Our libraries
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a27,<8.0.0a30
|
||||
thinc>=8.0.0a28,<8.0.0a30
|
||||
blis>=0.4.0,<0.5.0
|
||||
ml_datasets>=0.1.1
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
|
|
|
@ -34,13 +34,13 @@ setup_requires =
|
|||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
thinc>=8.0.0a27,<8.0.0a30
|
||||
thinc>=8.0.0a28,<8.0.0a30
|
||||
install_requires =
|
||||
# Our libraries
|
||||
murmurhash>=0.28.0,<1.1.0
|
||||
cymem>=2.0.2,<2.1.0
|
||||
preshed>=3.0.2,<3.1.0
|
||||
thinc>=8.0.0a27,<8.0.0a30
|
||||
thinc>=8.0.0a28,<8.0.0a30
|
||||
blis>=0.4.0,<0.5.0
|
||||
wasabi>=0.7.1,<1.1.0
|
||||
srsly>=2.1.0,<3.0.0
|
||||
|
|
|
@ -105,7 +105,7 @@ factory = "tok2vec"
|
|||
|
||||
[components.tok2vec.model.embed]
|
||||
@architectures = "spacy.MultiHashEmbed.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
rows = {{ 2000 if optimize == "efficiency" else 7000 }}
|
||||
also_embed_subwords = {{ true if has_letters else false }}
|
||||
also_use_static_vectors = {{ true if optimize == "accuracy" else false }}
|
||||
|
@ -127,7 +127,7 @@ nO = null
|
|||
|
||||
[components.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
{%- endif %}
|
||||
|
||||
{% if "parser" in components -%}
|
||||
|
@ -144,7 +144,7 @@ nO = null
|
|||
|
||||
[components.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
{%- endif %}
|
||||
|
||||
{% if "ner" in components %}
|
||||
|
@ -161,7 +161,7 @@ nO = null
|
|||
|
||||
[components.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model.encode:width}
|
||||
width = ${components.tok2vec.model.encode.width}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
|
||||
|
@ -194,12 +194,12 @@ initial_rate = 5e-5
|
|||
|
||||
[training.train_corpus]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths:train}
|
||||
path = ${paths.train}
|
||||
max_length = {{ 500 if hardware == "gpu" else 0 }}
|
||||
|
||||
[training.dev_corpus]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths:dev}
|
||||
path = ${paths.dev}
|
||||
max_length = 0
|
||||
|
||||
{% if use_transformer %}
|
||||
|
|
|
@ -23,12 +23,12 @@ after_pipeline_creation = null
|
|||
|
||||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
seed = ${system:seed}
|
||||
seed = ${system.seed}
|
||||
dropout = 0.1
|
||||
accumulate_gradient = 1
|
||||
# Extra resources for transfer-learning or pseudo-rehearsal
|
||||
init_tok2vec = ${paths:init_tok2vec}
|
||||
raw_text = ${paths:raw}
|
||||
init_tok2vec = ${paths.init_tok2vec}
|
||||
raw_text = ${paths.raw}
|
||||
vectors = null
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
|
@ -42,7 +42,7 @@ frozen_components = []
|
|||
|
||||
[training.train_corpus]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths:train}
|
||||
path = ${paths.train}
|
||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||
# and tokens. If you set this to true, take care to ensure your run-time
|
||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||
|
@ -54,7 +54,7 @@ limit = 0
|
|||
|
||||
[training.dev_corpus]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths:dev}
|
||||
path = ${paths.dev}
|
||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||
# and tokens. If you set this to true, take care to ensure your run-time
|
||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||
|
@ -98,8 +98,8 @@ max_length = 500
|
|||
dropout = 0.2
|
||||
n_save_every = null
|
||||
batch_size = 3000
|
||||
seed = ${system:seed}
|
||||
use_pytorch_for_gpu_memory = ${system:use_pytorch_for_gpu_memory}
|
||||
seed = ${system.seed}
|
||||
use_pytorch_for_gpu_memory = ${system.use_pytorch_for_gpu_memory}
|
||||
tok2vec_model = "components.tok2vec.model"
|
||||
|
||||
[pretraining.objective]
|
||||
|
|
|
@ -20,11 +20,11 @@ dev = ""
|
|||
|
||||
[training.train_corpus]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths:train}
|
||||
path = ${paths.train}
|
||||
|
||||
[training.dev_corpus]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths:dev}
|
||||
path = ${paths.dev}
|
||||
|
||||
[training.batcher]
|
||||
@batchers = "batch_by_words.v1"
|
||||
|
@ -57,7 +57,7 @@ factory = "tagger"
|
|||
|
||||
[components.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecListener.v1"
|
||||
width = ${components.tok2vec.model:width}
|
||||
width = ${components.tok2vec.model.width}
|
||||
"""
|
||||
|
||||
|
||||
|
@ -284,13 +284,13 @@ def test_config_overrides():
|
|||
|
||||
def test_config_interpolation():
|
||||
config = Config().from_str(nlp_config_string, interpolate=False)
|
||||
assert config["training"]["train_corpus"]["path"] == "${paths:train}"
|
||||
assert config["training"]["train_corpus"]["path"] == "${paths.train}"
|
||||
interpolated = config.interpolate()
|
||||
assert interpolated["training"]["train_corpus"]["path"] == ""
|
||||
nlp = English.from_config(config)
|
||||
assert nlp.config["training"]["train_corpus"]["path"] == "${paths:train}"
|
||||
assert nlp.config["training"]["train_corpus"]["path"] == "${paths.train}"
|
||||
# Ensure that variables are preserved in nlp config
|
||||
width = "${components.tok2vec.model:width}"
|
||||
width = "${components.tok2vec.model.width}"
|
||||
assert config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||
assert nlp.config["components"]["tagger"]["model"]["tok2vec"]["width"] == width
|
||||
interpolated2 = nlp.config.interpolate()
|
||||
|
|
|
@ -94,7 +94,7 @@ blog post for background.
|
|||
>
|
||||
> [components.tagger.model.tok2vec]
|
||||
> @architectures = "spacy.Tok2VecListener.v1"
|
||||
> width = ${components.tok2vec.model:width}
|
||||
> width = ${components.tok2vec.model.width}
|
||||
> ```
|
||||
|
||||
A listener is used as a sublayer within a component such as a
|
||||
|
|
|
@ -28,7 +28,7 @@ streaming.
|
|||
>
|
||||
> [training.train_corpus]
|
||||
> @readers = "spacy.Corpus.v1"
|
||||
> path = ${paths:train}
|
||||
> path = ${paths.train}
|
||||
> gold_preproc = false
|
||||
> max_length = 0
|
||||
> limit = 0
|
||||
|
|
|
@ -111,7 +111,7 @@ model to copy components from). See the docs on
|
|||
### paths, system {#config-variables tag="variables"}
|
||||
|
||||
These sections define variables that can be referenced across the other sections
|
||||
as variables. For example `${paths:train}` uses the value of `train` defined in
|
||||
as variables. For example `${paths.train}` uses the value of `train` defined in
|
||||
the block `[paths]`. If your config includes custom registered functions that
|
||||
need paths, you can define them here. All config values can also be
|
||||
[overwritten](/usage/training#config-overrides) on the CLI when you run
|
||||
|
@ -131,11 +131,11 @@ process that are used when you run [`spacy train`](/api/cli#train).
|
|||
|
||||
| Name | Description |
|
||||
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ |
|
||||
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
||||
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
|
||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
|
||||
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths:init_tok2vec}`. ~~Optional[str]~~ |
|
||||
| `raw_text` | TODO: ... Defaults to variable `${paths:raw}`. ~~Optional[str]~~ |
|
||||
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~ |
|
||||
| `raw_text` | TODO: ... Defaults to variable `${paths.raw}`. ~~Optional[str]~~ |
|
||||
| `vectors` | Model name or path to model containing pretrained word vectors to use, e.g. created with [`init model`](/api/cli#init-model). Defaults to `null`. ~~Optional[str]~~ |
|
||||
| `patience` | How many steps to continue without improvement in evaluation score. Defaults to `1600`. ~~int~~ |
|
||||
| `max_epochs` | Maximum number of epochs to train for. Defaults to `0`. ~~int~~ |
|
||||
|
@ -162,8 +162,8 @@ run [`spacy pretrain`](/api/cli#pretrain).
|
|||
| `dropout` | The dropout rate. Defaults to `0.2`. ~~float~~ |
|
||||
| `n_save_every` | Saving frequency. Defaults to `null`. ~~Optional[int]~~ |
|
||||
| `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). Defaults to `3000`. ~~Union[int, Sequence[int]]~~ |
|
||||
| `seed` | The random seed. Defaults to variable `${system:seed}`. ~~int~~ |
|
||||
| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system:use_pytorch_for_gpu_memory}`. ~~bool~~ |
|
||||
| `seed` | The random seed. Defaults to variable `${system.seed}`. ~~int~~ |
|
||||
| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. Defaults to variable `${system.use_pytorch_for_gpu_memory}`. ~~bool~~ |
|
||||
| `tok2vec_model` | The model section of the embedding component in the config. Defaults to `"components.tok2vec.model"`. ~~str~~ |
|
||||
| `objective` | The pretraining objective. Defaults to `{"type": "characters", "n_characters": 4}`. ~~Dict[str, Any]~~ |
|
||||
| `optimizer` | The optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~ |
|
||||
|
|
|
@ -612,7 +612,7 @@ components are created, as well as all training settings and hyperparameters.
|
|||
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | Path to the model's `config.cfg`. ~~Union[str, Path]~~ |
|
||||
| `overrides` | Optional config overrides to replace in loaded config. Can be provided as nested dict, or as flat dict with keys in dot notation, e.g. `"nlp.pipeline"`. ~~Dict[str, Any]~~ |
|
||||
| `interpolate` | Whether to interpolate the config and replace variables like `${paths:train}` with their values. Defaults to `False`. ~~bool~~ |
|
||||
| `interpolate` | Whether to interpolate the config and replace variables like `${paths.train}` with their values. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | The model's config. ~~Config~~ |
|
||||
|
||||
### util.load_meta {#util.load_meta tag="function" new="3"}
|
||||
|
|
|
@ -157,8 +157,8 @@ sections of a config file are:
|
|||
| ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `nlp` | Definition of the `nlp` object, its tokenizer and [processing pipeline](/usage/processing-pipelines) component names. |
|
||||
| `components` | Definitions of the [pipeline components](/usage/processing-pipelines) and their models. |
|
||||
| `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths:train}`, and can be [overwritten](#config-overrides) on the CLI. |
|
||||
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system:seed}`, and can be [overwritten](#config-overrides) on the CLI. |
|
||||
| `paths` | Paths to data and other assets. Re-used across the config as variables, e.g. `${paths.train}`, and can be [overwritten](#config-overrides) on the CLI. |
|
||||
| `system` | Settings related to system and hardware. Re-used across the config as variables, e.g. `${system.seed}`, and can be [overwritten](#config-overrides) on the CLI. |
|
||||
| `training` | Settings and controls for the training and evaluation process. |
|
||||
| `pretraining` | Optional settings and controls for the [language model pretraining](#pretraining). |
|
||||
|
||||
|
@ -325,19 +325,9 @@ compound = 1.001
|
|||
Another very useful feature of the config system is that it supports variable
|
||||
interpolation for both **values and sections**. This means that you only need to
|
||||
define a setting once and can reference it across your config using the
|
||||
`${section:value}` or `${section.block}` syntax. In this example, the value of
|
||||
`seed` is reused within the `[training]` block, and the whole block of
|
||||
`[training.optimizer]` is reused in `[pretraining]` and will become
|
||||
`pretraining.optimizer`.
|
||||
|
||||
> #### Note on syntax
|
||||
>
|
||||
> There are two different ways to format your variables, depending on whether
|
||||
> you want to reference a single value or a block. Values are specified after a
|
||||
> `:`, while blocks are specified with a `.`:
|
||||
>
|
||||
> 1. `${section:value}`, `${section.subsection:value}`
|
||||
> 2. `${section.block}`, `${section.subsection.block}`
|
||||
`${section.value}` syntax. In this example, the value of `seed` is reused within
|
||||
the `[training]` block, and the whole block of `[training.optimizer]` is reused
|
||||
in `[pretraining]` and will become `pretraining.optimizer`.
|
||||
|
||||
```ini
|
||||
### config.cfg (excerpt) {highlight="5,18"}
|
||||
|
@ -345,7 +335,7 @@ define a setting once and can reference it across your config using the
|
|||
seed = 0
|
||||
|
||||
[training]
|
||||
seed = ${system:seed}
|
||||
seed = ${system.seed}
|
||||
|
||||
[training.optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
|
@ -369,7 +359,7 @@ to a string.
|
|||
[paths]
|
||||
version = 5
|
||||
root = "/Users/you/data"
|
||||
train = "${paths:root}/train_${paths:version}.spacy"
|
||||
train = "${paths.root}/train_${paths.version}.spacy"
|
||||
# Result: /Users/you/data/train_5.spacy
|
||||
```
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user