mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Merge pull request #9563 from adrianeboyd/chore/update-develop-from-master-v3.2-3
Update develop from master for v3.2
This commit is contained in:
commit
5e9db156c2
1
.github/azure-steps.yml
vendored
1
.github/azure-steps.yml
vendored
|
@ -27,6 +27,7 @@ steps:
|
||||||
|
|
||||||
- script: python -m mypy spacy
|
- script: python -m mypy spacy
|
||||||
displayName: 'Run mypy'
|
displayName: 'Run mypy'
|
||||||
|
condition: ne(variables['python_version'], '3.10')
|
||||||
|
|
||||||
- task: DeleteFiles@1
|
- task: DeleteFiles@1
|
||||||
inputs:
|
inputs:
|
||||||
|
|
|
@ -42,7 +42,7 @@ jobs:
|
||||||
imageName: "ubuntu-18.04"
|
imageName: "ubuntu-18.04"
|
||||||
python.version: "3.6"
|
python.version: "3.6"
|
||||||
# Python36Windows:
|
# Python36Windows:
|
||||||
# imageName: "vs2017-win2016"
|
# imageName: "windows-2019"
|
||||||
# python.version: "3.6"
|
# python.version: "3.6"
|
||||||
# Python36Mac:
|
# Python36Mac:
|
||||||
# imageName: "macos-10.14"
|
# imageName: "macos-10.14"
|
||||||
|
@ -51,7 +51,7 @@ jobs:
|
||||||
# imageName: "ubuntu-18.04"
|
# imageName: "ubuntu-18.04"
|
||||||
# python.version: "3.7"
|
# python.version: "3.7"
|
||||||
Python37Windows:
|
Python37Windows:
|
||||||
imageName: "vs2017-win2016"
|
imageName: "windows-2019"
|
||||||
python.version: "3.7"
|
python.version: "3.7"
|
||||||
# Python37Mac:
|
# Python37Mac:
|
||||||
# imageName: "macos-10.14"
|
# imageName: "macos-10.14"
|
||||||
|
@ -60,7 +60,7 @@ jobs:
|
||||||
# imageName: "ubuntu-18.04"
|
# imageName: "ubuntu-18.04"
|
||||||
# python.version: "3.8"
|
# python.version: "3.8"
|
||||||
# Python38Windows:
|
# Python38Windows:
|
||||||
# imageName: "vs2017-win2016"
|
# imageName: "windows-2019"
|
||||||
# python.version: "3.8"
|
# python.version: "3.8"
|
||||||
Python38Mac:
|
Python38Mac:
|
||||||
imageName: "macos-10.14"
|
imageName: "macos-10.14"
|
||||||
|
@ -68,12 +68,21 @@ jobs:
|
||||||
Python39Linux:
|
Python39Linux:
|
||||||
imageName: "ubuntu-18.04"
|
imageName: "ubuntu-18.04"
|
||||||
python.version: "3.9"
|
python.version: "3.9"
|
||||||
Python39Windows:
|
# Python39Windows:
|
||||||
imageName: "vs2017-win2016"
|
# imageName: "windows-2019"
|
||||||
python.version: "3.9"
|
# python.version: "3.9"
|
||||||
Python39Mac:
|
# Python39Mac:
|
||||||
imageName: "macos-10.14"
|
# imageName: "macos-10.14"
|
||||||
python.version: "3.9"
|
# python.version: "3.9"
|
||||||
|
Python310Linux:
|
||||||
|
imageName: "ubuntu-20.04"
|
||||||
|
python.version: "3.10"
|
||||||
|
Python310Windows:
|
||||||
|
imageName: "windows-2019"
|
||||||
|
python.version: "3.10"
|
||||||
|
Python310Mac:
|
||||||
|
imageName: "macos-10.15"
|
||||||
|
python.version: "3.10"
|
||||||
maxParallel: 4
|
maxParallel: 4
|
||||||
pool:
|
pool:
|
||||||
vmImage: $(imageName)
|
vmImage: $(imageName)
|
||||||
|
|
|
@ -2,4 +2,5 @@
|
||||||
numpy==1.15.0; python_version<='3.7'
|
numpy==1.15.0; python_version<='3.7'
|
||||||
numpy==1.17.3; python_version=='3.8'
|
numpy==1.17.3; python_version=='3.8'
|
||||||
numpy==1.19.3; python_version=='3.9'
|
numpy==1.19.3; python_version=='3.9'
|
||||||
numpy; python_version>='3.10'
|
numpy==1.21.3; python_version=='3.10'
|
||||||
|
numpy; python_version>='3.11'
|
||||||
|
|
|
@ -5,7 +5,7 @@ requires = [
|
||||||
"cymem>=2.0.2,<2.1.0",
|
"cymem>=2.0.2,<2.1.0",
|
||||||
"preshed>=3.0.2,<3.1.0",
|
"preshed>=3.0.2,<3.1.0",
|
||||||
"murmurhash>=0.28.0,<1.1.0",
|
"murmurhash>=0.28.0,<1.1.0",
|
||||||
"thinc>=8.0.11,<8.1.0",
|
"thinc>=8.0.12,<8.1.0",
|
||||||
"blis>=0.4.0,<0.8.0",
|
"blis>=0.4.0,<0.8.0",
|
||||||
"pathy",
|
"pathy",
|
||||||
"numpy>=1.15.0",
|
"numpy>=1.15.0",
|
||||||
|
|
|
@ -3,7 +3,7 @@ spacy-legacy>=3.0.8,<3.1.0
|
||||||
spacy-loggers>=1.0.0,<2.0.0
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.11,<8.1.0
|
thinc>=8.0.12,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
ml_datasets>=0.2.0,<0.3.0
|
ml_datasets>=0.2.0,<0.3.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
|
|
11
setup.cfg
11
setup.cfg
|
@ -21,6 +21,7 @@ classifiers =
|
||||||
Programming Language :: Python :: 3.7
|
Programming Language :: Python :: 3.7
|
||||||
Programming Language :: Python :: 3.8
|
Programming Language :: Python :: 3.8
|
||||||
Programming Language :: Python :: 3.9
|
Programming Language :: Python :: 3.9
|
||||||
|
Programming Language :: Python :: 3.10
|
||||||
Topic :: Scientific/Engineering
|
Topic :: Scientific/Engineering
|
||||||
project_urls =
|
project_urls =
|
||||||
Release notes = https://github.com/explosion/spaCy/releases
|
Release notes = https://github.com/explosion/spaCy/releases
|
||||||
|
@ -37,7 +38,7 @@ setup_requires =
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
thinc>=8.0.11,<8.1.0
|
thinc>=8.0.12,<8.1.0
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.8,<3.1.0
|
spacy-legacy>=3.0.8,<3.1.0
|
||||||
|
@ -45,7 +46,7 @@ install_requires =
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
thinc>=8.0.11,<8.1.0
|
thinc>=8.0.12,<8.1.0
|
||||||
blis>=0.4.0,<0.8.0
|
blis>=0.4.0,<0.8.0
|
||||||
wasabi>=0.8.1,<1.1.0
|
wasabi>=0.8.1,<1.1.0
|
||||||
srsly>=2.4.1,<3.0.0
|
srsly>=2.4.1,<3.0.0
|
||||||
|
@ -97,6 +98,12 @@ cuda111 =
|
||||||
cupy-cuda111>=5.0.0b4,<10.0.0
|
cupy-cuda111>=5.0.0b4,<10.0.0
|
||||||
cuda112 =
|
cuda112 =
|
||||||
cupy-cuda112>=5.0.0b4,<10.0.0
|
cupy-cuda112>=5.0.0b4,<10.0.0
|
||||||
|
cuda113 =
|
||||||
|
cupy-cuda113>=5.0.0b4,<10.0.0
|
||||||
|
cuda114 =
|
||||||
|
cupy-cuda114>=5.0.0b4,<10.0.0
|
||||||
|
apple =
|
||||||
|
thinc-apple-ops>=0.0.4,<1.0.0
|
||||||
# Language tokenizers with external dependencies
|
# Language tokenizers with external dependencies
|
||||||
ja =
|
ja =
|
||||||
sudachipy>=0.4.9
|
sudachipy>=0.4.9
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, Dict, Any
|
from typing import Optional, Dict, Any, Union
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import msg
|
from wasabi import msg
|
||||||
import typer
|
import typer
|
||||||
|
@ -46,12 +46,14 @@ def train_cli(
|
||||||
|
|
||||||
|
|
||||||
def train(
|
def train(
|
||||||
config_path: Path,
|
config_path: Union[str, Path],
|
||||||
output_path: Optional[Path] = None,
|
output_path: Optional[Union[str, Path]] = None,
|
||||||
*,
|
*,
|
||||||
use_gpu: int = -1,
|
use_gpu: int = -1,
|
||||||
overrides: Dict[str, Any] = util.SimpleFrozenDict(),
|
overrides: Dict[str, Any] = util.SimpleFrozenDict(),
|
||||||
):
|
):
|
||||||
|
config_path = util.ensure_path(config_path)
|
||||||
|
output_path = util.ensure_path(output_path)
|
||||||
# Make sure all files and paths exists if they are needed
|
# Make sure all files and paths exists if they are needed
|
||||||
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
if not config_path or (str(config_path) != "-" and not config_path.exists()):
|
||||||
msg.fail("Config file not found", config_path, exits=1)
|
msg.fail("Config file not found", config_path, exits=1)
|
||||||
|
|
|
@ -893,6 +893,7 @@ class Errors:
|
||||||
"filename. Specify an epoch to resume from.")
|
"filename. Specify an epoch to resume from.")
|
||||||
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
|
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
|
||||||
"Non-UD tags should use the `tag` property.")
|
"Non-UD tags should use the `tag` property.")
|
||||||
|
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
|
||||||
|
|
||||||
|
|
||||||
# Deprecated model shortcuts, only used in errors and warnings
|
# Deprecated model shortcuts, only used in errors and warnings
|
||||||
|
|
|
@ -25,6 +25,7 @@ def test_build_dependencies():
|
||||||
"sudachipy",
|
"sudachipy",
|
||||||
"sudachidict_core",
|
"sudachidict_core",
|
||||||
"spacy-pkuseg",
|
"spacy-pkuseg",
|
||||||
|
"thinc-apple-ops",
|
||||||
]
|
]
|
||||||
|
|
||||||
# check requirements.txt
|
# check requirements.txt
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
from typing import Dict, Iterable, Callable
|
from typing import Dict, Iterable, Callable
|
||||||
import pytest
|
import pytest
|
||||||
from thinc.api import Config
|
from thinc.api import Config, fix_random_seed
|
||||||
from spacy import Language
|
from spacy import Language
|
||||||
from spacy.util import load_model_from_config, registry, resolve_dot_names
|
from spacy.util import load_model_from_config, registry, resolve_dot_names
|
||||||
from spacy.schemas import ConfigSchemaTraining
|
from spacy.schemas import ConfigSchemaTraining
|
||||||
|
@ -64,8 +64,8 @@ def test_readers():
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"reader,additional_config",
|
"reader,additional_config",
|
||||||
[
|
[
|
||||||
("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}),
|
("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}),
|
||||||
("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}),
|
("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 10}),
|
||||||
("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
|
("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
@ -93,6 +93,7 @@ def test_cat_readers(reader, additional_config):
|
||||||
factory = "textcat_multilabel"
|
factory = "textcat_multilabel"
|
||||||
"""
|
"""
|
||||||
config = Config().from_str(nlp_config_string)
|
config = Config().from_str(nlp_config_string)
|
||||||
|
fix_random_seed(config["training"]["seed"])
|
||||||
config["corpora"]["@readers"] = reader
|
config["corpora"]["@readers"] = reader
|
||||||
config["corpora"].update(additional_config)
|
config["corpora"].update(additional_config)
|
||||||
nlp = load_model_from_config(config, auto_fill=True)
|
nlp = load_model_from_config(config, auto_fill=True)
|
||||||
|
|
|
@ -194,11 +194,12 @@ cdef class Doc:
|
||||||
|
|
||||||
vocab (Vocab): A vocabulary object, which must match any models you
|
vocab (Vocab): A vocabulary object, which must match any models you
|
||||||
want to use (e.g. tokenizer, parser, entity recognizer).
|
want to use (e.g. tokenizer, parser, entity recognizer).
|
||||||
words (Optional[List[str]]): A list of unicode strings to add to the document
|
words (Optional[List[Union[str, int]]]): A list of unicode strings or
|
||||||
as words. If `None`, defaults to empty list.
|
hash values to add to the document as words. If `None`, defaults to
|
||||||
spaces (Optional[List[bool]]): A list of boolean values, of the same length as
|
empty list.
|
||||||
words. True means that the word is followed by a space, False means
|
spaces (Optional[List[bool]]): A list of boolean values, of the same
|
||||||
it is not. If `None`, defaults to `[True]*len(words)`
|
length as `words`. `True` means that the word is followed by a space,
|
||||||
|
`False` means it is not. If `None`, defaults to `[True]*len(words)`
|
||||||
user_data (dict or None): Optional extra data to attach to the Doc.
|
user_data (dict or None): Optional extra data to attach to the Doc.
|
||||||
tags (Optional[List[str]]): A list of unicode strings, of the same
|
tags (Optional[List[str]]): A list of unicode strings, of the same
|
||||||
length as words, to assign as token.tag. Defaults to None.
|
length as words, to assign as token.tag. Defaults to None.
|
||||||
|
@ -266,7 +267,10 @@ cdef class Doc:
|
||||||
elif isinstance(word, bytes):
|
elif isinstance(word, bytes):
|
||||||
raise ValueError(Errors.E028.format(value=word))
|
raise ValueError(Errors.E028.format(value=word))
|
||||||
else:
|
else:
|
||||||
lexeme = self.vocab.get_by_orth(self.mem, word)
|
try:
|
||||||
|
lexeme = self.vocab.get_by_orth(self.mem, word)
|
||||||
|
except TypeError:
|
||||||
|
raise TypeError(Errors.E1022.format(wtype=type(word)))
|
||||||
self.push_back(lexeme, has_space)
|
self.push_back(lexeme, has_space)
|
||||||
|
|
||||||
if heads is not None:
|
if heads is not None:
|
||||||
|
|
|
@ -820,6 +820,29 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
|
||||||
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
|
||||||
| **CREATES** | The final trained pipeline and the best trained pipeline. |
|
| **CREATES** | The final trained pipeline and the best trained pipeline. |
|
||||||
|
|
||||||
|
### Calling the training function from Python {#train-function new="3.2"}
|
||||||
|
|
||||||
|
The training CLI exposes a `train` helper function that lets you run the
|
||||||
|
training just like `spacy train`. Usually it's easier to use the command line
|
||||||
|
directly, but if you need to kick off training from code this is how to do it.
|
||||||
|
|
||||||
|
> #### Example
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from spacy.cli.train import train
|
||||||
|
>
|
||||||
|
> train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"})
|
||||||
|
>
|
||||||
|
> ```
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| -------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `config_path` | Path to the config to use for training. ~~Union[str, Path]~~ |
|
||||||
|
| `output_path` | Optional name of directory to save output model in. If not provided a model will not be saved. ~~Optional[Union[str, Path]]~~ |
|
||||||
|
| _keyword-only_ | |
|
||||||
|
| `use_gpu` | Which GPU to use. Defaults to -1 for no GPU. ~~int~~ |
|
||||||
|
| `overrides` | Values to override config settings. ~~Dict[str, Any]~~ |
|
||||||
|
|
||||||
## pretrain {#pretrain new="2.1" tag="command,experimental"}
|
## pretrain {#pretrain new="2.1" tag="command,experimental"}
|
||||||
|
|
||||||
Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
|
Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
|
||||||
|
|
|
@ -34,7 +34,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||||
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
|
||||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||||
| _keyword-only_ | |
|
| _keyword-only_ | |
|
||||||
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |
|
||||||
|
|
|
@ -773,17 +773,17 @@ from the specified model. Intended for use in `[initialize.before_init]`.
|
||||||
> after_pipeline_creation = {"@callbacks":"spacy.models_with_nvtx_range.v1"}
|
> after_pipeline_creation = {"@callbacks":"spacy.models_with_nvtx_range.v1"}
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
Recursively wrap the models in each pipe using [NVTX](https://nvidia.github.io/NVTX/)
|
Recursively wrap the models in each pipe using
|
||||||
range markers. These markers aid in GPU profiling by attributing specific operations
|
[NVTX](https://nvidia.github.io/NVTX/) range markers. These markers aid in GPU
|
||||||
to a ~~Model~~'s forward or backprop passes.
|
profiling by attributing specific operations to a ~~Model~~'s forward or
|
||||||
|
backprop passes.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
|------------------|------------------------------------------------------------------------------------------------------------------------------|
|
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `forward_color` | Color identifier for forward passes. Defaults to `-1`. ~~int~~ |
|
| `forward_color` | Color identifier for forward passes. Defaults to `-1`. ~~int~~ |
|
||||||
| `backprop_color` | Color identifier for backpropagation passes. Defaults to `-1`. ~~int~~ |
|
| `backprop_color` | Color identifier for backpropagation passes. Defaults to `-1`. ~~int~~ |
|
||||||
| **CREATES** | A function that takes the current `nlp` and wraps forward/backprop passes in NVTX ranges. ~~Callable[[Language], Language]~~ |
|
| **CREATES** | A function that takes the current `nlp` and wraps forward/backprop passes in NVTX ranges. ~~Callable[[Language], Language]~~ |
|
||||||
|
|
||||||
|
|
||||||
## Training data and alignment {#gold source="spacy/training"}
|
## Training data and alignment {#gold source="spacy/training"}
|
||||||
|
|
||||||
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
|
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
|
||||||
|
|
|
@ -71,13 +71,14 @@ spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
|
||||||
> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
|
> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
|
| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
|
||||||
| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. |
|
| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. |
|
||||||
| `ray` | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training). |
|
| `ray` | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training). |
|
||||||
| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. |
|
| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. |
|
||||||
| `ja`, `ko`, `th`, `zh` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages). |
|
| `apple` | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1. |
|
||||||
|
| `ja`, `ko`, `th` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages). |
|
||||||
|
|
||||||
### conda {#conda}
|
### conda {#conda}
|
||||||
|
|
||||||
|
|
|
@ -301,8 +301,6 @@ fly without having to save to and load from disk.
|
||||||
$ python -m spacy init config - --lang en --pipeline ner,textcat --optimize accuracy | python -m spacy train - --paths.train ./corpus/train.spacy --paths.dev ./corpus/dev.spacy
|
$ python -m spacy init config - --lang en --pipeline ner,textcat --optimize accuracy | python -m spacy train - --paths.train ./corpus/train.spacy --paths.dev ./corpus/dev.spacy
|
||||||
```
|
```
|
||||||
|
|
||||||
<!-- TODO: add reference to Prodigy's commands once Prodigy nightly is available -->
|
|
||||||
|
|
||||||
### Using variable interpolation {#config-interpolation}
|
### Using variable interpolation {#config-interpolation}
|
||||||
|
|
||||||
Another very useful feature of the config system is that it supports variable
|
Another very useful feature of the config system is that it supports variable
|
||||||
|
@ -1647,7 +1645,7 @@ workers are stuck waiting for it to complete before they can continue.
|
||||||
|
|
||||||
## Internal training API {#api}
|
## Internal training API {#api}
|
||||||
|
|
||||||
<Infobox variant="warning">
|
<Infobox variant="danger">
|
||||||
|
|
||||||
spaCy gives you full control over the training loop. However, for most use
|
spaCy gives you full control over the training loop. However, for most use
|
||||||
cases, it's recommended to train your pipelines via the
|
cases, it's recommended to train your pipelines via the
|
||||||
|
@ -1659,6 +1657,32 @@ typically give you everything you need to train fully custom pipelines with
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
### Training from a Python script {#api-train new="3.2"}
|
||||||
|
|
||||||
|
If you want to run the training from a Python script instead of using the
|
||||||
|
[`spacy train`](/api/cli#train) CLI command, you can call into the
|
||||||
|
[`train`](/api/cli#train-function) helper function directly. It takes the path
|
||||||
|
to the config file, an optional output directory and an optional dictionary of
|
||||||
|
[config overrides](#config-overrides).
|
||||||
|
|
||||||
|
```python
|
||||||
|
from spacy.cli.train import train
|
||||||
|
|
||||||
|
train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"})
|
||||||
|
```
|
||||||
|
|
||||||
|
### Internal training loop API {#api-loop}
|
||||||
|
|
||||||
|
<Infobox variant="warning">
|
||||||
|
|
||||||
|
This section documents how the training loop and updates to the `nlp` object
|
||||||
|
work internally. You typically shouldn't have to implement this in Python unless
|
||||||
|
you're writing your own trainable components. To train a pipeline, use
|
||||||
|
[`spacy train`](/api/cli#train) or the [`train`](/api/cli#train-function) helper
|
||||||
|
function instead.
|
||||||
|
|
||||||
|
</Infobox>
|
||||||
|
|
||||||
The [`Example`](/api/example) object contains annotated training data, also
|
The [`Example`](/api/example) object contains annotated training data, also
|
||||||
called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object
|
called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object
|
||||||
that will hold the predictions, and another `Doc` object that holds the
|
that will hold the predictions, and another `Doc` object that holds the
|
||||||
|
|
|
@ -1138,7 +1138,7 @@
|
||||||
{
|
{
|
||||||
"id": "deplacy",
|
"id": "deplacy",
|
||||||
"slogan": "CUI-based Tree Visualizer for Universal Dependencies and Immediate Catena Analysis",
|
"slogan": "CUI-based Tree Visualizer for Universal Dependencies and Immediate Catena Analysis",
|
||||||
"discreption": "Simple dependency visualizer for [spaCy](https://spacy.io/), [UniDic2UD](https://pypi.org/project/unidic2ud), [Stanza](https://stanfordnlp.github.io/stanza/), [NLP-Cube](https://github.com/Adobe/NLP-Cube), [Trankit](https://github.com/nlp-uoregon/trankit), etc.",
|
"description": "Simple dependency visualizer for [spaCy](https://spacy.io/), [UniDic2UD](https://pypi.org/project/unidic2ud), [Stanza](https://stanfordnlp.github.io/stanza/), [NLP-Cube](https://github.com/Adobe/NLP-Cube), [Trankit](https://github.com/nlp-uoregon/trankit), etc.",
|
||||||
"github": "KoichiYasuoka/deplacy",
|
"github": "KoichiYasuoka/deplacy",
|
||||||
"image": "https://i.imgur.com/6uOI4Op.png",
|
"image": "https://i.imgur.com/6uOI4Op.png",
|
||||||
"code_example": [
|
"code_example": [
|
||||||
|
@ -1270,7 +1270,7 @@
|
||||||
"description": "`textacy` is a Python library for performing a variety of natural language processing (NLP) tasks, built on the high-performance `spacy` library. With the fundamentals – tokenization, part-of-speech tagging, dependency parsing, etc. – delegated to another library, `textacy` focuses on the tasks that come before and follow after.",
|
"description": "`textacy` is a Python library for performing a variety of natural language processing (NLP) tasks, built on the high-performance `spacy` library. With the fundamentals – tokenization, part-of-speech tagging, dependency parsing, etc. – delegated to another library, `textacy` focuses on the tasks that come before and follow after.",
|
||||||
"github": "chartbeat-labs/textacy",
|
"github": "chartbeat-labs/textacy",
|
||||||
"pip": "textacy",
|
"pip": "textacy",
|
||||||
"url": "https://chartbeat-labs.github.io/textacy/",
|
"url": "https://github.com/chartbeat-labs/textacy",
|
||||||
"author": "Burton DeWilde",
|
"author": "Burton DeWilde",
|
||||||
"author_links": {
|
"author_links": {
|
||||||
"github": "bdewilde",
|
"github": "bdewilde",
|
||||||
|
|
|
@ -4,10 +4,12 @@ import { StaticQuery, graphql } from 'gatsby'
|
||||||
import { Quickstart, QS } from '../components/quickstart'
|
import { Quickstart, QS } from '../components/quickstart'
|
||||||
import { repo, DEFAULT_BRANCH } from '../components/util'
|
import { repo, DEFAULT_BRANCH } from '../components/util'
|
||||||
|
|
||||||
|
const DEFAULT_OS = 'mac'
|
||||||
|
const DEFAULT_PLATFORM = 'x86'
|
||||||
const DEFAULT_MODELS = ['en']
|
const DEFAULT_MODELS = ['en']
|
||||||
const DEFAULT_OPT = 'efficiency'
|
const DEFAULT_OPT = 'efficiency'
|
||||||
const DEFAULT_HARDWARE = 'cpu'
|
const DEFAULT_HARDWARE = 'cpu'
|
||||||
const DEFAULT_CUDA = 'cuda102'
|
const DEFAULT_CUDA = 'cuda113'
|
||||||
const CUDA = {
|
const CUDA = {
|
||||||
'8.0': 'cuda80',
|
'8.0': 'cuda80',
|
||||||
'9.0': 'cuda90',
|
'9.0': 'cuda90',
|
||||||
|
@ -19,11 +21,15 @@ const CUDA = {
|
||||||
'11.0': 'cuda110',
|
'11.0': 'cuda110',
|
||||||
'11.1': 'cuda111',
|
'11.1': 'cuda111',
|
||||||
'11.2': 'cuda112',
|
'11.2': 'cuda112',
|
||||||
|
'11.3': 'cuda113',
|
||||||
|
'11.4': 'cuda114',
|
||||||
}
|
}
|
||||||
const LANG_EXTRAS = ['ja'] // only for languages with models
|
const LANG_EXTRAS = ['ja'] // only for languages with models
|
||||||
|
|
||||||
const QuickstartInstall = ({ id, title }) => {
|
const QuickstartInstall = ({ id, title }) => {
|
||||||
const [train, setTrain] = useState(false)
|
const [train, setTrain] = useState(false)
|
||||||
|
const [platform, setPlatform] = useState(DEFAULT_PLATFORM)
|
||||||
|
const [os, setOs] = useState(DEFAULT_OS)
|
||||||
const [hardware, setHardware] = useState(DEFAULT_HARDWARE)
|
const [hardware, setHardware] = useState(DEFAULT_HARDWARE)
|
||||||
const [cuda, setCuda] = useState(DEFAULT_CUDA)
|
const [cuda, setCuda] = useState(DEFAULT_CUDA)
|
||||||
const [selectedModels, setModels] = useState(DEFAULT_MODELS)
|
const [selectedModels, setModels] = useState(DEFAULT_MODELS)
|
||||||
|
@ -33,15 +39,19 @@ const QuickstartInstall = ({ id, title }) => {
|
||||||
config: v => setTrain(v.includes('train')),
|
config: v => setTrain(v.includes('train')),
|
||||||
models: setModels,
|
models: setModels,
|
||||||
optimize: v => setEfficiency(v.includes('efficiency')),
|
optimize: v => setEfficiency(v.includes('efficiency')),
|
||||||
|
platform: v => setPlatform(v[0]),
|
||||||
|
os: v => setOs(v[0]),
|
||||||
}
|
}
|
||||||
const showDropdown = {
|
const showDropdown = {
|
||||||
hardware: () => hardware === 'gpu',
|
hardware: () => hardware === 'gpu',
|
||||||
}
|
}
|
||||||
const modelExtras = train ? selectedModels.filter(m => LANG_EXTRAS.includes(m)) : []
|
const modelExtras = train ? selectedModels.filter(m => LANG_EXTRAS.includes(m)) : []
|
||||||
|
const apple = os === 'mac' && platform === 'arm'
|
||||||
const pipExtras = [
|
const pipExtras = [
|
||||||
hardware === 'gpu' && cuda,
|
hardware === 'gpu' && cuda,
|
||||||
train && 'transformers',
|
train && 'transformers',
|
||||||
train && 'lookups',
|
train && 'lookups',
|
||||||
|
apple && 'apple',
|
||||||
...modelExtras,
|
...modelExtras,
|
||||||
]
|
]
|
||||||
.filter(e => e)
|
.filter(e => e)
|
||||||
|
@ -62,6 +72,16 @@ const QuickstartInstall = ({ id, title }) => {
|
||||||
{ id: 'windows', title: 'Windows' },
|
{ id: 'windows', title: 'Windows' },
|
||||||
{ id: 'linux', title: 'Linux' },
|
{ id: 'linux', title: 'Linux' },
|
||||||
],
|
],
|
||||||
|
defaultValue: DEFAULT_OS,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
id: 'platform',
|
||||||
|
title: 'Platform',
|
||||||
|
options: [
|
||||||
|
{ id: 'x86', title: 'x86', checked: true },
|
||||||
|
{ id: 'arm', title: 'ARM / M1' },
|
||||||
|
],
|
||||||
|
defaultValue: DEFAULT_PLATFORM,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
id: 'package',
|
id: 'package',
|
||||||
|
|
Loading…
Reference in New Issue
Block a user