Merge remote-tracking branch 'upstream/master' into chore/update-develop-from-master-v3.2-3

This commit is contained in:
Adriane Boyd 2021-10-29 12:18:15 +02:00
commit 2d430958e1
18 changed files with 140 additions and 45 deletions

View File

@ -27,6 +27,7 @@ steps:
- script: python -m mypy spacy
displayName: 'Run mypy'
condition: ne(variables['python_version'], '3.10')
- task: DeleteFiles@1
inputs:

View File

@ -42,7 +42,7 @@ jobs:
imageName: "ubuntu-18.04"
python.version: "3.6"
# Python36Windows:
# imageName: "vs2017-win2016"
# imageName: "windows-2019"
# python.version: "3.6"
# Python36Mac:
# imageName: "macos-10.14"
@ -51,7 +51,7 @@ jobs:
# imageName: "ubuntu-18.04"
# python.version: "3.7"
Python37Windows:
imageName: "vs2017-win2016"
imageName: "windows-2019"
python.version: "3.7"
# Python37Mac:
# imageName: "macos-10.14"
@ -60,7 +60,7 @@ jobs:
# imageName: "ubuntu-18.04"
# python.version: "3.8"
# Python38Windows:
# imageName: "vs2017-win2016"
# imageName: "windows-2019"
# python.version: "3.8"
Python38Mac:
imageName: "macos-10.14"
@ -68,12 +68,21 @@ jobs:
Python39Linux:
imageName: "ubuntu-18.04"
python.version: "3.9"
Python39Windows:
imageName: "vs2017-win2016"
python.version: "3.9"
Python39Mac:
imageName: "macos-10.14"
python.version: "3.9"
# Python39Windows:
# imageName: "windows-2019"
# python.version: "3.9"
# Python39Mac:
# imageName: "macos-10.14"
# python.version: "3.9"
Python310Linux:
imageName: "ubuntu-20.04"
python.version: "3.10"
Python310Windows:
imageName: "windows-2019"
python.version: "3.10"
Python310Mac:
imageName: "macos-10.15"
python.version: "3.10"
maxParallel: 4
pool:
vmImage: $(imageName)

View File

@ -2,4 +2,5 @@
numpy==1.15.0; python_version<='3.7'
numpy==1.17.3; python_version=='3.8'
numpy==1.19.3; python_version=='3.9'
numpy; python_version>='3.10'
numpy==1.21.3; python_version=='3.10'
numpy; python_version>='3.11'

View File

@ -5,7 +5,7 @@ requires = [
"cymem>=2.0.2,<2.1.0",
"preshed>=3.0.2,<3.1.0",
"murmurhash>=0.28.0,<1.1.0",
"thinc>=8.0.11,<8.1.0",
"thinc>=8.0.12,<8.1.0",
"blis>=0.4.0,<0.8.0",
"pathy",
"numpy>=1.15.0",

View File

@ -3,7 +3,7 @@ spacy-legacy>=3.0.8,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.11,<8.1.0
thinc>=8.0.12,<8.1.0
blis>=0.4.0,<0.8.0
ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0

View File

@ -21,6 +21,7 @@ classifiers =
Programming Language :: Python :: 3.7
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10
Topic :: Scientific/Engineering
project_urls =
Release notes = https://github.com/explosion/spaCy/releases
@ -37,7 +38,7 @@ setup_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
murmurhash>=0.28.0,<1.1.0
thinc>=8.0.11,<8.1.0
thinc>=8.0.12,<8.1.0
install_requires =
# Our libraries
spacy-legacy>=3.0.8,<3.1.0
@ -45,7 +46,7 @@ install_requires =
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.0.11,<8.1.0
thinc>=8.0.12,<8.1.0
blis>=0.4.0,<0.8.0
wasabi>=0.8.1,<1.1.0
srsly>=2.4.1,<3.0.0
@ -97,6 +98,12 @@ cuda111 =
cupy-cuda111>=5.0.0b4,<10.0.0
cuda112 =
cupy-cuda112>=5.0.0b4,<10.0.0
cuda113 =
cupy-cuda113>=5.0.0b4,<10.0.0
cuda114 =
cupy-cuda114>=5.0.0b4,<10.0.0
apple =
thinc-apple-ops>=0.0.4,<1.0.0
# Language tokenizers with external dependencies
ja =
sudachipy>=0.4.9

View File

@ -1,4 +1,4 @@
from typing import Optional, Dict, Any
from typing import Optional, Dict, Any, Union
from pathlib import Path
from wasabi import msg
import typer
@ -46,12 +46,14 @@ def train_cli(
def train(
config_path: Path,
output_path: Optional[Path] = None,
config_path: Union[str, Path],
output_path: Optional[Union[str, Path]] = None,
*,
use_gpu: int = -1,
overrides: Dict[str, Any] = util.SimpleFrozenDict(),
):
config_path = util.ensure_path(config_path)
output_path = util.ensure_path(output_path)
# Make sure all files and paths exists if they are needed
if not config_path or (str(config_path) != "-" and not config_path.exists()):
msg.fail("Config file not found", config_path, exits=1)

View File

@ -893,6 +893,7 @@ class Errors:
"filename. Specify an epoch to resume from.")
E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
"Non-UD tags should use the `tag` property.")
E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")
# Deprecated model shortcuts, only used in errors and warnings

View File

@ -25,6 +25,7 @@ def test_build_dependencies():
"sudachipy",
"sudachidict_core",
"spacy-pkuseg",
"thinc-apple-ops",
]
# check requirements.txt

View File

@ -1,6 +1,6 @@
from typing import Dict, Iterable, Callable
import pytest
from thinc.api import Config
from thinc.api import Config, fix_random_seed
from spacy import Language
from spacy.util import load_model_from_config, registry, resolve_dot_names
from spacy.schemas import ConfigSchemaTraining
@ -64,8 +64,8 @@ def test_readers():
@pytest.mark.parametrize(
"reader,additional_config",
[
("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}),
("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}),
("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}),
("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 10}),
("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
],
)
@ -93,6 +93,7 @@ def test_cat_readers(reader, additional_config):
factory = "textcat_multilabel"
"""
config = Config().from_str(nlp_config_string)
fix_random_seed(config["training"]["seed"])
config["corpora"]["@readers"] = reader
config["corpora"].update(additional_config)
nlp = load_model_from_config(config, auto_fill=True)

View File

@ -194,11 +194,12 @@ cdef class Doc:
vocab (Vocab): A vocabulary object, which must match any models you
want to use (e.g. tokenizer, parser, entity recognizer).
words (Optional[List[str]]): A list of unicode strings to add to the document
as words. If `None`, defaults to empty list.
spaces (Optional[List[bool]]): A list of boolean values, of the same length as
words. True means that the word is followed by a space, False means
it is not. If `None`, defaults to `[True]*len(words)`
words (Optional[List[Union[str, int]]]): A list of unicode strings or
hash values to add to the document as words. If `None`, defaults to
empty list.
spaces (Optional[List[bool]]): A list of boolean values, of the same
length as `words`. `True` means that the word is followed by a space,
`False` means it is not. If `None`, defaults to `[True]*len(words)`
user_data (dict or None): Optional extra data to attach to the Doc.
tags (Optional[List[str]]): A list of unicode strings, of the same
length as words, to assign as token.tag. Defaults to None.
@ -266,7 +267,10 @@ cdef class Doc:
elif isinstance(word, bytes):
raise ValueError(Errors.E028.format(value=word))
else:
lexeme = self.vocab.get_by_orth(self.mem, word)
try:
lexeme = self.vocab.get_by_orth(self.mem, word)
except TypeError:
raise TypeError(Errors.E1022.format(wtype=type(word)))
self.push_back(lexeme, has_space)
if heads is not None:

View File

@ -820,6 +820,29 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
| overrides | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~ |
| **CREATES** | The final trained pipeline and the best trained pipeline. |
### Calling the training function from Python {#train-function new="3.2"}
The training CLI exposes a `train` helper function that lets you run the
training just like `spacy train`. Usually it's easier to use the command line
directly, but if you need to kick off training from code this is how to do it.
> #### Example
>
> ```python
> from spacy.cli.train import train
>
> train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"})
>
> ```
| Name | Description |
| -------------- | ----------------------------------------------------------------------------------------------------------------------------- |
| `config_path` | Path to the config to use for training. ~~Union[str, Path]~~ |
| `output_path` | Optional name of directory to save output model in. If not provided a model will not be saved. ~~Optional[Union[str, Path]]~~ |
| _keyword-only_ | |
| `use_gpu` | Which GPU to use. Defaults to -1 for no GPU. ~~int~~ |
| `overrides` | Values to override config settings. ~~Dict[str, Any]~~ |
## pretrain {#pretrain new="2.1" tag="command,experimental"}
Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline

View File

@ -34,7 +34,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
| Name | Description |
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
| `words` | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~ |
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
| _keyword-only_ | |
| `user\_data` | Optional extra data to attach to the Doc. ~~Dict~~ |

View File

@ -773,17 +773,17 @@ from the specified model. Intended for use in `[initialize.before_init]`.
> after_pipeline_creation = {"@callbacks":"spacy.models_with_nvtx_range.v1"}
> ```
Recursively wrap the models in each pipe using [NVTX](https://nvidia.github.io/NVTX/)
range markers. These markers aid in GPU profiling by attributing specific operations
to a ~~Model~~'s forward or backprop passes.
Recursively wrap the models in each pipe using
[NVTX](https://nvidia.github.io/NVTX/) range markers. These markers aid in GPU
profiling by attributing specific operations to a ~~Model~~'s forward or
backprop passes.
| Name | Description |
|------------------|------------------------------------------------------------------------------------------------------------------------------|
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- |
| `forward_color` | Color identifier for forward passes. Defaults to `-1`. ~~int~~ |
| `backprop_color` | Color identifier for backpropagation passes. Defaults to `-1`. ~~int~~ |
| **CREATES** | A function that takes the current `nlp` and wraps forward/backprop passes in NVTX ranges. ~~Callable[[Language], Language]~~ |
## Training data and alignment {#gold source="spacy/training"}
### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}

View File

@ -71,13 +71,14 @@ spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
> $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
> ```
| Name | Description |
| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. |
| `ray` | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training). |
| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. |
| `ja`, `ko`, `th`, `zh` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages). |
| Name | Description |
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lookups` | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
| `transformers` | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline. |
| `ray` | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training). |
| `cuda`, ... | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options. |
| `apple` | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1. |
| `ja`, `ko`, `th` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages). |
### conda {#conda}

View File

@ -301,8 +301,6 @@ fly without having to save to and load from disk.
$ python -m spacy init config - --lang en --pipeline ner,textcat --optimize accuracy | python -m spacy train - --paths.train ./corpus/train.spacy --paths.dev ./corpus/dev.spacy
```
<!-- TODO: add reference to Prodigy's commands once Prodigy nightly is available -->
### Using variable interpolation {#config-interpolation}
Another very useful feature of the config system is that it supports variable
@ -1647,7 +1645,7 @@ workers are stuck waiting for it to complete before they can continue.
## Internal training API {#api}
<Infobox variant="warning">
<Infobox variant="danger">
spaCy gives you full control over the training loop. However, for most use
cases, it's recommended to train your pipelines via the
@ -1659,6 +1657,32 @@ typically give you everything you need to train fully custom pipelines with
</Infobox>
### Training from a Python script {#api-train new="3.2"}
If you want to run the training from a Python script instead of using the
[`spacy train`](/api/cli#train) CLI command, you can call into the
[`train`](/api/cli#train-function) helper function directly. It takes the path
to the config file, an optional output directory and an optional dictionary of
[config overrides](#config-overrides).
```python
from spacy.cli.train import train
train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"})
```
### Internal training loop API {#api-loop}
<Infobox variant="warning">
This section documents how the training loop and updates to the `nlp` object
work internally. You typically shouldn't have to implement this in Python unless
you're writing your own trainable components. To train a pipeline, use
[`spacy train`](/api/cli#train) or the [`train`](/api/cli#train-function) helper
function instead.
</Infobox>
The [`Example`](/api/example) object contains annotated training data, also
called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object
that will hold the predictions, and another `Doc` object that holds the

View File

@ -1138,7 +1138,7 @@
{
"id": "deplacy",
"slogan": "CUI-based Tree Visualizer for Universal Dependencies and Immediate Catena Analysis",
"discreption": "Simple dependency visualizer for [spaCy](https://spacy.io/), [UniDic2UD](https://pypi.org/project/unidic2ud), [Stanza](https://stanfordnlp.github.io/stanza/), [NLP-Cube](https://github.com/Adobe/NLP-Cube), [Trankit](https://github.com/nlp-uoregon/trankit), etc.",
"description": "Simple dependency visualizer for [spaCy](https://spacy.io/), [UniDic2UD](https://pypi.org/project/unidic2ud), [Stanza](https://stanfordnlp.github.io/stanza/), [NLP-Cube](https://github.com/Adobe/NLP-Cube), [Trankit](https://github.com/nlp-uoregon/trankit), etc.",
"github": "KoichiYasuoka/deplacy",
"image": "https://i.imgur.com/6uOI4Op.png",
"code_example": [
@ -1270,7 +1270,7 @@
"description": "`textacy` is a Python library for performing a variety of natural language processing (NLP) tasks, built on the high-performance `spacy` library. With the fundamentals tokenization, part-of-speech tagging, dependency parsing, etc. delegated to another library, `textacy` focuses on the tasks that come before and follow after.",
"github": "chartbeat-labs/textacy",
"pip": "textacy",
"url": "https://chartbeat-labs.github.io/textacy/",
"url": "https://github.com/chartbeat-labs/textacy",
"author": "Burton DeWilde",
"author_links": {
"github": "bdewilde",

View File

@ -4,10 +4,12 @@ import { StaticQuery, graphql } from 'gatsby'
import { Quickstart, QS } from '../components/quickstart'
import { repo, DEFAULT_BRANCH } from '../components/util'
const DEFAULT_OS = 'mac'
const DEFAULT_PLATFORM = 'x86'
const DEFAULT_MODELS = ['en']
const DEFAULT_OPT = 'efficiency'
const DEFAULT_HARDWARE = 'cpu'
const DEFAULT_CUDA = 'cuda102'
const DEFAULT_CUDA = 'cuda113'
const CUDA = {
'8.0': 'cuda80',
'9.0': 'cuda90',
@ -19,11 +21,15 @@ const CUDA = {
'11.0': 'cuda110',
'11.1': 'cuda111',
'11.2': 'cuda112',
'11.3': 'cuda113',
'11.4': 'cuda114',
}
const LANG_EXTRAS = ['ja'] // only for languages with models
const QuickstartInstall = ({ id, title }) => {
const [train, setTrain] = useState(false)
const [platform, setPlatform] = useState(DEFAULT_PLATFORM)
const [os, setOs] = useState(DEFAULT_OS)
const [hardware, setHardware] = useState(DEFAULT_HARDWARE)
const [cuda, setCuda] = useState(DEFAULT_CUDA)
const [selectedModels, setModels] = useState(DEFAULT_MODELS)
@ -33,15 +39,19 @@ const QuickstartInstall = ({ id, title }) => {
config: v => setTrain(v.includes('train')),
models: setModels,
optimize: v => setEfficiency(v.includes('efficiency')),
platform: v => setPlatform(v[0]),
os: v => setOs(v[0]),
}
const showDropdown = {
hardware: () => hardware === 'gpu',
}
const modelExtras = train ? selectedModels.filter(m => LANG_EXTRAS.includes(m)) : []
const apple = os === 'mac' && platform === 'arm'
const pipExtras = [
hardware === 'gpu' && cuda,
train && 'transformers',
train && 'lookups',
apple && 'apple',
...modelExtras,
]
.filter(e => e)
@ -62,6 +72,16 @@ const QuickstartInstall = ({ id, title }) => {
{ id: 'windows', title: 'Windows' },
{ id: 'linux', title: 'Linux' },
],
defaultValue: DEFAULT_OS,
},
{
id: 'platform',
title: 'Platform',
options: [
{ id: 'x86', title: 'x86', checked: true },
{ id: 'arm', title: 'ARM / M1' },
],
defaultValue: DEFAULT_PLATFORM,
},
{
id: 'package',