Merge remote-tracking branch 'upstream/master' into chore/update-develop-from-master-v3.2-3

2025-07-30 18:10:14 +03:00 · 2021-10-29 12:18:15 +02:00 · 2021-10-29 12:18:15 +02:00 · 2d430958e1
commit 2d430958e1
parent 12974bf4d9 006df1ae1f
18 changed files with 140 additions and 45 deletions
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@ -27,6 +27,7 @@ steps:

  - script: python -m mypy spacy
    displayName: 'Run mypy'
+    condition: ne(variables['python_version'], '3.10')

  - task: DeleteFiles@1
    inputs:
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -42,7 +42,7 @@ jobs:
          imageName: "ubuntu-18.04"
          python.version: "3.6"
        #        Python36Windows:
-        #          imageName: "vs2017-win2016"
+        #          imageName: "windows-2019"
        #          python.version: "3.6"
        #        Python36Mac:
        #          imageName: "macos-10.14"
@ -51,7 +51,7 @@ jobs:
        #          imageName: "ubuntu-18.04"
        #          python.version: "3.7"
        Python37Windows:
-          imageName: "vs2017-win2016"
+          imageName: "windows-2019"
          python.version: "3.7"
        #        Python37Mac:
        #          imageName: "macos-10.14"
@ -60,7 +60,7 @@ jobs:
        #          imageName: "ubuntu-18.04"
        #          python.version: "3.8"
        #        Python38Windows:
-        #          imageName: "vs2017-win2016"
+        #          imageName: "windows-2019"
        #          python.version: "3.8"
        Python38Mac:
          imageName: "macos-10.14"
@ -68,12 +68,21 @@ jobs:
        Python39Linux:
          imageName: "ubuntu-18.04"
          python.version: "3.9"
-        Python39Windows:
-          imageName: "vs2017-win2016"
-          python.version: "3.9"
-        Python39Mac:
-          imageName: "macos-10.14"
-          python.version: "3.9"
+        #        Python39Windows:
+        #          imageName: "windows-2019"
+        #          python.version: "3.9"
+        #        Python39Mac:
+        #          imageName: "macos-10.14"
+        #          python.version: "3.9"
+        Python310Linux:
+          imageName: "ubuntu-20.04"
+          python.version: "3.10"
+        Python310Windows:
+          imageName: "windows-2019"
+          python.version: "3.10"
+        Python310Mac:
+          imageName: "macos-10.15"
+          python.version: "3.10"
      maxParallel: 4
    pool:
      vmImage: $(imageName)
--- a/build-constraints.txt
+++ b/build-constraints.txt
@ -2,4 +2,5 @@
 numpy==1.15.0; python_version<='3.7'
 numpy==1.17.3; python_version=='3.8'
 numpy==1.19.3; python_version=='3.9'
-numpy; python_version>='3.10'
+numpy==1.21.3; python_version=='3.10'
+numpy; python_version>='3.11'
--- a/pyproject.toml
+++ b/pyproject.toml
@ -5,7 +5,7 @@ requires = [
    "cymem>=2.0.2,<2.1.0",
    "preshed>=3.0.2,<3.1.0",
    "murmurhash>=0.28.0,<1.1.0",
-    "thinc>=8.0.11,<8.1.0",
+    "thinc>=8.0.12,<8.1.0",
    "blis>=0.4.0,<0.8.0",
    "pathy",
    "numpy>=1.15.0",
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@ spacy-legacy>=3.0.8,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
-thinc>=8.0.11,<8.1.0
+thinc>=8.0.12,<8.1.0
 blis>=0.4.0,<0.8.0
 ml_datasets>=0.2.0,<0.3.0
 murmurhash>=0.28.0,<1.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -21,6 +21,7 @@ classifiers =
    Programming Language :: Python :: 3.7
    Programming Language :: Python :: 3.8
    Programming Language :: Python :: 3.9
+    Programming Language :: Python :: 3.10
    Topic :: Scientific/Engineering
 project_urls =
    Release notes = https://github.com/explosion/spaCy/releases
@ -37,7 +38,7 @@ setup_requires =
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
    murmurhash>=0.28.0,<1.1.0
-    thinc>=8.0.11,<8.1.0
+    thinc>=8.0.12,<8.1.0
 install_requires =
    # Our libraries
    spacy-legacy>=3.0.8,<3.1.0
@ -45,7 +46,7 @@ install_requires =
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
    preshed>=3.0.2,<3.1.0
-    thinc>=8.0.11,<8.1.0
+    thinc>=8.0.12,<8.1.0
    blis>=0.4.0,<0.8.0
    wasabi>=0.8.1,<1.1.0
    srsly>=2.4.1,<3.0.0
@ -97,6 +98,12 @@ cuda111 =
    cupy-cuda111>=5.0.0b4,<10.0.0
 cuda112 =
    cupy-cuda112>=5.0.0b4,<10.0.0
+cuda113 =
+    cupy-cuda113>=5.0.0b4,<10.0.0
+cuda114 =
+    cupy-cuda114>=5.0.0b4,<10.0.0
+apple =
+    thinc-apple-ops>=0.0.4,<1.0.0
 # Language tokenizers with external dependencies
 ja =
    sudachipy>=0.4.9
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -1,4 +1,4 @@
-from typing import Optional, Dict, Any
+from typing import Optional, Dict, Any, Union
 from pathlib import Path
 from wasabi import msg
 import typer
@ -46,12 +46,14 @@ def train_cli(


 def train(
-    config_path: Path,
-    output_path: Optional[Path] = None,
+    config_path: Union[str, Path],
+    output_path: Optional[Union[str, Path]] = None,
    *,
    use_gpu: int = -1,
    overrides: Dict[str, Any] = util.SimpleFrozenDict(),
 ):
+    config_path = util.ensure_path(config_path)
+    output_path = util.ensure_path(output_path)
    # Make sure all files and paths exists if they are needed
    if not config_path or (str(config_path) != "-" and not config_path.exists()):
        msg.fail("Config file not found", config_path, exits=1)
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -893,6 +893,7 @@ class Errors:
             "filename. Specify an epoch to resume from.")
    E1021 = ("`pos` value \"{pp}\" is not a valid Universal Dependencies tag. "
             "Non-UD tags should use the `tag` property.")
+    E1022 = ("Words must be of type str or int, but input is of type '{wtype}'")


 # Deprecated model shortcuts, only used in errors and warnings
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@ -25,6 +25,7 @@ def test_build_dependencies():
        "sudachipy",
        "sudachidict_core",
        "spacy-pkuseg",
+        "thinc-apple-ops",
    ]

    # check requirements.txt
--- a/spacy/tests/training/test_readers.py
+++ b/spacy/tests/training/test_readers.py
@ -1,6 +1,6 @@
 from typing import Dict, Iterable, Callable
 import pytest
-from thinc.api import Config
+from thinc.api import Config, fix_random_seed
 from spacy import Language
 from spacy.util import load_model_from_config, registry, resolve_dot_names
 from spacy.schemas import ConfigSchemaTraining
@ -64,8 +64,8 @@ def test_readers():
@pytest.mark.parametrize(
    "reader,additional_config",
    [
-        ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 2}),
-        ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 2}),
+        ("ml_datasets.imdb_sentiment.v1", {"train_limit": 10, "dev_limit": 10}),
+        ("ml_datasets.dbpedia.v1", {"train_limit": 10, "dev_limit": 10}),
        ("ml_datasets.cmu_movies.v1", {"limit": 10, "freq_cutoff": 200, "split": 0.8}),
    ],
 )
@ -93,6 +93,7 @@ def test_cat_readers(reader, additional_config):
    factory = "textcat_multilabel"
    """
    config = Config().from_str(nlp_config_string)
+    fix_random_seed(config["training"]["seed"])
    config["corpora"]["@readers"] = reader
    config["corpora"].update(additional_config)
    nlp = load_model_from_config(config, auto_fill=True)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -194,11 +194,12 @@ cdef class Doc:

        vocab (Vocab): A vocabulary object, which must match any models you
            want to use (e.g. tokenizer, parser, entity recognizer).
-        words (Optional[List[str]]): A list of unicode strings to add to the document
-            as words. If `None`, defaults to empty list.
-        spaces (Optional[List[bool]]): A list of boolean values, of the same length as
-            words. True means that the word is followed by a space, False means
-            it is not. If `None`, defaults to `[True]*len(words)`
+        words (Optional[List[Union[str, int]]]): A list of unicode strings or
+            hash values to add to the document as words. If `None`, defaults to
+            empty list.
+        spaces (Optional[List[bool]]): A list of boolean values, of the same
+            length as `words`. `True` means that the word is followed by a space,
+            `False` means it is not. If `None`, defaults to `[True]*len(words)`
        user_data (dict or None): Optional extra data to attach to the Doc.
        tags (Optional[List[str]]): A list of unicode strings, of the same
            length as words, to assign as token.tag. Defaults to None.
@ -266,7 +267,10 @@ cdef class Doc:
            elif isinstance(word, bytes):
                raise ValueError(Errors.E028.format(value=word))
            else:
-                lexeme = self.vocab.get_by_orth(self.mem, word)
+                try:
+                    lexeme = self.vocab.get_by_orth(self.mem, word)
+                except TypeError:
+                    raise TypeError(Errors.E1022.format(wtype=type(word)))
            self.push_back(lexeme, has_space)

        if heads is not None:
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -820,6 +820,29 @@ $ python -m spacy train [config_path] [--output] [--code] [--verbose] [--gpu-id]
 | overrides         | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--paths.train ./train.spacy`. ~~Any (option/flag)~~                         |
 | **CREATES**       | The final trained pipeline and the best trained pipeline.                                                                                                                                                          |

+### Calling the training function from Python {#train-function new="3.2"}
+
+The training CLI exposes a `train` helper function that lets you run the
+training just like `spacy train`. Usually it's easier to use the command line
+directly, but if you need to kick off training from code this is how to do it.
+
+> #### Example
+>
+> ```python
+> from spacy.cli.train import train
+>
+> train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"})
+>
+> ```
+
+| Name           | Description                                                                                                                   |
+| -------------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| `config_path`  | Path to the config to use for training. ~~Union[str, Path]~~                                                                  |
+| `output_path`  | Optional name of directory to save output model in. If not provided a model will not be saved. ~~Optional[Union[str, Path]]~~ |
+| _keyword-only_ |                                                                                                                               |
+| `use_gpu`      | Which GPU to use. Defaults to -1 for no GPU. ~~int~~                                                                          |
+| `overrides`    | Values to override config settings. ~~Dict[str, Any]~~                                                                        |
+
 ## pretrain {#pretrain new="2.1" tag="command,experimental"}

 Pretrain the "token to vector" ([`Tok2vec`](/api/tok2vec)) layer of pipeline
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -34,7 +34,7 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
 | Name                                     | Description                                                                                                                                                                                        |
 | ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`                                  | A storage container for lexical types. ~~Vocab~~                                                                                                                                                   |
-| `words`                                  | A list of strings to add to the container. ~~Optional[List[str]]~~                                                                                                                                 |
+| `words`                                  | A list of strings or integer hash values to add to the document as words. ~~Optional[List[Union[str,int]]]~~                                                                                                 |
 | `spaces`                                 | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~       |
 | _keyword-only_                           |                                                                                                                                                                                                    |
 | `user\_data`                             | Optional extra data to attach to the Doc. ~~Dict~~                                                                                                                                                 |
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -773,17 +773,17 @@ from the specified model. Intended for use in `[initialize.before_init]`.
 > after_pipeline_creation = {"@callbacks":"spacy.models_with_nvtx_range.v1"}
 > ```

-Recursively wrap the models in each pipe using [NVTX](https://nvidia.github.io/NVTX/)
-range markers. These markers aid in GPU profiling by attributing specific operations
-to a ~~Model~~'s forward or backprop passes.
+Recursively wrap the models in each pipe using
+[NVTX](https://nvidia.github.io/NVTX/) range markers. These markers aid in GPU
+profiling by attributing specific operations to a ~~Model~~'s forward or
+backprop passes.

 | Name             | Description                                                                                                                  |
-|------------------|------------------------------------------------------------------------------------------------------------------------------|
+| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- |
 | `forward_color`  | Color identifier for forward passes. Defaults to `-1`. ~~int~~                                                               |
 | `backprop_color` | Color identifier for backpropagation passes. Defaults to `-1`. ~~int~~                                                       |
 | **CREATES**      | A function that takes the current `nlp` and wraps forward/backprop passes in NVTX ranges. ~~Callable[[Language], Language]~~ |

-
 ## Training data and alignment {#gold source="spacy/training"}

 ### training.offsets_to_biluo_tags {#offsets_to_biluo_tags tag="function"}
--- a/website/docs/usage/index.md
+++ b/website/docs/usage/index.md
@ -71,13 +71,14 @@ spaCy's [`setup.cfg`](%%GITHUB_SPACY/setup.cfg) for details on what's included.
 > $ pip install %%SPACY_PKG_NAME[lookups,transformers]%%SPACY_PKG_FLAGS
 > ```

-| Name                   | Description                                                                                                                                                                                                                                                    |
-| ---------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lookups`              | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
-| `transformers`         | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline.                                                                                    |
-| `ray`                  | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training).                                                                                                                  |
-| `cuda`, ...            | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options.                                                                                |
-| `ja`, `ko`, `th`, `zh` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages).                                                                                                                                                        |
+| Name             | Description                                                                                                                                                                                                                                                    |
+| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `lookups`        | Install [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) for data tables for lemmatization and lexeme normalization. The data is serialized with trained pipelines, so you only need this package if you want to train your own models. |
+| `transformers`   | Install [`spacy-transformers`](https://github.com/explosion/spacy-transformers). The package will be installed automatically when you install a transformer-based pipeline.                                                                                    |
+| `ray`            | Install [`spacy-ray`](https://github.com/explosion/spacy-ray) to add CLI commands for [parallel training](/usage/training#parallel-training).                                                                                                                  |
+| `cuda`, ...      | Install spaCy with GPU support provided by [CuPy](https://cupy.chainer.org) for your given CUDA version. See the GPU [installation instructions](#gpu) for details and options.                                                                                |
+| `apple`          | Install [`thinc-apple-ops`](https://github.com/explosion/thinc-apple-ops) to improve performance on an Apple M1.                                                                                                                                               |
+| `ja`, `ko`, `th` | Install additional dependencies required for tokenization for the [languages](/usage/models#languages).                                                                                                                                                        |

 ### conda {#conda}

--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -301,8 +301,6 @@ fly without having to save to and load from disk.
 $ python -m spacy init config - --lang en --pipeline ner,textcat --optimize accuracy | python -m spacy train - --paths.train ./corpus/train.spacy --paths.dev ./corpus/dev.spacy
 ```

-<!-- TODO: add reference to Prodigy's commands once Prodigy nightly is available -->
-
 ### Using variable interpolation {#config-interpolation}

 Another very useful feature of the config system is that it supports variable
@ -1647,7 +1645,7 @@ workers are stuck waiting for it to complete before they can continue.

 ## Internal training API {#api}

-<Infobox variant="warning">
+<Infobox variant="danger">

 spaCy gives you full control over the training loop. However, for most use
 cases, it's recommended to train your pipelines via the
@ -1659,6 +1657,32 @@ typically give you everything you need to train fully custom pipelines with

 </Infobox>

+### Training from a Python script {#api-train new="3.2"}
+
+If you want to run the training from a Python script instead of using the
+[`spacy train`](/api/cli#train) CLI command, you can call into the
+[`train`](/api/cli#train-function) helper function directly. It takes the path
+to the config file, an optional output directory and an optional dictionary of
+[config overrides](#config-overrides).
+
+```python
+from spacy.cli.train import train
+
+train("./config.cfg", overrides={"paths.train": "./train.spacy", "paths.dev": "./dev.spacy"})
+```
+
+### Internal training loop API {#api-loop}
+
+<Infobox variant="warning">
+
+This section documents how the training loop and updates to the `nlp` object
+work internally. You typically shouldn't have to implement this in Python unless
+you're writing your own trainable components. To train a pipeline, use
+[`spacy train`](/api/cli#train) or the [`train`](/api/cli#train-function) helper
+function instead.
+
+</Infobox>
+
 The [`Example`](/api/example) object contains annotated training data, also
 called the **gold standard**. It's initialized with a [`Doc`](/api/doc) object
 that will hold the predictions, and another `Doc` object that holds the
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -1138,7 +1138,7 @@
        {
            "id": "deplacy",
            "slogan": "CUI-based Tree Visualizer for Universal Dependencies and Immediate Catena Analysis",
-            "discreption": "Simple dependency visualizer for [spaCy](https://spacy.io/), [UniDic2UD](https://pypi.org/project/unidic2ud), [Stanza](https://stanfordnlp.github.io/stanza/), [NLP-Cube](https://github.com/Adobe/NLP-Cube), [Trankit](https://github.com/nlp-uoregon/trankit), etc.",
+            "description": "Simple dependency visualizer for [spaCy](https://spacy.io/), [UniDic2UD](https://pypi.org/project/unidic2ud), [Stanza](https://stanfordnlp.github.io/stanza/), [NLP-Cube](https://github.com/Adobe/NLP-Cube), [Trankit](https://github.com/nlp-uoregon/trankit), etc.",
            "github": "KoichiYasuoka/deplacy",
            "image": "https://i.imgur.com/6uOI4Op.png",
            "code_example": [
@ -1270,7 +1270,7 @@
            "description": "`textacy` is a Python library for performing a variety of natural language processing (NLP) tasks, built on the high-performance `spacy` library. With the fundamentals – tokenization, part-of-speech tagging, dependency parsing, etc. – delegated to another library, `textacy` focuses on the tasks that come before and follow after.",
            "github": "chartbeat-labs/textacy",
            "pip": "textacy",
-            "url": "https://chartbeat-labs.github.io/textacy/",
+            "url": "https://github.com/chartbeat-labs/textacy",
            "author": "Burton DeWilde",
            "author_links": {
                "github": "bdewilde",
--- a/website/src/widgets/quickstart-install.js
+++ b/website/src/widgets/quickstart-install.js
@ -4,10 +4,12 @@ import { StaticQuery, graphql } from 'gatsby'
 import { Quickstart, QS } from '../components/quickstart'
 import { repo, DEFAULT_BRANCH } from '../components/util'

+const DEFAULT_OS = 'mac'
+const DEFAULT_PLATFORM = 'x86'
 const DEFAULT_MODELS = ['en']
 const DEFAULT_OPT = 'efficiency'
 const DEFAULT_HARDWARE = 'cpu'
-const DEFAULT_CUDA = 'cuda102'
+const DEFAULT_CUDA = 'cuda113'
 const CUDA = {
    '8.0': 'cuda80',
    '9.0': 'cuda90',
@ -19,11 +21,15 @@ const CUDA = {
    '11.0': 'cuda110',
    '11.1': 'cuda111',
    '11.2': 'cuda112',
+    '11.3': 'cuda113',
+    '11.4': 'cuda114',
 }
 const LANG_EXTRAS = ['ja'] // only for languages with models

 const QuickstartInstall = ({ id, title }) => {
    const [train, setTrain] = useState(false)
+    const [platform, setPlatform] = useState(DEFAULT_PLATFORM)
+    const [os, setOs] = useState(DEFAULT_OS)
    const [hardware, setHardware] = useState(DEFAULT_HARDWARE)
    const [cuda, setCuda] = useState(DEFAULT_CUDA)
    const [selectedModels, setModels] = useState(DEFAULT_MODELS)
@ -33,15 +39,19 @@ const QuickstartInstall = ({ id, title }) => {
        config: v => setTrain(v.includes('train')),
        models: setModels,
        optimize: v => setEfficiency(v.includes('efficiency')),
+        platform: v => setPlatform(v[0]),
+        os: v => setOs(v[0]),
    }
    const showDropdown = {
        hardware: () => hardware === 'gpu',
    }
    const modelExtras = train ? selectedModels.filter(m => LANG_EXTRAS.includes(m)) : []
+    const apple = os === 'mac' && platform === 'arm'
    const pipExtras = [
        hardware === 'gpu' && cuda,
        train && 'transformers',
        train && 'lookups',
+        apple && 'apple',
        ...modelExtras,
    ]
        .filter(e => e)
@ -62,6 +72,16 @@ const QuickstartInstall = ({ id, title }) => {
                            { id: 'windows', title: 'Windows' },
                            { id: 'linux', title: 'Linux' },
                        ],
+                        defaultValue: DEFAULT_OS,
+                    },
+                    {
+                        id: 'platform',
+                        title: 'Platform',
+                        options: [
+                            { id: 'x86', title: 'x86', checked: true },
+                            { id: 'arm', title: 'ARM / M1' },
+                        ],
+                        defaultValue: DEFAULT_PLATFORM,
                    },
                    {
                        id: 'package',