Include custom code via spacy package command (#6531)

This commit is contained in:
Ines Montani 2020-12-10 23:36:46 +11:00 committed by GitHub
parent 2a6043fabb
commit 513c4e332a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 117 additions and 42 deletions

View File

@ -1,4 +1,4 @@
from typing import Optional, Union, Any, Dict from typing import Optional, Union, Any, Dict, List
import shutil import shutil
from pathlib import Path from pathlib import Path
from wasabi import Printer, get_raw_input from wasabi import Printer, get_raw_input
@ -16,6 +16,7 @@ def package_cli(
# fmt: off # fmt: off
input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False), input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False), output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
code_paths: Optional[str] = Opt(None, "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False), meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"), create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"), name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
@ -33,12 +34,22 @@ def package_cli(
After packaging, "python setup.py sdist" is run in the package directory, After packaging, "python setup.py sdist" is run in the package directory,
which will create a .tar.gz archive that can be installed via "pip install". which will create a .tar.gz archive that can be installed via "pip install".
If additional code files are provided (e.g. Python files containing custom
registered functions like pipeline components), they are copied into the
package and imported in the __init__.py.
DOCS: https://nightly.spacy.io/api/cli#package DOCS: https://nightly.spacy.io/api/cli#package
""" """
code_paths = (
[Path(p.strip()) for p in code_paths.split(",")]
if code_paths is not None
else []
)
package( package(
input_dir, input_dir,
output_dir, output_dir,
meta_path=meta_path, meta_path=meta_path,
code_paths=code_paths,
name=name, name=name,
version=version, version=version,
create_meta=create_meta, create_meta=create_meta,
@ -52,6 +63,7 @@ def package(
input_dir: Path, input_dir: Path,
output_dir: Path, output_dir: Path,
meta_path: Optional[Path] = None, meta_path: Optional[Path] = None,
code_paths: List[Path] = [],
name: Optional[str] = None, name: Optional[str] = None,
version: Optional[str] = None, version: Optional[str] = None,
create_meta: bool = False, create_meta: bool = False,
@ -67,6 +79,14 @@ def package(
msg.fail("Can't locate pipeline data", input_path, exits=1) msg.fail("Can't locate pipeline data", input_path, exits=1)
if not output_path or not output_path.exists(): if not output_path or not output_path.exists():
msg.fail("Output directory not found", output_path, exits=1) msg.fail("Output directory not found", output_path, exits=1)
for code_path in code_paths:
if not code_path.exists():
msg.fail("Can't find code file", code_path, exits=1)
# Import the code here so it's available when model is loaded (via
# get_meta helper). Also verifies that everything works
util.import_file(code_path.stem, code_path)
if code_paths:
msg.good(f"Including {len(code_paths)} Python module(s) with custom code")
if meta_path and not meta_path.exists(): if meta_path and not meta_path.exists():
msg.fail("Can't find pipeline meta.json", meta_path, exits=1) msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
meta_path = meta_path or input_dir / "meta.json" meta_path = meta_path or input_dir / "meta.json"
@ -106,10 +126,17 @@ def package(
license_path = package_path / model_name_v / "LICENSE" license_path = package_path / model_name_v / "LICENSE"
if license_path.exists(): if license_path.exists():
shutil.move(str(license_path), str(main_path)) shutil.move(str(license_path), str(main_path))
imports = []
for code_path in code_paths:
imports.append(code_path.stem)
shutil.copy(str(code_path), str(package_path))
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2)) create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
create_file(main_path / "setup.py", TEMPLATE_SETUP) create_file(main_path / "setup.py", TEMPLATE_SETUP)
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST) create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
create_file(package_path / "__init__.py", TEMPLATE_INIT) init_py = TEMPLATE_INIT.format(
imports="\n".join(f"from . import {m}" for m in imports)
)
create_file(package_path / "__init__.py", init_py)
msg.good(f"Successfully created package '{model_name_v}'", main_path) msg.good(f"Successfully created package '{model_name_v}'", main_path)
if create_sdist: if create_sdist:
with util.working_dir(main_path): with util.working_dir(main_path):
@ -249,6 +276,7 @@ TEMPLATE_INIT = """
from pathlib import Path from pathlib import Path
from spacy.util import load_model_from_init_py, get_model_meta from spacy.util import load_model_from_init_py, get_model_meta
{imports}
__version__ = get_model_meta(Path(__file__).parent)['version'] __version__ = get_model_meta(Path(__file__).parent)['version']

View File

@ -273,6 +273,7 @@ class ModelMetaSchema(BaseModel):
version: StrictStr = Field(..., title="Model version") version: StrictStr = Field(..., title="Model version")
spacy_version: StrictStr = Field("", title="Compatible spaCy version identifier") spacy_version: StrictStr = Field("", title="Compatible spaCy version identifier")
parent_package: StrictStr = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly") parent_package: StrictStr = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly")
requirements: List[StrictStr] = Field([], title="Additional Python package dependencies, used for the Python package setup")
pipeline: List[StrictStr] = Field([], title="Names of pipeline components") pipeline: List[StrictStr] = Field([], title="Names of pipeline components")
description: StrictStr = Field("", title="Model description") description: StrictStr = Field("", title="Model description")
license: StrictStr = Field("", title="Model license") license: StrictStr = Field("", title="Model license")

View File

@ -872,11 +872,15 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] [--gp
## package {#package tag="command"} ## package {#package tag="command"}
Generate an installable [Python package](/usage/training#models-generating) from Generate an installable [Python package](/usage/training#models-generating) from
an existing pipeline data directory. All data files are copied over. If the path an existing pipeline data directory. All data files are copied over. If
to a [`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is additional code files are provided (e.g. Python files containing custom
found in the input directory, this file is used. Otherwise, the data can be registered functions like
entered directly from the command line. spaCy will then create a `.tar.gz` [pipeline components](/usage/processing-pipelines#custom-components)), they are
archive file that you can distribute and install with `pip install`. copied into the package and imported in the `__init__.py`. If the path to a
[`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is found in
the input directory, this file is used. Otherwise, the data can be entered
directly from the command line. spaCy will then create a `.tar.gz` archive file
that you can distribute and install with `pip install`.
<Infobox title="New in v3.0" variant="warning"> <Infobox title="New in v3.0" variant="warning">
@ -887,7 +891,7 @@ this, you can set the `--no-sdist` flag.
</Infobox> </Infobox>
```cli ```cli
$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--no-sdist] [--name] [--version] [--force] $ python -m spacy package [input_dir] [output_dir] [--code] [--meta-path] [--create-meta] [--no-sdist] [--name] [--version] [--force]
``` ```
> #### Example > #### Example
@ -898,18 +902,19 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
> $ pip install dist/en_pipeline-0.0.0.tar.gz > $ pip install dist/en_pipeline-0.0.0.tar.gz
> ``` > ```
| Name | Description | | Name | Description |
| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `input_dir` | Path to directory containing pipeline data. ~~Path (positional)~~ | | `input_dir` | Path to directory containing pipeline data. ~~Path (positional)~~ |
| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ | | `output_dir` | Directory to create package folder in. ~~Path (positional)~~ |
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ | | `--code`, `-c` <Tag variant="new">3</Tag> | Comma-separated paths to Python files to be included in the package and imported in its `__init__.py`. This allows including [registering functions](/usage/training#custom-functions) and [custom components](/usage/processing-pipelines#custom-components). ~~Optional[str] \(option)~~ |
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ | | `--meta-path`, `-m` <Tag variant="new">2</Tag> | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ |
| `--no-sdist`, `-NS`, | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. ~~bool (flag)~~ | | `--create-meta`, `-C` <Tag variant="new">2</Tag> | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ |
| `--name`, `-n` <Tag variant="new">3</Tag> | Package name to override in meta. ~~Optional[str] \(option)~~ | | `--no-sdist`, `-NS`, | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. ~~bool (flag)~~ |
| `--version`, `-v` <Tag variant="new">3</Tag> | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ | | `--name`, `-n` <Tag variant="new">3</Tag> | Package name to override in meta. ~~Optional[str] \(option)~~ |
| `--force`, `-f` | Force overwriting of existing folder in output directory. ~~bool (flag)~~ | | `--version`, `-v` <Tag variant="new">3</Tag> | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. ~~Optional[str] \(option)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--force`, `-f` | Force overwriting of existing folder in output directory. ~~bool (flag)~~ |
| **CREATES** | A Python package containing the spaCy pipeline. | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A Python package containing the spaCy pipeline. |
## project {#project new="3"} ## project {#project new="3"}

View File

@ -543,6 +543,7 @@ source of truth** used for loading a pipeline.
> "version": "1.0.0", > "version": "1.0.0",
> "spacy_version": ">=3.0.0,<3.1.0", > "spacy_version": ">=3.0.0,<3.1.0",
> "parent_package": "spacy", > "parent_package": "spacy",
> "requirements": ["spacy-transformers>=1.0.0,<1.1.0"],
> "description": "Example pipeline for spaCy", > "description": "Example pipeline for spaCy",
> "author": "You", > "author": "You",
> "email": "you@example.com", > "email": "you@example.com",
@ -573,6 +574,7 @@ source of truth** used for loading a pipeline.
| `version` | Pipeline version. Will be used to version a Python package created with [`spacy package`](/api/cli#package). Defaults to `"0.0.0"`. ~~str~~ | | `version` | Pipeline version. Will be used to version a Python package created with [`spacy package`](/api/cli#package). Defaults to `"0.0.0"`. ~~str~~ |
| `spacy_version` | spaCy version range the package is compatible with. Defaults to the spaCy version used to create the pipeline, up to next minor version, which is the default compatibility for the available [trained pipelines](/models). For instance, a pipeline trained with v3.0.0 will have the version range `">=3.0.0,<3.1.0"`. ~~str~~ | | `spacy_version` | spaCy version range the package is compatible with. Defaults to the spaCy version used to create the pipeline, up to next minor version, which is the default compatibility for the available [trained pipelines](/models). For instance, a pipeline trained with v3.0.0 will have the version range `">=3.0.0,<3.1.0"`. ~~str~~ |
| `parent_package` | Name of the spaCy package. Typically `"spacy"` or `"spacy_nightly"`. Defaults to `"spacy"`. ~~str~~ | | `parent_package` | Name of the spaCy package. Typically `"spacy"` or `"spacy_nightly"`. Defaults to `"spacy"`. ~~str~~ |
| `requirements` | Python package requirements that the pipeline depends on. Will be used for the Python package setup in [`spacy package`](/api/cli#package). Should be a list of package names with optional version specifiers, just like you'd define them in a `setup.cfg` or `requirements.txt`. Defaults to `[]`. ~~List[str]~~ |
| `description` | Pipeline description. Also used for Python package. Defaults to `""`. ~~str~~ | | `description` | Pipeline description. Also used for Python package. Defaults to `""`. ~~str~~ |
| `author` | Pipeline author name. Also used for Python package. Defaults to `""`. ~~str~~ | | `author` | Pipeline author name. Also used for Python package. Defaults to `""`. ~~str~~ |
| `email` | Pipeline author email. Also used for Python package. Defaults to `""`. ~~str~~ | | `email` | Pipeline author email. Also used for Python package. Defaults to `""`. ~~str~~ |

View File

@ -463,12 +463,12 @@ entry_points={
} }
``` ```
The factory can also implement other pipeline component methods like `to_disk` and The factory can also implement other pipeline component methods like `to_disk`
`from_disk` for serialization, or even `update` to make the component trainable. and `from_disk` for serialization, or even `update` to make the component
If a component exposes a `from_disk` method and is included in a pipeline, spaCy trainable. If a component exposes a `from_disk` method and is included in a
will call it on load. This lets you ship custom data with your pipeline package. pipeline, spaCy will call it on load. This lets you ship custom data with your
When you save out a pipeline using `nlp.to_disk` and the component exposes a pipeline package. When you save out a pipeline using `nlp.to_disk` and the
`to_disk` method, it will be called with the disk path. component exposes a `to_disk` method, it will be called with the disk path.
```python ```python
def to_disk(self, path, exclude=tuple()): def to_disk(self, path, exclude=tuple()):
@ -633,10 +633,10 @@ can be a convenient way to share them with your team.
</Infobox> </Infobox>
spaCy comes with a handy CLI command that will create all required files, and spaCy comes with a handy CLI command that will create all required files, and
walk you through generating the meta data. You can also create the `meta.json` walk you through generating the meta data. You can also create the
manually and place it in the data directory, or supply a path to it using the [`meta.json`](/api/data-formats#meta) manually and place it in the data
`--meta` flag. For more info on this, see the [`package`](/api/cli#package) directory, or supply a path to it using the `--meta` flag. For more info on
docs. this, see the [`package`](/api/cli#package) docs.
> #### meta.json (example) > #### meta.json (example)
> >
@ -654,7 +654,7 @@ docs.
> ``` > ```
```cli ```cli
$ python -m spacy package ./en_example_pipeline ./my_pipelines $ python -m spacy package ./en_example_pipeline ./packages
``` ```
This command will create a pipeline package directory and will run This command will create a pipeline package directory and will run
@ -683,15 +683,44 @@ If you're creating the package manually, keep in mind that the directories need
to be named according to the naming conventions of `lang_name` and to be named according to the naming conventions of `lang_name` and
`lang_name-version`. `lang_name-version`.
### Customizing the package setup {#models-custom} ### Including custom functions and components {#models-custom}
The `load()` method that comes with our pipeline package templates will take If your pipeline includes
care of putting all this together and returning a `Language` object with the [custom components](/usage/processing-pipelines#custom-components), model
loaded pipeline and data. If your pipeline requires architectures or other [code](/usage/training#custom-code), those functions need
[custom components](/usage/processing-pipelines#custom-components) or a custom to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know
language class, you can also **ship the code with your package** and include it how to create the objects referenced in the config. The
in the `__init__.py` for example, to register a component before the `nlp` [`spacy package`](/api/cli#package) command lets you provide one or more paths
object is created. to Python files containing custom registered functions using the `--code`
argument.
> #### \_\_init\_\_.py (excerpt)
>
> ```python
> from . import functions
>
> def load(**overrides):
> ...
> ```
```cli
$ python -m spacy package ./en_example_pipeline ./packages --code functions.py
```
The Python files will be copied over into the root of the package, and the
package's `__init__.py` will import them as modules. This ensures that functions
are registered when the pipeline is imported, e.g. when you call `spacy.load`. A
simple import is all that's needed to make registered functions available.
Make sure to include **all Python files** that are referenced in your custom
code, including modules imported by others. If your custom code depends on
**external packages**, make sure they're listed in the list of `"requirements"`
in your [`meta.json`](/api/data-formats#meta). For the majority of use cases,
registered functions should provide you with all customizations you need, from
custom components to custom model architectures and lifecycle hooks. However, if
you do want to customize the setup in more detail, you can edit the package's
`__init__.py` and the package's `load` function that's called by
[`spacy.load`](/api/top-level#spacy.load).
<Infobox variant="warning" title="Important note on making manual edits"> <Infobox variant="warning" title="Important note on making manual edits">

View File

@ -581,10 +581,14 @@ value for it.
### Training with custom code {#custom-code} ### Training with custom code {#custom-code}
> #### Example > ```cli
> ### Training
> $ python -m spacy train config.cfg --code functions.py
> ```
> >
> ```cli > ```cli
> $ python -m spacy train config.cfg --code functions.py > ### Packaging
> $ python -m spacy package ./model-best ./packages --code functions.py
> ``` > ```
The [`spacy train`](/api/cli#train) recipe lets you specify an optional argument The [`spacy train`](/api/cli#train) recipe lets you specify an optional argument
@ -592,7 +596,13 @@ The [`spacy train`](/api/cli#train) recipe lets you specify an optional argument
allows you to add custom functions and architectures to the function registry allows you to add custom functions and architectures to the function registry
that can then be referenced from your `config.cfg`. This lets you train spaCy that can then be referenced from your `config.cfg`. This lets you train spaCy
pipelines with custom components, without having to re-implement the whole pipelines with custom components, without having to re-implement the whole
training workflow. training workflow. When you package your trained pipeline later using
[`spacy package`](/api/cli#package), you can provide one or more Python files to
be included in the package and imported in its `__init__.py`. This means that
any custom architectures, functions or
[components](/usage/processing-pipelines#custom-components) will be shipped with
your pipeline and registered when it's loaded. See the documentation on
[saving and loading pipelines](/usage/saving-loading#models-custom) for details.
#### Example: Modifying the nlp object {#custom-code-nlp-callbacks} #### Example: Modifying the nlp object {#custom-code-nlp-callbacks}