mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 09:56:28 +03:00
Include custom code via spacy package command (#6531)
This commit is contained in:
parent
2a6043fabb
commit
513c4e332a
|
@ -1,4 +1,4 @@
|
||||||
from typing import Optional, Union, Any, Dict
|
from typing import Optional, Union, Any, Dict, List
|
||||||
import shutil
|
import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from wasabi import Printer, get_raw_input
|
from wasabi import Printer, get_raw_input
|
||||||
|
@ -16,6 +16,7 @@ def package_cli(
|
||||||
# fmt: off
|
# fmt: off
|
||||||
input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
|
input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
|
||||||
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
||||||
|
code_paths: Optional[str] = Opt(None, "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
|
||||||
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||||
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
||||||
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
|
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
|
||||||
|
@ -33,12 +34,22 @@ def package_cli(
|
||||||
After packaging, "python setup.py sdist" is run in the package directory,
|
After packaging, "python setup.py sdist" is run in the package directory,
|
||||||
which will create a .tar.gz archive that can be installed via "pip install".
|
which will create a .tar.gz archive that can be installed via "pip install".
|
||||||
|
|
||||||
|
If additional code files are provided (e.g. Python files containing custom
|
||||||
|
registered functions like pipeline components), they are copied into the
|
||||||
|
package and imported in the __init__.py.
|
||||||
|
|
||||||
DOCS: https://nightly.spacy.io/api/cli#package
|
DOCS: https://nightly.spacy.io/api/cli#package
|
||||||
"""
|
"""
|
||||||
|
code_paths = (
|
||||||
|
[Path(p.strip()) for p in code_paths.split(",")]
|
||||||
|
if code_paths is not None
|
||||||
|
else []
|
||||||
|
)
|
||||||
package(
|
package(
|
||||||
input_dir,
|
input_dir,
|
||||||
output_dir,
|
output_dir,
|
||||||
meta_path=meta_path,
|
meta_path=meta_path,
|
||||||
|
code_paths=code_paths,
|
||||||
name=name,
|
name=name,
|
||||||
version=version,
|
version=version,
|
||||||
create_meta=create_meta,
|
create_meta=create_meta,
|
||||||
|
@ -52,6 +63,7 @@ def package(
|
||||||
input_dir: Path,
|
input_dir: Path,
|
||||||
output_dir: Path,
|
output_dir: Path,
|
||||||
meta_path: Optional[Path] = None,
|
meta_path: Optional[Path] = None,
|
||||||
|
code_paths: List[Path] = [],
|
||||||
name: Optional[str] = None,
|
name: Optional[str] = None,
|
||||||
version: Optional[str] = None,
|
version: Optional[str] = None,
|
||||||
create_meta: bool = False,
|
create_meta: bool = False,
|
||||||
|
@ -67,6 +79,14 @@ def package(
|
||||||
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
||||||
if not output_path or not output_path.exists():
|
if not output_path or not output_path.exists():
|
||||||
msg.fail("Output directory not found", output_path, exits=1)
|
msg.fail("Output directory not found", output_path, exits=1)
|
||||||
|
for code_path in code_paths:
|
||||||
|
if not code_path.exists():
|
||||||
|
msg.fail("Can't find code file", code_path, exits=1)
|
||||||
|
# Import the code here so it's available when model is loaded (via
|
||||||
|
# get_meta helper). Also verifies that everything works
|
||||||
|
util.import_file(code_path.stem, code_path)
|
||||||
|
if code_paths:
|
||||||
|
msg.good(f"Including {len(code_paths)} Python module(s) with custom code")
|
||||||
if meta_path and not meta_path.exists():
|
if meta_path and not meta_path.exists():
|
||||||
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
||||||
meta_path = meta_path or input_dir / "meta.json"
|
meta_path = meta_path or input_dir / "meta.json"
|
||||||
|
@ -106,10 +126,17 @@ def package(
|
||||||
license_path = package_path / model_name_v / "LICENSE"
|
license_path = package_path / model_name_v / "LICENSE"
|
||||||
if license_path.exists():
|
if license_path.exists():
|
||||||
shutil.move(str(license_path), str(main_path))
|
shutil.move(str(license_path), str(main_path))
|
||||||
|
imports = []
|
||||||
|
for code_path in code_paths:
|
||||||
|
imports.append(code_path.stem)
|
||||||
|
shutil.copy(str(code_path), str(package_path))
|
||||||
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
||||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||||
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
init_py = TEMPLATE_INIT.format(
|
||||||
|
imports="\n".join(f"from . import {m}" for m in imports)
|
||||||
|
)
|
||||||
|
create_file(package_path / "__init__.py", init_py)
|
||||||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||||
if create_sdist:
|
if create_sdist:
|
||||||
with util.working_dir(main_path):
|
with util.working_dir(main_path):
|
||||||
|
@ -249,6 +276,7 @@ TEMPLATE_INIT = """
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from spacy.util import load_model_from_init_py, get_model_meta
|
from spacy.util import load_model_from_init_py, get_model_meta
|
||||||
|
|
||||||
|
{imports}
|
||||||
|
|
||||||
__version__ = get_model_meta(Path(__file__).parent)['version']
|
__version__ = get_model_meta(Path(__file__).parent)['version']
|
||||||
|
|
||||||
|
|
|
@ -273,6 +273,7 @@ class ModelMetaSchema(BaseModel):
|
||||||
version: StrictStr = Field(..., title="Model version")
|
version: StrictStr = Field(..., title="Model version")
|
||||||
spacy_version: StrictStr = Field("", title="Compatible spaCy version identifier")
|
spacy_version: StrictStr = Field("", title="Compatible spaCy version identifier")
|
||||||
parent_package: StrictStr = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly")
|
parent_package: StrictStr = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly")
|
||||||
|
requirements: List[StrictStr] = Field([], title="Additional Python package dependencies, used for the Python package setup")
|
||||||
pipeline: List[StrictStr] = Field([], title="Names of pipeline components")
|
pipeline: List[StrictStr] = Field([], title="Names of pipeline components")
|
||||||
description: StrictStr = Field("", title="Model description")
|
description: StrictStr = Field("", title="Model description")
|
||||||
license: StrictStr = Field("", title="Model license")
|
license: StrictStr = Field("", title="Model license")
|
||||||
|
|
|
@ -872,11 +872,15 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] [--gp
|
||||||
## package {#package tag="command"}
|
## package {#package tag="command"}
|
||||||
|
|
||||||
Generate an installable [Python package](/usage/training#models-generating) from
|
Generate an installable [Python package](/usage/training#models-generating) from
|
||||||
an existing pipeline data directory. All data files are copied over. If the path
|
an existing pipeline data directory. All data files are copied over. If
|
||||||
to a [`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is
|
additional code files are provided (e.g. Python files containing custom
|
||||||
found in the input directory, this file is used. Otherwise, the data can be
|
registered functions like
|
||||||
entered directly from the command line. spaCy will then create a `.tar.gz`
|
[pipeline components](/usage/processing-pipelines#custom-components)), they are
|
||||||
archive file that you can distribute and install with `pip install`.
|
copied into the package and imported in the `__init__.py`. If the path to a
|
||||||
|
[`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is found in
|
||||||
|
the input directory, this file is used. Otherwise, the data can be entered
|
||||||
|
directly from the command line. spaCy will then create a `.tar.gz` archive file
|
||||||
|
that you can distribute and install with `pip install`.
|
||||||
|
|
||||||
<Infobox title="New in v3.0" variant="warning">
|
<Infobox title="New in v3.0" variant="warning">
|
||||||
|
|
||||||
|
@ -887,7 +891,7 @@ this, you can set the `--no-sdist` flag.
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--no-sdist] [--name] [--version] [--force]
|
$ python -m spacy package [input_dir] [output_dir] [--code] [--meta-path] [--create-meta] [--no-sdist] [--name] [--version] [--force]
|
||||||
```
|
```
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -899,9 +903,10 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `input_dir` | Path to directory containing pipeline data. ~~Path (positional)~~ |
|
| `input_dir` | Path to directory containing pipeline data. ~~Path (positional)~~ |
|
||||||
| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ |
|
| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ |
|
||||||
|
| `--code`, `-c` <Tag variant="new">3</Tag> | Comma-separated paths to Python files to be included in the package and imported in its `__init__.py`. This allows including [registering functions](/usage/training#custom-functions) and [custom components](/usage/processing-pipelines#custom-components). ~~Optional[str] \(option)~~ |
|
||||||
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ |
|
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ |
|
||||||
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ |
|
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ |
|
||||||
| `--no-sdist`, `-NS`, | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. ~~bool (flag)~~ |
|
| `--no-sdist`, `-NS`, | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. ~~bool (flag)~~ |
|
||||||
|
|
|
@ -543,6 +543,7 @@ source of truth** used for loading a pipeline.
|
||||||
> "version": "1.0.0",
|
> "version": "1.0.0",
|
||||||
> "spacy_version": ">=3.0.0,<3.1.0",
|
> "spacy_version": ">=3.0.0,<3.1.0",
|
||||||
> "parent_package": "spacy",
|
> "parent_package": "spacy",
|
||||||
|
> "requirements": ["spacy-transformers>=1.0.0,<1.1.0"],
|
||||||
> "description": "Example pipeline for spaCy",
|
> "description": "Example pipeline for spaCy",
|
||||||
> "author": "You",
|
> "author": "You",
|
||||||
> "email": "you@example.com",
|
> "email": "you@example.com",
|
||||||
|
@ -573,6 +574,7 @@ source of truth** used for loading a pipeline.
|
||||||
| `version` | Pipeline version. Will be used to version a Python package created with [`spacy package`](/api/cli#package). Defaults to `"0.0.0"`. ~~str~~ |
|
| `version` | Pipeline version. Will be used to version a Python package created with [`spacy package`](/api/cli#package). Defaults to `"0.0.0"`. ~~str~~ |
|
||||||
| `spacy_version` | spaCy version range the package is compatible with. Defaults to the spaCy version used to create the pipeline, up to next minor version, which is the default compatibility for the available [trained pipelines](/models). For instance, a pipeline trained with v3.0.0 will have the version range `">=3.0.0,<3.1.0"`. ~~str~~ |
|
| `spacy_version` | spaCy version range the package is compatible with. Defaults to the spaCy version used to create the pipeline, up to next minor version, which is the default compatibility for the available [trained pipelines](/models). For instance, a pipeline trained with v3.0.0 will have the version range `">=3.0.0,<3.1.0"`. ~~str~~ |
|
||||||
| `parent_package` | Name of the spaCy package. Typically `"spacy"` or `"spacy_nightly"`. Defaults to `"spacy"`. ~~str~~ |
|
| `parent_package` | Name of the spaCy package. Typically `"spacy"` or `"spacy_nightly"`. Defaults to `"spacy"`. ~~str~~ |
|
||||||
|
| `requirements` | Python package requirements that the pipeline depends on. Will be used for the Python package setup in [`spacy package`](/api/cli#package). Should be a list of package names with optional version specifiers, just like you'd define them in a `setup.cfg` or `requirements.txt`. Defaults to `[]`. ~~List[str]~~ |
|
||||||
| `description` | Pipeline description. Also used for Python package. Defaults to `""`. ~~str~~ |
|
| `description` | Pipeline description. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||||
| `author` | Pipeline author name. Also used for Python package. Defaults to `""`. ~~str~~ |
|
| `author` | Pipeline author name. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||||
| `email` | Pipeline author email. Also used for Python package. Defaults to `""`. ~~str~~ |
|
| `email` | Pipeline author email. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||||
|
|
|
@ -463,12 +463,12 @@ entry_points={
|
||||||
}
|
}
|
||||||
```
|
```
|
||||||
|
|
||||||
The factory can also implement other pipeline component methods like `to_disk` and
|
The factory can also implement other pipeline component methods like `to_disk`
|
||||||
`from_disk` for serialization, or even `update` to make the component trainable.
|
and `from_disk` for serialization, or even `update` to make the component
|
||||||
If a component exposes a `from_disk` method and is included in a pipeline, spaCy
|
trainable. If a component exposes a `from_disk` method and is included in a
|
||||||
will call it on load. This lets you ship custom data with your pipeline package.
|
pipeline, spaCy will call it on load. This lets you ship custom data with your
|
||||||
When you save out a pipeline using `nlp.to_disk` and the component exposes a
|
pipeline package. When you save out a pipeline using `nlp.to_disk` and the
|
||||||
`to_disk` method, it will be called with the disk path.
|
component exposes a `to_disk` method, it will be called with the disk path.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def to_disk(self, path, exclude=tuple()):
|
def to_disk(self, path, exclude=tuple()):
|
||||||
|
@ -633,10 +633,10 @@ can be a convenient way to share them with your team.
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
spaCy comes with a handy CLI command that will create all required files, and
|
spaCy comes with a handy CLI command that will create all required files, and
|
||||||
walk you through generating the meta data. You can also create the `meta.json`
|
walk you through generating the meta data. You can also create the
|
||||||
manually and place it in the data directory, or supply a path to it using the
|
[`meta.json`](/api/data-formats#meta) manually and place it in the data
|
||||||
`--meta` flag. For more info on this, see the [`package`](/api/cli#package)
|
directory, or supply a path to it using the `--meta` flag. For more info on
|
||||||
docs.
|
this, see the [`package`](/api/cli#package) docs.
|
||||||
|
|
||||||
> #### meta.json (example)
|
> #### meta.json (example)
|
||||||
>
|
>
|
||||||
|
@ -654,7 +654,7 @@ docs.
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
```cli
|
```cli
|
||||||
$ python -m spacy package ./en_example_pipeline ./my_pipelines
|
$ python -m spacy package ./en_example_pipeline ./packages
|
||||||
```
|
```
|
||||||
|
|
||||||
This command will create a pipeline package directory and will run
|
This command will create a pipeline package directory and will run
|
||||||
|
@ -683,15 +683,44 @@ If you're creating the package manually, keep in mind that the directories need
|
||||||
to be named according to the naming conventions of `lang_name` and
|
to be named according to the naming conventions of `lang_name` and
|
||||||
`lang_name-version`.
|
`lang_name-version`.
|
||||||
|
|
||||||
### Customizing the package setup {#models-custom}
|
### Including custom functions and components {#models-custom}
|
||||||
|
|
||||||
The `load()` method that comes with our pipeline package templates will take
|
If your pipeline includes
|
||||||
care of putting all this together and returning a `Language` object with the
|
[custom components](/usage/processing-pipelines#custom-components), model
|
||||||
loaded pipeline and data. If your pipeline requires
|
architectures or other [code](/usage/training#custom-code), those functions need
|
||||||
[custom components](/usage/processing-pipelines#custom-components) or a custom
|
to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know
|
||||||
language class, you can also **ship the code with your package** and include it
|
how to create the objects referenced in the config. The
|
||||||
in the `__init__.py` – for example, to register a component before the `nlp`
|
[`spacy package`](/api/cli#package) command lets you provide one or more paths
|
||||||
object is created.
|
to Python files containing custom registered functions using the `--code`
|
||||||
|
argument.
|
||||||
|
|
||||||
|
> #### \_\_init\_\_.py (excerpt)
|
||||||
|
>
|
||||||
|
> ```python
|
||||||
|
> from . import functions
|
||||||
|
>
|
||||||
|
> def load(**overrides):
|
||||||
|
> ...
|
||||||
|
> ```
|
||||||
|
|
||||||
|
```cli
|
||||||
|
$ python -m spacy package ./en_example_pipeline ./packages --code functions.py
|
||||||
|
```
|
||||||
|
|
||||||
|
The Python files will be copied over into the root of the package, and the
|
||||||
|
package's `__init__.py` will import them as modules. This ensures that functions
|
||||||
|
are registered when the pipeline is imported, e.g. when you call `spacy.load`. A
|
||||||
|
simple import is all that's needed to make registered functions available.
|
||||||
|
|
||||||
|
Make sure to include **all Python files** that are referenced in your custom
|
||||||
|
code, including modules imported by others. If your custom code depends on
|
||||||
|
**external packages**, make sure they're listed in the list of `"requirements"`
|
||||||
|
in your [`meta.json`](/api/data-formats#meta). For the majority of use cases,
|
||||||
|
registered functions should provide you with all customizations you need, from
|
||||||
|
custom components to custom model architectures and lifecycle hooks. However, if
|
||||||
|
you do want to customize the setup in more detail, you can edit the package's
|
||||||
|
`__init__.py` and the package's `load` function that's called by
|
||||||
|
[`spacy.load`](/api/top-level#spacy.load).
|
||||||
|
|
||||||
<Infobox variant="warning" title="Important note on making manual edits">
|
<Infobox variant="warning" title="Important note on making manual edits">
|
||||||
|
|
||||||
|
|
|
@ -581,10 +581,14 @@ value for it.
|
||||||
|
|
||||||
### Training with custom code {#custom-code}
|
### Training with custom code {#custom-code}
|
||||||
|
|
||||||
> #### Example
|
> ```cli
|
||||||
|
> ### Training
|
||||||
|
> $ python -m spacy train config.cfg --code functions.py
|
||||||
|
> ```
|
||||||
>
|
>
|
||||||
> ```cli
|
> ```cli
|
||||||
> $ python -m spacy train config.cfg --code functions.py
|
> ### Packaging
|
||||||
|
> $ python -m spacy package ./model-best ./packages --code functions.py
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
The [`spacy train`](/api/cli#train) recipe lets you specify an optional argument
|
The [`spacy train`](/api/cli#train) recipe lets you specify an optional argument
|
||||||
|
@ -592,7 +596,13 @@ The [`spacy train`](/api/cli#train) recipe lets you specify an optional argument
|
||||||
allows you to add custom functions and architectures to the function registry
|
allows you to add custom functions and architectures to the function registry
|
||||||
that can then be referenced from your `config.cfg`. This lets you train spaCy
|
that can then be referenced from your `config.cfg`. This lets you train spaCy
|
||||||
pipelines with custom components, without having to re-implement the whole
|
pipelines with custom components, without having to re-implement the whole
|
||||||
training workflow.
|
training workflow. When you package your trained pipeline later using
|
||||||
|
[`spacy package`](/api/cli#package), you can provide one or more Python files to
|
||||||
|
be included in the package and imported in its `__init__.py`. This means that
|
||||||
|
any custom architectures, functions or
|
||||||
|
[components](/usage/processing-pipelines#custom-components) will be shipped with
|
||||||
|
your pipeline and registered when it's loaded. See the documentation on
|
||||||
|
[saving and loading pipelines](/usage/saving-loading#models-custom) for details.
|
||||||
|
|
||||||
#### Example: Modifying the nlp object {#custom-code-nlp-callbacks}
|
#### Example: Modifying the nlp object {#custom-code-nlp-callbacks}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user