mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Include custom code via spacy package command (#6531)
This commit is contained in:
parent
2a6043fabb
commit
513c4e332a
|
@ -1,4 +1,4 @@
|
|||
from typing import Optional, Union, Any, Dict
|
||||
from typing import Optional, Union, Any, Dict, List
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from wasabi import Printer, get_raw_input
|
||||
|
@ -16,6 +16,7 @@ def package_cli(
|
|||
# fmt: off
|
||||
input_dir: Path = Arg(..., help="Directory with pipeline data", exists=True, file_okay=False),
|
||||
output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
|
||||
code_paths: Optional[str] = Opt(None, "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
|
||||
meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
|
||||
create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
|
||||
name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
|
||||
|
@ -33,12 +34,22 @@ def package_cli(
|
|||
After packaging, "python setup.py sdist" is run in the package directory,
|
||||
which will create a .tar.gz archive that can be installed via "pip install".
|
||||
|
||||
If additional code files are provided (e.g. Python files containing custom
|
||||
registered functions like pipeline components), they are copied into the
|
||||
package and imported in the __init__.py.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/cli#package
|
||||
"""
|
||||
code_paths = (
|
||||
[Path(p.strip()) for p in code_paths.split(",")]
|
||||
if code_paths is not None
|
||||
else []
|
||||
)
|
||||
package(
|
||||
input_dir,
|
||||
output_dir,
|
||||
meta_path=meta_path,
|
||||
code_paths=code_paths,
|
||||
name=name,
|
||||
version=version,
|
||||
create_meta=create_meta,
|
||||
|
@ -52,6 +63,7 @@ def package(
|
|||
input_dir: Path,
|
||||
output_dir: Path,
|
||||
meta_path: Optional[Path] = None,
|
||||
code_paths: List[Path] = [],
|
||||
name: Optional[str] = None,
|
||||
version: Optional[str] = None,
|
||||
create_meta: bool = False,
|
||||
|
@ -67,6 +79,14 @@ def package(
|
|||
msg.fail("Can't locate pipeline data", input_path, exits=1)
|
||||
if not output_path or not output_path.exists():
|
||||
msg.fail("Output directory not found", output_path, exits=1)
|
||||
for code_path in code_paths:
|
||||
if not code_path.exists():
|
||||
msg.fail("Can't find code file", code_path, exits=1)
|
||||
# Import the code here so it's available when model is loaded (via
|
||||
# get_meta helper). Also verifies that everything works
|
||||
util.import_file(code_path.stem, code_path)
|
||||
if code_paths:
|
||||
msg.good(f"Including {len(code_paths)} Python module(s) with custom code")
|
||||
if meta_path and not meta_path.exists():
|
||||
msg.fail("Can't find pipeline meta.json", meta_path, exits=1)
|
||||
meta_path = meta_path or input_dir / "meta.json"
|
||||
|
@ -106,10 +126,17 @@ def package(
|
|||
license_path = package_path / model_name_v / "LICENSE"
|
||||
if license_path.exists():
|
||||
shutil.move(str(license_path), str(main_path))
|
||||
imports = []
|
||||
for code_path in code_paths:
|
||||
imports.append(code_path.stem)
|
||||
shutil.copy(str(code_path), str(package_path))
|
||||
create_file(main_path / "meta.json", srsly.json_dumps(meta, indent=2))
|
||||
create_file(main_path / "setup.py", TEMPLATE_SETUP)
|
||||
create_file(main_path / "MANIFEST.in", TEMPLATE_MANIFEST)
|
||||
create_file(package_path / "__init__.py", TEMPLATE_INIT)
|
||||
init_py = TEMPLATE_INIT.format(
|
||||
imports="\n".join(f"from . import {m}" for m in imports)
|
||||
)
|
||||
create_file(package_path / "__init__.py", init_py)
|
||||
msg.good(f"Successfully created package '{model_name_v}'", main_path)
|
||||
if create_sdist:
|
||||
with util.working_dir(main_path):
|
||||
|
@ -249,6 +276,7 @@ TEMPLATE_INIT = """
|
|||
from pathlib import Path
|
||||
from spacy.util import load_model_from_init_py, get_model_meta
|
||||
|
||||
{imports}
|
||||
|
||||
__version__ = get_model_meta(Path(__file__).parent)['version']
|
||||
|
||||
|
|
|
@ -273,6 +273,7 @@ class ModelMetaSchema(BaseModel):
|
|||
version: StrictStr = Field(..., title="Model version")
|
||||
spacy_version: StrictStr = Field("", title="Compatible spaCy version identifier")
|
||||
parent_package: StrictStr = Field("spacy", title="Name of parent spaCy package, e.g. spacy or spacy-nightly")
|
||||
requirements: List[StrictStr] = Field([], title="Additional Python package dependencies, used for the Python package setup")
|
||||
pipeline: List[StrictStr] = Field([], title="Names of pipeline components")
|
||||
description: StrictStr = Field("", title="Model description")
|
||||
license: StrictStr = Field("", title="Model license")
|
||||
|
|
|
@ -872,11 +872,15 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--gold-preproc] [--gp
|
|||
## package {#package tag="command"}
|
||||
|
||||
Generate an installable [Python package](/usage/training#models-generating) from
|
||||
an existing pipeline data directory. All data files are copied over. If the path
|
||||
to a [`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is
|
||||
found in the input directory, this file is used. Otherwise, the data can be
|
||||
entered directly from the command line. spaCy will then create a `.tar.gz`
|
||||
archive file that you can distribute and install with `pip install`.
|
||||
an existing pipeline data directory. All data files are copied over. If
|
||||
additional code files are provided (e.g. Python files containing custom
|
||||
registered functions like
|
||||
[pipeline components](/usage/processing-pipelines#custom-components)), they are
|
||||
copied into the package and imported in the `__init__.py`. If the path to a
|
||||
[`meta.json`](/api/data-formats#meta) is supplied, or a `meta.json` is found in
|
||||
the input directory, this file is used. Otherwise, the data can be entered
|
||||
directly from the command line. spaCy will then create a `.tar.gz` archive file
|
||||
that you can distribute and install with `pip install`.
|
||||
|
||||
<Infobox title="New in v3.0" variant="warning">
|
||||
|
||||
|
@ -887,7 +891,7 @@ this, you can set the `--no-sdist` flag.
|
|||
</Infobox>
|
||||
|
||||
```cli
|
||||
$ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--no-sdist] [--name] [--version] [--force]
|
||||
$ python -m spacy package [input_dir] [output_dir] [--code] [--meta-path] [--create-meta] [--no-sdist] [--name] [--version] [--force]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
|
@ -899,9 +903,10 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta]
|
|||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| ------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `input_dir` | Path to directory containing pipeline data. ~~Path (positional)~~ |
|
||||
| `output_dir` | Directory to create package folder in. ~~Path (positional)~~ |
|
||||
| `--code`, `-c` <Tag variant="new">3</Tag> | Comma-separated paths to Python files to be included in the package and imported in its `__init__.py`. This allows including [registering functions](/usage/training#custom-functions) and [custom components](/usage/processing-pipelines#custom-components). ~~Optional[str] \(option)~~ |
|
||||
| `--meta-path`, `-m` <Tag variant="new">2</Tag> | Path to [`meta.json`](/api/data-formats#meta) file (optional). ~~Optional[Path] \(option)~~ |
|
||||
| `--create-meta`, `-C` <Tag variant="new">2</Tag> | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. ~~bool (flag)~~ |
|
||||
| `--no-sdist`, `-NS`, | Don't build the `.tar.gz` sdist automatically. Can be set if you want to run this step manually. ~~bool (flag)~~ |
|
||||
|
|
|
@ -543,6 +543,7 @@ source of truth** used for loading a pipeline.
|
|||
> "version": "1.0.0",
|
||||
> "spacy_version": ">=3.0.0,<3.1.0",
|
||||
> "parent_package": "spacy",
|
||||
> "requirements": ["spacy-transformers>=1.0.0,<1.1.0"],
|
||||
> "description": "Example pipeline for spaCy",
|
||||
> "author": "You",
|
||||
> "email": "you@example.com",
|
||||
|
@ -573,6 +574,7 @@ source of truth** used for loading a pipeline.
|
|||
| `version` | Pipeline version. Will be used to version a Python package created with [`spacy package`](/api/cli#package). Defaults to `"0.0.0"`. ~~str~~ |
|
||||
| `spacy_version` | spaCy version range the package is compatible with. Defaults to the spaCy version used to create the pipeline, up to next minor version, which is the default compatibility for the available [trained pipelines](/models). For instance, a pipeline trained with v3.0.0 will have the version range `">=3.0.0,<3.1.0"`. ~~str~~ |
|
||||
| `parent_package` | Name of the spaCy package. Typically `"spacy"` or `"spacy_nightly"`. Defaults to `"spacy"`. ~~str~~ |
|
||||
| `requirements` | Python package requirements that the pipeline depends on. Will be used for the Python package setup in [`spacy package`](/api/cli#package). Should be a list of package names with optional version specifiers, just like you'd define them in a `setup.cfg` or `requirements.txt`. Defaults to `[]`. ~~List[str]~~ |
|
||||
| `description` | Pipeline description. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `author` | Pipeline author name. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
| `email` | Pipeline author email. Also used for Python package. Defaults to `""`. ~~str~~ |
|
||||
|
|
|
@ -463,12 +463,12 @@ entry_points={
|
|||
}
|
||||
```
|
||||
|
||||
The factory can also implement other pipeline component methods like `to_disk` and
|
||||
`from_disk` for serialization, or even `update` to make the component trainable.
|
||||
If a component exposes a `from_disk` method and is included in a pipeline, spaCy
|
||||
will call it on load. This lets you ship custom data with your pipeline package.
|
||||
When you save out a pipeline using `nlp.to_disk` and the component exposes a
|
||||
`to_disk` method, it will be called with the disk path.
|
||||
The factory can also implement other pipeline component methods like `to_disk`
|
||||
and `from_disk` for serialization, or even `update` to make the component
|
||||
trainable. If a component exposes a `from_disk` method and is included in a
|
||||
pipeline, spaCy will call it on load. This lets you ship custom data with your
|
||||
pipeline package. When you save out a pipeline using `nlp.to_disk` and the
|
||||
component exposes a `to_disk` method, it will be called with the disk path.
|
||||
|
||||
```python
|
||||
def to_disk(self, path, exclude=tuple()):
|
||||
|
@ -633,10 +633,10 @@ can be a convenient way to share them with your team.
|
|||
</Infobox>
|
||||
|
||||
spaCy comes with a handy CLI command that will create all required files, and
|
||||
walk you through generating the meta data. You can also create the `meta.json`
|
||||
manually and place it in the data directory, or supply a path to it using the
|
||||
`--meta` flag. For more info on this, see the [`package`](/api/cli#package)
|
||||
docs.
|
||||
walk you through generating the meta data. You can also create the
|
||||
[`meta.json`](/api/data-formats#meta) manually and place it in the data
|
||||
directory, or supply a path to it using the `--meta` flag. For more info on
|
||||
this, see the [`package`](/api/cli#package) docs.
|
||||
|
||||
> #### meta.json (example)
|
||||
>
|
||||
|
@ -654,7 +654,7 @@ docs.
|
|||
> ```
|
||||
|
||||
```cli
|
||||
$ python -m spacy package ./en_example_pipeline ./my_pipelines
|
||||
$ python -m spacy package ./en_example_pipeline ./packages
|
||||
```
|
||||
|
||||
This command will create a pipeline package directory and will run
|
||||
|
@ -683,15 +683,44 @@ If you're creating the package manually, keep in mind that the directories need
|
|||
to be named according to the naming conventions of `lang_name` and
|
||||
`lang_name-version`.
|
||||
|
||||
### Customizing the package setup {#models-custom}
|
||||
### Including custom functions and components {#models-custom}
|
||||
|
||||
The `load()` method that comes with our pipeline package templates will take
|
||||
care of putting all this together and returning a `Language` object with the
|
||||
loaded pipeline and data. If your pipeline requires
|
||||
[custom components](/usage/processing-pipelines#custom-components) or a custom
|
||||
language class, you can also **ship the code with your package** and include it
|
||||
in the `__init__.py` – for example, to register a component before the `nlp`
|
||||
object is created.
|
||||
If your pipeline includes
|
||||
[custom components](/usage/processing-pipelines#custom-components), model
|
||||
architectures or other [code](/usage/training#custom-code), those functions need
|
||||
to be registered **before** your pipeline is loaded. Otherwise, spaCy won't know
|
||||
how to create the objects referenced in the config. The
|
||||
[`spacy package`](/api/cli#package) command lets you provide one or more paths
|
||||
to Python files containing custom registered functions using the `--code`
|
||||
argument.
|
||||
|
||||
> #### \_\_init\_\_.py (excerpt)
|
||||
>
|
||||
> ```python
|
||||
> from . import functions
|
||||
>
|
||||
> def load(**overrides):
|
||||
> ...
|
||||
> ```
|
||||
|
||||
```cli
|
||||
$ python -m spacy package ./en_example_pipeline ./packages --code functions.py
|
||||
```
|
||||
|
||||
The Python files will be copied over into the root of the package, and the
|
||||
package's `__init__.py` will import them as modules. This ensures that functions
|
||||
are registered when the pipeline is imported, e.g. when you call `spacy.load`. A
|
||||
simple import is all that's needed to make registered functions available.
|
||||
|
||||
Make sure to include **all Python files** that are referenced in your custom
|
||||
code, including modules imported by others. If your custom code depends on
|
||||
**external packages**, make sure they're listed in the list of `"requirements"`
|
||||
in your [`meta.json`](/api/data-formats#meta). For the majority of use cases,
|
||||
registered functions should provide you with all customizations you need, from
|
||||
custom components to custom model architectures and lifecycle hooks. However, if
|
||||
you do want to customize the setup in more detail, you can edit the package's
|
||||
`__init__.py` and the package's `load` function that's called by
|
||||
[`spacy.load`](/api/top-level#spacy.load).
|
||||
|
||||
<Infobox variant="warning" title="Important note on making manual edits">
|
||||
|
||||
|
|
|
@ -581,10 +581,14 @@ value for it.
|
|||
|
||||
### Training with custom code {#custom-code}
|
||||
|
||||
> #### Example
|
||||
> ```cli
|
||||
> ### Training
|
||||
> $ python -m spacy train config.cfg --code functions.py
|
||||
> ```
|
||||
>
|
||||
> ```cli
|
||||
> $ python -m spacy train config.cfg --code functions.py
|
||||
> ### Packaging
|
||||
> $ python -m spacy package ./model-best ./packages --code functions.py
|
||||
> ```
|
||||
|
||||
The [`spacy train`](/api/cli#train) recipe lets you specify an optional argument
|
||||
|
@ -592,7 +596,13 @@ The [`spacy train`](/api/cli#train) recipe lets you specify an optional argument
|
|||
allows you to add custom functions and architectures to the function registry
|
||||
that can then be referenced from your `config.cfg`. This lets you train spaCy
|
||||
pipelines with custom components, without having to re-implement the whole
|
||||
training workflow.
|
||||
training workflow. When you package your trained pipeline later using
|
||||
[`spacy package`](/api/cli#package), you can provide one or more Python files to
|
||||
be included in the package and imported in its `__init__.py`. This means that
|
||||
any custom architectures, functions or
|
||||
[components](/usage/processing-pipelines#custom-components) will be shipped with
|
||||
your pipeline and registered when it's loaded. See the documentation on
|
||||
[saving and loading pipelines](/usage/saving-loading#models-custom) for details.
|
||||
|
||||
#### Example: Modifying the nlp object {#custom-code-nlp-callbacks}
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user