diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index 455e31cc1..e4980c089 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -297,60 +297,41 @@ will not be available. ## Train {#train} - - Train a model. Expects data in spaCy's -[JSON format](/api/data-formats#json-input). On each epoch, a model will be -saved out to the directory. Accuracy scores and model details will be added to a -[`meta.json`](/usage/training#models-generating) to allow packaging the model -using the [`package`](/api/cli#package) command. +[binary format](/api/data-formats#training) and a +[config file](/api/data-formats#config) with all settings and hyperparameters. +Will save out the best model from all epochs, as well as the final model. The +`--code` argument can be used to provide a Python file that's imported before +the training process starts. This lets you register +[custom functions](/usage/training#custom-models) and architectures and refer to +them in your config, all while still using spaCy's built-in `train` workflow. If +you need to manage complex multi-step training workflows, check out the new +[spaCy projects](/usage/projects). + + + +As of spaCy v3.0, the `train` command doesn't take a long list of command-line +arguments anymore and instead expects a single +[`config.cfg` file](/usage/training#config) containing all settings for the +pipeline, training process and hyperparameters. + + ```bash -$ python -m spacy train [lang] [output_path] [train_path] [dev_path] -[--base-model] [--pipeline] [--vectors] [--n-iter] [--n-early-stopping] -[--n-examples] [--use-gpu] [--version] [--meta-path] [--init-tok2vec] -[--parser-multitasks] [--entity-multitasks] [--gold-preproc] [--noise-level] -[--orth-variant-level] [--learn-tokens] [--textcat-arch] [--textcat-multilabel] -[--textcat-positive-label] [--verbose] +$ python -m spacy train [train_path] [dev_path] [config_path] [--output] +[--code] [--verbose] ``` -| Argument | Type | Description | -| --------------------------------------------------------------- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `lang` | positional | Model language. | -| `output_path` | positional | Directory to store model in. Will be created if it doesn't exist. | -| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | -| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | -| `--base-model`, `-b` 2.1 | option | Optional name of base model to update. Can be any loadable spaCy model. | -| `--pipeline`, `-p` 2.1 | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | -| `--replace-components`, `-R` | flag | Replace components from the base model. | -| `--vectors`, `-v` | option | Model to load vectors from. | -| `--n-iter`, `-n` | option | Number of iterations (default: `30`). | -| `--n-early-stopping`, `-ne` | option | Maximum number of training epochs without dev accuracy improvement. | -| `--n-examples`, `-ns` | option | Number of examples to use (defaults to `0` for all examples). | -| `--use-gpu`, `-g` | option | GPU ID or `-1` for CPU only (default: `-1`). | -| `--version`, `-V` | option | Model version. Will be written out to the model's `meta.json` after training. | -| `--meta-path`, `-m` 2 | option | Optional path to model [`meta.json`](/usage/training#models-generating). All relevant properties like `lang`, `pipeline` and `spacy_version` will be overwritten. | -| `--init-tok2vec`, `-t2v` 2.1 | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. | -| `--parser-multitasks`, `-pt` | option | Side objectives for parser CNN, e.g. `'dep'` or `'dep,tag'` | -| `--entity-multitasks`, `-et` | option | Side objectives for NER CNN, e.g. `'dep'` or `'dep,tag'` | -| `--width`, `-cw` 2.2.4 | option | Width of CNN layers of `Tok2Vec` component. | -| `--conv-depth`, `-cd` 2.2.4 | option | Depth of CNN layers of `Tok2Vec` component. | -| `--cnn-window`, `-cW` 2.2.4 | option | Window size for CNN layers of `Tok2Vec` component. | -| `--cnn-pieces`, `-cP` 2.2.4 | option | Maxout size for CNN layers of `Tok2Vec` component. | -| `--use-chars`, `-chr` 2.2.4 | flag | Whether to use character-based embedding of `Tok2Vec` component. | -| `--bilstm-depth`, `-lstm` 2.2.4 | option | Depth of BiLSTM layers of `Tok2Vec` component (requires PyTorch). | -| `--embed-rows`, `-er` 2.2.4 | option | Number of embedding rows of `Tok2Vec` component. | -| `--noise-level`, `-nl` | option | Float indicating the amount of corruption for data augmentation. | -| `--orth-variant-level`, `-ovl` 2.2 | option | Float indicating the orthography variation for data augmentation (e.g. `0.3` for making 30% of occurrences of some tokens subject to replacement). | -| `--gold-preproc`, `-G` | flag | Use gold preprocessing. | -| `--learn-tokens`, `-T` | flag | Make parser learn gold-standard tokenization by merging ] subtokens. Typically used for languages like Chinese. | -| `--textcat-multilabel`, `-TML` 2.2 | flag | Text classification classes aren't mutually exclusive (multilabel). | -| `--textcat-arch`, `-ta` 2.2 | option | Text classification model architecture. Defaults to `"bow"`. | -| `--textcat-positive-label`, `-tpl` 2.2 | option | Text classification positive label for binary classes with two labels. | -| `--tag-map-path`, `-tm` 2.2.4 | option | Location of JSON-formatted tag map. | -| `--verbose`, `-VV` 2.0.13 | flag | Show more detailed messages during training. | -| `--help`, `-h` | flag | Show help message and available arguments. | -| **CREATES** | model, pickle | A spaCy model on each epoch. | +| Argument | Type | Description | +| ----------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- | +| `train_path` | positional | Location of training data in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. | +| `dev_path` | positional | Location of development data for evaluation in spaCy's [binary format](/api/data-formats#training). Can be a file or a directory of files. | +| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | +| `--output`, `-o` | positional | Directory to store model in. Will be created if it doesn't exist. | +| `--code`, `-c` | option | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures. | +| `--verbose`, `-V` | flag | Show more detailed messages during training. | +| `--help`, `-h` | flag | Show help message and available arguments. | +| **CREATES** | model | The final model and the best model. | ## Pretrain {#pretrain new="2.1" tag="experimental"} @@ -507,12 +488,13 @@ so you don't have to run `python setup.py sdist` separately anymore. $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force] ``` -```bash -### Example -python -m spacy package /input /output -cd /output/en_model-0.0.0 -pip install dist/en_model-0.0.0.tar.gz -``` +> #### Example +> +> ```bash +> python -m spacy package /input /output +> cd /output/en_model-0.0.0 +> pip install dist/en_model-0.0.0.tar.gz +> ``` | Argument | Type | Description | | ------------------------------------------------ | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | @@ -525,14 +507,143 @@ pip install dist/en_model-0.0.0.tar.gz | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | directory | A Python package containing the spaCy model. | -## Project {#project} +## Project {#project new="3"} +The `spacy project` CLI includes subcommands for working with +[spaCy projects](/usage/projects), end-to-end workflows for building and +deploying custom spaCy models. + ### project clone {#project-clone} +Clone a project template from a Git repository. Calls into `git` under the hood +and uses the sparse checkout feature, so you're only downloading what you need. +By default, spaCy's +[project templates repo](https://github.com/explosion/projects) is used, but you +can provide any other repo (public or private) that you have access to using the +`--repo` option. + + + +```bash +$ python -m spacy project clone [name] [dest] [--repo] +``` + +> #### Example +> +> ```bash +> $ python -m spacy project clone some_example +> ``` +> +> Clone from custom repo: +> +> ```bash +> $ python -m spacy project clone template --repo https://github.com/your_org/your_repo +> ``` + +| Argument | Type | Description | +| -------------- | ---------- | ---------------------------------------------------------------------------------------------------------------------------- | +| `name` | positional | The name of the template to clone, relative to the repo. Can be a top-level directory or a subdirectory like `dir/template`. | +| `dest` | positional | Where to clone the project. Defaults to current working directory. | +| `--repo`, `-r` | option | The repository to clone from. Can be any public or private Git repo you have access to. | +| `--help`, `-h` | flag | Show help message and available arguments. | +| **CREATES** | directory | The cloned [project directory](/usage/projects#project-files). | + ### project assets {#project-assets} +Fetch project assets like datasets and pretrained weights. Assets are defined in +the `assets` section of the [`project.yml`](/usage/projects#project-yml). If a +`checksum` is provided, the file is only downloaded if no local file with the +same checksum exists and spaCy will show an error if the checksum of the +downloaded file doesn't match. If assets don't specify a `url` they're +considered "private" and you have to take care of putting them into the +destination directory yourself. If a local path is provided, the asset is copied +into the current project. + + + +```bash +$ python -m spacy project assets [project_dir] +``` + +> #### Example +> +> ```bash +> $ python -m spacy project assets +> ``` + +| Argument | Type | Description | +| -------------- | ---------- | ----------------------------------------------------------------- | +| `project_dir` | positional | Path to project directory. Defaults to current working directory. | +| `--help`, `-h` | flag | Show help message and available arguments. | +| **CREATES** | files | Downloaded or copied assets defined in the `project.yml`. | + ### project run {#project-run} +Run a named command or workflow defined in the +[`project.yml`](/usage/projects#project-yml). If a workflow name is specified, +all commands in the workflow are run, in order. If commands define +[dependencies or outputs](/usage/projects#deps-outputs), they will only be +re-run if state has changed. For example, if the input dataset changes, a +preprocessing command that depends on those files will be re-run. + + + +```bash +$ python -m spacy project run [subcommand] [project_dir] [--force] [--dry] +``` + +> #### Example +> +> ```bash +> $ python -m spacy project run train +> ``` + +| Argument | Type | Description | +| --------------- | ---------- | ----------------------------------------------------------------- | +| `subcommand` | positional | Name of the command or workflow to run. | +| `project_dir` | positional | Path to project directory. Defaults to current working directory. | +| `--force`, `-F` | flag | Force re-running steps, even if nothing changed. | +| `--dry`, `-D` | flag | Β Perform a dry run and don't execute scripts. | +| `--help`, `-h` | flag | Show help message and available arguments. | + ### project dvc {#project-dvc} + +Auto-generate [Data Version Control](https://dvc.org) (DVC) config file. Calls +[`dvc run`](https://dvc.org/doc/command-reference/run) with `--no-exec` under +the hood to generate the `dvc.yaml`. A DVC project can only define one pipeline, +so you need to specify one workflow defined in the +[`project.yml`](/usage/projects#project-yml). If no workflow is specified, the +first defined workflow is used. The DVC config will only be updated if the +`project.yml` changed. For details, see the +[DVC integration](/usage/projects#dvc) docs. + + + +This command requires DVC to be installed and initialized in the project +directory, e.g. via [`dvc init`](https://dvc.org/doc/command-reference/init). +You'll also need to add the assets you want to track with +[`dvc add`](https://dvc.org/doc/command-reference/add). + + + +```bash +$ python -m spacy project dvc [project_dir] [workflow] [--force] [--verbose] +``` + +> #### Example +> +> ```bash +> git init +> dvc init +> python -m spacy project dvc all +> ``` + +| Argument | Type | Description | +| ----------------- | ---------- | --------------------------------------------------------------------------------- | +| `project_dir` | positional | Path to project directory. Defaults to current working directory. | +| `workflow` | positional | Name of workflow defined in `project.yml`. Defaults to first workflow if not set. | +| `--force`, `-F` | flag | Force-updating config file. | +| `--verbose`, `-V` | flag | Β Print more output generated by DVC. | +| `--help`, `-h` | flag | Show help message and available arguments. | diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 5c2c84d79..c5335dc2e 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -5,25 +5,29 @@ menu: - ['Intro & Workflow', 'intro'] - ['Directory & Assets', 'directory'] - ['Custom Projects', 'custom'] + - ['Integrations', 'integrations'] --- -> #### Project templates +> #### πŸͺ Project templates > > Our [`projects`](https://github.com/explosion/projects) repo includes various -> project templates for different tasks and models that you can clone and run. - - +> project templates for different NLP tasks, models, workflows and integrations +> that you can clone and run. The easiest way to get started is to pick a +> template, clone it and start modifying it! spaCy projects let you manage and share **end-to-end spaCy workflows** for -training, packaging and serving your custom models. You can start off by cloning -a pre-defined project template, adjust it to fit your needs, load in your data, -train a model, export it as a Python package and share the project templates -with your team. Under the hood, project use -[Data Version Control](https://dvc.org) (DVC) to track and version inputs and -outputs, and make sure you're only re-running what's needed. spaCy projects can -be used via the new [`spacy project`](/api/cli#project) command. For an overview -of the available project templates, check out the -[`projects`](https://github.com/explosion/projects) repo. +different **use cases and domains**, and orchestrate training, packaging and +serving your custom models. You can start off by cloning a pre-defined project +template, adjust it to fit your needs, load in your data, train a model, export +it as a Python package and share the project templates with your team. spaCy +projects can be used via the new [`spacy project`](/api/cli#project) command. +For an overview of the available project templates, check out the +[`projects`](https://github.com/explosion/projects) repo. spaCy projects also +[integrate](#integrations) with many other cool machine learning and data +science tools to track and manage your data and experiments, iterate on demos +and prototypes and ship your models into production. + + ## Introduction and workflow {#intro} @@ -37,18 +41,32 @@ mattis pretium. +spaCy projects make it easy to integrate with many other **awesome tools** in +the data science and machine learning ecosystem to track and manage your data +and experiments, iterate on demos and prototypes and ship your models into +production. + + +Manage and version your data +Create labelled training data +Visualize and demo your models +Serve your models and host APIs +Distributed and parallel training +Track your experiments and results + + ### 1. Clone a project template {#clone} +> #### Cloning under the hoodimport { ReactComponent as WandBLogo } from '../images/logos/wandb.svg' +> +> To clone a project, spaCy calls into `git` and uses the "sparse checkout" +> feature to only clone the relevant directory or directories. + The [`spacy project clone`](/api/cli#project-clone) command clones an existing project template and copies the files to a local directory. You can then run the project, e.g. to train a model and edit the commands and scripts to build fully custom workflows. -> #### Cloning under the hood -> -> To clone a project, spaCy calls into `git` and uses the "sparse checkout" -> feature to only clone the relevant directory or directories. - ```bash $ python -m spacy clone some_example_project ``` @@ -59,46 +77,169 @@ can specify an optional second argument to define the output directory. The to use the spaCy [`projects`](https://github.com/explosion/projects) repo. You can also use any private repo you have access to with Git. -If you plan on making the project a Git repo, you can set the `--git` flag to -set it up automatically _before_ initializing DVC, so DVC can integrate with -Git. This means that it will automatically add asset files to a `.gitignore` (so -you never check assets into the repo, only the asset meta files). - ### 2. Fetch the project assets {#assets} +> #### project.yml +> +> ```yaml +> assets: +> - dest: 'assets/training.spacy' +> url: 'https://example.com/data.spacy' +> checksum: '63373dd656daa1fd3043ce166a59474c' +> ``` + Assets are data files your project needs – for example, the training and evaluation data or pretrained vectors and embeddings to initialize your model -with. +with. Each project template comes with a `project.yml` that defines the assets +to download and where to put them. The +[`spacy project assets`](/api/cli#project-assets) will fetch the project assets +for you: ```bash cd some_example_project python -m spacy project assets ``` -### 3. Run the steps {#run-all} +### 3. Run a command {#run} + +> #### project.yml +> +> ```yaml +> commands: +> - name: preprocess +> help: "Convert the input data to spaCy's format" +> script: +> - 'python -m spacy convert assets/train.conllu corpus/' +> - 'python -m spacy convert assets/eval.conllu corpus/' +> deps: +> - 'assets/train.conllu' +> - 'assets/eval.conllu' +> outputs: +> - 'corpus/train.spacy' +> - 'corpus/eval.spacy' +> ``` + +Commands consist of one or more steps and can be run with +[`spacy project run`](/api/cli#project-run). The following will run the command +`preprocess` defined in the `project.yml`: ```bash -$ python -m spacy project run-all +$ python -m spacy project run preprocess ``` -### 4. Run single commands {#run} +Commands can define their expected [dependencies and outputs](#deps-outputs) +using the `deps` (files the commands require) and `outputs` (files the commands +create) keys. This allows your project to track changes and determine whether a +command needs to be re-run. For instance, if your input data changes, you want +to re-run the `preprocess` command. But if nothing changed, this step can be +skipped. You can also set `--force` to force re-running a command, or `--dry` to +perform a "dry run" and see what would happen (without actually running the +script). + +### 4. Run a workflow {#run-workfow} + +> #### project.yml +> +> ```yaml +> workflows: +> all: +> - preprocess +> - train +> - package +> ``` + +Workflows are series of commands that are run in order and often depend on each +other. For instance, to generate a packaged model, you might start by converting +your data, then run [`spacy train`](/api/cli#train) to train your model on the +converted data and if that's successful, run [`spacy package`](/api/cli#package) +to turn the best model artifact into an installable Python package. The +following command run the workflow named `all` defined in the `project.yml`, and +execute the commands it specifies, in order: ```bash -$ python -m spacy project run visualize +$ python -m spacy project run all ``` +Using the expected [dependencies and outputs](#deps-outputs) defined in the +commands, spaCy can determine whether to re-run a command (if its inputs or +outputs have changed) or whether to skip it. If you're looking to implement more +advanced data pipelines and track your changes in Git, check out the +[Data Version Control (DVC) integration](#dvc). The +[`spacy project dvc`](/api/cli#project-dvc) command generates a DVC config file +from a workflow defined in your `project.yml` so you can manage your spaCy +project as a DVC repo. + ## Project directory and assets {#directory} ### project.yml {#project-yml} -The project config, `project.yml`, defines the assets a project depends on, like -datasets and pretrained weights, as well as a series of commands that can be run -separately or as a pipeline – for instance, to preprocess the data, convert it -to spaCy's format, train a model, evaluate it and export metrics, package it and -spin up a quick web demo. It looks pretty similar to a config file used to -define CI pipelines. +The `project.yml` defines the assets a project depends on, like datasets and +pretrained weights, as well as a series of commands that can be run separately +or as a workflow – for instance, to preprocess the data, convert it to spaCy's +format, train a model, evaluate it and export metrics, package it and spin up a +quick web demo. It looks pretty similar to a config file used to define CI +pipelines. - + + +```yaml +https://github.com/explosion/spacy-boilerplates/blob/master/ner_fashion/project.yml +``` + +| Section | Description | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| `variables` | A dictionary of variables that can be referenced in paths, URLs and scripts. For example, `{NAME}` will use the value of the variable `NAME`. | +| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. | +| `workflows` | A dictionary of workflow names, mapped to a list of command names, to execute in order. Workflows can be run with the [`project run`](/api/cli#project-run) command. | +| `commands` | A list of named commands. A command can define an optional help message (shown in the CLI when the user adds `--help`) and the `script`, a list of commands to run. The `deps` and `outputs` let you define the created file the command depends on and produces, respectively. This lets spaCy determine whether a command needs to be re-run because its dependencies or outputs changed. Commands can be run as part of a workflow, or separately with the [`project run`](/api/cli#project-run) command. | + +### Dependencies and outputs {#deps-outputs} + +Each command defined in the `project.yml` can optionally define a list of +dependencies and outputs. These are the files the commands requires and creates. +For example, a command for training a model may depend on a +[`config.cfg`](/usage/training#config) and the training and evaluation data, and +it will export a directory `model-best`, containing the best model, which you +can then re-use in other commands. + + +```yaml +### project.yml +commands: + - name: train + help: 'Train a spaCy model using the specified corpus and config' + script: + - 'python -m spacy train ./corpus/training.spacy ./corpus/evaluation.spacy ./configs/config.cfg -o training/' + deps: + - 'configs/config.cfg' + - 'corpus/training.spacy' + - 'corpus/evaluation.spacy' + outputs: + - 'training/model-best' +``` + +> #### Re-running vs. skipping +> +> Under the hood, spaCy uses a `project.lock` lockfile that stores the details +> for each command, as well as its dependencies and outputs and their checksums. +> It's updated on each run. If any of this information changes, the command will +> be re-run. Otherwise, it will be skipped. + +If you're running a command and it depends on files that are missing, spaCy will +show you an error. If a command defines dependencies and outputs that haven't +changed since the last run, the command will be skipped. This means that you're +only re-running commands if they need to be re-run. To force re-running a +command or workflow, even if nothing changed, you can set the `--force` flag. + +Note that [`spacy project`](/api/cli#project) doesn't compile any dependency +graphs based on the dependencies and outputs, and won't re-run previous steps +automatically. For instance, if you only run the command `train` that depends on +data created by `preprocess` and those files are missing, spaCy will show an +error – it won't just re-run `preprocess`. If you're looking for more advanced +data management, check out the [Data Version Control (DVC) integration](#dvc) +integration. If you're planning on integrating your spaCy project with DVC, you +can also use `outputs_no_cache` instead of `outputs` to define outputs that +won't be cached or tracked. ### Files and directory structure {#project-files} @@ -109,10 +250,9 @@ scripts). ```yaml ### Project directory -β”œβ”€β”€ project.yml # the project configuration -β”œβ”€β”€ dvc.yaml # auto-generated Data Version Control config -β”œβ”€β”€ dvc.lock # auto-generated Data Version control lock file -β”œβ”€β”€ assets/ # downloaded data assets and DVC meta files +β”œβ”€β”€ project.yml # the project settings +β”œβ”€β”€ project.lock # lockfile that tracks inputs/outputs +β”œβ”€β”€ assets/ # downloaded data assets β”œβ”€β”€ metrics/ # output directory for evaluation metrics β”œβ”€β”€ training/ # output directory for trained models β”œβ”€β”€ corpus/ # output directory for training corpus @@ -125,13 +265,89 @@ scripts). └── ... # any other files, like a requirements.txt etc. ``` -When the project is initialized, spaCy will auto-generate a `dvc.yaml` based on -the project config. The file is updated whenever the project config has changed -and includes all commands defined in the `run` section of the project config. -This allows DVC to track the inputs and outputs and know which steps need to be -re-run. +--- -#### Why Data Version Control (DVC)? +## Custom scripts and projects {#custom} + +The `project.yml` lets you define any custom commands and run them as part of +your training, evaluation or deployment workflows. The `script` section defines +a list of commands that are called in a subprocess, in order. This lets you +execute other Python scripts or command-line tools. Let's say you've written a +few integration tests that load the best model produced by the training command +and check that it works correctly. You can now define a `test` command that +calls into [`pytest`](https://docs.pytest.org/en/latest/) and runs your tests: + +> #### Calling into Python +> +> If any of your command scripts call into `python`, spaCy will take care of +> replacing that with your `sys.executable`, to make sure you're executing +> everything with the same Python (not some other Python installed on your +> system). It also normalizes references to `python3`, `pip3` and `pip`. + +```yaml +### project.yml +commands: + - name: test + help: 'Test the trained model' + script: + - 'python -m pytest ./scripts/tests' + deps: + - 'training/model-best' +``` + +Adding `training/model-best` to the command's `deps` lets you ensure that the +file is available. If not, spaCy will show an error and the command won't run. + + + +### Cloning from your own repo {#custom-repo} + +The [`spacy project clone`](/api/cli#project-clone) command lets you customize +the repo to clone from using the `--repo` option. It calls into `git`, so you'll +be able to clone from any repo that you have access to, including private repos. + +```bash +$ python -m spacy project your_project --repo https://github.com/you/repo +``` + +At a minimum, a valid project template needs to contain a +[`project.yml`](#project-yml). It can also include +[other files](/usage/projects#project-files), like custom scripts, a +`requirements.txt` listing additional dependencies, +[training configs](/usage/training#config) and model meta templates, or Jupyter +notebooks with usage examples. + + + +It's typically not a good idea to check large data assets, trained models or +other artifacts into a Git repo and you should exclude them from your project +template. If you want to version your data and models, check out +[Data Version Control](#dvc) (DVC), which integrates with spaCy projects. + + + +### Working with private assets {#private-assets} + +For many projects, the datasets and weights you're working with might be +company-internal and not available via a public URL. In that case, you can +specify the destination paths and a checksum, and leave out the URL. When your +teammates clone and run your project, they can place the files in the respective +directory themselves. The [`spacy project assets`](/api/cli#project-assets) +command will alert about missing files and mismatched checksums, so you can +ensure that others are running your project with the same data. + +```yaml +### project.yml +assets: + - dest: 'assets/private_training_data.json' + checksum: '63373dd656daa1fd3043ce166a59474c' + - dest: 'assets/private_vectors.bin' + checksum: '5113dc04e03f079525edd8df3f4f39e3' +``` + +## Integrations {#integrations} + +### Data Version Control (DVC) {#dvc} Data assets like training corpora or pretrained weights are at the core of any NLP project, but they're often difficult to manage: you can't just check them @@ -140,19 +356,187 @@ steps that depend on each other, like a preprocessing step that generates your training data, you need to make sure the data is always up-to-date, and re-run all steps of your process every time, just to be safe. -[Data Version Control (DVC)](https://dvc.org) is a standalone open-source tool +[Data Version Control](https://dvc.org) (DVC) is a standalone open-source tool that integrates into your workflow like Git, builds a dependency graph for your data pipelines and tracks and caches your data files. If you're downloading data from an external source, like a storage bucket, DVC can tell whether the resource has changed. It can also determine whether to re-run a step, depending on whether its input have changed or not. All metadata can be checked into a Git -repo, so you'll always be able to reproduce your experiments. `spacy project` -uses DVC under the hood and you typically don't have to think about it if you -don't want to. But if you do want to integrate with DVC more deeply, you can. -Each spaCy project is also a regular DVC project. +repo, so you'll always be able to reproduce your experiments. -#### Checking projects into Git +To set up DVC, install the package and initialize your spaCy project as a Git +and DVC repo. You can also +[customize your DVC installation](https://dvc.org/doc/install/macos#install-with-pip) +to include support for remote storage like Google Cloud Storage, S3, Azure, SSH +and more. + +```bash +pip install dvc # Install DVC +git init # Initialize a Git repo +dvc init # Initialize a DVC project +``` + +The [`spacy project dvc`](/api/cli#project-dvc) command creates a `dvc.yaml` +config file based on a workflow defined in your `project.yml`. Whenever you +update your project, you can re-run the command to update your DVC config. You +can then manage your spaCy project like any other DVC project, run +[`dvc add`](https://dvc.org/doc/command-reference/add) to add and track assets +and [`dvc repro`](https://dvc.org/doc/command-reference/repro) to reproduce the +workflow or individual commands. + +```bash +$ python -m spacy project dvc [workflow name] +``` + + + +DVC currently expects a single workflow per project, so when creating the config +with [`spacy project dvc`](/api/cli#project-dvc), you need to specify the name +of a workflow defined in your `project.yml`. You can still use multiple +workflows, but only one can be tracked by DVC. + + + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + --- -## Custom projects and scripts {#custom} +### Prodigy {#prodigy} + +[Prodigy](https://prodi.gy) is a modern annotation tool for creating training +data for machine learning models, developed by us. It integrates with spaCy +out-of-the-box and provides many different +[annotation recipes](https://prodi.gy/docs/recipes) for a variety of NLP tasks, +with and without a model in the loop. If Prodigy is installed in your project, +you can + +The following example command starts the Prodigy app using the +[`ner.correct`](https://prodi.gy/docs/recipes#ner-correct) recipe and streams in +suggestions for the given entity labels produced by a pretrained model. You can +then correct the suggestions manually in the UI. After you save and exit the +server, the full dataset is exported in spaCy's format and split into a training +and evaluation set. + + +```yaml +### project.yml +variables: + PRODIGY_DATASET: 'ner_articles' + PRODIGY_LABELS: 'PERSON,ORG,PRODUCT' + PRODIGY_MODEL: 'en_core_web_md' + +commands: + - name: annotate + - script: + - 'python -m prodigy ner.correct {PRODIGY_DATASET} ./assets/raw_data.jsonl + {PRODIGY_MODEL} --labels {PRODIGY_LABELS}' + - 'python -m prodigy data-to-spacy ./corpus/train.spacy + ./corpus/eval.spacy --ner {PRODIGY_DATASET}' + - deps: + - 'assets/raw_data.jsonl' + - outputs: + - 'corpus/train.spacy' + - 'corpus/eval.spacy' +``` + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + + +--- + +### Streamlit {#streamlit} + + + +
+ +[Streamlit](https://streamlit.io) is a Python framework for building interactive +data apps. The [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit) +package helps you integrate spaCy visualizations into your Streamlit apps and +quickly spin up demos to explore your models interactively. It includes a full +embedded visualizer, as well as individual components. + +```bash +$ pip install spacy_streamlit +``` + +
+ +![](../images/spacy-streamlit.png) + +
+ +Using [`spacy-streamlit`](https://github.com/explosion/spacy-streamlit), your +projects can easily define their own scripts that spin up an interactive +visualizer, using the latest model you trained, or a selection of models so you +can compare their results. The following script starts an +[NER visualizer](/usage/visualizers#ent) and takes two positional command-line +argument you can pass in from your `config.yml`: a comma-separated list of model +paths and an example text to use as the default text. + +```python +### scripts/visualize.py +import spacy_streamlit +import sys + +DEFAULT_TEXT = sys.argv[2] if len(sys.argv) >= 3 else "" +MODELS = [name.strip() for name in sys.argv[1].split(",")] +spacy_streamlit.visualize(MODELS, DEFAULT_TEXT, visualizers=["ner"]) +``` + + +```yaml +### project.yml +commands: + - name: visualize + help: "Visualize the model's output interactively using Streamlit" + script: + - 'streamlit run ./scripts/visualize.py ./training/model-best "I like Adidas shoes."' + deps: + - 'training/model-best' +``` + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + + +--- + +### FastAPI {#fastapi} + + + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + + +--- + +### Ray {#ray} + + + +--- + +### Weights & Biases {#wandb} + + diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md index 6b533b739..5db741d52 100644 --- a/website/docs/usage/visualizers.md +++ b/website/docs/usage/visualizers.md @@ -4,7 +4,7 @@ teaser: Visualize dependencies and entities in your browser or in a notebook new: 2 menu: - ['Dependencies', 'dep'] - - ['Entities', 'ent'] + - ['Named Entities', 'ent'] - ['Jupyter Notebooks', 'jupyter'] - ['Rendering HTML', 'html'] - ['Web app usage', 'webapp'] @@ -356,6 +356,6 @@ Alternatively, if you're using [Streamlit](https://streamlit.io), check out the helps you integrate spaCy visualizations into your apps. It includes a full embedded visualizer, as well as individual components. -![](../images/spacy-streamlit.png)] +![](../images/spacy-streamlit.png) diff --git a/website/src/components/card.js b/website/src/components/card.js index ca4619b06..fee381c5e 100644 --- a/website/src/components/card.js +++ b/website/src/components/card.js @@ -1,29 +1,32 @@ import React from 'react' import PropTypes from 'prop-types' +import classNames from 'classnames' import Link from './link' import { H5 } from './typography' import classes from '../styles/card.module.sass' -const Card = ({ title, to, image, header, onClick, children }) => ( -
+const Card = ({ title, to, image, header, small, onClick, children }) => ( +
{header && ( {header} )} -
- {image && ( -
- -
- )} - {title && ( - - {title} - - )} -
+ {(title || image) && ( +
+ {image && ( +
+ +
+ )} + {title && ( + + {title} + + )} +
+ )} {children} @@ -31,10 +34,10 @@ const Card = ({ title, to, image, header, onClick, children }) => ( ) Card.propTypes = { - title: PropTypes.string, + title: PropTypes.node, + header: PropTypes.node, to: PropTypes.string, image: PropTypes.string, - card: PropTypes.node, onClick: PropTypes.func, children: PropTypes.node, } diff --git a/website/src/components/infobox.js b/website/src/components/infobox.js index 496dd2fbe..06c5fbb95 100644 --- a/website/src/components/infobox.js +++ b/website/src/components/infobox.js @@ -37,7 +37,7 @@ Infobox.defaultProps = { } Infobox.propTypes = { - title: PropTypes.string, + title: PropTypes.node, id: PropTypes.string, variant: PropTypes.oneOf(['default', 'warning', 'danger']), className: PropTypes.string, diff --git a/website/src/components/table.js b/website/src/components/table.js index 4d49806ef..1a7d460d0 100644 --- a/website/src/components/table.js +++ b/website/src/components/table.js @@ -29,7 +29,7 @@ function getCellContent(children) { function isDividerRow(children) { if (children.length && children[0].props && children[0].props.name == 'td') { const tdChildren = children[0].props.children - if (!Array.isArray(tdChildren) && tdChildren.props) { + if (tdChildren && !Array.isArray(tdChildren) && tdChildren.props) { return tdChildren.props.name === 'em' } } diff --git a/website/src/images/logos/dvc.svg b/website/src/images/logos/dvc.svg new file mode 100644 index 000000000..258ab1374 --- /dev/null +++ b/website/src/images/logos/dvc.svg @@ -0,0 +1,5 @@ + + + + + diff --git a/website/src/images/logos/fastapi.svg b/website/src/images/logos/fastapi.svg new file mode 100644 index 000000000..bdd514a4b --- /dev/null +++ b/website/src/images/logos/fastapi.svg @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/website/src/images/logos/prodigy.svg b/website/src/images/logos/prodigy.svg new file mode 100644 index 000000000..3f318b793 --- /dev/null +++ b/website/src/images/logos/prodigy.svg @@ -0,0 +1,3 @@ + + + diff --git a/website/src/images/logos/ray.svg b/website/src/images/logos/ray.svg new file mode 100644 index 000000000..3e7390dce --- /dev/null +++ b/website/src/images/logos/ray.svg @@ -0,0 +1,4 @@ + + + + diff --git a/website/src/images/logos/streamlit.svg b/website/src/images/logos/streamlit.svg new file mode 100644 index 000000000..3c55deb55 --- /dev/null +++ b/website/src/images/logos/streamlit.svg @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/website/src/images/logos/wandb.svg b/website/src/images/logos/wandb.svg new file mode 100644 index 000000000..e3f8ea7fa --- /dev/null +++ b/website/src/images/logos/wandb.svg @@ -0,0 +1,28 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/website/src/styles/card.module.sass b/website/src/styles/card.module.sass index d9e0633cf..629607bd5 100644 --- a/website/src/styles/card.module.sass +++ b/website/src/styles/card.module.sass @@ -5,6 +5,15 @@ font: var(--font-size-md)/var(--line-height-md) var(--font-primary) margin-bottom: var(--spacing-sm) +.small + padding: 1.5rem + font-size: var(--font-size-sm) + line-height: var(--line-height-sm) + color: var(--color-dark) + + .title + margin-bottom: var(--spacing-xs) + .image $image-size: 35px width: $image-size diff --git a/website/src/templates/index.js b/website/src/templates/index.js index 7f9314d9d..c97663317 100644 --- a/website/src/templates/index.js +++ b/website/src/templates/index.js @@ -33,6 +33,7 @@ import { YouTube, SoundCloud, Iframe, Image } from '../components/embed' import Alert from '../components/alert' import Search from '../components/search' import Project from '../widgets/project' +import { Integration, IntegrationLogo } from '../widgets/integration' const mdxComponents = { a: Link, @@ -75,6 +76,8 @@ const scopeComponents = { Grid, InlineCode, Project, + Integration, + IntegrationLogo, } const AlertSpace = ({ nightly }) => { diff --git a/website/src/widgets/integration.js b/website/src/widgets/integration.js new file mode 100644 index 000000000..50a84f26c --- /dev/null +++ b/website/src/widgets/integration.js @@ -0,0 +1,46 @@ +import React from 'react' + +import Card from '../components/card' + +import { ReactComponent as DVCLogo } from '../images/logos/dvc.svg' +import { ReactComponent as ProdigyLogo } from '../images/logos/prodigy.svg' +import { ReactComponent as StreamlitLogo } from '../images/logos/streamlit.svg' +import { ReactComponent as FastAPILogo } from '../images/logos/fastapi.svg' +import { ReactComponent as WandBLogo } from '../images/logos/wandb.svg' +import { ReactComponent as RayLogo } from '../images/logos/ray.svg' + +const LOGOS = { + dvc: DVCLogo, + prodigy: ProdigyLogo, + streamlit: StreamlitLogo, + fastapi: FastAPILogo, + wandb: WandBLogo, + ray: RayLogo, +} + +export const IntegrationLogo = ({ name, title, width, height, maxWidth, align, ...props }) => { + const Logo = LOGOS[name] + if (!Logo) throw new Error(`Unknown logo: ${name}`) + const style = { maxWidth, float: align || 'none' } + return ( + + ) +} + +export const Integration = ({ height = 30, url, logo, title, children }) => { + const header = logo && ( + + ) + return ( + + {children} + + ) +}