diff --git a/website/docs/api/cython.md b/website/docs/api/cython.md index f91909747..d7c03cf41 100644 --- a/website/docs/api/cython.md +++ b/website/docs/api/cython.md @@ -122,7 +122,7 @@ where the rescuers keep passing out from low oxygen, causing another rescuer to follow — only to succumb themselves. In short, just say no to optimizing your Python. If it's not fast enough the first time, just switch to Cython. - + - [Official Cython documentation](http://docs.cython.org/en/latest/) (cython.org) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index d8abc4a10..10fef6ba6 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -85,7 +85,7 @@ hood. For details on how to use training configs, see the -The `@` notation lets you refer to function names registered in the +The `@` syntax lets you refer to function names registered in the [function registry](/api/top-level#registry). For example, `@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of the name `"spacy.HashEmbedCNN.v1"` and all other values defined in its block @@ -96,6 +96,7 @@ API details. + ## Lexical data for vocabulary {#vocab-jsonl new="2"} diff --git a/website/docs/models/index.md b/website/docs/models/index.md index 10910b93b..8d8e0374e 100644 --- a/website/docs/models/index.md +++ b/website/docs/models/index.md @@ -27,7 +27,7 @@ import QuickstartModels from 'widgets/quickstart-models.js' - + For more details on how to use models with spaCy, see the [usage guide on models](/usage/models). diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md index 99612a6bb..9c028ce61 100644 --- a/website/docs/usage/linguistic-features.md +++ b/website/docs/usage/linguistic-features.md @@ -28,7 +28,7 @@ import PosDeps101 from 'usage/101/\_pos-deps.md' - + For a list of the fine-grained and coarse-grained part-of-speech tags assigned by spaCy's models across different languages, see the label schemes documented @@ -287,7 +287,7 @@ for token in doc: | their | `ADJ` | `poss` | requests | | requests | `NOUN` | `dobj` | submit | - + For a list of the syntactic dependency labels assigned by spaCy's models across different languages, see the label schemes documented in the @@ -615,7 +615,7 @@ tokens containing periods intact (abbreviations like "U.S."). ![Language data architecture](../images/language_data.svg) - + For more details on the language-specific data, see the usage guide on [adding languages](/usage/adding-languages). diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index 8157e2c07..4c8bc1664 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -338,7 +338,7 @@ nlp = spacy.load("/path/to/en_core_web_sm") # load package from a directory doc = nlp("This is a sentence.") ``` - + You can use the [`info`](/api/cli#info) command or [`spacy.info()`](/api/top-level#spacy.info) method to print a model's meta data diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md index 32d6bf7a2..fc335ac5d 100644 --- a/website/docs/usage/processing-pipelines.md +++ b/website/docs/usage/processing-pipelines.md @@ -34,7 +34,7 @@ texts = ["This is a text", "These are lots of texts", "..."] + docs = list(nlp.pipe(texts)) ``` - + - Process the texts **as a stream** using [`nlp.pipe`](/api/language#pipe) and buffer them in batches, instead of one-by-one. This is usually much more @@ -912,7 +912,7 @@ new_heads = [head - i - 1 if head != 0 else 0 for i, head in enumerate(heads)] - + For more details on how to write and package custom components, make them available to spaCy via entry points and implement your own serialization diff --git a/website/docs/usage/projects.md b/website/docs/usage/projects.md index 2631f1438..5c2c84d79 100644 --- a/website/docs/usage/projects.md +++ b/website/docs/usage/projects.md @@ -1,5 +1,158 @@ --- title: Projects +new: 3 +menu: + - ['Intro & Workflow', 'intro'] + - ['Directory & Assets', 'directory'] + - ['Custom Projects', 'custom'] --- -TODO: write +> #### Project templates +> +> Our [`projects`](https://github.com/explosion/projects) repo includes various +> project templates for different tasks and models that you can clone and run. + + + +spaCy projects let you manage and share **end-to-end spaCy workflows** for +training, packaging and serving your custom models. You can start off by cloning +a pre-defined project template, adjust it to fit your needs, load in your data, +train a model, export it as a Python package and share the project templates +with your team. Under the hood, project use +[Data Version Control](https://dvc.org) (DVC) to track and version inputs and +outputs, and make sure you're only re-running what's needed. spaCy projects can +be used via the new [`spacy project`](/api/cli#project) command. For an overview +of the available project templates, check out the +[`projects`](https://github.com/explosion/projects) repo. + +## Introduction and workflow {#intro} + + + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + + +### 1. Clone a project template {#clone} + +The [`spacy project clone`](/api/cli#project-clone) command clones an existing +project template and copies the files to a local directory. You can then run the +project, e.g. to train a model and edit the commands and scripts to build fully +custom workflows. + +> #### Cloning under the hood +> +> To clone a project, spaCy calls into `git` and uses the "sparse checkout" +> feature to only clone the relevant directory or directories. + +```bash +$ python -m spacy clone some_example_project +``` + +By default, the project will be cloned into the current working directory. You +can specify an optional second argument to define the output directory. The +`--repo` option lets you define a custom repo to clone from, if you don't want +to use the spaCy [`projects`](https://github.com/explosion/projects) repo. You +can also use any private repo you have access to with Git. + +If you plan on making the project a Git repo, you can set the `--git` flag to +set it up automatically _before_ initializing DVC, so DVC can integrate with +Git. This means that it will automatically add asset files to a `.gitignore` (so +you never check assets into the repo, only the asset meta files). + +### 2. Fetch the project assets {#assets} + +Assets are data files your project needs – for example, the training and +evaluation data or pretrained vectors and embeddings to initialize your model +with. + +```bash +cd some_example_project +python -m spacy project assets +``` + +### 3. Run the steps {#run-all} + +```bash +$ python -m spacy project run-all +``` + +### 4. Run single commands {#run} + +```bash +$ python -m spacy project run visualize +``` + +## Project directory and assets {#directory} + +### project.yml {#project-yml} + +The project config, `project.yml`, defines the assets a project depends on, like +datasets and pretrained weights, as well as a series of commands that can be run +separately or as a pipeline – for instance, to preprocess the data, convert it +to spaCy's format, train a model, evaluate it and export metrics, package it and +spin up a quick web demo. It looks pretty similar to a config file used to +define CI pipelines. + + + +### Files and directory structure {#project-files} + +A project directory created by [`spacy project clone`](/api/cli#project-clone) +includes the following files and directories. They can optionally be +pre-populated by a project template (most commonly used for metas, configs or +scripts). + +```yaml +### Project directory +├── project.yml # the project configuration +├── dvc.yaml # auto-generated Data Version Control config +├── dvc.lock # auto-generated Data Version control lock file +├── assets/ # downloaded data assets and DVC meta files +├── metrics/ # output directory for evaluation metrics +├── training/ # output directory for trained models +├── corpus/ # output directory for training corpus +├── packages/ # output directory for model Python packages +├── metrics/ # output directory for evaluation metrics +├── notebooks/ # directory for Jupyter notebooks +├── scripts/ # directory for scripts, e.g. referenced in commands +├── metas/ # model meta.json templates used for packaging +├── configs/ # model config.cfg files used for training +└── ... # any other files, like a requirements.txt etc. +``` + +When the project is initialized, spaCy will auto-generate a `dvc.yaml` based on +the project config. The file is updated whenever the project config has changed +and includes all commands defined in the `run` section of the project config. +This allows DVC to track the inputs and outputs and know which steps need to be +re-run. + +#### Why Data Version Control (DVC)? + +Data assets like training corpora or pretrained weights are at the core of any +NLP project, but they're often difficult to manage: you can't just check them +into your Git repo to version and keep track of them. And if you have multiple +steps that depend on each other, like a preprocessing step that generates your +training data, you need to make sure the data is always up-to-date, and re-run +all steps of your process every time, just to be safe. + +[Data Version Control (DVC)](https://dvc.org) is a standalone open-source tool +that integrates into your workflow like Git, builds a dependency graph for your +data pipelines and tracks and caches your data files. If you're downloading data +from an external source, like a storage bucket, DVC can tell whether the +resource has changed. It can also determine whether to re-run a step, depending +on whether its input have changed or not. All metadata can be checked into a Git +repo, so you'll always be able to reproduce your experiments. `spacy project` +uses DVC under the hood and you typically don't have to think about it if you +don't want to. But if you do want to integrate with DVC more deeply, you can. +Each spaCy project is also a regular DVC project. + +#### Checking projects into Git + +--- + +## Custom projects and scripts {#custom} diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md index e89e41586..392bcf0c0 100644 --- a/website/docs/usage/rule-based-matching.md +++ b/website/docs/usage/rule-based-matching.md @@ -552,7 +552,7 @@ component with different patterns, depending on your application: html_merger = BadHTMLMerger(nlp, path="/path/to/patterns.json") ``` - + For more details and examples of how to **create custom pipeline components** and **extension attributes**, see the diff --git a/website/docs/usage/spacy-101.md b/website/docs/usage/spacy-101.md index 0cfe404f2..245d4ef42 100644 --- a/website/docs/usage/spacy-101.md +++ b/website/docs/usage/spacy-101.md @@ -198,7 +198,7 @@ import Tokenization101 from 'usage/101/\_tokenization.md' - + To learn more about how spaCy's tokenization rules work in detail, how to **customize and replace** the default tokenizer and how to **add @@ -214,7 +214,7 @@ import PosDeps101 from 'usage/101/\_pos-deps.md' - + To learn more about **part-of-speech tagging** and rule-based morphology, and how to **navigate and use the parse tree** effectively, see the usage guides on @@ -229,7 +229,7 @@ import NER101 from 'usage/101/\_named-entities.md' - + To learn more about entity recognition in spaCy, how to **add your own entities** to a document and how to **train and update** the entity predictions @@ -245,7 +245,7 @@ import Vectors101 from 'usage/101/\_vectors-similarity.md' - + To learn more about word vectors, how to **customize them** and how to load **your own vectors** into spaCy, see the usage guide on @@ -259,7 +259,7 @@ import Pipelines101 from 'usage/101/\_pipelines.md' - + To learn more about **how processing pipelines work** in detail, how to enable and disable their components, and how to **create your own**, see the usage @@ -458,7 +458,7 @@ import Serialization101 from 'usage/101/\_serialization.md' - + To learn more about how to **save and load your own models**, see the usage guide on [saving and loading](/usage/saving-loading#models). @@ -471,7 +471,7 @@ import Training101 from 'usage/101/\_training.md' - + To learn more about **training and updating** models, how to create training data and how to improve spaCy's named entity recognition models, see the usage @@ -485,14 +485,6 @@ import LanguageData101 from 'usage/101/\_language-data.md' - - -To learn more about the individual components of the language data and how to -**add a new language** to spaCy in preparation for training a language model, -see the usage guide on [adding languages](/usage/adding-languages). - - - ## Lightning tour {#lightning-tour} The following examples and code snippets give you an overview of spaCy's diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 73adf4885..fd755c58b 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -4,8 +4,8 @@ next: /usage/projects menu: - ['Introduction', 'basics'] - ['CLI & Config', 'cli-config'] - - ['Custom Models', 'custom-models'] - ['Transfer Learning', 'transfer-learning'] + - ['Custom Models', 'custom-models'] - ['Parallel Training', 'parallel-training'] - ['Internal API', 'api'] --- @@ -195,7 +195,7 @@ dropout = null - + For a full overview of spaCy's config format and settings, see the [training format documentation](/api/data-formats#config). The settings @@ -206,26 +206,47 @@ available for the different architectures are documented with the +#### Using registered functions {#config-functions} + +The training configuration defined in the config file doesn't have to only +consist of static values. Some settings can also be **functions**. For instance, +the `batch_size` can be a number that doesn't change, or a schedule, like a +sequence of compounding values, which has shown to be an effective trick (see +[Smith et al., 2017](https://arxiv.org/abs/1711.00489)). + +```ini +### With static value +[training] +batch_size = 128 +``` + +To refer to a function instead, you can make `[training.batch_size]` its own +section and use the `@` syntax specify the function and its arguments – in this +case [`compounding.v1`](https://thinc.ai/docs/api-schedules#compounding) defined +in the [function registry](/api/top-level#registry). All other values defined in +the block are passed to the function as keyword arguments when it's initialized. +You can also use this mechanism to register +[custom implementations and architectures](#custom-models) and reference them +from your configs. + +> #### TODO +> +> TODO: something about how the tree is built bottom-up? + +```ini +### With registered function +[training.batch_size] +@schedules = "compounding.v1" +start = 100 +stop = 1000 +compound = 1.001 +``` + ### Model architectures {#model-architectures} -## Custom model implementations and architectures {#custom-models} - - - - - -Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum -sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat -mattis pretium. - - - -### Training with custom code - - - + ## Transfer learning {#transfer-learning} @@ -245,6 +266,101 @@ visualize your model. +## Custom model implementations and architectures {#custom-models} + + + +### Training with custom code {#custom-code} + +The [`spacy train`](/api/cli#train) recipe lets you specify an optional argument +`--code` that points to a Python file. The file is imported before training and +allows you to add custom functions and architectures to the function registry +that can then be referenced from your `config.cfg`. This lets you train spaCy +models with custom components, without having to re-implement the whole training +workflow. + +For example, let's say you've implemented your own batch size schedule to use +during training. The `@spacy.registry.schedules` decorator lets you register +that function in the `schedules` [registry](/api/top-level#registry) and assign +it a string name: + +> #### Why the version in the name? +> +> A big benefit of the config system is that it makes your experiments +> reproducible. We recommend versioning the functions you register, especially +> if you expect them to change (like a new model architecture). This way, you +> know that a config referencing `v1` means a different function than a config +> referencing `v2`. + +```python +### functions.py +import spacy + +@spacy.registry.schedules("my_custom_schedule.v1") +def my_custom_schedule(start: int = 1, factor: int = 1.001): + while True: + yield start + start = start * factor +``` + +In your config, you can now reference the schedule in the +`[training.batch_size]` block via `@schedules`. If a block contains a key +starting with an `@`, it's interpreted as a reference to a function. All other +settings in the block will be passed to the function as keyword arguments. Keep +in mind that the config shouldn't have any hidden defaults and all arguments on +the functions need to be represented in the config. + + + +```ini +### config.cfg (excerpt) +[training.batch_size] +@schedules = "my_custom_schedule.v1" +start = 2 +factor = 1.005 +``` + +You can now run [`spacy train`](/api/cli#train) with the `config.cfg` and your +custom `functions.py` as the argument `--code`. Before loading the config, spaCy +will import the `functions.py` module and your custom functions will be +registered. + +```bash +### Training with custom code {wrap="true"} +python -m spacy train train.spacy dev.spacy config.cfg --output ./output --code ./functions.py +``` + + + +spaCy's configs are powered by our machine learning library Thinc's +[configuration system](https://thinc.ai/docs/usage-config), which supports +[type hints](https://docs.python.org/3/library/typing.html) and even +[advanced type annotations](https://thinc.ai/docs/usage-config#advanced-types) +using [`pydantic`](https://github.com/samuelcolvin/pydantic). If your registered +function provides For example, `start: int` in the example above will ensure +that the value received as the argument `start` is an integer. If the value +can't be cast to an integer, spaCy will raise an error. +`start: pydantic.StrictInt` will force the value to be an integer and raise an +error if it's not – for instance, if your config defines a float. + + + +### Defining custom architectures {#custom-architectures} + + + +### Wrapping PyTorch and TensorFlow {#custom-frameworks} + + + + + +Lorem ipsum dolor sit amet, consectetur adipiscing elit. Phasellus interdum +sodales lectus, ut sodales orci ullamcorper id. Sed condimentum neque ut erat +mattis pretium. + + + ## Parallel Training with Ray {#parallel-training} diff --git a/website/docs/usage/vectors-embeddings.md b/website/docs/usage/vectors-embeddings.md index 49b651d9e..c3a73d4db 100644 --- a/website/docs/usage/vectors-embeddings.md +++ b/website/docs/usage/vectors-embeddings.md @@ -186,7 +186,7 @@ underlying [`Lexeme`](/api/lexeme), while [`Doc.vector`](/api/doc#vector) and tokens. You can customize these behaviors by modifying the `doc.user_hooks`, `doc.user_span_hooks` and `doc.user_token_hooks` dictionaries. - + For more details on **adding hooks** and **overwriting** the built-in `Doc`, `Span` and `Token` methods, see the usage guide on diff --git a/website/src/components/infobox.js b/website/src/components/infobox.js index 6af24a6ca..496dd2fbe 100644 --- a/website/src/components/infobox.js +++ b/website/src/components/infobox.js @@ -5,7 +5,7 @@ import classNames from 'classnames' import Icon from './icon' import classes from '../styles/infobox.module.sass' -const Infobox = ({ title, id, variant, className, children }) => { +const Infobox = ({ title, emoji, id, variant, className, children }) => { const infoboxClassNames = classNames(classes.root, className, { [classes.warning]: variant === 'warning', [classes.danger]: variant === 'danger', @@ -17,7 +17,14 @@ const Infobox = ({ title, id, variant, className, children }) => { {variant !== 'default' && ( )} - {title} + + {emoji && ( + + )} + {title} + )} {children} diff --git a/website/src/components/table.js b/website/src/components/table.js index ee0f5b1b1..4d49806ef 100644 --- a/website/src/components/table.js +++ b/website/src/components/table.js @@ -27,9 +27,9 @@ function getCellContent(children) { } function isDividerRow(children) { - if (children.length && children[0].props.name == 'td') { + if (children.length && children[0].props && children[0].props.name == 'td') { const tdChildren = children[0].props.children - if (!Array.isArray(tdChildren)) { + if (!Array.isArray(tdChildren) && tdChildren.props) { return tdChildren.props.name === 'em' } } diff --git a/website/src/styles/infobox.module.sass b/website/src/styles/infobox.module.sass index 2be59f33b..baf9919c3 100644 --- a/website/src/styles/infobox.module.sass +++ b/website/src/styles/infobox.module.sass @@ -31,6 +31,9 @@ position: relative bottom: -2px +.emoji + margin-right: 0.65em + .warning --color-theme: var(--color-yellow-dark) --color-theme-dark: var(--color-yellow-dark) diff --git a/website/src/styles/layout.sass b/website/src/styles/layout.sass index 56f1a5aa6..4b63324b9 100644 --- a/website/src/styles/layout.sass +++ b/website/src/styles/layout.sass @@ -25,7 +25,7 @@ --line-height-sm: 1.375 --line-height-md: 1.5 --line-height-lg: 1.9 - --line-height-code: 1.8 + --line-height-code: 1.7 // Spacing --spacing-xs: 1rem @@ -271,7 +271,7 @@ body color: var(--color-front) p - margin-bottom: var(--spacing-md) + margin-bottom: var(--spacing-sm) font-family: var(--font-primary) font-size: var(--font-size-md) line-height: var(--line-height-md) diff --git a/website/src/widgets/project.js b/website/src/widgets/project.js index f1c18cf7a..d46472706 100644 --- a/website/src/widgets/project.js +++ b/website/src/widgets/project.js @@ -15,14 +15,14 @@ const Project = ({ id, repo, children }) => { const url = `${repo || DEFAULT_REPO}/${id}` const title = ( <> - 🪐 Get started with a project template:{' '} + Get started with a project template:{' '} {id} ) return ( - + {children}