From 44790c1c32f0ff4884b255b95004fa352d971ffd Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Mon, 6 Jul 2020 18:14:57 +0200 Subject: [PATCH] Update docs and add keyword-only tag --- website/docs/api/data-formats.md | 25 ++++++++++++++- website/docs/api/doc.md | 13 ++++---- website/docs/api/top-level.md | 47 +++++++++++++++++++++++++++- website/docs/usage/training.md | 43 +++++++++++++++++++------ website/src/components/table.js | 12 +++++++ website/src/styles/table.module.sass | 30 ++++++++++++++++++ 6 files changed, 153 insertions(+), 17 deletions(-) diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md index 5b122a2e2..d8abc4a10 100644 --- a/website/docs/api/data-formats.md +++ b/website/docs/api/data-formats.md @@ -2,7 +2,8 @@ title: Data formats teaser: Details on spaCy's input and output data formats menu: - - ['Training data', 'training'] + - ['Training Data', 'training'] + - ['Training Config', 'config'] - ['Vocabulary', 'vocab'] --- @@ -74,6 +75,28 @@ from the English Wall Street Journal portion of the Penn Treebank: https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json ``` +## Training config {#config new="3"} + +Config files define the training process and model pipeline and can be passed to +[`spacy train`](/api/cli#train). They use +[Thinc's configuration system](https://thinc.ai/docs/usage-config) under the +hood. For details on how to use training configs, see the +[usage documentation](/usage/training#config). + + + +The `@` notation lets you refer to function names registered in the +[function registry](/api/top-level#registry). For example, +`@architectures = "spacy.HashEmbedCNN.v1"` refers to a registered function of +the name `"spacy.HashEmbedCNN.v1"` and all other values defined in its block +will be passed into that function as arguments. Those arguments depend on the +registered function. See the [model architectures](/api/architectures) docs for +API details. + + + + + ## Lexical data for vocabulary {#vocab-jsonl new="2"} To populate a model's vocabulary, you can use the diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index b5871f2ab..d0c758d7e 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -30,12 +30,13 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the > doc = Doc(nlp.vocab, words=words, spaces=spaces) > ``` -| Name | Type | Description | -| ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `vocab` | `Vocab` | A storage container for lexical types. | -| `words` | iterable | A list of strings to add to the container. | -| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. | -| **RETURNS** | `Doc` | The newly constructed object. | +| Name | Type | Description | +| -------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | `Vocab` | A storage container for lexical types. | +| _keyword-only_ | | | +| `words` | iterable | A list of strings to add to the container. | +| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. | +| **RETURNS** | `Doc` | The newly constructed object. | ## Doc.\_\_getitem\_\_ {#getitem tag="method"} diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 9094b46d3..c8fea6a34 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -3,6 +3,7 @@ title: Top-level Functions menu: - ['spacy', 'spacy'] - ['displacy', 'displacy'] + - ['registry', 'registry'] - ['Data & Alignment', 'gold'] - ['Utility Functions', 'util'] --- @@ -259,6 +260,48 @@ package can also expose a [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) to add custom labels and their colors automatically. +## registry {#registry source="spacy/util.py" new="3"} + +spaCy's function registry extends +[Thinc's `registry`](https://thinc.ai/docs/api-config#registry) and allows you +to map strings to functions. You can register functions to create architectures, +optimizers, schedules and more, and then refer to them and set their arguments +in your [config file](/usage/training#config). Python type hints are used to +validate the inputs. See the +[Thinc docs](https://thinc.ai/docs/api-config#registry) for details on the +`registry` methods and our helper library +[`catalogue`](https://github.com/explosion/catalogue) for some background on the +concept of function registries. spaCy also uses the function registry for +language subclasses, model architecture, lookups and pipeline component +factories. + + + +> #### Example +> +> ```python +> import spacy +> from thinc.api import Model +> +> @spacy.registry.architectures("CustomNER.v1") +> def custom_ner(n0: int) -> Model: +> return Model("custom", forward, dims={"nO": nO}) +> ``` + +| Registry name | Description | +| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `architectures` | Registry for functions that create [model architectures](/api/architectures). Can be used to register custom model architectures and reference them in the `config.cfg`. | +| `factories` | Registry for functions that create [pipeline components](/usage/processing-pipelines#custom-components). Added automatically when you use the `@spacy.component` decorator and also reads from [entry points](/usage/saving-loading#entry-points) | +| `languages` | Registry for language-specific `Language` subclasses. Automatically reads from [entry points](/usage/saving-loading#entry-points). | +| `lookups` | Registry for large lookup tables available via `vocab.lookups`. | +| `displacy_colors` | Registry for custom color scheme for the [`displacy` NER visualizer](/usage/visualizers). Automatically reads from [entry points](/usage/saving-loading#entry-points). | +| `assets` | | +| `optimizers` | Registry for functions that create [optimizers](https://thinc.ai/docs/api-optimizers). | +| `schedules` | Registry for functions that create [schedules](https://thinc.ai/docs/api-schedules). | +| `layers` | Registry for functions that create [layers](https://thinc.ai/docs/api-layers). | +| `losses` | Registry for functions that create [losses](https://thinc.ai/docs/api-loss). | +| `initializers` | Registry for functions that create [initializers](https://thinc.ai/docs/api-initializers). | + ## Training data and alignment {#gold source="spacy/gold"} ### gold.docs_to_json {#docs_to_json tag="function"} @@ -421,6 +464,8 @@ page should be safe to use and we'll try to ensure backwards compatibility. However, we recommend having additional tests in place if your application depends on any of spaCy's utilities. + + ### util.get_lang_class {#util.get_lang_class tag="function"} Import and load a `Language` class. Allows lazy-loading @@ -705,7 +750,7 @@ of one entity) or when merging spans with | `spans` | iterable | The spans to filter. | | **RETURNS** | list | The filtered spans. | -## util.get_words_and_spaces {#get_words_and_spaces tag="function" new="3"} +### util.get_words_and_spaces {#get_words_and_spaces tag="function" new="3"} diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 2bbf5dddd..73adf4885 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -103,26 +103,38 @@ still look good. > #### Migration from spaCy v2.x > -> TODO: ... +> TODO: once we have an answer for how to update the training command +> (`spacy migrate`?), add details here Training config files include all **settings and hyperparameters** for training your model. Instead of providing lots of arguments on the command line, you only -need to pass your `config.cfg` file to [`spacy train`](/api/cli#train). +need to pass your `config.cfg` file to [`spacy train`](/api/cli#train). Under +the hood, the training config uses the +[configuration system](https://thinc.ai/docs/usage-config) provided by our +machine learning library [Thinc](https://thinc.ai). This also makes it easy to +integrate custom models and architectures, written in your framework of choice. +Some of the main advantages and features of spaCy's training config are: -To read more about how the config system works under the hood, check out the -[Thinc documentation](https://thinc.ai/docs/usage-config). - -- **Structured sections.** +- **Structured sections.** The config is grouped into sections, and nested + sections are defined using the `.` notation. For example, `[nlp.pipeline.ner]` + defines the settings for the pipeline's named entity recognizer. The config + can be loaded as a Python dict. - **References to registered functions.** Sections can refer to registered functions like [model architectures](/api/architectures), [optimizers](https://thinc.ai/docs/api-optimizers) or [schedules](https://thinc.ai/docs/api-schedules) and define arguments that are passed into them. You can also register your own functions to define - [custom architectures](#custom-models), reference them in your config, + [custom architectures](#custom-models), reference them in your config and + tweak their parameters. - **Interpolation.** If you have hyperparameters used by multiple components, define them once and reference them as variables. - - +- **Reproducibility with no hidden defaults.** The config file is the "single + source of truth" and includes all settings. +- **Automated checks and validation.** When you load a config, spaCy checks if + the settings are complete and if all values have the correct types. This lets + you catch potential mistakes early. In your custom architectures, you can use + Python [type hints](https://docs.python.org/3/library/typing.html) to tell the + config which types of data to expect. @@ -181,6 +193,19 @@ pretrained_vectors = null dropout = null ``` + + + + +For a full overview of spaCy's config format and settings, see the +[training format documentation](/api/data-formats#config). The settings +available for the different architectures are documented with the +[model architectures API](/api/architectures). See the Thinc documentation for +[optimizers](https://thinc.ai/docs/api-optimizers) and +[schedules](https://thinc.ai/docs/api-schedules). + + + ### Model architectures {#model-architectures} diff --git a/website/src/components/table.js b/website/src/components/table.js index 85b8e2144..ee0f5b1b1 100644 --- a/website/src/components/table.js +++ b/website/src/components/table.js @@ -26,6 +26,16 @@ function getCellContent(children) { return children } +function isDividerRow(children) { + if (children.length && children[0].props.name == 'td') { + const tdChildren = children[0].props.children + if (!Array.isArray(tdChildren)) { + return tdChildren.props.name === 'em' + } + } + return false +} + function isFootRow(children) { const rowRegex = /^(RETURNS|YIELDS|CREATES|PRINTS)/ if (children.length && children[0].props.name === 'td') { @@ -53,9 +63,11 @@ export const Th = props => export const Tr = ({ evenodd = true, children, ...props }) => { const foot = isFootRow(children) + const isDivider = isDividerRow(children) const trClasssNames = classNames({ [classes.tr]: evenodd, [classes.footer]: foot, + [classes.divider]: isDivider, 'table-footer': foot, }) diff --git a/website/src/styles/table.module.sass b/website/src/styles/table.module.sass index 68cc4bace..7a82a26fe 100644 --- a/website/src/styles/table.module.sass +++ b/website/src/styles/table.module.sass @@ -49,6 +49,36 @@ border-bottom: 2px solid var(--color-theme) vertical-align: bottom +.divider + height: 0 + border-bottom: 1px solid var(--color-subtle) + + td + top: -1px + height: 0 + position: relative + padding: 0 !important + + & + tr td + padding-top: 12px + + td em + position: absolute + top: -5px + left: 10px + display: inline-block + background: var(--color-theme) + color: var(--color-back) + padding: 0 5px 1px + font-size: 0.85rem + text-transform: uppercase + font-weight: bold + border: 0 + border-radius: 1em + font-style: normal + white-space: nowrap + z-index: 5 + // Responsive table // Shadows adapted from "CSS only Responsive Tables" by David Bushell // http://codepen.io/dbushell/pen/wGaamR