From 1e0d54edd1fb3bbe9c16c741a63dbc00a41d84e6 Mon Sep 17 00:00:00 2001 From: Ines Montani Date: Sat, 4 Jul 2020 14:23:10 +0200 Subject: [PATCH] Update docs --- website/docs/api/cli.md | 126 ++++++++-------- website/docs/api/corpus.md | 37 +++++ website/docs/api/doc.md | 32 ++--- website/docs/api/goldcorpus.md | 24 ---- website/docs/api/sentencerecognizer.md | 29 ++++ website/docs/api/sentencizer.md | 35 ++--- website/docs/api/span.md | 10 +- website/docs/api/token.md | 8 +- website/docs/api/top-level.md | 167 +++++++++++++++++++++- website/docs/usage/101/_architecture.md | 2 + website/docs/usage/models.md | 4 +- website/docs/usage/saving-loading.md | 65 +-------- website/docs/usage/training.md | 9 ++ website/meta/sidebars.json | 8 +- website/src/components/code.js | 3 +- website/src/widgets/quickstart-install.js | 2 +- 16 files changed, 354 insertions(+), 207 deletions(-) create mode 100644 website/docs/api/corpus.md delete mode 100644 website/docs/api/goldcorpus.md create mode 100644 website/docs/api/sentencerecognizer.md diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md index c90d7c69c..8dccad165 100644 --- a/website/docs/api/cli.md +++ b/website/docs/api/cli.md @@ -13,6 +13,7 @@ menu: - ['Init Model', 'init-model'] - ['Evaluate', 'evaluate'] - ['Package', 'package'] + - ['Project', 'project'] --- For a list of available commands, type `spacy --help`. @@ -95,26 +96,29 @@ $ python -m spacy validate ## Convert {#convert} -Convert files into spaCy's [JSON format](/api/annotation#json-input) for use -with the `train` command and other experiment management functions. The -converter can be specified on the command line, or chosen based on the file -extension of the input file. +Convert files into spaCy's +[binary training data format](/usage/training#data-format), a serialized +[`DocBin`](/api/docbin), for use with the `train` command and other experiment +management functions. The converter can be specified on the command line, or +chosen based on the file extension of the input file. ```bash -$ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter] -[--n-sents] [--morphology] [--lang] +$ python -m spacy convert [input_file] [output_dir] [--converter] +[--file-type] [--n-sents] [--seg-sents] [--model] [--morphology] +[--merge-subtokens] [--ner-map] [--lang] ``` | Argument | Type | Description | | ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------ | | `input_file` | positional | Input file. | | `output_dir` | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`. | -| `--file-type`, `-t` 2.1 | option | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. | | `--converter`, `-c` 2 | option | Name of converter to use (see below). | +| `--file-type`, `-t` 2.1 | option | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. | | `--n-sents`, `-n` | option | Number of sentences per document. | | `--seg-sents`, `-s` 2.2 | flag | Segment sentences (for `-c ner`) | | `--model`, `-b` 2.2 | option | Model for parser-based sentence segmentation (for `-s`) | | `--morphology`, `-m` | option | Enable appending morphology to tags. | +| `--ner-map`, `-nm` | option | NER tag mapping (as JSON-encoded dict of entity types). | | `--lang`, `-l` 2.1 | option | Language code (if tokenizer required). | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | binary | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train). | @@ -136,20 +140,21 @@ stats, and find problems like invalid entity annotations, cyclic dependencies, low data labels and more. ```bash -$ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pipeline] [--ignore-warnings] [--verbose] [--no-format] +$ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] +[--pipeline] [--tag-map-path] [--ignore-warnings] [--verbose] [--no-format] ``` -| Argument | Type | Description | -| ------------------------------------------------------ | ---------- | -------------------------------------------------------------------------------------------------- | -| `lang` | positional | Model language. | -| `train_path` | positional | Location of JSON-formatted training data. Can be a file or a directory of files. | -| `dev_path` | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | -| `--tag-map-path`, `-tm` 2.2.4 | option | Location of JSON-formatted tag map. | -| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | -| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | -| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | -| `--verbose`, `-V` | flag | Print additional information and explanations. | -| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | +| Argument | Type | Description | +| ------------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------- | +| `lang` | positional | Model language. | +| `train_path` | positional | Location of [binary training data](/usage/training#data-format). Can be a file or a directory of files. | +| `dev_path` | positional | Location of [binary development data](/usage/training#data-format) for evaluation. Can be a file or a directory of files. | +| `--tag-map-path`, `-tm` 2.2.4 | option | Location of JSON-formatted tag map. | +| `--base-model`, `-b` | option | Optional name of base model to update. Can be any loadable spaCy model. | +| `--pipeline`, `-p` | option | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`. | +| `--ignore-warnings`, `-IW` | flag | Ignore warnings, only show stats and errors. | +| `--verbose`, `-V` | flag | Print additional information and explanations. | +| `--no-format`, `-NF` | flag | Don't pretty-print the results. Use this if you want to write to a file. | @@ -292,6 +297,8 @@ will not be available. ## Train {#train} + + Train a model. Expects data in spaCy's [JSON format](/api/annotation#json-input). On each epoch, a model will be saved out to the directory. Accuracy scores and model details will be added to a @@ -345,47 +352,10 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | model, pickle | A spaCy model on each epoch. | -### Environment variables for hyperparameters {#train-hyperparams new="2"} - -spaCy lets you set hyperparameters for training via environment variables. For -example: - -```bash -$ token_vector_width=256 learn_rate=0.0001 spacy train [...] -``` - -> #### Usage with alias -> -> Environment variables keep the command simple and allow you to to -> [create an alias](https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537) -> for your custom `train` command while still being able to easily tweak the -> hyperparameters. -> -> ```bash -> alias train-parser="python -m spacy train en /output /data /train /dev -n 1000" -> token_vector_width=256 train-parser -> ``` - -| Name | Description | Default | -| -------------------- | --------------------------------------------------- | ------- | -| `dropout_from` | Initial dropout rate. | `0.2` | -| `dropout_to` | Final dropout rate. | `0.2` | -| `dropout_decay` | Rate of dropout change. | `0.0` | -| `batch_from` | Initial batch size. | `1` | -| `batch_to` | Final batch size. | `64` | -| `batch_compound` | Rate of batch size acceleration. | `1.001` | -| `token_vector_width` | Width of embedding tables and convolutional layers. | `128` | -| `embed_size` | Number of rows in embedding tables. | `7500` | -| `hidden_width` | Size of the parser's and NER's hidden layers. | `128` | -| `learn_rate` | Learning rate. | `0.001` | -| `optimizer_B1` | Momentum for the Adam solver. | `0.9` | -| `optimizer_B2` | Adagrad-momentum for the Adam solver. | `0.999` | -| `optimizer_eps` | Epsilon value for the Adam solver. | `1e-08` | -| `L2_penalty` | L2 regularization penalty. | `1e-06` | -| `grad_norm_clip` | Gradient L2 norm constraint. | `1.0` | - ## Pretrain {#pretrain new="2.1" tag="experimental"} + + Pre-train the "token to vector" (`tok2vec`) layer of pipeline components, using an approximate language-modeling objective. Specifically, we load pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which @@ -491,6 +461,8 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] ## Evaluate {#evaluate new="2"} + + Evaluate a model's accuracy and speed on JSON-formatted annotated data. Will print the results and optionally export [displaCy visualizations](/usage/visualizers) of a sample set of parses to @@ -516,12 +488,20 @@ $ python -m spacy evaluate [model] [data_path] [--displacy-path] [--displacy-lim ## Package {#package} -Generate a [model Python package](/usage/training#models-generating) from an -existing model data directory. All data files are copied over. If the path to a -`meta.json` is supplied, or a `meta.json` is found in the input directory, this -file is used. Otherwise, the data can be entered directly from the command line. -After packaging, you can run `python setup.py sdist` from the newly created -directory to turn your model into an installable archive file. +Generate an installable +[model Python package](/usage/training#models-generating) from an existing model +data directory. All data files are copied over. If the path to a `meta.json` is +supplied, or a `meta.json` is found in the input directory, this file is used. +Otherwise, the data can be entered directly from the command line. spaCy will +then create a `.tar.gz` archive file that you can distribute and install with +`pip install`. + + + +The `spacy package` command now also builds the `.tar.gz` archive automatically, +so you don't have to run `python setup.py sdist` separately anymore. + + ```bash $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force] @@ -531,7 +511,6 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] ### Example python -m spacy package /input /output cd /output/en_model-0.0.0 -python setup.py sdist pip install dist/en_model-0.0.0.tar.gz ``` @@ -541,6 +520,23 @@ pip install dist/en_model-0.0.0.tar.gz | `output_dir` | positional | Directory to create package folder in. | | `--meta-path`, `-m` 2 | option | Path to `meta.json` file (optional). | | `--create-meta`, `-c` 2 | flag | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. | +| `--version`, `-v` 3 | option | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template. | | `--force`, `-f` | flag | Force overwriting of existing folder in output directory. | | `--help`, `-h` | flag | Show help message and available arguments. | | **CREATES** | directory | A Python package containing the spaCy model. | + +## Project {#project} + + + +### project clone {#project-clone} + +### project assets {#project-assets} + +### project run-all {#project-run-all} + +### project run {#project-run} + +### project init {#project-init} + +### project update-dvc {#project-update-dvc} diff --git a/website/docs/api/corpus.md b/website/docs/api/corpus.md new file mode 100644 index 000000000..3256849c3 --- /dev/null +++ b/website/docs/api/corpus.md @@ -0,0 +1,37 @@ +--- +title: Corpus +teaser: An annotated corpus +tag: class +source: spacy/gold/corpus.py +new: 3 +--- + +This class manages annotated corpora and can read training and development +datasets in the [DocBin](/api/docbin) (`.spacy`) format. + +## Corpus.\_\_init\_\_ {#init tag="method"} + +Create a `Corpus`. The input data can be a file or a directory of files. + +| Name | Type | Description | +| ----------- | ------------ | ---------------------------------------------------------------- | +| `train` | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files). | +| `dev` | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). | +| `limit` | int | Maximum number of examples returned. | +| **RETURNS** | `Corpus` | The newly constructed object. | + + + +## Corpus.walk_corpus {#walk_corpus tag="staticmethod"} + +## Corpus.make_examples {#make_examples tag="method"} + +## Corpus.make_examples_gold_preproc {#make_examples_gold_preproc tag="method"} + +## Corpus.read_docbin {#read_docbin tag="method"} + +## Corpus.count_train {#count_train tag="method"} + +## Corpus.train_dataset {#train_dataset tag="method"} + +## Corpus.dev_dataset {#dev_dataset tag="method"} diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md index 3b31b2c80..b5871f2ab 100644 --- a/website/docs/api/doc.md +++ b/website/docs/api/doc.md @@ -123,7 +123,7 @@ details, see the documentation on | Name | Type | Description | | --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `doc._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -140,8 +140,8 @@ Look up a previously registered extension by name. Returns a 4-tuple > > ```python > from spacy.tokens import Doc -> Doc.set_extension('has_city', default=False) -> extension = Doc.get_extension('has_city') +> Doc.set_extension("has_city", default=False) +> extension = Doc.get_extension("has_city") > assert extension == (False, None, None, None) > ``` @@ -158,8 +158,8 @@ Check whether an extension has been registered on the `Doc` class. > > ```python > from spacy.tokens import Doc -> Doc.set_extension('has_city', default=False) -> assert Doc.has_extension('has_city') +> Doc.set_extension("has_city", default=False) +> assert Doc.has_extension("has_city") > ``` | Name | Type | Description | @@ -175,9 +175,9 @@ Remove a previously registered extension. > > ```python > from spacy.tokens import Doc -> Doc.set_extension('has_city', default=False) -> removed = Doc.remove_extension('has_city') -> assert not Doc.has_extension('has_city') +> Doc.set_extension("has_city", default=False) +> removed = Doc.remove_extension("has_city") +> assert not Doc.has_extension("has_city") > ``` | Name | Type | Description | @@ -204,7 +204,7 @@ the character indices don't map to a valid span. | `end` | int | The index of the last character after the span. | | `label` | uint64 / str | A label to attach to the span, e.g. for named entities. | | `kb_id` 2.2 | uint64 / str | An ID from a knowledge base to capture the meaning of a named entity. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | | **RETURNS** | `Span` | The newly constructed object or `None`. | ## Doc.similarity {#similarity tag="method" model="vectors"} @@ -264,7 +264,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. | Name | Type | Description | | ----------- | -------------------------------------- | ----------------------------------------------- | -| **RETURNS** | `numpy.ndarray[ndim=2, dtype='int32']` | The lowest common ancestor matrix of the `Doc`. | +| **RETURNS** | `numpy.ndarray[ndim=2, dtype="int32"]` | The lowest common ancestor matrix of the `Doc`. | ## Doc.to_json {#to_json tag="method" new="2.1"} @@ -303,7 +303,7 @@ Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence of `M` attributes, the output array will be of shape `(N, M)`, where `N` is the length of the `Doc` (in tokens). If `attr_ids` is a single attribute, the output shape will be `(N,)`. You can specify attributes by integer ID (e.g. -`spacy.attrs.LEMMA`) or string name (e.g. 'LEMMA' or 'lemma'). The values will +`spacy.attrs.LEMMA`) or string name (e.g. "LEMMA" or "lemma"). The values will be 64-bit integers. Returns a 2D array with one row per token and one column per attribute (when @@ -323,7 +323,7 @@ Returns a 2D array with one row per token and one column per attribute (when | Name | Type | Description | | ----------- | ---------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | | `attr_ids` | list or int or string | A list of attributes (int IDs or string names) or a single attribute (int ID or string name) | -| **RETURNS** | `numpy.ndarray[ndim=2, dtype='uint64']` or `numpy.ndarray[ndim=1, dtype='uint64']` | The exported attributes as a numpy array. | +| **RETURNS** | `numpy.ndarray[ndim=2, dtype="uint64"]` or `numpy.ndarray[ndim=1, dtype="uint64"]` | The exported attributes as a numpy array. | ## Doc.from_array {#from_array tag="method"} @@ -345,14 +345,14 @@ array of attributes. | Name | Type | Description | | ----------- | -------------------------------------- | ------------------------------------------------------------------------- | | `attrs` | list | A list of attribute ID ints. | -| `array` | `numpy.ndarray[ndim=2, dtype='int32']` | The attribute values to load. | +| `array` | `numpy.ndarray[ndim=2, dtype="int32"]` | The attribute values to load. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | **RETURNS** | `Doc` | Itself. | - ## Doc.from_docs {#from_docs tag="staticmethod"} -Concatenate multiple `Doc` objects to form a new one. Raises an error if the `Doc` objects do not all share the same `Vocab`. +Concatenate multiple `Doc` objects to form a new one. Raises an error if the +`Doc` objects do not all share the same `Vocab`. > #### Example > @@ -634,7 +634,7 @@ vectors. | Name | Type | Description | | ----------- | ---------------------------------------- | ------------------------------------------------------- | -| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A 1D numpy array representing the document's semantics. | +| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the document's semantics. | ## Doc.vector_norm {#vector_norm tag="property" model="vectors"} diff --git a/website/docs/api/goldcorpus.md b/website/docs/api/goldcorpus.md deleted file mode 100644 index 7767b28bd..000000000 --- a/website/docs/api/goldcorpus.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: GoldCorpus -teaser: An annotated corpus, using the JSON file format -tag: class -source: spacy/gold.pyx -new: 2 ---- - -This class manages annotations for tagging, dependency parsing and NER. - -## GoldCorpus.\_\_init\_\_ {#init tag="method"} - -Create a `GoldCorpus`. IF the input data is an iterable, each item should be a -`(text, paragraphs)` tuple, where each paragraph is a tuple -`(sentences, brackets)`, and each sentence is a tuple -`(ids, words, tags, heads, ner)`. See the implementation of -[`gold.read_json_file`](https://github.com/explosion/spaCy/tree/master/spacy/gold.pyx) -for further details. - -| Name | Type | Description | -| ----------- | ----------------------- | ------------------------------------------------------------ | -| `train` | str / `Path` / iterable | Training data, as a path (file or directory) or iterable. | -| `dev` | str / `Path` / iterable | Development data, as a path (file or directory) or iterable. | -| **RETURNS** | `GoldCorpus` | The newly constructed object. | diff --git a/website/docs/api/sentencerecognizer.md b/website/docs/api/sentencerecognizer.md new file mode 100644 index 000000000..367b79e5d --- /dev/null +++ b/website/docs/api/sentencerecognizer.md @@ -0,0 +1,29 @@ +--- +title: SentenceRecognizer +tag: class +source: spacy/pipeline/pipes.pyx +new: 3 +--- + +A trainable pipeline component for sentence segmentation. For a simpler, +ruse-based strategy, see the [`Sentencizer`](/api/sentencizer). This class is a +subclass of `Pipe` and follows the same API. The component is also available via +the string name `"senter"`. After initialization, it is typically added to the +processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). + +## SentenceRecognizer.\_\_init\_\_ {#init tag="method"} + +Initialize the sentence recognizer. + +> #### Example +> +> ```python +> # Construction via create_pipe +> senter = nlp.create_pipe("senter") +> +> # Construction from class +> from spacy.pipeline import SentenceRecognizer +> senter = SentenceRecognizer() +> ``` + + diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md index 14482c353..9c6e2d58c 100644 --- a/website/docs/api/sentencizer.md +++ b/website/docs/api/sentencizer.md @@ -12,19 +12,6 @@ require a statistical model to be loaded. The component is also available via the string name `"sentencizer"`. After initialization, it is typically added to the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). - - -Compared to the previous `SentenceSegmenter` class, the `Sentencizer` component -doesn't add a hook to `doc.user_hooks["sents"]`. Instead, it iterates over the -tokens in the `Doc` and sets the `Token.is_sent_start` property. The -`SentenceSegmenter` is still available if you import it directly: - -```python -from spacy.pipeline import SentenceSegmenter -``` - - - ## Sentencizer.\_\_init\_\_ {#init tag="method"} Initialize the sentencizer. @@ -40,10 +27,24 @@ Initialize the sentencizer. > sentencizer = Sentencizer() > ``` -| Name | Type | Description | -| ------------- | ------------- | ------------------------------------------------------------------------------------------------------ | -| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. Defaults to `['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '。', '。']`. | -| **RETURNS** | `Sentencizer` | The newly constructed object. | +| Name | Type | Description | +| ------------- | ------------- | ----------------------------------------------------------------------------------------------- | +| `punct_chars` | list | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. | +| **RETURNS** | `Sentencizer` | The newly constructed object. | + +```python +### punct_chars defaults +['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', + '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', + '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉', + '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', '꡷', '꣎', '꣏', '꤯', '꧈', + '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', '﹖', '﹗', '!', '.', '?', + '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', + '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', + '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', + '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑩃', '𑪛', + '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '。', '。'] +``` ## Sentencizer.\_\_call\_\_ {#call tag="method"} diff --git a/website/docs/api/span.md b/website/docs/api/span.md index 4d10c08d9..668013e76 100644 --- a/website/docs/api/span.md +++ b/website/docs/api/span.md @@ -25,7 +25,7 @@ Create a Span object from the slice `doc[start : end]`. | `end` | int | The index of the first token after the span. | | `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. | | `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | | **RETURNS** | `Span` | The newly constructed object. | ## Span.\_\_getitem\_\_ {#getitem tag="method"} @@ -110,7 +110,7 @@ For details, see the documentation on | Name | Type | Description | | --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `span._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `span._.compare(other_span)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -191,7 +191,7 @@ the character indices don't map to a valid span. | `end` | int | The index of the last character after the span. | | `label` | uint64 / str | A label to attach to the span, e.g. for named entities. | | `kb_id` | uint64 / str | An ID from a knowledge base to capture the meaning of a named entity. | -| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span. | +| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | | **RETURNS** | `Span` | The newly constructed object or `None`. | ## Span.similarity {#similarity tag="method" model="vectors"} @@ -232,7 +232,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. | Name | Type | Description | | ----------- | -------------------------------------- | ------------------------------------------------ | -| **RETURNS** | `numpy.ndarray[ndim=2, dtype='int32']` | The lowest common ancestor matrix of the `Span`. | +| **RETURNS** | `numpy.ndarray[ndim=2, dtype="int32"]` | The lowest common ancestor matrix of the `Span`. | ## Span.to_array {#to_array tag="method" new="2"} @@ -440,7 +440,7 @@ vectors. | Name | Type | Description | | ----------- | ---------------------------------------- | --------------------------------------------------- | -| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A 1D numpy array representing the span's semantics. | +| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the span's semantics. | ## Span.vector_norm {#vector_norm tag="property" model="vectors"} diff --git a/website/docs/api/token.md b/website/docs/api/token.md index 2d25d9db2..549189cad 100644 --- a/website/docs/api/token.md +++ b/website/docs/api/token.md @@ -58,7 +58,7 @@ For details, see the documentation on | Name | Type | Description | | --------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------- | -| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `token._.my_attr`. | +| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `token._.my_attr`. | | `default` | - | Optional default value of the attribute if no getter or method is defined. | | `method` | callable | Set a custom method on the object, for example `token._.compare(other_token)`. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | @@ -370,7 +370,7 @@ A real-valued meaning representation. | Name | Type | Description | | ----------- | ---------------------------------------- | ---------------------------------------------------- | -| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A 1D numpy array representing the token's semantics. | +| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the token's semantics. | ## Token.vector_norm {#vector_norm tag="property" model="vectors"} @@ -435,8 +435,8 @@ The L2 norm of the token's vector representation. | `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | | `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | | `is_punct` | bool | Is the token punctuation? | -| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `'('` ? | -| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `')'` ? | +| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `"("` ? | +| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `")"` ? | | `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | | `is_bracket` | bool | Is the token a bracket? | | `is_quote` | bool | Is the token a quotation mark? | diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 01bc712a8..fe0952c9f 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -3,6 +3,7 @@ title: Top-level Functions menu: - ['spacy', 'spacy'] - ['displacy', 'displacy'] + - ['Data & Alignment', 'gold'] - ['Utility Functions', 'util'] --- @@ -76,8 +77,8 @@ meta data as a dictionary instead, you can use the `meta` attribute on your > > ```python > spacy.info() -> spacy.info("en") -> spacy.info("de", markdown=True) +> spacy.info("en_core_web_sm") +> spacy.info(markdown=True) > ``` | Name | Type | Description | @@ -258,6 +259,156 @@ colors for them. Your application or model package can also expose a [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) to add custom labels and their colors automatically. +## Training data and alignment {#gold source="spacy/gold"} + +### gold.docs_to_json {#docs_to_json tag="function"} + +Convert a list of Doc objects into the +[JSON-serializable format](/api/annotation#json-input) used by the +[`spacy train`](/api/cli#train) command. Each input doc will be treated as a +'paragraph' in the output doc. + +> #### Example +> +> ```python +> from spacy.gold import docs_to_json +> +> doc = nlp("I like London") +> json_data = docs_to_json([doc]) +> ``` + +| Name | Type | Description | +| ----------- | ---------------- | ------------------------------------------ | +| `docs` | iterable / `Doc` | The `Doc` object(s) to convert. | +| `id` | int | ID to assign to the JSON. Defaults to `0`. | +| **RETURNS** | dict | The data in spaCy's JSON format. | + +### gold.align {#align tag="function"} + +Calculate alignment tables between two tokenizations, using the Levenshtein +algorithm. The alignment is case-insensitive. + + + +The current implementation of the alignment algorithm assumes that both +tokenizations add up to the same string. For example, you'll be able to align +`["I", "'", "m"]` and `["I", "'m"]`, which both add up to `"I'm"`, but not +`["I", "'m"]` and `["I", "am"]`. + + + +> #### Example +> +> ```python +> from spacy.gold import align +> +> bert_tokens = ["obama", "'", "s", "podcast"] +> spacy_tokens = ["obama", "'s", "podcast"] +> alignment = align(bert_tokens, spacy_tokens) +> cost, a2b, b2a, a2b_multi, b2a_multi = alignment +> ``` + +| Name | Type | Description | +| ----------- | ----- | -------------------------------------------------------------------------- | +| `tokens_a` | list | String values of candidate tokens to align. | +| `tokens_b` | list | String values of reference tokens to align. | +| **RETURNS** | tuple | A `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the alignment. | + +The returned tuple contains the following alignment information: + +> #### Example +> +> ```python +> a2b = array([0, -1, -1, 2]) +> b2a = array([0, 2, 3]) +> a2b_multi = {1: 1, 2: 1} +> b2a_multi = {} +> ``` +> +> If `a2b[3] == 2`, that means that `tokens_a[3]` aligns to `tokens_b[2]`. If +> there's no one-to-one alignment for a token, it has the value `-1`. + +| Name | Type | Description | +| ----------- | -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `cost` | int | The number of misaligned tokens. | +| `a2b` | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_a` to indices in `tokens_b`. | +| `b2a` | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_b` to indices in `tokens_a`. | +| `a2b_multi` | dict | A dictionary mapping indices in `tokens_a` to indices in `tokens_b`, where multiple tokens of `tokens_a` align to the same token of `tokens_b`. | +| `b2a_multi` | dict | A dictionary mapping indices in `tokens_b` to indices in `tokens_a`, where multiple tokens of `tokens_b` align to the same token of `tokens_a`. | + +### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} + +Encode labelled spans into per-token tags, using the +[BILUO scheme](/api/annotation#biluo) (Begin, In, Last, Unit, Out). Returns a +list of strings, describing the tags. Each tag string will be of the form of +either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, +`"L"`, `"U"`. The string `"-"` is used where the entity offsets don't align with +the tokenization in the `Doc` object. The training algorithm will view these as +missing values. `O` denotes a non-entity token. `B` denotes the beginning of a +multi-token entity, `I` the inside of an entity of three or more tokens, and `L` +the end of an entity of two or more tokens. `U` denotes a single-token entity. + +> #### Example +> +> ```python +> from spacy.gold import biluo_tags_from_offsets +> +> doc = nlp("I like London.") +> entities = [(7, 13, "LOC")] +> tags = biluo_tags_from_offsets(doc, entities) +> assert tags == ["O", "O", "U-LOC", "O"] +> ``` + +| Name | Type | Description | +| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. | +| `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. | +| **RETURNS** | list | str strings, describing the [BILUO](/api/annotation#biluo) tags. | + +### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} + +Encode per-token tags following the [BILUO scheme](/api/annotation#biluo) into +entity offsets. + +> #### Example +> +> ```python +> from spacy.gold import offsets_from_biluo_tags +> +> doc = nlp("I like London.") +> tags = ["O", "O", "U-LOC", "O"] +> entities = offsets_from_biluo_tags(doc, tags) +> assert entities == [(7, 13, "LOC")] +> ``` + +| Name | Type | Description | +| ----------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The document that the BILUO tags refer to. | +| `entities` | iterable | A sequence of [BILUO](/api/annotation#biluo) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | +| **RETURNS** | list | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. | + +### gold.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} + +Encode per-token tags following the [BILUO scheme](/api/annotation#biluo) into +[`Span`](/api/span) objects. This can be used to create entity spans from +token-based tags, e.g. to overwrite the `doc.ents`. + +> #### Example +> +> ```python +> from spacy.gold import spans_from_biluo_tags +> +> doc = nlp("I like London.") +> tags = ["O", "O", "U-LOC", "O"] +> doc.ents = spans_from_biluo_tags(doc, tags) +> ``` + +| Name | Type | Description | +| ----------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `doc` | `Doc` | The document that the BILUO tags refer to. | +| `entities` | iterable | A sequence of [BILUO](/api/annotation#biluo) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | +| **RETURNS** | list | A sequence of `Span` objects with added entity labels. | + ## Utility functions {#util source="spacy/util.py"} spaCy comes with a small collection of utility functions located in @@ -341,7 +492,7 @@ class. The model data will then be loaded in via > #### Example > > ```python -> nlp = util.load_model("en") +> nlp = util.load_model("en_core_web_sm") > nlp = util.load_model("en_core_web_sm", disable=["ner"]) > nlp = util.load_model("/path/to/data") > ``` @@ -634,3 +785,13 @@ of one entity) or when merging spans with | ----------- | -------- | -------------------- | | `spans` | iterable | The spans to filter. | | **RETURNS** | list | The filtered spans. | + +## util.get_words_and_spaces {#get_words_and_spaces tag="function" new="3"} + + + +| Name | Type | Description | +| ----------- | ----- | ----------- | +| `words` | list | | +| `text` | str | | +| **RETURNS** | tuple | | diff --git a/website/docs/usage/101/_architecture.md b/website/docs/usage/101/_architecture.md index 7cd749521..4363b9b4f 100644 --- a/website/docs/usage/101/_architecture.md +++ b/website/docs/usage/101/_architecture.md @@ -12,6 +12,8 @@ place** by the components of the pipeline. The `Language` object coordinates these components. It takes raw text and sends it through the pipeline, returning an **annotated document**. It also orchestrates training and serialization. + + ![Library architecture](../../images/architecture.svg) ### Container objects {#architecture-containers} diff --git a/website/docs/usage/models.md b/website/docs/usage/models.md index db8d0ee28..8157e2c07 100644 --- a/website/docs/usage/models.md +++ b/website/docs/usage/models.md @@ -392,9 +392,7 @@ loading models, the underlying functionality is entirely based on native Python packages. This allows your application to handle a model like any other package dependency. -For an example of an automated model training and build process, see -[this overview](/usage/training#example-training-spacy) of how we're training -and packaging our models for spaCy. + ### Downloading and requiring model dependencies {#models-download} diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md index e9ba0de6a..ac6b275d8 100644 --- a/website/docs/usage/saving-loading.md +++ b/website/docs/usage/saving-loading.md @@ -711,67 +711,4 @@ class and call [`from_disk`](/api/language#from_disk) instead. nlp = spacy.blank("en").from_disk("/path/to/data") ``` - - -In spaCy 1.x, the distinction between `spacy.load()` and the `Language` class -constructor was quite unclear. You could call `spacy.load()` when no model was -present, and it would silently return an empty object. Likewise, you could pass -a path to `English`, even if the mode required a different language. spaCy v2.0 -solves this with a clear distinction between setting up the instance and loading -the data. - -```diff -- nlp = spacy.load("en_core_web_sm", path="/path/to/data") -+ nlp = spacy.blank("en_core_web_sm").from_disk("/path/to/data") -``` - - - -### How we're training and packaging models for spaCy {#example-training-spacy} - -Publishing a new version of spaCy often means re-training all available models, -which is [quite a lot](/usage/models#languages). To make this run smoothly, -we're using an automated build process and a [`spacy train`](/api/cli#train) -template that looks like this: - -```bash -$ python -m spacy train {lang} {models_dir}/{name} {train_data} {dev_data} -m meta/{name}.json -V {version} -g {gpu_id} -n {n_epoch} -ns {n_sents} -``` - -> #### meta.json template -> -> ```json -> { -> "lang": "en", -> "name": "core_web_sm", -> "license": "CC BY-SA 3.0", -> "author": "Explosion AI", -> "url": "https://explosion.ai", -> "email": "contact@explosion.ai", -> "sources": ["OntoNotes 5", "Common Crawl"], -> "description": "English multi-task CNN trained on OntoNotes, with GloVe vectors trained on common crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities." -> } -> ``` - -In a directory `meta`, we keep `meta.json` templates for the individual models, -containing all relevant information that doesn't change across versions, like -the name, description, author info and training data sources. When we train the -model, we pass in the file to the meta template as the `--meta` argument, and -specify the current model version as the `--version` argument. - -On each epoch, the model is saved out with a `meta.json` using our template and -added properties, like the `pipeline`, `accuracy` scores and the `spacy_version` -used to train the model. After training completion, the best model is selected -automatically and packaged using the [`package`](/api/cli#package) command. -Since a full meta file is already present on the trained model, no further setup -is required to build a valid model package. - -```bash -python -m spacy package -f {best_model} dist/ -cd dist/{model_name} -python setup.py sdist -``` - -This process allows us to quickly trigger the model training and build process -for all available models and languages, and generate the correct meta data -automatically. + diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md index 7ca309ea0..6fa0b3d8e 100644 --- a/website/docs/usage/training.md +++ b/website/docs/usage/training.md @@ -6,6 +6,7 @@ menu: - ['CLI & Config', 'cli-config'] - ['Custom Models', 'custom-models'] - ['Transfer Learning', 'transfer-learning'] + - ['Parallel Training', 'parallel-training'] - ['Internal API', 'api'] --- @@ -43,6 +44,10 @@ The recommended way to train your spaCy models is via the +### Training data format {#data-format} + + + > #### Tip: Debug your data > > The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate @@ -167,6 +172,10 @@ dropout = null +## Parallel Training with Ray {#parallel-training} + + + ## Internal training API {#api} diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json index 165c02a29..9a0d0fb05 100644 --- a/website/meta/sidebars.json +++ b/website/meta/sidebars.json @@ -68,7 +68,8 @@ { "text": "Token", "url": "/api/token" }, { "text": "Span", "url": "/api/span" }, { "text": "Lexeme", "url": "/api/lexeme" }, - { "text": "Example", "url": "/api/example" } + { "text": "Example", "url": "/api/example" }, + { "text": "DocBin", "url": "/api/docbin" } ] }, { @@ -86,6 +87,7 @@ { "text": "PhraseMatcher", "url": "/api/phrasematcher" }, { "text": "EntityRuler", "url": "/api/entityruler" }, { "text": "Sentencizer", "url": "/api/sentencizer" }, + { "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" }, { "text": "Other Functions", "url": "/api/pipeline-functions" } ] }, @@ -97,10 +99,8 @@ { "text": "Vectors", "url": "/api/vectors" }, { "text": "Lookups", "url": "/api/lookups" }, { "text": "KnowledgeBase", "url": "/api/kb" }, - { "text": "GoldParse", "url": "/api/goldparse" }, - { "text": "GoldCorpus", "url": "/api/goldcorpus" }, { "text": "Scorer", "url": "/api/scorer" }, - { "text": "DocBin", "url": "/api/docbin" } + { "text": "Corpus", "url": "/api/corpus" } ] }, { diff --git a/website/src/components/code.js b/website/src/components/code.js index 5184da833..2c1ad32d8 100644 --- a/website/src/components/code.js +++ b/website/src/components/code.js @@ -83,12 +83,13 @@ export class Code extends React.Component { executable, github, prompt, + wrap, highlight, className, children, } = this.props const codeClassNames = classNames(classes.code, className, `language-${lang}`, { - [classes.wrap]: !!highlight, + [classes.wrap]: !!highlight || !!wrap, }) const ghClassNames = classNames(codeClassNames, classes.maxHeight) const { Juniper } = this.state diff --git a/website/src/widgets/quickstart-install.js b/website/src/widgets/quickstart-install.js index dd4e10f01..237567eb8 100644 --- a/website/src/widgets/quickstart-install.js +++ b/website/src/widgets/quickstart-install.js @@ -83,7 +83,7 @@ const QuickstartInstall = ({ id, title }) => ( export PYTHONPATH=`pwd` - set PYTHONPATH=/path/to/spaCy + set PYTHONPATH=C:\path\to\spaCy pip install -r requirements.txt