mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 21:51:24 +03:00 
			
		
		
		
	Update docs
This commit is contained in:
		
							parent
							
								
									fe224dc2dd
								
							
						
					
					
						commit
						1e0d54edd1
					
				|  | @ -13,6 +13,7 @@ menu: | |||
|   - ['Init Model', 'init-model'] | ||||
|   - ['Evaluate', 'evaluate'] | ||||
|   - ['Package', 'package'] | ||||
|   - ['Project', 'project'] | ||||
| --- | ||||
| 
 | ||||
| For a list of available commands, type `spacy --help`. | ||||
|  | @ -95,26 +96,29 @@ $ python -m spacy validate | |||
| 
 | ||||
| ## Convert {#convert} | ||||
| 
 | ||||
| Convert files into spaCy's [JSON format](/api/annotation#json-input) for use | ||||
| with the `train` command and other experiment management functions. The | ||||
| converter can be specified on the command line, or chosen based on the file | ||||
| extension of the input file. | ||||
| Convert files into spaCy's | ||||
| [binary training data format](/usage/training#data-format), a serialized | ||||
| [`DocBin`](/api/docbin), for use with the `train` command and other experiment | ||||
| management functions. The converter can be specified on the command line, or | ||||
| chosen based on the file extension of the input file. | ||||
| 
 | ||||
| ```bash | ||||
| $ python -m spacy convert [input_file] [output_dir] [--file-type] [--converter] | ||||
| [--n-sents] [--morphology] [--lang] | ||||
| $ python -m spacy convert [input_file] [output_dir] [--converter] | ||||
| [--file-type] [--n-sents] [--seg-sents] [--model] [--morphology] | ||||
| [--merge-subtokens] [--ner-map] [--lang] | ||||
| ``` | ||||
| 
 | ||||
| | Argument                                         | Type       | Description                                                                                                              | | ||||
| | ------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `input_file`                                     | positional | Input file.                                                                                                              | | ||||
| | `output_dir`                                     | positional | Output directory for converted file. Defaults to `"-"`, meaning data will be written to `stdout`.                        | | ||||
| | `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option     | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. | | ||||
| | `--converter`, `-c` <Tag variant="new">2</Tag>   | option     | Name of converter to use (see below).                                                                                    | | ||||
| | `--file-type`, `-t` <Tag variant="new">2.1</Tag> | option     | Type of file to create. Either `spacy` (default) for binary [`DocBin`](/api/docbin) data or `json` for v2.x JSON format. | | ||||
| | `--n-sents`, `-n`                                | option     | Number of sentences per document.                                                                                        | | ||||
| | `--seg-sents`, `-s` <Tag variant="new">2.2</Tag> | flag       | Segment sentences (for `-c ner`)                                                                                         | | ||||
| | `--model`, `-b` <Tag variant="new">2.2</Tag>     | option     | Model for parser-based sentence segmentation (for `-s`)                                                                  | | ||||
| | `--morphology`, `-m`                             | option     | Enable appending morphology to tags.                                                                                     | | ||||
| | `--ner-map`, `-nm`                               | option     | NER tag mapping (as JSON-encoded dict of entity types).                                                                  | | ||||
| | `--lang`, `-l` <Tag variant="new">2.1</Tag>      | option     | Language code (if tokenizer required).                                                                                   | | ||||
| | `--help`, `-h`                                   | flag       | Show help message and available arguments.                                                                               | | ||||
| | **CREATES**                                      | binary     | Binary [`DocBin`](/api/docbin) training data that can be used with [`spacy train`](/api/cli#train).                      | | ||||
|  | @ -136,20 +140,21 @@ stats, and find problems like invalid entity annotations, cyclic dependencies, | |||
| low data labels and more. | ||||
| 
 | ||||
| ```bash | ||||
| $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] [--pipeline] [--ignore-warnings] [--verbose] [--no-format] | ||||
| $ python -m spacy debug-data [lang] [train_path] [dev_path] [--base-model] | ||||
| [--pipeline] [--tag-map-path] [--ignore-warnings] [--verbose] [--no-format] | ||||
| ``` | ||||
| 
 | ||||
| | Argument                                               | Type       | Description                                                                                        | | ||||
| | ------------------------------------------------------ | ---------- | -------------------------------------------------------------------------------------------------- | | ||||
| | `lang`                                                 | positional | Model language.                                                                                    | | ||||
| | `train_path`                                           | positional | Location of JSON-formatted training data. Can be a file or a directory of files.                   | | ||||
| | `dev_path`                                             | positional | Location of JSON-formatted development data for evaluation. Can be a file or a directory of files. | | ||||
| | `--tag-map-path`, `-tm` <Tag variant="new">2.2.4</Tag> | option     | Location of JSON-formatted tag map.                                                                | | ||||
| | `--base-model`, `-b`                                   | option     | Optional name of base model to update. Can be any loadable spaCy model.                            | | ||||
| | `--pipeline`, `-p`                                     | option     | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`.          | | ||||
| | `--ignore-warnings`, `-IW`                             | flag       | Ignore warnings, only show stats and errors.                                                       | | ||||
| | `--verbose`, `-V`                                      | flag       | Print additional information and explanations.                                                     | | ||||
| | `--no-format`, `-NF`                                   | flag       | Don't pretty-print the results. Use this if you want to write to a file.                           | | ||||
| | Argument                                               | Type       | Description                                                                                                               | | ||||
| | ------------------------------------------------------ | ---------- | ------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `lang`                                                 | positional | Model language.                                                                                                           | | ||||
| | `train_path`                                           | positional | Location of [binary training data](/usage/training#data-format). Can be a file or a directory of files.                   | | ||||
| | `dev_path`                                             | positional | Location of [binary development data](/usage/training#data-format) for evaluation. Can be a file or a directory of files. | | ||||
| | `--tag-map-path`, `-tm` <Tag variant="new">2.2.4</Tag> | option     | Location of JSON-formatted tag map.                                                                                       | | ||||
| | `--base-model`, `-b`                                   | option     | Optional name of base model to update. Can be any loadable spaCy model.                                                   | | ||||
| | `--pipeline`, `-p`                                     | option     | Comma-separated names of pipeline components to train. Defaults to `'tagger,parser,ner'`.                                 | | ||||
| | `--ignore-warnings`, `-IW`                             | flag       | Ignore warnings, only show stats and errors.                                                                              | | ||||
| | `--verbose`, `-V`                                      | flag       | Print additional information and explanations.                                                                            | | ||||
| | `--no-format`, `-NF`                                   | flag       | Don't pretty-print the results. Use this if you want to write to a file.                                                  | | ||||
| 
 | ||||
| <Accordion title="Example output"> | ||||
| 
 | ||||
|  | @ -292,6 +297,8 @@ will not be available. | |||
| 
 | ||||
| ## Train {#train} | ||||
| 
 | ||||
| <!-- TODO: document new training --> | ||||
| 
 | ||||
| Train a model. Expects data in spaCy's | ||||
| [JSON format](/api/annotation#json-input). On each epoch, a model will be saved | ||||
| out to the directory. Accuracy scores and model details will be added to a | ||||
|  | @ -345,47 +352,10 @@ $ python -m spacy train [lang] [output_path] [train_path] [dev_path] | |||
| | `--help`, `-h`                                                  | flag          | Show help message and available arguments.                                                                                                                        | | ||||
| | **CREATES**                                                     | model, pickle | A spaCy model on each epoch.                                                                                                                                      | | ||||
| 
 | ||||
| ### Environment variables for hyperparameters {#train-hyperparams new="2"} | ||||
| 
 | ||||
| spaCy lets you set hyperparameters for training via environment variables. For | ||||
| example: | ||||
| 
 | ||||
| ```bash | ||||
| $ token_vector_width=256 learn_rate=0.0001 spacy train [...] | ||||
| ``` | ||||
| 
 | ||||
| > #### Usage with alias | ||||
| > | ||||
| > Environment variables keep the command simple and allow you to to | ||||
| > [create an alias](https://askubuntu.com/questions/17536/how-do-i-create-a-permanent-bash-alias/17537#17537) | ||||
| > for your custom `train` command while still being able to easily tweak the | ||||
| > hyperparameters. | ||||
| > | ||||
| > ```bash | ||||
| > alias train-parser="python -m spacy train en /output /data /train /dev -n 1000" | ||||
| > token_vector_width=256 train-parser | ||||
| > ``` | ||||
| 
 | ||||
| | Name                 | Description                                         | Default | | ||||
| | -------------------- | --------------------------------------------------- | ------- | | ||||
| | `dropout_from`       | Initial dropout rate.                               | `0.2`   | | ||||
| | `dropout_to`         | Final dropout rate.                                 | `0.2`   | | ||||
| | `dropout_decay`      | Rate of dropout change.                             | `0.0`   | | ||||
| | `batch_from`         | Initial batch size.                                 | `1`     | | ||||
| | `batch_to`           | Final batch size.                                   | `64`    | | ||||
| | `batch_compound`     | Rate of batch size acceleration.                    | `1.001` | | ||||
| | `token_vector_width` | Width of embedding tables and convolutional layers. | `128`   | | ||||
| | `embed_size`         | Number of rows in embedding tables.                 | `7500`  | | ||||
| | `hidden_width`       | Size of the parser's and NER's hidden layers.       | `128`   | | ||||
| | `learn_rate`         | Learning rate.                                      | `0.001` | | ||||
| | `optimizer_B1`       | Momentum for the Adam solver.                       | `0.9`   | | ||||
| | `optimizer_B2`       | Adagrad-momentum for the Adam solver.               | `0.999` | | ||||
| | `optimizer_eps`      | Epsilon value for the Adam solver.                  | `1e-08` | | ||||
| | `L2_penalty`         | L2 regularization penalty.                          | `1e-06` | | ||||
| | `grad_norm_clip`     | Gradient L2 norm constraint.                        | `1.0`   | | ||||
| 
 | ||||
| ## Pretrain {#pretrain new="2.1" tag="experimental"} | ||||
| 
 | ||||
| <!-- TODO: document new pretrain command and link to new pretraining docs --> | ||||
| 
 | ||||
| Pre-train the "token to vector" (`tok2vec`) layer of pipeline components, using | ||||
| an approximate language-modeling objective. Specifically, we load pretrained | ||||
| vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which | ||||
|  | @ -491,6 +461,8 @@ $ python -m spacy init-model [lang] [output_dir] [--jsonl-loc] [--vectors-loc] | |||
| 
 | ||||
| ## Evaluate {#evaluate new="2"} | ||||
| 
 | ||||
| <!-- TODO: document new evaluate command --> | ||||
| 
 | ||||
| Evaluate a model's accuracy and speed on JSON-formatted annotated data. Will | ||||
| print the results and optionally export | ||||
| [displaCy visualizations](/usage/visualizers) of a sample set of parses to | ||||
|  | @ -516,12 +488,20 @@ $ python -m spacy evaluate [model] [data_path] [--displacy-path] [--displacy-lim | |||
| 
 | ||||
| ## Package {#package} | ||||
| 
 | ||||
| Generate a [model Python package](/usage/training#models-generating) from an | ||||
| existing model data directory. All data files are copied over. If the path to a | ||||
| `meta.json` is supplied, or a `meta.json` is found in the input directory, this | ||||
| file is used. Otherwise, the data can be entered directly from the command line. | ||||
| After packaging, you can run `python setup.py sdist` from the newly created | ||||
| directory to turn your model into an installable archive file. | ||||
| Generate an installable | ||||
| [model Python package](/usage/training#models-generating) from an existing model | ||||
| data directory. All data files are copied over. If the path to a `meta.json` is | ||||
| supplied, or a `meta.json` is found in the input directory, this file is used. | ||||
| Otherwise, the data can be entered directly from the command line. spaCy will | ||||
| then create a `.tar.gz` archive file that you can distribute and install with | ||||
| `pip install`. | ||||
| 
 | ||||
| <Infobox title="New in v3.0" variant="warning"> | ||||
| 
 | ||||
| The `spacy package` command now also builds the `.tar.gz` archive automatically, | ||||
| so you don't have to run `python setup.py sdist` separately anymore. | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| ```bash | ||||
| $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] [--force] | ||||
|  | @ -531,7 +511,6 @@ $ python -m spacy package [input_dir] [output_dir] [--meta-path] [--create-meta] | |||
| ### Example | ||||
| python -m spacy package /input /output | ||||
| cd /output/en_model-0.0.0 | ||||
| python setup.py sdist | ||||
| pip install dist/en_model-0.0.0.tar.gz | ||||
| ``` | ||||
| 
 | ||||
|  | @ -541,6 +520,23 @@ pip install dist/en_model-0.0.0.tar.gz | |||
| | `output_dir`                                     | positional | Directory to create package folder in.                                                                                                                                                          | | ||||
| | `--meta-path`, `-m` <Tag variant="new">2</Tag>   | option     | Path to `meta.json` file (optional).                                                                                                                                                            | | ||||
| | `--create-meta`, `-c` <Tag variant="new">2</Tag> | flag       | Create a `meta.json` file on the command line, even if one already exists in the directory. If an existing file is found, its entries will be shown as the defaults in the command line prompt. | | ||||
| | `--version`, `-v` <Tag variant="new">3</Tag>     | option     | Package version to override in meta. Useful when training new versions, as it doesn't require editing the meta template.                                                                        | | ||||
| | `--force`, `-f`                                  | flag       | Force overwriting of existing folder in output directory.                                                                                                                                       | | ||||
| | `--help`, `-h`                                   | flag       | Show help message and available arguments.                                                                                                                                                      | | ||||
| | **CREATES**                                      | directory  | A Python package containing the spaCy model.                                                                                                                                                    | | ||||
| 
 | ||||
| ## Project {#project} | ||||
| 
 | ||||
| <!-- TODO: document project command and subcommands. We should probably wait and only finalize this once we've finalized the design --> | ||||
| 
 | ||||
| ### project clone {#project-clone} | ||||
| 
 | ||||
| ### project assets {#project-assets} | ||||
| 
 | ||||
| ### project run-all {#project-run-all} | ||||
| 
 | ||||
| ### project run {#project-run} | ||||
| 
 | ||||
| ### project init {#project-init} | ||||
| 
 | ||||
| ### project update-dvc {#project-update-dvc} | ||||
|  |  | |||
							
								
								
									
										37
									
								
								website/docs/api/corpus.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										37
									
								
								website/docs/api/corpus.md
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,37 @@ | |||
| --- | ||||
| title: Corpus | ||||
| teaser: An annotated corpus | ||||
| tag: class | ||||
| source: spacy/gold/corpus.py | ||||
| new: 3 | ||||
| --- | ||||
| 
 | ||||
| This class manages annotated corpora and can read training and development | ||||
| datasets in the [DocBin](/api/docbin) (`.spacy`) format. | ||||
| 
 | ||||
| ## Corpus.\_\_init\_\_ {#init tag="method"} | ||||
| 
 | ||||
| Create a `Corpus`. The input data can be a file or a directory of files. | ||||
| 
 | ||||
| | Name        | Type         | Description                                                      | | ||||
| | ----------- | ------------ | ---------------------------------------------------------------- | | ||||
| | `train`     | str / `Path` | Training data (`.spacy` file or directory of `.spacy` files).    | | ||||
| | `dev`       | str / `Path` | Development data (`.spacy` file or directory of `.spacy` files). | | ||||
| | `limit`     | int          | Maximum number of examples returned.                             | | ||||
| | **RETURNS** | `Corpus`     | The newly constructed object.                                    | | ||||
| 
 | ||||
| <!-- TODO: document remaining methods / decide which to document --> | ||||
| 
 | ||||
| ## Corpus.walk_corpus {#walk_corpus tag="staticmethod"} | ||||
| 
 | ||||
| ## Corpus.make_examples {#make_examples tag="method"} | ||||
| 
 | ||||
| ## Corpus.make_examples_gold_preproc {#make_examples_gold_preproc tag="method"} | ||||
| 
 | ||||
| ## Corpus.read_docbin {#read_docbin tag="method"} | ||||
| 
 | ||||
| ## Corpus.count_train {#count_train tag="method"} | ||||
| 
 | ||||
| ## Corpus.train_dataset {#train_dataset tag="method"} | ||||
| 
 | ||||
| ## Corpus.dev_dataset {#dev_dataset tag="method"} | ||||
|  | @ -123,7 +123,7 @@ details, see the documentation on | |||
| 
 | ||||
| | Name      | Type     | Description                                                                                                                         | | ||||
| | --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `name`    | str      | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`.                       | | ||||
| | `name`    | str      | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `doc._.my_attr`.                       | | ||||
| | `default` | -        | Optional default value of the attribute if no getter or method is defined.                                                          | | ||||
| | `method`  | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`.                                                          | | ||||
| | `getter`  | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute.          | | ||||
|  | @ -140,8 +140,8 @@ Look up a previously registered extension by name. Returns a 4-tuple | |||
| > | ||||
| > ```python | ||||
| > from spacy.tokens import Doc | ||||
| > Doc.set_extension('has_city', default=False) | ||||
| > extension = Doc.get_extension('has_city') | ||||
| > Doc.set_extension("has_city", default=False) | ||||
| > extension = Doc.get_extension("has_city") | ||||
| > assert extension == (False, None, None, None) | ||||
| > ``` | ||||
| 
 | ||||
|  | @ -158,8 +158,8 @@ Check whether an extension has been registered on the `Doc` class. | |||
| > | ||||
| > ```python | ||||
| > from spacy.tokens import Doc | ||||
| > Doc.set_extension('has_city', default=False) | ||||
| > assert Doc.has_extension('has_city') | ||||
| > Doc.set_extension("has_city", default=False) | ||||
| > assert Doc.has_extension("has_city") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type | Description                                | | ||||
|  | @ -175,9 +175,9 @@ Remove a previously registered extension. | |||
| > | ||||
| > ```python | ||||
| > from spacy.tokens import Doc | ||||
| > Doc.set_extension('has_city', default=False) | ||||
| > removed = Doc.remove_extension('has_city') | ||||
| > assert not Doc.has_extension('has_city') | ||||
| > Doc.set_extension("has_city", default=False) | ||||
| > removed = Doc.remove_extension("has_city") | ||||
| > assert not Doc.has_extension("has_city") | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type  | Description                                                           | | ||||
|  | @ -204,7 +204,7 @@ the character indices don't map to a valid span. | |||
| | `end`                                | int                                      | The index of the last character after the span.                       | | ||||
| | `label`                              | uint64 / str                             | A label to attach to the span, e.g. for named entities.               | | ||||
| | `kb_id` <Tag variant="new">2.2</Tag> | uint64 / str                             | An ID from a knowledge base to capture the meaning of a named entity. | | ||||
| | `vector`                             | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                                 | | ||||
| | `vector`                             | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span.                                 | | ||||
| | **RETURNS**                          | `Span`                                   | The newly constructed object or `None`.                               | | ||||
| 
 | ||||
| ## Doc.similarity {#similarity tag="method" model="vectors"} | ||||
|  | @ -264,7 +264,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. | |||
| 
 | ||||
| | Name        | Type                                   | Description                                     | | ||||
| | ----------- | -------------------------------------- | ----------------------------------------------- | | ||||
| | **RETURNS** | `numpy.ndarray[ndim=2, dtype='int32']` | The lowest common ancestor matrix of the `Doc`. | | ||||
| | **RETURNS** | `numpy.ndarray[ndim=2, dtype="int32"]` | The lowest common ancestor matrix of the `Doc`. | | ||||
| 
 | ||||
| ## Doc.to_json {#to_json tag="method" new="2.1"} | ||||
| 
 | ||||
|  | @ -303,7 +303,7 @@ Export given token attributes to a numpy `ndarray`. If `attr_ids` is a sequence | |||
| of `M` attributes, the output array will be of shape `(N, M)`, where `N` is the | ||||
| length of the `Doc` (in tokens). If `attr_ids` is a single attribute, the output | ||||
| shape will be `(N,)`. You can specify attributes by integer ID (e.g. | ||||
| `spacy.attrs.LEMMA`) or string name (e.g. 'LEMMA' or 'lemma'). The values will | ||||
| `spacy.attrs.LEMMA`) or string name (e.g. "LEMMA" or "lemma"). The values will | ||||
| be 64-bit integers. | ||||
| 
 | ||||
| Returns a 2D array with one row per token and one column per attribute (when | ||||
|  | @ -323,7 +323,7 @@ Returns a 2D array with one row per token and one column per attribute (when | |||
| | Name        | Type                                                                               | Description                                                                                  | | ||||
| | ----------- | ---------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | | ||||
| | `attr_ids`  | list or int or string                                                              | A list of attributes (int IDs or string names) or a single attribute (int ID or string name) | | ||||
| | **RETURNS** | `numpy.ndarray[ndim=2, dtype='uint64']` or `numpy.ndarray[ndim=1, dtype='uint64']` | The exported attributes as a numpy array.                                                    | | ||||
| | **RETURNS** | `numpy.ndarray[ndim=2, dtype="uint64"]` or `numpy.ndarray[ndim=1, dtype="uint64"]` | The exported attributes as a numpy array.                                                    | | ||||
| 
 | ||||
| ## Doc.from_array {#from_array tag="method"} | ||||
| 
 | ||||
|  | @ -345,14 +345,14 @@ array of attributes. | |||
| | Name        | Type                                   | Description                                                               | | ||||
| | ----------- | -------------------------------------- | ------------------------------------------------------------------------- | | ||||
| | `attrs`     | list                                   | A list of attribute ID ints.                                              | | ||||
| | `array`     | `numpy.ndarray[ndim=2, dtype='int32']` | The attribute values to load.                                             | | ||||
| | `array`     | `numpy.ndarray[ndim=2, dtype="int32"]` | The attribute values to load.                                             | | ||||
| | `exclude`   | list                                   | String names of [serialization fields](#serialization-fields) to exclude. | | ||||
| | **RETURNS** | `Doc`                                  | Itself.                                                                   | | ||||
| 
 | ||||
| 
 | ||||
| ## Doc.from_docs {#from_docs tag="staticmethod"} | ||||
| 
 | ||||
| Concatenate multiple `Doc` objects to form a new one. Raises an error if the `Doc` objects do not all share the same `Vocab`. | ||||
| Concatenate multiple `Doc` objects to form a new one. Raises an error if the | ||||
| `Doc` objects do not all share the same `Vocab`. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
|  | @ -634,7 +634,7 @@ vectors. | |||
| 
 | ||||
| | Name        | Type                                     | Description                                             | | ||||
| | ----------- | ---------------------------------------- | ------------------------------------------------------- | | ||||
| | **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A 1D numpy array representing the document's semantics. | | ||||
| | **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the document's semantics. | | ||||
| 
 | ||||
| ## Doc.vector_norm {#vector_norm tag="property" model="vectors"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -1,24 +0,0 @@ | |||
| --- | ||||
| title: GoldCorpus | ||||
| teaser: An annotated corpus, using the JSON file format | ||||
| tag: class | ||||
| source: spacy/gold.pyx | ||||
| new: 2 | ||||
| --- | ||||
| 
 | ||||
| This class manages annotations for tagging, dependency parsing and NER. | ||||
| 
 | ||||
| ## GoldCorpus.\_\_init\_\_ {#init tag="method"} | ||||
| 
 | ||||
| Create a `GoldCorpus`. IF the input data is an iterable, each item should be a | ||||
| `(text, paragraphs)` tuple, where each paragraph is a tuple | ||||
| `(sentences, brackets)`, and each sentence is a tuple | ||||
| `(ids, words, tags, heads, ner)`. See the implementation of | ||||
| [`gold.read_json_file`](https://github.com/explosion/spaCy/tree/master/spacy/gold.pyx) | ||||
| for further details. | ||||
| 
 | ||||
| | Name        | Type                    | Description                                                  | | ||||
| | ----------- | ----------------------- | ------------------------------------------------------------ | | ||||
| | `train`     | str / `Path` / iterable | Training data, as a path (file or directory) or iterable.    | | ||||
| | `dev`       | str / `Path` / iterable | Development data, as a path (file or directory) or iterable. | | ||||
| | **RETURNS** | `GoldCorpus`            | The newly constructed object.                                | | ||||
							
								
								
									
										29
									
								
								website/docs/api/sentencerecognizer.md
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										29
									
								
								website/docs/api/sentencerecognizer.md
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,29 @@ | |||
| --- | ||||
| title: SentenceRecognizer | ||||
| tag: class | ||||
| source: spacy/pipeline/pipes.pyx | ||||
| new: 3 | ||||
| --- | ||||
| 
 | ||||
| A trainable pipeline component for sentence segmentation. For a simpler, | ||||
| ruse-based strategy, see the [`Sentencizer`](/api/sentencizer). This class is a | ||||
| subclass of `Pipe` and follows the same API. The component is also available via | ||||
| the string name `"senter"`. After initialization, it is typically added to the | ||||
| processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). | ||||
| 
 | ||||
| ## SentenceRecognizer.\_\_init\_\_ {#init tag="method"} | ||||
| 
 | ||||
| Initialize the sentence recognizer. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > # Construction via create_pipe | ||||
| > senter = nlp.create_pipe("senter") | ||||
| > | ||||
| > # Construction from class | ||||
| > from spacy.pipeline import SentenceRecognizer | ||||
| > senter = SentenceRecognizer() | ||||
| > ``` | ||||
| 
 | ||||
| <!-- TODO: document, similar to other trainable pipeline components --> | ||||
|  | @ -12,19 +12,6 @@ require a statistical model to be loaded. The component is also available via | |||
| the string name `"sentencizer"`. After initialization, it is typically added to | ||||
| the processing pipeline using [`nlp.add_pipe`](/api/language#add_pipe). | ||||
| 
 | ||||
| <Infobox title="Important note" variant="warning"> | ||||
| 
 | ||||
| Compared to the previous `SentenceSegmenter` class, the `Sentencizer` component | ||||
| doesn't add a hook to `doc.user_hooks["sents"]`. Instead, it iterates over the | ||||
| tokens in the `Doc` and sets the `Token.is_sent_start` property. The | ||||
| `SentenceSegmenter` is still available if you import it directly: | ||||
| 
 | ||||
| ```python | ||||
| from spacy.pipeline import SentenceSegmenter | ||||
| ``` | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| ## Sentencizer.\_\_init\_\_ {#init tag="method"} | ||||
| 
 | ||||
| Initialize the sentencizer. | ||||
|  | @ -40,10 +27,24 @@ Initialize the sentencizer. | |||
| > sentencizer = Sentencizer() | ||||
| > ``` | ||||
| 
 | ||||
| | Name          | Type          | Description                                                                                            | | ||||
| | ------------- | ------------- | ------------------------------------------------------------------------------------------------------ | | ||||
| | `punct_chars` | list          | Optional custom list of punctuation characters that mark sentence ends. Defaults to `['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉', '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', '꡷', '꣎', '꣏', '꤯', '꧈', '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', '﹖', '﹗', '!', '.', '?', '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑩃', '𑪛', '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '。', '。']`. | | ||||
| | **RETURNS**   | `Sentencizer` | The newly constructed object.                                                                          | | ||||
| | Name          | Type          | Description                                                                                     | | ||||
| | ------------- | ------------- | ----------------------------------------------------------------------------------------------- | | ||||
| | `punct_chars` | list          | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. | | ||||
| | **RETURNS**   | `Sentencizer` | The newly constructed object.                                                                   | | ||||
| 
 | ||||
| ```python | ||||
| ### punct_chars defaults | ||||
| ['!', '.', '?', '։', '؟', '۔', '܀', '܁', '܂', '߹', '।', '॥', '၊', '။', '።', | ||||
|  '፧', '፨', '᙮', '᜵', '᜶', '᠃', '᠉', '᥄', '᥅', '᪨', '᪩', '᪪', '᪫', | ||||
|  '᭚', '᭛', '᭞', '᭟', '᰻', '᰼', '᱾', '᱿', '‼', '‽', '⁇', '⁈', '⁉', | ||||
|  '⸮', '⸼', '꓿', '꘎', '꘏', '꛳', '꛷', '꡶', '꡷', '꣎', '꣏', '꤯', '꧈', | ||||
|  '꧉', '꩝', '꩞', '꩟', '꫰', '꫱', '꯫', '﹒', '﹖', '﹗', '!', '.', '?', | ||||
|  '𐩖', '𐩗', '𑁇', '𑁈', '𑂾', '𑂿', '𑃀', '𑃁', '𑅁', '𑅂', '𑅃', '𑇅', | ||||
|  '𑇆', '𑇍', '𑇞', '𑇟', '𑈸', '𑈹', '𑈻', '𑈼', '𑊩', '𑑋', '𑑌', '𑗂', | ||||
|  '𑗃', '𑗉', '𑗊', '𑗋', '𑗌', '𑗍', '𑗎', '𑗏', '𑗐', '𑗑', '𑗒', '𑗓', | ||||
|  '𑗔', '𑗕', '𑗖', '𑗗', '𑙁', '𑙂', '𑜼', '𑜽', '𑜾', '𑩂', '𑩃', '𑪛', | ||||
|  '𑪜', '𑱁', '𑱂', '𖩮', '𖩯', '𖫵', '𖬷', '𖬸', '𖭄', '𛲟', '𝪈', '。', '。'] | ||||
| ``` | ||||
| 
 | ||||
| ## Sentencizer.\_\_call\_\_ {#call tag="method"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -25,7 +25,7 @@ Create a Span object from the slice `doc[start : end]`. | |||
| | `end`       | int                                      | The index of the first token after the span.                                                              | | ||||
| | `label`     | int / str                                | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string.       | | ||||
| | `kb_id`     | int / str                                | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. | | ||||
| | `vector`    | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                                                                     | | ||||
| | `vector`    | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span.                                                                     | | ||||
| | **RETURNS** | `Span`                                   | The newly constructed object.                                                                             | | ||||
| 
 | ||||
| ## Span.\_\_getitem\_\_ {#getitem tag="method"} | ||||
|  | @ -110,7 +110,7 @@ For details, see the documentation on | |||
| 
 | ||||
| | Name      | Type     | Description                                                                                                                           | | ||||
| | --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `name`    | str      | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`.                        | | ||||
| | `name`    | str      | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `span._.my_attr`.                        | | ||||
| | `default` | -        | Optional default value of the attribute if no getter or method is defined.                                                            | | ||||
| | `method`  | callable | Set a custom method on the object, for example `span._.compare(other_span)`.                                                          | | ||||
| | `getter`  | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute.            | | ||||
|  | @ -191,7 +191,7 @@ the character indices don't map to a valid span. | |||
| | `end`       | int                                      | The index of the last character after the span.                       | | ||||
| | `label`     | uint64 / str                             | A label to attach to the span, e.g. for named entities.               | | ||||
| | `kb_id`     | uint64 / str                             | An ID from a knowledge base to capture the meaning of a named entity. | | ||||
| | `vector`    | `numpy.ndarray[ndim=1, dtype='float32']` | A meaning representation of the span.                                 | | ||||
| | `vector`    | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span.                                 | | ||||
| | **RETURNS** | `Span`                                   | The newly constructed object or `None`.                               | | ||||
| 
 | ||||
| ## Span.similarity {#similarity tag="method" model="vectors"} | ||||
|  | @ -232,7 +232,7 @@ ancestor is found, e.g. if span excludes a necessary ancestor. | |||
| 
 | ||||
| | Name        | Type                                   | Description                                      | | ||||
| | ----------- | -------------------------------------- | ------------------------------------------------ | | ||||
| | **RETURNS** | `numpy.ndarray[ndim=2, dtype='int32']` | The lowest common ancestor matrix of the `Span`. | | ||||
| | **RETURNS** | `numpy.ndarray[ndim=2, dtype="int32"]` | The lowest common ancestor matrix of the `Span`. | | ||||
| 
 | ||||
| ## Span.to_array {#to_array tag="method" new="2"} | ||||
| 
 | ||||
|  | @ -440,7 +440,7 @@ vectors. | |||
| 
 | ||||
| | Name        | Type                                     | Description                                         | | ||||
| | ----------- | ---------------------------------------- | --------------------------------------------------- | | ||||
| | **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A 1D numpy array representing the span's semantics. | | ||||
| | **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the span's semantics. | | ||||
| 
 | ||||
| ## Span.vector_norm {#vector_norm tag="property" model="vectors"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -58,7 +58,7 @@ For details, see the documentation on | |||
| 
 | ||||
| | Name      | Type     | Description                                                                                                                             | | ||||
| | --------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `name`    | str      | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `token._.my_attr`.                         | | ||||
| | `name`    | str      | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `token._.my_attr`.                         | | ||||
| | `default` | -        | Optional default value of the attribute if no getter or method is defined.                                                              | | ||||
| | `method`  | callable | Set a custom method on the object, for example `token._.compare(other_token)`.                                                          | | ||||
| | `getter`  | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute.              | | ||||
|  | @ -370,7 +370,7 @@ A real-valued meaning representation. | |||
| 
 | ||||
| | Name        | Type                                     | Description                                          | | ||||
| | ----------- | ---------------------------------------- | ---------------------------------------------------- | | ||||
| | **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A 1D numpy array representing the token's semantics. | | ||||
| | **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the token's semantics. | | ||||
| 
 | ||||
| ## Token.vector_norm {#vector_norm tag="property" model="vectors"} | ||||
| 
 | ||||
|  | @ -435,8 +435,8 @@ The L2 norm of the token's vector representation. | |||
| | `is_upper`                                   | bool         | Is the token in uppercase? Equivalent to `token.text.isupper()`.                                                                                                                                                                                               | | ||||
| | `is_title`                                   | bool         | Is the token in titlecase? Equivalent to `token.text.istitle()`.                                                                                                                                                                                               | | ||||
| | `is_punct`                                   | bool         | Is the token punctuation?                                                                                                                                                                                                                                      | | ||||
| | `is_left_punct`                              | bool         | Is the token a left punctuation mark, e.g. `'('` ?                                                                                                                                                                                                             | | ||||
| | `is_right_punct`                             | bool         | Is the token a right punctuation mark, e.g. `')'` ?                                                                                                                                                                                                            | | ||||
| | `is_left_punct`                              | bool         | Is the token a left punctuation mark, e.g. `"("` ?                                                                                                                                                                                                             | | ||||
| | `is_right_punct`                             | bool         | Is the token a right punctuation mark, e.g. `")"` ?                                                                                                                                                                                                            | | ||||
| | `is_space`                                   | bool         | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`.                                                                                                                                                                         | | ||||
| | `is_bracket`                                 | bool         | Is the token a bracket?                                                                                                                                                                                                                                        | | ||||
| | `is_quote`                                   | bool         | Is the token a quotation mark?                                                                                                                                                                                                                                 | | ||||
|  |  | |||
|  | @ -3,6 +3,7 @@ title: Top-level Functions | |||
| menu: | ||||
|   - ['spacy', 'spacy'] | ||||
|   - ['displacy', 'displacy'] | ||||
|   - ['Data & Alignment', 'gold'] | ||||
|   - ['Utility Functions', 'util'] | ||||
| --- | ||||
| 
 | ||||
|  | @ -76,8 +77,8 @@ meta data as a dictionary instead, you can use the `meta` attribute on your | |||
| > | ||||
| > ```python | ||||
| > spacy.info() | ||||
| > spacy.info("en") | ||||
| > spacy.info("de", markdown=True) | ||||
| > spacy.info("en_core_web_sm") | ||||
| > spacy.info(markdown=True) | ||||
| > ``` | ||||
| 
 | ||||
| | Name       | Type | Description                                      | | ||||
|  | @ -258,6 +259,156 @@ colors for them. Your application or model package can also expose a | |||
| [`spacy_displacy_colors` entry point](/usage/saving-loading#entry-points-displacy) | ||||
| to add custom labels and their colors automatically. | ||||
| 
 | ||||
| ## Training data and alignment {#gold source="spacy/gold"} | ||||
| 
 | ||||
| ### gold.docs_to_json {#docs_to_json tag="function"} | ||||
| 
 | ||||
| Convert a list of Doc objects into the | ||||
| [JSON-serializable format](/api/annotation#json-input) used by the | ||||
| [`spacy train`](/api/cli#train) command. Each input doc will be treated as a | ||||
| 'paragraph' in the output doc. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > from spacy.gold import docs_to_json | ||||
| > | ||||
| > doc = nlp("I like London") | ||||
| > json_data = docs_to_json([doc]) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type             | Description                                | | ||||
| | ----------- | ---------------- | ------------------------------------------ | | ||||
| | `docs`      | iterable / `Doc` | The `Doc` object(s) to convert.            | | ||||
| | `id`        | int              | ID to assign to the JSON. Defaults to `0`. | | ||||
| | **RETURNS** | dict             | The data in spaCy's JSON format.           | | ||||
| 
 | ||||
| ### gold.align {#align tag="function"} | ||||
| 
 | ||||
| Calculate alignment tables between two tokenizations, using the Levenshtein | ||||
| algorithm. The alignment is case-insensitive. | ||||
| 
 | ||||
| <Infobox title="Important note" variant="warning"> | ||||
| 
 | ||||
| The current implementation of the alignment algorithm assumes that both | ||||
| tokenizations add up to the same string. For example, you'll be able to align | ||||
| `["I", "'", "m"]` and `["I", "'m"]`, which both add up to `"I'm"`, but not | ||||
| `["I", "'m"]` and `["I", "am"]`. | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > from spacy.gold import align | ||||
| > | ||||
| > bert_tokens = ["obama", "'", "s", "podcast"] | ||||
| > spacy_tokens = ["obama", "'s", "podcast"] | ||||
| > alignment = align(bert_tokens, spacy_tokens) | ||||
| > cost, a2b, b2a, a2b_multi, b2a_multi = alignment | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type  | Description                                                                | | ||||
| | ----------- | ----- | -------------------------------------------------------------------------- | | ||||
| | `tokens_a`  | list  | String values of candidate tokens to align.                                | | ||||
| | `tokens_b`  | list  | String values of reference tokens to align.                                | | ||||
| | **RETURNS** | tuple | A `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the alignment. | | ||||
| 
 | ||||
| The returned tuple contains the following alignment information: | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > a2b = array([0, -1, -1, 2]) | ||||
| > b2a = array([0, 2, 3]) | ||||
| > a2b_multi = {1: 1, 2: 1} | ||||
| > b2a_multi = {} | ||||
| > ``` | ||||
| > | ||||
| > If `a2b[3] == 2`, that means that `tokens_a[3]` aligns to `tokens_b[2]`. If | ||||
| > there's no one-to-one alignment for a token, it has the value `-1`. | ||||
| 
 | ||||
| | Name        | Type                                   | Description                                                                                                                                     | | ||||
| | ----------- | -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `cost`      | int                                    | The number of misaligned tokens.                                                                                                                | | ||||
| | `a2b`       | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_a` to indices in `tokens_b`.                                                                          | | ||||
| | `b2a`       | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_b` to indices in `tokens_a`.                                                                          | | ||||
| | `a2b_multi` | dict                                   | A dictionary mapping indices in `tokens_a` to indices in `tokens_b`, where multiple tokens of `tokens_a` align to the same token of `tokens_b`. | | ||||
| | `b2a_multi` | dict                                   | A dictionary mapping indices in `tokens_b` to indices in `tokens_a`, where multiple tokens of `tokens_b` align to the same token of `tokens_a`. | | ||||
| 
 | ||||
| ### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"} | ||||
| 
 | ||||
| Encode labelled spans into per-token tags, using the | ||||
| [BILUO scheme](/api/annotation#biluo) (Begin, In, Last, Unit, Out). Returns a | ||||
| list of strings, describing the tags. Each tag string will be of the form of | ||||
| either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, | ||||
| `"L"`, `"U"`. The string `"-"` is used where the entity offsets don't align with | ||||
| the tokenization in the `Doc` object. The training algorithm will view these as | ||||
| missing values. `O` denotes a non-entity token. `B` denotes the beginning of a | ||||
| multi-token entity, `I` the inside of an entity of three or more tokens, and `L` | ||||
| the end of an entity of two or more tokens. `U` denotes a single-token entity. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > from spacy.gold import biluo_tags_from_offsets | ||||
| > | ||||
| > doc = nlp("I like London.") | ||||
| > entities = [(7, 13, "LOC")] | ||||
| > tags = biluo_tags_from_offsets(doc, entities) | ||||
| > assert tags == ["O", "O", "U-LOC", "O"] | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type     | Description                                                                                                                                     | | ||||
| | ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `doc`       | `Doc`    | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document.                          | | ||||
| | `entities`  | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. | | ||||
| | **RETURNS** | list     | str strings, describing the [BILUO](/api/annotation#biluo) tags.                                                                                | | ||||
| 
 | ||||
| ### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} | ||||
| 
 | ||||
| Encode per-token tags following the [BILUO scheme](/api/annotation#biluo) into | ||||
| entity offsets. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > from spacy.gold import offsets_from_biluo_tags | ||||
| > | ||||
| > doc = nlp("I like London.") | ||||
| > tags = ["O", "O", "U-LOC", "O"] | ||||
| > entities = offsets_from_biluo_tags(doc, tags) | ||||
| > assert entities == [(7, 13, "LOC")] | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type     | Description                                                                                                                                                                                                                 | | ||||
| | ----------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `doc`       | `Doc`    | The document that the BILUO tags refer to.                                                                                                                                                                                  | | ||||
| | `entities`  | iterable | A sequence of [BILUO](/api/annotation#biluo) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | | ||||
| | **RETURNS** | list     | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string.                                                                               | | ||||
| 
 | ||||
| ### gold.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} | ||||
| 
 | ||||
| Encode per-token tags following the [BILUO scheme](/api/annotation#biluo) into | ||||
| [`Span`](/api/span) objects. This can be used to create entity spans from | ||||
| token-based tags, e.g. to overwrite the `doc.ents`. | ||||
| 
 | ||||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > from spacy.gold import spans_from_biluo_tags | ||||
| > | ||||
| > doc = nlp("I like London.") | ||||
| > tags = ["O", "O", "U-LOC", "O"] | ||||
| > doc.ents = spans_from_biluo_tags(doc, tags) | ||||
| > ``` | ||||
| 
 | ||||
| | Name        | Type     | Description                                                                                                                                                                                                                 | | ||||
| | ----------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `doc`       | `Doc`    | The document that the BILUO tags refer to.                                                                                                                                                                                  | | ||||
| | `entities`  | iterable | A sequence of [BILUO](/api/annotation#biluo) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | | ||||
| | **RETURNS** | list     | A sequence of `Span` objects with added entity labels.                                                                                                                                                                      | | ||||
| 
 | ||||
| ## Utility functions {#util source="spacy/util.py"} | ||||
| 
 | ||||
| spaCy comes with a small collection of utility functions located in | ||||
|  | @ -341,7 +492,7 @@ class. The model data will then be loaded in via | |||
| > #### Example | ||||
| > | ||||
| > ```python | ||||
| > nlp = util.load_model("en") | ||||
| > nlp = util.load_model("en_core_web_sm") | ||||
| > nlp = util.load_model("en_core_web_sm", disable=["ner"]) | ||||
| > nlp = util.load_model("/path/to/data") | ||||
| > ``` | ||||
|  | @ -634,3 +785,13 @@ of one entity) or when merging spans with | |||
| | ----------- | -------- | -------------------- | | ||||
| | `spans`     | iterable | The spans to filter. | | ||||
| | **RETURNS** | list     | The filtered spans.  | | ||||
| 
 | ||||
| ## util.get_words_and_spaces {#get_words_and_spaces tag="function" new="3"} | ||||
| 
 | ||||
| <!-- TODO: document --> | ||||
| 
 | ||||
| | Name        | Type  | Description | | ||||
| | ----------- | ----- | ----------- | | ||||
| | `words`     | list  |             | | ||||
| | `text`      | str   |             | | ||||
| | **RETURNS** | tuple |             | | ||||
|  |  | |||
|  | @ -12,6 +12,8 @@ place** by the components of the pipeline. The `Language` object coordinates | |||
| these components. It takes raw text and sends it through the pipeline, returning | ||||
| an **annotated document**. It also orchestrates training and serialization. | ||||
| 
 | ||||
| <!-- TODO: update architecture and tables below to match sidebar in API docs etc. --> | ||||
| 
 | ||||
|  | ||||
| 
 | ||||
| ### Container objects {#architecture-containers} | ||||
|  |  | |||
|  | @ -392,9 +392,7 @@ loading models, the underlying functionality is entirely based on native Python | |||
| packages. This allows your application to handle a model like any other package | ||||
| dependency. | ||||
| 
 | ||||
| For an example of an automated model training and build process, see | ||||
| [this overview](/usage/training#example-training-spacy) of how we're training | ||||
| and packaging our models for spaCy. | ||||
| <!-- TODO: reference relevant spaCy project --> | ||||
| 
 | ||||
| ### Downloading and requiring model dependencies {#models-download} | ||||
| 
 | ||||
|  |  | |||
|  | @ -711,67 +711,4 @@ class and call [`from_disk`](/api/language#from_disk) instead. | |||
| nlp = spacy.blank("en").from_disk("/path/to/data") | ||||
| ``` | ||||
| 
 | ||||
| <Infobox title="Important note: Loading data in v2.x" variant="warning"> | ||||
| 
 | ||||
| In spaCy 1.x, the distinction between `spacy.load()` and the `Language` class | ||||
| constructor was quite unclear. You could call `spacy.load()` when no model was | ||||
| present, and it would silently return an empty object. Likewise, you could pass | ||||
| a path to `English`, even if the mode required a different language. spaCy v2.0 | ||||
| solves this with a clear distinction between setting up the instance and loading | ||||
| the data. | ||||
| 
 | ||||
| ```diff | ||||
| - nlp = spacy.load("en_core_web_sm", path="/path/to/data") | ||||
| + nlp = spacy.blank("en_core_web_sm").from_disk("/path/to/data") | ||||
| ``` | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| ### How we're training and packaging models for spaCy {#example-training-spacy} | ||||
| 
 | ||||
| Publishing a new version of spaCy often means re-training all available models, | ||||
| which is [quite a lot](/usage/models#languages). To make this run smoothly, | ||||
| we're using an automated build process and a [`spacy train`](/api/cli#train) | ||||
| template that looks like this: | ||||
| 
 | ||||
| ```bash | ||||
| $ python -m spacy train {lang} {models_dir}/{name} {train_data} {dev_data} -m meta/{name}.json -V {version} -g {gpu_id} -n {n_epoch} -ns {n_sents} | ||||
| ``` | ||||
| 
 | ||||
| > #### meta.json template | ||||
| > | ||||
| > ```json | ||||
| > { | ||||
| >   "lang": "en", | ||||
| >   "name": "core_web_sm", | ||||
| >   "license": "CC BY-SA 3.0", | ||||
| >   "author": "Explosion AI", | ||||
| >   "url": "https://explosion.ai", | ||||
| >   "email": "contact@explosion.ai", | ||||
| >   "sources": ["OntoNotes 5", "Common Crawl"], | ||||
| >   "description": "English multi-task CNN trained on OntoNotes, with GloVe vectors trained on common crawl. Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities." | ||||
| > } | ||||
| > ``` | ||||
| 
 | ||||
| In a directory `meta`, we keep `meta.json` templates for the individual models, | ||||
| containing all relevant information that doesn't change across versions, like | ||||
| the name, description, author info and training data sources. When we train the | ||||
| model, we pass in the file to the meta template as the `--meta` argument, and | ||||
| specify the current model version as the `--version` argument. | ||||
| 
 | ||||
| On each epoch, the model is saved out with a `meta.json` using our template and | ||||
| added properties, like the `pipeline`, `accuracy` scores and the `spacy_version` | ||||
| used to train the model. After training completion, the best model is selected | ||||
| automatically and packaged using the [`package`](/api/cli#package) command. | ||||
| Since a full meta file is already present on the trained model, no further setup | ||||
| is required to build a valid model package. | ||||
| 
 | ||||
| ```bash | ||||
| python -m spacy package -f {best_model} dist/ | ||||
| cd dist/{model_name} | ||||
| python setup.py sdist | ||||
| ``` | ||||
| 
 | ||||
| This process allows us to quickly trigger the model training and build process | ||||
| for all available models and languages, and generate the correct meta data | ||||
| automatically. | ||||
| <!-- TODO: point to spaCy projects? --> | ||||
|  |  | |||
|  | @ -6,6 +6,7 @@ menu: | |||
|   - ['CLI & Config', 'cli-config'] | ||||
|   - ['Custom Models', 'custom-models'] | ||||
|   - ['Transfer Learning', 'transfer-learning'] | ||||
|   - ['Parallel Training', 'parallel-training'] | ||||
|   - ['Internal API', 'api'] | ||||
| --- | ||||
| 
 | ||||
|  | @ -43,6 +44,10 @@ The recommended way to train your spaCy models is via the | |||
| 
 | ||||
| <!-- TODO: decide how we want to present the "getting started" workflow here, get a default config etc. --> | ||||
| 
 | ||||
| ### Training data format {#data-format} | ||||
| 
 | ||||
| <!-- TODO: explain the new binary DocBin format --> | ||||
| 
 | ||||
| > #### Tip: Debug your data | ||||
| > | ||||
| > The [`debug-data` command](/api/cli#debug-data) lets you analyze and validate | ||||
|  | @ -167,6 +172,10 @@ dropout = null | |||
| 
 | ||||
| <!-- TODO: document spacy pretrain --> | ||||
| 
 | ||||
| ## Parallel Training with Ray {#parallel-training} | ||||
| 
 | ||||
| <!-- TODO: document Ray integration --> | ||||
| 
 | ||||
| ## Internal training API {#api} | ||||
| 
 | ||||
| <!-- TODO: rewrite for new nlp.update / example logic --> | ||||
|  |  | |||
|  | @ -68,7 +68,8 @@ | |||
|                     { "text": "Token", "url": "/api/token" }, | ||||
|                     { "text": "Span", "url": "/api/span" }, | ||||
|                     { "text": "Lexeme", "url": "/api/lexeme" }, | ||||
|                     { "text": "Example", "url": "/api/example" } | ||||
|                     { "text": "Example", "url": "/api/example" }, | ||||
|                     { "text": "DocBin", "url": "/api/docbin" } | ||||
|                 ] | ||||
|             }, | ||||
|             { | ||||
|  | @ -86,6 +87,7 @@ | |||
|                     { "text": "PhraseMatcher", "url": "/api/phrasematcher" }, | ||||
|                     { "text": "EntityRuler", "url": "/api/entityruler" }, | ||||
|                     { "text": "Sentencizer", "url": "/api/sentencizer" }, | ||||
|                     { "text": "SentenceRecognizer", "url": "/api/sentencerecognizer" }, | ||||
|                     { "text": "Other Functions", "url": "/api/pipeline-functions" } | ||||
|                 ] | ||||
|             }, | ||||
|  | @ -97,10 +99,8 @@ | |||
|                     { "text": "Vectors", "url": "/api/vectors" }, | ||||
|                     { "text": "Lookups", "url": "/api/lookups" }, | ||||
|                     { "text": "KnowledgeBase", "url": "/api/kb" }, | ||||
|                     { "text": "GoldParse", "url": "/api/goldparse" }, | ||||
|                     { "text": "GoldCorpus", "url": "/api/goldcorpus" }, | ||||
|                     { "text": "Scorer", "url": "/api/scorer" }, | ||||
|                     { "text": "DocBin", "url": "/api/docbin" } | ||||
|                     { "text": "Corpus", "url": "/api/corpus" } | ||||
|                 ] | ||||
|             }, | ||||
|             { | ||||
|  |  | |||
|  | @ -83,12 +83,13 @@ export class Code extends React.Component { | |||
|             executable, | ||||
|             github, | ||||
|             prompt, | ||||
|             wrap, | ||||
|             highlight, | ||||
|             className, | ||||
|             children, | ||||
|         } = this.props | ||||
|         const codeClassNames = classNames(classes.code, className, `language-${lang}`, { | ||||
|             [classes.wrap]: !!highlight, | ||||
|             [classes.wrap]: !!highlight || !!wrap, | ||||
|         }) | ||||
|         const ghClassNames = classNames(codeClassNames, classes.maxHeight) | ||||
|         const { Juniper } = this.state | ||||
|  |  | |||
|  | @ -83,7 +83,7 @@ const QuickstartInstall = ({ id, title }) => ( | |||
|                         export PYTHONPATH=`pwd` | ||||
|                     </QS> | ||||
|                     <QS package="source" os="windows"> | ||||
|                         set PYTHONPATH=/path/to/spaCy | ||||
|                         set PYTHONPATH=C:\path\to\spaCy | ||||
|                     </QS> | ||||
|                     <QS package="source">pip install -r requirements.txt</QS> | ||||
|                     <QS data="lookups" package="pip"> | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user