mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 13:41:21 +03:00 
			
		
		
		
	Update docs [ci skip]
This commit is contained in:
		
							parent
							
								
									44160cd52f
								
							
						
					
					
						commit
						0a8a124a6e
					
				|  | @ -19,13 +19,13 @@ def init_vectors_cli( | |||
|     output_dir: Path = Arg(..., help="Pipeline output directory"), | ||||
|     prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), | ||||
|     truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), | ||||
|     jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file"), | ||||
|     name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"), | ||||
|     verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), | ||||
|     jsonl_loc: Optional[Path]=Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file") | ||||
|     # fmt: on | ||||
| ): | ||||
|     """Convert word vectors for use with spaCy. Will export an nlp object that | ||||
|     you can use in the [initialize.vocab] block of your config to initialize | ||||
|     you can use in the [initialize] block of your config to initialize | ||||
|     a model with vectors. | ||||
|     """ | ||||
|     util.logger.setLevel(logging.DEBUG if verbose else logging.INFO) | ||||
|  |  | |||
|  | @ -170,36 +170,35 @@ $ python -m spacy init fill-config [base_path] [output_file] [--diff] | |||
| | `--help`, `-h`         | Show help message and available arguments. ~~bool (flag)~~                                                                          | | ||||
| | **CREATES**            | Complete and auto-filled config file for training.                                                                                  | | ||||
| 
 | ||||
| ### init vocab {#init-vocab new="3" tag="command"} | ||||
| ### init vectors {#init-vectors new="3" tag="command"} | ||||
| 
 | ||||
| Create a blank pipeline directory from raw data, like word frequencies, Brown | ||||
| clusters and word vectors. Note that in order to populate the vocabulary, you | ||||
| need to pass in a JSONL-formatted | ||||
| [vocabulary file](/api/data-formats#vocab-jsonl) as `--jsonl-loc` with optional | ||||
| `id` values that correspond to the vectors table. Just loading in vectors will | ||||
| not automatically populate the vocab. | ||||
| Convert [word vectors](/usage/linguistic-features#vectors-similarity) for use | ||||
| with spaCy. Will export an `nlp` object that you can use in the | ||||
| [`[initialize]`](/api/data-formats#config-initialize) block of your config to | ||||
| initialize a model with vectors. See the usage guide on | ||||
| [static vectors](/usage/embeddings-transformers#static-vectors) for details on | ||||
| how to use vectors in your model. | ||||
| 
 | ||||
| <Infobox title="New in v3.0" variant="warning" id="init-model"> | ||||
| 
 | ||||
| This command was previously called `init-model`. | ||||
| This functionality was previously available as part of the command `init-model`. | ||||
| 
 | ||||
| </Infobox> | ||||
| 
 | ||||
| ```cli | ||||
| $ python -m spacy init vocab [lang] [output_dir] [--jsonl-loc] [--vectors-loc] [--prune-vectors] [--vectors-name] [--meta-name] [--base] | ||||
| $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--lexemes-jsonl] [--verbose] | ||||
| ``` | ||||
| 
 | ||||
| | Name                    | Description                                                                                                                                                                                                                                                         | | ||||
| | ------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `lang`                  | Pipeline language [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes), e.g. `en`. ~~str (positional)~~                                                                                                                                                | | ||||
| | `vectors_loc`           | Location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Path (positional)~~ | | ||||
| | `output_dir`            | Pipeline output directory. Will be created if it doesn't exist. ~~Path (positional)~~                                                                                                                                                                               | | ||||
| | `--jsonl-loc`, `-j`                                     | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~                                                                                                                                         | | ||||
| | `--vectors-loc`, `-v`                                   | Optional location of vectors. Should be a file where the first row contains the dimensions of the vectors, followed by a space-separated Word2Vec table. File can be provided in `.txt` format or as a zipped text file in `.zip` or `.tar.gz` format. ~~Optional[Path] \(option)~~ | | ||||
| | `--truncate-vectors`, `-t` <Tag variant="new">2.3</Tag> | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                                  | | ||||
| | `--prune-vectors`, `-V`                                 | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                                     | | ||||
| | `--vectors-name`, `-vn`                                 | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                                   | | ||||
| | `--meta-name`, `-mn`                                    | Optional name of the package for the pipeline meta. ~~Optional[str] \(option)~~                                                                                                                                                                                                     | | ||||
| | `--base`, `-b`                                          | Optional name of or path to base pipeline to start with (mostly relevant for pipelines with custom tokenizers). ~~Optional[str] \(option)~~                                                                                                                                         | | ||||
| | `--lexemes-jsonl`, `-j` | Optional location of JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) with lexical attributes. ~~Optional[Path] \(option)~~                                                                                                                         | | ||||
| | `--truncate`, `-t`      | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~                                                                                                                                                  | | ||||
| | `--prune`, `-p`         | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~                                                                                                                                                                     | | ||||
| | `--name`, `-n`          | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~                                                                                                                                                   | | ||||
| | `--verbose`, `-V`       | Print additional information and explanations. ~~bool (flag)~~                                                                                                                                                                                                      | | ||||
| | `--help`, `-h`          | Show help message and available arguments. ~~bool (flag)~~                                                                                                                                                                                                          | | ||||
| | **CREATES**             | A spaCy pipeline directory containing the vocab and vectors.                                                                                                                                                                                                        | | ||||
| 
 | ||||
|  |  | |||
|  | @ -246,7 +246,7 @@ without requiring them at runtime when you load the trained pipeline back in. | |||
| | `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). Defaults to variable `${paths.init_tok2vec}`. ~~Optional[str]~~                                                                                                                                                                                                                                                | | ||||
| | `lookups`      | Additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `null`. ~~Optional[Lookups]~~                                                                                                                                                                                                                                                       | | ||||
| | `tokenizer`    | Additional arguments passed to the `initialize` method of the specified tokenizer. Can be used for languages like Chinese that depend on dictionaries or trained models for tokenization. If type annotations are available on the method, the config will be validated against them. The `initialize` method will always receive the `get_examples` callback and the current `nlp` object. ~~Dict[str, Any]~~ | | ||||
| | `vectors`      | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vocab`](/api/cli#init-vocab). Defaults to `null`. ~~Optional[str]~~                                                                                                                                                                                                                                               | | ||||
| | `vectors`      | Name or path of pipeline containing pretrained word vectors to use, e.g. created with [`init vectors`](/api/cli#init-vectors). Defaults to `null`. ~~Optional[str]~~                                                                                                                                                                                                                                           | | ||||
| | `vocab_data`   | Path to JSONL-formatted [vocabulary file](/api/data-formats#vocab-jsonl) to initialize vocabulary. ~~Optional[str]~~                                                                                                                                                                                                                                                                                           | | ||||
| 
 | ||||
| ## Training data {#training} | ||||
|  | @ -274,8 +274,8 @@ Typically, the extension for these binary files is `.spacy`, and they are used | |||
| as input format for specifying a [training corpus](/api/corpus) and for spaCy's | ||||
| CLI [`train`](/api/cli#train) command. The built-in | ||||
| [`convert`](/api/cli#convert) command helps you convert spaCy's previous | ||||
| [JSON format](#json-input) to the new binary format. It also supports | ||||
| conversion of the `.conllu` format used by the | ||||
| [JSON format](#json-input) to the new binary format. It also supports conversion | ||||
| of the `.conllu` format used by the | ||||
| [Universal Dependencies corpora](https://github.com/UniversalDependencies). | ||||
| 
 | ||||
| ### JSON training format {#json-input tag="deprecated"} | ||||
|  | @ -455,7 +455,7 @@ example = Example.from_dict(doc, gold_dict) | |||
| ## Lexical data for vocabulary {#vocab-jsonl new="2"} | ||||
| 
 | ||||
| To populate a pipeline's vocabulary, you can use the | ||||
| [`spacy init vocab`](/api/cli#init-vocab) command and load in a | ||||
| [`spacy init vectors`](/api/cli#init-vectors) command and load in a | ||||
| [newline-delimited JSON](http://jsonlines.org/) (JSONL) file containing one | ||||
| lexical entry per line via the `--jsonl-loc` option. The first line defines the | ||||
| language and vocabulary settings. All other lines are expected to be JSON | ||||
|  |  | |||
|  | @ -372,7 +372,7 @@ results to a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of | |||
| using one of the built-in loggers listed here, you can also | ||||
| [implement your own](/usage/training#custom-logging). | ||||
| 
 | ||||
| #### ConsoleLogger {#ConsoleLogger tag="registered function"} | ||||
| #### spacy.ConsoleLogger.v1 {#ConsoleLogger tag="registered function"} | ||||
| 
 | ||||
| > #### Example config | ||||
| > | ||||
|  | @ -418,7 +418,7 @@ start decreasing across epochs. | |||
| 
 | ||||
|  </Accordion> | ||||
| 
 | ||||
| #### WandbLogger {#WandbLogger tag="registered function"} | ||||
| #### spacy.WandbLogger.v1 {#WandbLogger tag="registered function"} | ||||
| 
 | ||||
| > #### Installation | ||||
| > | ||||
|  | @ -480,7 +480,7 @@ with your own registered function in the | |||
| [`@readers` registry](/api/top-level#registry) to customize the data loading and | ||||
| streaming. | ||||
| 
 | ||||
| ### Corpus {#corpus} | ||||
| ### spacy.Corpus.v1 {#corpus tag="registered function"} | ||||
| 
 | ||||
| The `Corpus` reader manages annotated corpora and can be used for training and | ||||
| development datasets in the [DocBin](/api/docbin) (`.spacy`) format. Also see | ||||
|  | @ -507,8 +507,9 @@ the [`Corpus`](/api/corpus) class. | |||
| | `max_length`    | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~                                                                                                                                      | | ||||
| | `limit`         | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                                                                                                                                                                          | | ||||
| | `augmenter`     | Apply some simply data augmentation, where we replace tokens with variations. This is especially useful for punctuation and case replacement, to help generalize beyond corpora that don't have smart-quotes, or only have smart quotes, etc. Defaults to `None`. ~~Optional[Callable]~~ | | ||||
| | **CREATES**     | The corpus reader. ~~Corpus~~                                                                                                                                                                                                                                                            | | ||||
| 
 | ||||
| ### JsonlReader {#jsonlreader} | ||||
| ### spacy.JsonlReader.v1 {#jsonlreader tag="registered function"} | ||||
| 
 | ||||
| Create [`Example`](/api/example) objects from a JSONL (newline-delimited JSON) | ||||
| file of texts keyed by `"text"`. Can be used to read the raw text corpus for | ||||
|  | @ -535,6 +536,7 @@ JSONL file. Also see the [`JsonlReader`](/api/corpus#jsonlreader) class. | |||
| | `min_length` | Minimum document length (in tokens). Shorter documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~       | | ||||
| | `max_length` | Maximum document length (in tokens). Longer documents will be skipped. Defaults to `0`, which indicates no limit. ~~int~~        | | ||||
| | `limit`      | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~                                  | | ||||
| | **CREATES**  | The corpus reader. ~~JsonlTexts~~                                                                                                | | ||||
| 
 | ||||
| ## Batchers {#batchers source="spacy/training/batchers.py" new="3"} | ||||
| 
 | ||||
|  | @ -550,7 +552,7 @@ Instead of using one of the built-in batchers listed here, you can also | |||
| [implement your own](/usage/training#custom-code-readers-batchers), which may or | ||||
| may not use a custom schedule. | ||||
| 
 | ||||
| ### batch_by_words {#batch_by_words tag="registered function"} | ||||
| ### spacy.batch_by_words.v1 {#batch_by_words tag="registered function"} | ||||
| 
 | ||||
| Create minibatches of roughly a given number of words. If any examples are | ||||
| longer than the specified batch length, they will appear in a batch by | ||||
|  | @ -576,8 +578,9 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument | |||
| | `tolerance`        | What percentage of the size to allow batches to exceed. ~~float~~                                                                                                                       | | ||||
| | `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~                                                                                                     | | ||||
| | `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                 | | ||||
| | **CREATES**        | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                     | | ||||
| 
 | ||||
| ### batch_by_sequence {#batch_by_sequence tag="registered function"} | ||||
| ### spacy.batch_by_sequence.v1 {#batch_by_sequence tag="registered function"} | ||||
| 
 | ||||
| > #### Example config | ||||
| > | ||||
|  | @ -594,8 +597,9 @@ Create a batcher that creates batches of the specified size. | |||
| | ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ||||
| | `size`       | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ | | ||||
| | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                 | | ||||
| | **CREATES**  | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                     | | ||||
| 
 | ||||
| ### batch_by_padded {#batch_by_padded tag="registered function"} | ||||
| ### spacy.batch_by_padded.v1 {#batch_by_padded tag="registered function"} | ||||
| 
 | ||||
| > #### Example config | ||||
| > | ||||
|  | @ -619,20 +623,21 @@ sequences in the batch. | |||
| | `buffer`           | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. ~~int~~ | | ||||
| | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~                                                                                                                                     | | ||||
| | `get_length`       | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~                                                                                     | | ||||
| | **CREATES**        | The batcher that takes an iterable of items and returns batches. ~~Callable[[Iterable[Any]], Iterable[List[Any]]]~~                                                                                                                         | | ||||
| 
 | ||||
| ## Augmenters {#augmenters source="spacy/training/augment.py" new="3"} | ||||
| 
 | ||||
| <!-- TODO: intro, explain data augmentation concept --> | ||||
| 
 | ||||
| ### orth_variants {#orth_variants tag="registered function"} | ||||
| ### spacy.orth_variants.v1 {#orth_variants tag="registered function"} | ||||
| 
 | ||||
| > #### Example config | ||||
| > | ||||
| > ```ini | ||||
| > [corpora.train.augmenter] | ||||
| > @augmenters = "spacy.orth_variants.v1" | ||||
| > level = 0.0 | ||||
| > lower = 0.0 | ||||
| > level = 0.1 | ||||
| > lower = 0.5 | ||||
| > lookups = null | ||||
| > ``` | ||||
| 
 | ||||
|  | @ -643,10 +648,10 @@ beyond corpora that don't have smart quotes, or only have smart quotes etc. | |||
| 
 | ||||
| | Name        | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    | | ||||
| | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ||||
| | `level`     | ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      | | ||||
| | `lower`     | ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      | | ||||
| | `level`     | The percentage of texts that will be augmented. ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                      | | ||||
| | `lower`     | The percentage of texts that will be lowercased. ~~float~~                                                                                                                                                                                                                                                                                                                                                                                                                                                                     | | ||||
| | `lookups`   | Lookups table containing the orth variants to use. See [`orth_variants.json`](https://github.com/explosion/spacy-lookups-data/blob/master/spacy_lookups_data/data/en_orth_variants.json) for an example. If not set, tables from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) are used if available and added in the [`[initialize]`](/api/data-formats#config-initialize) block of the config. If no orth variants are found, spaCy will raise an error. Defaults to `None`. ~~Optional[Lookups]~~ | | ||||
| | **RETURNS** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~                                                                                                                                                                                                                                                                                                                                                   | | ||||
| | **CREATES** | A function that takes the current `nlp` object and an [`Example`](/api/example) and yields augmented `Example` objects. ~~Callable[[Language, Example], Iterator[Example]]~~                                                                                                                                                                                                                                                                                                                                                   | | ||||
| 
 | ||||
| ## Training data and alignment {#gold source="spacy/training"} | ||||
| 
 | ||||
|  |  | |||
|  | @ -496,7 +496,7 @@ algorithms for learning word vector tables. You can train a word vectors table | |||
| using tools such as [Gensim](https://radimrehurek.com/gensim/), | ||||
| [FastText](https://fasttext.cc/) or | ||||
| [GloVe](https://nlp.stanford.edu/projects/glove/), or download existing | ||||
| pretrained vectors. The [`init vocab`](/api/cli#init-vocab) command lets you | ||||
| pretrained vectors. The [`init vectors`](/api/cli#init-vectors) command lets you | ||||
| convert vectors for use with spaCy and will give you a directory you can load or | ||||
| refer to in your [training configs](/usage/training#config). | ||||
| 
 | ||||
|  |  | |||
|  | @ -1834,10 +1834,12 @@ word vector libraries output an easy-to-read text-based format, where each line | |||
| consists of the word followed by its vector. For everyday use, we want to | ||||
| convert the vectors into a binary format that loads faster and takes up less | ||||
| space on disk. The easiest way to do this is the | ||||
| [`init vocab`](/api/cli#init-vocab) command-line utility. This will output a | ||||
| [`init vectors`](/api/cli#init-vectors) command-line utility. This will output a | ||||
| blank spaCy pipeline in the directory `/tmp/la_vectors_wiki_lg`, giving you | ||||
| access to some nice Latin vectors. You can then pass the directory path to | ||||
| [`spacy.load`](/api/top-level#spacy.load). | ||||
| [`spacy.load`](/api/top-level#spacy.load) or use it in the | ||||
| [`[initialize]`](/api/data-formats#config-initialize) of your config when you | ||||
| [train](/usage/training) a model. | ||||
| 
 | ||||
| > #### Usage example | ||||
| > | ||||
|  | @ -1850,7 +1852,7 @@ access to some nice Latin vectors. You can then pass the directory path to | |||
| 
 | ||||
| ```cli | ||||
| $ wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/word-vectors-v2/cc.la.300.vec.gz | ||||
| $ python -m spacy init vocab en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300.vec.gz | ||||
| $ python -m spacy init vectors en cc.la.300.vec.gz /tmp/la_vectors_wiki_lg | ||||
| ``` | ||||
| 
 | ||||
| <Accordion title="How to optimize vector coverage" id="custom-vectors-coverage" spaced> | ||||
|  | @ -1858,9 +1860,9 @@ $ python -m spacy init vocab en /tmp/la_vectors_wiki_lg --vectors-loc cc.la.300. | |||
| To help you strike a good balance between coverage and memory usage, spaCy's | ||||
| [`Vectors`](/api/vectors) class lets you map **multiple keys** to the **same | ||||
| row** of the table. If you're using the | ||||
| [`spacy init vocab`](/api/cli#init-vocab) command to create a vocabulary, | ||||
| pruning the vectors will be taken care of automatically if you set the | ||||
| `--prune-vectors` flag. You can also do it manually in the following steps: | ||||
| [`spacy init vectors`](/api/cli#init-vectors) command to create a vocabulary, | ||||
| pruning the vectors will be taken care of automatically if you set the `--prune` | ||||
| flag. You can also do it manually in the following steps: | ||||
| 
 | ||||
| 1. Start with a **word vectors package** that covers a huge vocabulary. For | ||||
|    instance, the [`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg) | ||||
|  | @ -1905,12 +1907,12 @@ the two words. | |||
| In the example above, the vector for "Shore" was removed and remapped to the | ||||
| vector of "coast", which is deemed about 73% similar. "Leaving" was remapped to | ||||
| the vector of "leaving", which is identical. If you're using the | ||||
| [`init vocab`](/api/cli#init-vocab) command, you can set the `--prune-vectors` | ||||
| [`init vectors`](/api/cli#init-vectors) command, you can set the `--prune` | ||||
| option to easily reduce the size of the vectors as you add them to a spaCy | ||||
| pipeline: | ||||
| 
 | ||||
| ```cli | ||||
| $ python -m spacy init vocab en /tmp/la_vectors_web_md --vectors-loc la.300d.vec.tgz --prune-vectors 10000 | ||||
| $ python -m spacy init vectors en la.300d.vec.tgz /tmp/la_vectors_web_md --prune 10000 | ||||
| ``` | ||||
| 
 | ||||
| This will create a blank spaCy pipeline with vectors for the first 10,000 words | ||||
|  |  | |||
|  | @ -564,7 +564,7 @@ Note that spaCy v3.0 now requires **Python 3.6+**. | |||
| | `KnowledgeBase.load_bulk`, `KnowledgeBase.dump`                                              | [`KnowledgeBase.from_disk`](/api/kb#from_disk), [`KnowledgeBase.to_disk`](/api/kb#to_disk)                                                                                                                               | | ||||
| | `Matcher.pipe`, `PhraseMatcher.pipe`                                                         | not needed                                                                                                                                                                                                               | | ||||
| | `gold.offsets_from_biluo_tags`, `gold.spans_from_biluo_tags`, `gold.biluo_tags_from_offsets` | [`training.biluo_tags_to_offsets`](/api/top-level#biluo_tags_to_offsets), [`training.biluo_tags_to_spans`](/api/top-level#biluo_tags_to_spans), [`training.offsets_to_biluo_tags`](/api/top-level#offsets_to_biluo_tags) | | ||||
| | `spacy init-model`                                                                           | [`spacy init vocab`](/api/cli#init-vocab)                                                                                                                                                                                | | ||||
| | `spacy init-model`                                                                           | [`spacy init vectors`](/api/cli#init-vectors)                                                                                                                                                                            | | ||||
| | `spacy debug-data`                                                                           | [`spacy debug data`](/api/cli#debug-data)                                                                                                                                                                                | | ||||
| | `spacy profile`                                                                              | [`spacy debug profile`](/api/cli#debug-profile)                                                                                                                                                                          | | ||||
| | `spacy link`, `util.set_data_path`, `util.get_data_path`                                     | not needed, symlinks are deprecated                                                                                                                                                                                      | | ||||
|  |  | |||
|  | @ -23,6 +23,8 @@ | |||
|     "PhraseMatcher": "/api/phrasematcher", | ||||
|     "TransformerData": "/api/transformer#transformerdata", | ||||
|     "FullTransformerBatch": "/api/transformer#fulltransformerbatch", | ||||
|     "Corpus": "/api/corpus", | ||||
|     "JsonlTexts": "/api/corpus#jsonltexts", | ||||
|     "LexemeC": "/api/cython-structs#lexemec", | ||||
|     "TokenC": "/api/cython-structs#tokenc", | ||||
|     "Config": "https://thinc.ai/docs/api-config#config", | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user