Merge branch 'develop' into feature/prepare

This commit is contained in:
Ines Montani 2020-09-29 20:53:05 +02:00
commit d3c63b7965
39 changed files with 189 additions and 134 deletions

View File

@ -14,7 +14,7 @@ pathy
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
pydantic>=1.3.0,<2.0.0 pydantic>=1.5.0,<2.0.0
pytokenizations pytokenizations
# Official Python utilities # Official Python utilities
setuptools setuptools

View File

@ -51,7 +51,7 @@ install_requires =
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
numpy>=1.15.0 numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
pydantic>=1.3.0,<2.0.0 pydantic>=1.5.0,<2.0.0
pytokenizations pytokenizations
# Official Python utilities # Official Python utilities
setuptools setuptools

View File

@ -114,6 +114,6 @@ def project_document(
content = f"{before}{content}{after}" content = f"{before}{content}{after}"
else: else:
msg.warn("Replacing existing file") msg.warn("Replacing existing file")
with output_file.open("w") as f: with output_file.open("w", encoding="utf8") as f:
f.write(content) f.write(content)
msg.good("Saved project documentation", output_file) msg.good("Saved project documentation", output_file)

View File

@ -270,6 +270,7 @@ factory = "{{ pipe }}"
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"
path = ${paths.train} path = ${paths.train}
max_length = {{ 500 if hardware == "gpu" else 2000 }} max_length = {{ 500 if hardware == "gpu" else 2000 }}
augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
[corpora.dev] [corpora.dev]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"

View File

@ -36,6 +36,11 @@ gold_preproc = false
max_length = 0 max_length = 0
# Limitation on number of training examples # Limitation on number of training examples
limit = 0 limit = 0
# Apply some simply data augmentation, where we replace tokens with variations.
# This is especially useful for punctuation and case replacement, to help
# generalize beyond corpora that don't have smart-quotes, or only have smart
# quotes, etc.
augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
[corpora.dev] [corpora.dev]
@readers = "spacy.Corpus.v1" @readers = "spacy.Corpus.v1"

View File

@ -4,7 +4,7 @@ from spacy.training import biluo_tags_to_spans, iob_to_biluo
from spacy.training import Corpus, docs_to_json from spacy.training import Corpus, docs_to_json
from spacy.training.example import Example from spacy.training.example import Example
from spacy.training.converters import json_to_docs from spacy.training.converters import json_to_docs
from spacy.training.augment import make_orth_variants_example from spacy.training.augment import create_orth_variants_augmenter
from spacy.lang.en import English from spacy.lang.en import English
from spacy.tokens import Doc, DocBin from spacy.tokens import Doc, DocBin
from spacy.util import get_words_and_spaces, minibatch from spacy.util import get_words_and_spaces, minibatch
@ -496,9 +496,8 @@ def test_make_orth_variants(doc):
output_file = tmpdir / "roundtrip.spacy" output_file = tmpdir / "roundtrip.spacy"
DocBin(docs=[doc]).to_disk(output_file) DocBin(docs=[doc]).to_disk(output_file)
# due to randomness, test only that this runs with no errors for now # due to randomness, test only that this runs with no errors for now
reader = Corpus(output_file) reader = Corpus(output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5))
train_example = next(reader(nlp)) train_examples = list(reader(nlp))
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
@pytest.mark.skip("Outdated") @pytest.mark.skip("Outdated")

View File

@ -1,6 +1,7 @@
from .corpus import Corpus # noqa: F401 from .corpus import Corpus # noqa: F401
from .example import Example, validate_examples # noqa: F401 from .example import Example, validate_examples # noqa: F401
from .align import Alignment # noqa: F401 from .align import Alignment # noqa: F401
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401 from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401 from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401
from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401 from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401

View File

@ -1,30 +1,50 @@
from typing import Callable
import random import random
import itertools import itertools
import copy
from functools import partial
from ..util import registry
def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming @registry.augmenters("spacy.dont_augment.v1")
raw_text = example.text def create_null_augmenter():
orig_dict = example.to_dict() return dont_augment
variant_text, variant_token_annot = make_orth_variants(
nlp, raw_text, orig_dict["token_annotation"], orth_variant_level
)
doc = nlp.make_doc(variant_text)
orig_dict["token_annotation"] = variant_token_annot
return example.from_dict(doc, orig_dict)
def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0): @registry.augmenters("spacy.orth_variants.v1")
if random.random() >= orth_variant_level: def create_orth_variants_augmenter(level: float, lower: float) -> Callable:
return raw_text, orig_token_dict """Create a data augmentation callback that uses orth-variant replacement.
if not orig_token_dict: The callback can be added to a corpus or other data iterator during training.
return raw_text, orig_token_dict """
raw = raw_text return partial(orth_variants_augmenter, level=level, lower=lower)
token_dict = orig_token_dict
lower = False
if random.random() >= 0.5: def dont_augment(nlp, example):
lower = True yield example
if raw is not None:
raw = raw.lower()
def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.0):
if random.random() >= level:
yield example
else:
raw_text = example.text
orig_dict = example.to_dict()
if not orig_dict["token_annotation"]:
yield example
else:
variant_text, variant_token_annot = make_orth_variants(
nlp,
raw_text,
orig_dict["token_annotation"],
lower=raw_text is not None and random.random() < lower
)
doc = nlp.make_doc(variant_text)
orig_dict["token_annotation"] = variant_token_annot
yield example.from_dict(doc, orig_dict)
def make_orth_variants(nlp, raw, token_dict, *, lower: bool=False):
orig_token_dict = copy.deepcopy(token_dict)
orth_variants = nlp.vocab.lookups.get_table("orth_variants", {}) orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
ndsv = orth_variants.get("single", []) ndsv = orth_variants.get("single", [])
ndpv = orth_variants.get("paired", []) ndpv = orth_variants.get("paired", [])
@ -103,7 +123,7 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
# something went wrong, abort # something went wrong, abort
# (add a warning message?) # (add a warning message?)
if not match_found: if not match_found:
return raw_text, orig_token_dict return raw, orig_token_dict
# add following whitespace # add following whitespace
while raw_idx < len(raw) and raw[raw_idx].isspace(): while raw_idx < len(raw) and raw[raw_idx].isspace():
variant_raw += raw[raw_idx] variant_raw += raw[raw_idx]

View File

@ -1,9 +1,11 @@
import warnings import warnings
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
from typing import Optional
from pathlib import Path from pathlib import Path
import srsly import srsly
from .. import util from .. import util
from .augment import dont_augment
from .example import Example from .example import Example
from ..errors import Warnings from ..errors import Warnings
from ..tokens import DocBin, Doc from ..tokens import DocBin, Doc
@ -18,9 +20,19 @@ FILE_TYPE = ".spacy"
@util.registry.readers("spacy.Corpus.v1") @util.registry.readers("spacy.Corpus.v1")
def create_docbin_reader( def create_docbin_reader(
path: Path, gold_preproc: bool, max_length: int = 0, limit: int = 0 path: Path,
gold_preproc: bool,
max_length: int = 0,
limit: int = 0,
augmenter: Optional[Callable] = None,
) -> Callable[["Language"], Iterable[Example]]: ) -> Callable[["Language"], Iterable[Example]]:
return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit) return Corpus(
path,
gold_preproc=gold_preproc,
max_length=max_length,
limit=limit,
augmenter=augmenter,
)
@util.registry.readers("spacy.JsonlReader.v1") @util.registry.readers("spacy.JsonlReader.v1")
@ -70,6 +82,8 @@ class Corpus:
0, which indicates no limit. 0, which indicates no limit.
limit (int): Limit corpus to a subset of examples, e.g. for debugging. limit (int): Limit corpus to a subset of examples, e.g. for debugging.
Defaults to 0, which indicates no limit. Defaults to 0, which indicates no limit.
augment (Callable[Example, Iterable[Example]]): Optional data augmentation
function, to extrapolate additional examples from your annotations.
DOCS: https://nightly.spacy.io/api/corpus DOCS: https://nightly.spacy.io/api/corpus
""" """
@ -81,11 +95,13 @@ class Corpus:
limit: int = 0, limit: int = 0,
gold_preproc: bool = False, gold_preproc: bool = False,
max_length: int = 0, max_length: int = 0,
augmenter: Optional[Callable] = None,
) -> None: ) -> None:
self.path = util.ensure_path(path) self.path = util.ensure_path(path)
self.gold_preproc = gold_preproc self.gold_preproc = gold_preproc
self.max_length = max_length self.max_length = max_length
self.limit = limit self.limit = limit
self.augmenter = augmenter if augmenter is not None else dont_augment
def __call__(self, nlp: "Language") -> Iterator[Example]: def __call__(self, nlp: "Language") -> Iterator[Example]:
"""Yield examples from the data. """Yield examples from the data.
@ -100,7 +116,9 @@ class Corpus:
examples = self.make_examples_gold_preproc(nlp, ref_docs) examples = self.make_examples_gold_preproc(nlp, ref_docs)
else: else:
examples = self.make_examples(nlp, ref_docs) examples = self.make_examples(nlp, ref_docs)
yield from examples for real_eg in examples:
for augmented_eg in self.augmenter(nlp, real_eg):
yield augmented_eg
def _make_example( def _make_example(
self, nlp: "Language", reference: Doc, gold_preproc: bool self, nlp: "Language", reference: Doc, gold_preproc: bool

View File

@ -83,6 +83,7 @@ class registry(thinc.registry):
callbacks = catalogue.create("spacy", "callbacks") callbacks = catalogue.create("spacy", "callbacks")
batchers = catalogue.create("spacy", "batchers", entry_points=True) batchers = catalogue.create("spacy", "batchers", entry_points=True)
readers = catalogue.create("spacy", "readers", entry_points=True) readers = catalogue.create("spacy", "readers", entry_points=True)
augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
loggers = catalogue.create("spacy", "loggers", entry_points=True) loggers = catalogue.create("spacy", "loggers", entry_points=True)
# These are factories registered via third-party packages and the # These are factories registered via third-party packages and the
# spacy_factories entry point. This registry only exists so we can easily # spacy_factories entry point. This registry only exists so we can easily

View File

@ -143,11 +143,10 @@ argument that connects to the shared `tok2vec` component in the pipeline.
Construct an embedding layer that separately embeds a number of lexical Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it through attributes using hash embedding, concatenates the results, and passes it through
a feed-forward subnetwork to build a mixed representations. The features used a feed-forward subnetwork to build mixed representations. The features used are
are the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions
definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained
pretrained static vectors can also be incorporated into the concatenated static vectors can also be incorporated into the concatenated representation.
representation.
| Name | Description | | Name | Description |
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -170,7 +169,7 @@ representation.
> nC = 8 > nC = 8
> ``` > ```
Construct an embedded representations based on character embeddings, using a Construct an embedded representation based on character embeddings, using a
feed-forward network. A fixed number of UTF-8 byte characters are used for each feed-forward network. A fixed number of UTF-8 byte characters are used for each
word, taken from the beginning and end of the word equally. Padding is used in word, taken from the beginning and end of the word equally. Padding is used in
the center for words that are too short. the center for words that are too short.
@ -392,7 +391,7 @@ a single token vector given zero or more wordpiece vectors.
> ``` > ```
Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
**not** allow multiple components to share the transformer weights, and does **not** allow multiple components to share the transformer weights and does
**not** allow the transformer to set annotations into the [`Doc`](/api/doc) **not** allow the transformer to set annotations into the [`Doc`](/api/doc)
object, but it's a **simpler solution** if you only need the transformer within object, but it's a **simpler solution** if you only need the transformer within
one component. one component.
@ -437,7 +436,7 @@ might find [this tutorial](https://explosion.ai/blog/parsing-english-in-python)
helpful for background information. The neural network state prediction model helpful for background information. The neural network state prediction model
consists of either two or three subnetworks: consists of either two or three subnetworks:
- **tok2vec**: Map each token into a vector representations. This subnetwork is - **tok2vec**: Map each token into a vector representation. This subnetwork is
run once for each batch. run once for each batch.
- **lower**: Construct a feature-specific vector for each `(token, feature)` - **lower**: Construct a feature-specific vector for each `(token, feature)`
pair. This is also run once for each batch. Constructing the state pair. This is also run once for each batch. Constructing the state
@ -575,14 +574,14 @@ architecture is usually less accurate than the ensemble, but runs faster.
> nO = null > nO = null
> ``` > ```
An ngram "bag-of-words" model. This architecture should run much faster than the An n-gram "bag-of-words" model. This architecture should run much faster than
others, but may not be as accurate, especially if texts are short. the others, but may not be as accurate, especially if texts are short.
| Name | Description | | Name | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ | | `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ | | `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ | | `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
@ -596,7 +595,7 @@ into the "real world". This requires 3 main components:
synonyms and prior probabilities. synonyms and prior probabilities.
- A candidate generation step to produce a set of likely identifiers, given a - A candidate generation step to produce a set of likely identifiers, given a
certain textual mention. certain textual mention.
- A Machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the - A machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the
most plausible ID from the set of candidates. most plausible ID from the set of candidates.
### spacy.EntityLinker.v1 {#EntityLinker} ### spacy.EntityLinker.v1 {#EntityLinker}

View File

@ -71,7 +71,7 @@ pattern_dicts = [
## AttributeRuler.\_\_call\_\_ {#call tag="method"} ## AttributeRuler.\_\_call\_\_ {#call tag="method"}
Apply the attribute ruler to a Doc, setting token attributes for tokens matched Apply the attribute ruler to a `Doc`, setting token attributes for tokens matched
by the provided patterns. by the provided patterns.
| Name | Description | | Name | Description |
@ -256,6 +256,6 @@ serialization by passing in the string names via the `exclude` argument.
| Name | Description | | Name | Description |
| ---------- | -------------------------------------------------------------- | | ---------- | -------------------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). | | `vocab` | The shared [`Vocab`](/api/vocab). |
| `patterns` | The Matcher patterns. You usually don't want to exclude this. | | `patterns` | The `Matcher` patterns. You usually don't want to exclude this. |
| `attrs` | The attributes to set. You usually don't want to exclude this. | | `attrs` | The attributes to set. You usually don't want to exclude this. |
| `indices` | The token indices. You usually don't want to exclude this. | | `indices` | The token indices. You usually don't want to exclude this. |

View File

@ -81,7 +81,7 @@ $ python -m spacy info [model] [--markdown] [--silent]
Find all trained pipeline packages installed in the current environment and Find all trained pipeline packages installed in the current environment and
check whether they are compatible with the currently installed version of spaCy. check whether they are compatible with the currently installed version of spaCy.
Should be run after upgrading spaCy via `pip install -U spacy` to ensure that Should be run after upgrading spaCy via `pip install -U spacy` to ensure that
all installed packages are can be used with the new version. It will show a list all installed packages can be used with the new version. It will show a list
of packages and their installed versions. If any package is out of date, the of packages and their installed versions. If any package is out of date, the
latest compatible versions and command for updating are shown. latest compatible versions and command for updating are shown.
@ -408,7 +408,7 @@ File /path/to/thinc/thinc/schedules.py (line 91)
### debug data {#debug-data tag="command"} ### debug data {#debug-data tag="command"}
Analyze, debug, and validate your training and development data. Get useful Analyze, debug and validate your training and development data. Get useful
stats, and find problems like invalid entity annotations, cyclic dependencies, stats, and find problems like invalid entity annotations, cyclic dependencies,
low data labels and more. low data labels and more.

View File

@ -74,6 +74,7 @@ train/test skew.
|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ | |  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ |
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ | | `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~
## Corpus.\_\_call\_\_ {#call tag="method"} ## Corpus.\_\_call\_\_ {#call tag="method"}

View File

@ -274,7 +274,7 @@ Typically, the extension for these binary files is `.spacy`, and they are used
as input format for specifying a [training corpus](/api/corpus) and for spaCy's as input format for specifying a [training corpus](/api/corpus) and for spaCy's
CLI [`train`](/api/cli#train) command. The built-in CLI [`train`](/api/cli#train) command. The built-in
[`convert`](/api/cli#convert) command helps you convert spaCy's previous [`convert`](/api/cli#convert) command helps you convert spaCy's previous
[JSON format](#json-input) to the new binary format format. It also supports [JSON format](#json-input) to the new binary format. It also supports
conversion of the `.conllu` format used by the conversion of the `.conllu` format used by the
[Universal Dependencies corpora](https://github.com/UniversalDependencies). [Universal Dependencies corpora](https://github.com/UniversalDependencies).
@ -338,7 +338,7 @@ $ python -m spacy convert ./data.json ./output.spacy
<Accordion title="Sample JSON data" spaced> <Accordion title="Sample JSON data" spaced>
Here's an example of dependencies, part-of-speech tags and names entities, taken Here's an example of dependencies, part-of-speech tags and named entities, taken
from the English Wall Street Journal portion of the Penn Treebank: from the English Wall Street Journal portion of the Penn Treebank:
```json ```json

View File

@ -21,8 +21,7 @@ non-projective parses.
The parser is trained using an **imitation learning objective**. It follows the The parser is trained using an **imitation learning objective**. It follows the
actions predicted by the current weights, and at each state, determines which actions predicted by the current weights, and at each state, determines which
actions are compatible with the optimal parse that could be reached from the actions are compatible with the optimal parse that could be reached from the
current state. The weights such that the scores assigned to the set of optimal current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note
actions is increased, while scores assigned to other actions are decreased. Note
that more than one action may be optimal for a given state. that more than one action may be optimal for a given state.
## Config and implementation {#config} ## Config and implementation {#config}

View File

@ -503,8 +503,7 @@ invalidated, although they may accidentally continue to work.
Mark a span for merging. The `attrs` will be applied to the resulting token (if Mark a span for merging. The `attrs` will be applied to the resulting token (if
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
underlying lexeme (if they're context-independent lexical attributes like underlying lexeme (if they're context-independent lexical attributes like
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a `LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values.
dictionary mapping attribute names to values as the `"_"` key.
> #### Example > #### Example
> >

View File

@ -94,7 +94,7 @@ providing custom registered functions.
## EntityLinker.\_\_call\_\_ {#call tag="method"} ## EntityLinker.\_\_call\_\_ {#call tag="method"}
Apply the pipe to one document. The document is modified in place, and returned. Apply the pipe to one document. The document is modified in place and returned.
This usually happens under the hood when the `nlp` object is called on a text This usually happens under the hood when the `nlp` object is called on a text
and all pipeline components are applied to the `Doc` in order. Both and all pipeline components are applied to the `Doc` in order. Both
[`__call__`](/api/entitylinker#call) and [`pipe`](/api/entitylinker#pipe) [`__call__`](/api/entitylinker#call) and [`pipe`](/api/entitylinker#pipe)

View File

@ -43,7 +43,7 @@ architectures and their arguments and hyperparameters.
| Setting | Description | | Setting | Description |
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]] | | `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ |
@ -83,7 +83,7 @@ shortcut for this and instantiate the component using its string name and
## EntityRecognizer.\_\_call\_\_ {#call tag="method"} ## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
Apply the pipe to one document. The document is modified in place, and returned. Apply the pipe to one document. The document is modified in place and returned.
This usually happens under the hood when the `nlp` object is called on a text This usually happens under the hood when the `nlp` object is called on a text
and all pipeline components are applied to the `Doc` in order. Both and all pipeline components are applied to the `Doc` in order. Both
[`__call__`](/api/entityrecognizer#call) and [`__call__`](/api/entityrecognizer#call) and

View File

@ -256,6 +256,6 @@ Get all patterns that were added to the entity ruler.
| Name | Description | | Name | Description |
| ----------------- | --------------------------------------------------------------------------------------------------------------------- | | ----------------- | --------------------------------------------------------------------------------------------------------------------- |
| `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ | | `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ |
| `phrase_matcher` | The underlying phrase matcher, used to process phrase patterns. ~~PhraseMatcher~~ | | `phrase_matcher` | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~ |
| `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ | | `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ | | `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ |

View File

@ -33,8 +33,8 @@ both documents.
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------------------------------------------------------ | | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `predicted` | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ | | `predicted` | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ |
| `reference` | The document containing gold-standard annotations. Can not be `None`. ~~Doc~~ | | `reference` | The document containing gold-standard annotations. Cannot be `None`. ~~Doc~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. ~~Optional[Alignment]~~ | | `alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. ~~Optional[Alignment]~~ |
@ -58,8 +58,8 @@ see the [training format documentation](/api/data-formats#dict-input).
| Name | Description | | Name | Description |
| -------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------- |
| `predicted` | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ | | `predicted` | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ |
| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Can not be `None`. ~~Dict[str, Any]~~ | | `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Cannot be `None`. ~~Dict[str, Any]~~ |
| **RETURNS** | The newly constructed object. ~~Example~~ | | **RETURNS** | The newly constructed object. ~~Example~~ |
## Example.text {#text tag="property"} ## Example.text {#text tag="property"}

View File

@ -46,10 +46,11 @@ information in [`Language.meta`](/api/language#meta) and not to configure the
## Language.from_config {#from_config tag="classmethod" new="3"} ## Language.from_config {#from_config tag="classmethod" new="3"}
Create a `Language` object from a loaded config. Will set up the tokenizer and Create a `Language` object from a loaded config. Will set up the tokenizer and
language data, add pipeline components based on the pipeline and components language data, add pipeline components based on the pipeline and add pipeline
define in the config and validate the results. If no config is provided, the components based on the definitions specified in the config. If no config is
default config of the given language is used. This is also how spaCy loads a provided, the default config of the given language is used. This is also how
model under the hood based on its [`config.cfg`](/api/data-formats#config). spaCy loads a model under the hood based on its
[`config.cfg`](/api/data-formats#config).
> #### Example > #### Example
> >
@ -107,7 +108,7 @@ decorator. For more details and examples, see the
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | | `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | | `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ | | `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[Doc], Doc]]~~ | | `func` | Optional function if not used as a decorator. ~~Optional[Callable[[Doc], Doc]]~~ |
## Language.factory {#factory tag="classmethod"} ## Language.factory {#factory tag="classmethod"}
@ -154,7 +155,7 @@ examples, see the
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ | | `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ | | `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ | | `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ |
| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ | | `func` | Optional function if not used as a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ |
## Language.\_\_call\_\_ {#call tag="method"} ## Language.\_\_call\_\_ {#call tag="method"}
@ -609,7 +610,7 @@ does nothing.
## Language.enable_pipe {#enable_pipe tag="method" new="3"} ## Language.enable_pipe {#enable_pipe tag="method" new="3"}
Enable a previously disable component (e.g. via Enable a previously disabled component (e.g. via
[`Language.disable_pipes`](/api/language#disable_pipes)) so it's run as part of [`Language.disable_pipes`](/api/language#disable_pipes)) so it's run as part of
the pipeline, [`nlp.pipeline`](/api/language#pipeline). If the component is the pipeline, [`nlp.pipeline`](/api/language#pipeline). If the component is
already enabled, this method does nothing. already enabled, this method does nothing.
@ -636,7 +637,7 @@ pipeline will be restored to the initial state at the end of the block.
Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method
you can use to undo your changes. You can specify either `disable` (as a list or you can use to undo your changes. You can specify either `disable` (as a list or
string), or `enable`. In the latter case, all components not in the `enable` string), or `enable`. In the latter case, all components not in the `enable`
list, will be disabled. Under the hood, this method calls into list will be disabled. Under the hood, this method calls into
[`disable_pipe`](/api/language#disable_pipe) and [`disable_pipe`](/api/language#disable_pipe) and
[`enable_pipe`](/api/language#enable_pipe). [`enable_pipe`](/api/language#enable_pipe).
@ -669,7 +670,7 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
| -------------- | ------------------------------------------------------------------------------------------------------ | | -------------- | ------------------------------------------------------------------------------------------------------ |
| _keyword-only_ | | | _keyword-only_ | |
| `disable` | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~ | | `disable` | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~ |
| `enable` | Names(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ | | `enable` | Name(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ |
| **RETURNS** | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ | | **RETURNS** | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ |
## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"} ## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"}
@ -881,10 +882,10 @@ Loads state from a directory, including all data that was saved with the
<Infobox variant="warning" title="Important note"> <Infobox variant="warning" title="Important note">
Keep in mind that this method **only loads serialized state** and doesn't set up Keep in mind that this method **only loads the serialized state** and doesn't
the `nlp` object. This means that it requires the correct language class to be set up the `nlp` object. This means that it requires the correct language class
initialized and all pipeline components to be added to the pipeline. If you want to be initialized and all pipeline components to be added to the pipeline. If
to load a serialized pipeline from a directory, you should use you want to load a serialized pipeline from a directory, you should use
[`spacy.load`](/api/top-level#spacy.load), which will set everything up for you. [`spacy.load`](/api/top-level#spacy.load), which will set everything up for you.
</Infobox> </Infobox>

View File

@ -38,7 +38,7 @@ The default config is defined by the pipeline component factory and describes
how the component should be configured. You can override its settings via the how the component should be configured. You can override its settings via the
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your `config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
[`config.cfg` for training](/usage/training#config). For examples of the lookups [`config.cfg` for training](/usage/training#config). For examples of the lookups
data formats used by the lookup and rule-based lemmatizers, see data format used by the lookup and rule-based lemmatizers, see
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
> #### Example > #### Example

View File

@ -61,7 +61,7 @@ matched:
| `!` | Negate the pattern, by requiring it to match exactly 0 times. | | `!` | Negate the pattern, by requiring it to match exactly 0 times. |
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. | | `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
| `+` | Require the pattern to match 1 or more times. | | `+` | Require the pattern to match 1 or more times. |
| `*` | Allow the pattern to match zero or more times. | | `*` | Allow the pattern to match 0 or more times. |
Token patterns can also map to a **dictionary of properties** instead of a Token patterns can also map to a **dictionary of properties** instead of a
single value to indicate whether the expected value is a member of a list or how single value to indicate whether the expected value is a member of a list or how

View File

@ -12,7 +12,7 @@ container storing a single morphological analysis.
## Morphology.\_\_init\_\_ {#init tag="method"} ## Morphology.\_\_init\_\_ {#init tag="method"}
Create a Morphology object. Create a `Morphology` object.
> #### Example > #### Example
> >
@ -101,7 +101,7 @@ representation.
| Name | Description | | Name | Description |
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ | | `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ |
| **RETURNS** | The morphological features as in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ | | **RETURNS** | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
## Attributes {#attributes} ## Attributes {#attributes}

View File

@ -200,7 +200,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"} ## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"}
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
current model to make predictions similar to an initial model, to try to address current model to make predictions similar to an initial model to try to address
the "catastrophic forgetting" problem. This feature is experimental. the "catastrophic forgetting" problem. This feature is experimental.
> #### Example > #### Example

View File

@ -8,7 +8,7 @@ api_string_name: sentencizer
api_trainable: false api_trainable: false
--- ---
A simple pipeline component, to allow custom sentence boundary detection logic A simple pipeline component to allow custom sentence boundary detection logic
that doesn't require the dependency parse. By default, sentence segmentation is that doesn't require the dependency parse. By default, sentence segmentation is
performed by the [`DependencyParser`](/api/dependencyparser), so the performed by the [`DependencyParser`](/api/dependencyparser), so the
`Sentencizer` lets you implement a simpler, rule-based strategy that doesn't `Sentencizer` lets you implement a simpler, rule-based strategy that doesn't
@ -130,7 +130,7 @@ Score a batch of examples.
## Sentencizer.to_disk {#to_disk tag="method"} ## Sentencizer.to_disk {#to_disk tag="method"}
Save the sentencizer settings (punctuation characters) a directory. Will create Save the sentencizer settings (punctuation characters) to a directory. Will create
a file `sentencizer.json`. This also happens automatically when you save an a file `sentencizer.json`. This also happens automatically when you save an
`nlp` object with a sentencizer added to its pipeline. `nlp` object with a sentencizer added to its pipeline.

View File

@ -8,7 +8,7 @@ A slice from a [`Doc`](/api/doc) object.
## Span.\_\_init\_\_ {#init tag="method"} ## Span.\_\_init\_\_ {#init tag="method"}
Create a Span object from the slice `doc[start : end]`. Create a `Span` object from the slice `doc[start : end]`.
> #### Example > #### Example
> >
@ -187,7 +187,7 @@ the character indices don't map to a valid span.
| Name | Description | | Name | Description |
| ------------------------------------ | ----------------------------------------------------------------------------------------- | | ------------------------------------ | ----------------------------------------------------------------------------------------- |
| `start` | The index of the first character of the span. ~~int~~ | | `start` | The index of the first character of the span. ~~int~~ |
| `end` | The index of the last character after the span. ~int~~ | | `end` | The index of the last character after the span. ~~int~~ |
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ | | `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |

View File

@ -157,7 +157,7 @@ This method was previously called `begin_training`.
## TextCategorizer.predict {#predict tag="method"} ## TextCategorizer.predict {#predict tag="method"}
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without Apply the component's model to a batch of [`Doc`](/api/doc) objects without
modifying them. modifying them.
> #### Example > #### Example
@ -174,7 +174,7 @@ modifying them.
## TextCategorizer.set_annotations {#set_annotations tag="method"} ## TextCategorizer.set_annotations {#set_annotations tag="method"}
Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores. Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
> #### Example > #### Example
> >
@ -217,7 +217,7 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"} ## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"}
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
current model to make predictions similar to an initial model, to try to address current model to make predictions similar to an initial model to try to address
the "catastrophic forgetting" problem. This feature is experimental. the "catastrophic forgetting" problem. This feature is experimental.
> #### Example > #### Example
@ -290,7 +290,7 @@ Create an optimizer for the pipeline component.
## TextCategorizer.use_params {#use_params tag="method, contextmanager"} ## TextCategorizer.use_params {#use_params tag="method, contextmanager"}
Modify the pipe's model, to use the given parameter values. Modify the pipe's model to use the given parameter values.
> #### Example > #### Example
> >

View File

@ -150,7 +150,7 @@ by [`Language.initialize`](/api/language#initialize).
## Tok2Vec.predict {#predict tag="method"} ## Tok2Vec.predict {#predict tag="method"}
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without Apply the component's model to a batch of [`Doc`](/api/doc) objects without
modifying them. modifying them.
> #### Example > #### Example
@ -223,7 +223,7 @@ Create an optimizer for the pipeline component.
## Tok2Vec.use_params {#use_params tag="method, contextmanager"} ## Tok2Vec.use_params {#use_params tag="method, contextmanager"}
Modify the pipe's model, to use the given parameter values. At the end of the Modify the pipe's model to use the given parameter values. At the end of the
context, the original parameters are restored. context, the original parameters are restored.
> #### Example > #### Example

View File

@ -243,7 +243,7 @@ A sequence of the token's immediate syntactic children.
## Token.lefts {#lefts tag="property" model="parser"} ## Token.lefts {#lefts tag="property" model="parser"}
The leftward immediate children of the word, in the syntactic dependency parse. The leftward immediate children of the word in the syntactic dependency parse.
> #### Example > #### Example
> >
@ -259,7 +259,7 @@ The leftward immediate children of the word, in the syntactic dependency parse.
## Token.rights {#rights tag="property" model="parser"} ## Token.rights {#rights tag="property" model="parser"}
The rightward immediate children of the word, in the syntactic dependency parse. The rightward immediate children of the word in the syntactic dependency parse.
> #### Example > #### Example
> >
@ -275,7 +275,7 @@ The rightward immediate children of the word, in the syntactic dependency parse.
## Token.n_lefts {#n_lefts tag="property" model="parser"} ## Token.n_lefts {#n_lefts tag="property" model="parser"}
The number of leftward immediate children of the word, in the syntactic The number of leftward immediate children of the word in the syntactic
dependency parse. dependency parse.
> #### Example > #### Example
@ -291,7 +291,7 @@ dependency parse.
## Token.n_rights {#n_rights tag="property" model="parser"} ## Token.n_rights {#n_rights tag="property" model="parser"}
The number of rightward immediate children of the word, in the syntactic The number of rightward immediate children of the word in the syntactic
dependency parse. dependency parse.
> #### Example > #### Example
@ -422,8 +422,8 @@ The L2 norm of the token's vector representation.
| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~ | | `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~ |
| `lower` | Lowercase form of the token. ~~int~~ | | `lower` | Lowercase form of the token. ~~int~~ |
| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ | | `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ |
| `shape` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ | | `shape` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
| `shape_` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ | | `shape_` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ | | `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ |
| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ | | `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ |
| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ | | `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ |
@ -451,7 +451,7 @@ The L2 norm of the token's vector representation.
| `tag` | Fine-grained part-of-speech. ~~int~~ | | `tag` | Fine-grained part-of-speech. ~~int~~ |
| `tag_` | Fine-grained part-of-speech. ~~str~~ | | `tag_` | Fine-grained part-of-speech. ~~str~~ |
| `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~ | | `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~ |
| `morph_` <Tag variant="new">3</Tag> | Morphological analysis in the Universal Dependencies [FEATS]https://universaldependencies.org/format.html#morphological-annotation format. ~~str~~ | | `morph_` <Tag variant="new">3</Tag> | Morphological analysis in the Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
| `dep` | Syntactic dependency relation. ~~int~~ | | `dep` | Syntactic dependency relation. ~~int~~ |
| `dep_` | Syntactic dependency relation. ~~str~~ | | `dep_` | Syntactic dependency relation. ~~str~~ |
| `lang` | Language of the parent document's vocabulary. ~~int~~ | | `lang` | Language of the parent document's vocabulary. ~~int~~ |

View File

@ -1,6 +1,6 @@
--- ---
title: Tokenizer title: Tokenizer
teaser: Segment text into words, punctuations marks etc. teaser: Segment text into words, punctuations marks, etc.
tag: class tag: class
source: spacy/tokenizer.pyx source: spacy/tokenizer.pyx
--- ---
@ -15,14 +15,14 @@ source: spacy/tokenizer.pyx
Segment text, and create `Doc` objects with the discovered segment boundaries. Segment text, and create `Doc` objects with the discovered segment boundaries.
For a deeper understanding, see the docs on For a deeper understanding, see the docs on
[how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works). [how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works).
The tokenizer is typically created automatically when the a The tokenizer is typically created automatically when a
[`Language`](/api/language) subclass is initialized and it reads its settings [`Language`](/api/language) subclass is initialized and it reads its settings
like punctuation and special case rules from the like punctuation and special case rules from the
[`Language.Defaults`](/api/language#defaults) provided by the language subclass. [`Language.Defaults`](/api/language#defaults) provided by the language subclass.
## Tokenizer.\_\_init\_\_ {#init tag="method"} ## Tokenizer.\_\_init\_\_ {#init tag="method"}
Create a `Tokenizer`, to create `Doc` objects given unicode text. For examples Create a `Tokenizer` to create `Doc` objects given unicode text. For examples
of how to construct a custom tokenizer with different tokenization rules, see of how to construct a custom tokenizer with different tokenization rules, see
the the
[usage documentation](https://spacy.io/usage/linguistic-features#native-tokenizers). [usage documentation](https://spacy.io/usage/linguistic-features#native-tokenizers).
@ -87,7 +87,7 @@ Tokenize a stream of texts.
| ------------ | ------------------------------------------------------------------------------------ | | ------------ | ------------------------------------------------------------------------------------ |
| `texts` | A sequence of unicode texts. ~~Iterable[str]~~ | | `texts` | A sequence of unicode texts. ~~Iterable[str]~~ |
| `batch_size` | The number of texts to accumulate in an internal buffer. Defaults to `1000`. ~~int~~ | | `batch_size` | The number of texts to accumulate in an internal buffer. Defaults to `1000`. ~~int~~ |
| **YIELDS** | The tokenized Doc objects, in order. ~~Doc~~ | | **YIELDS** | The tokenized `Doc` objects, in order. ~~Doc~~ |
## Tokenizer.find_infix {#find_infix tag="method"} ## Tokenizer.find_infix {#find_infix tag="method"}

View File

@ -198,7 +198,7 @@ browser. Will run a simple web server.
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | | `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | | `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | | `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
| `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | | `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ | | `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ |
| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ | | `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ |
@ -223,7 +223,7 @@ Render a dependency parse tree or named entity visualization.
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ | | `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ | | `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ | | `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
| `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ | | `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
| `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ | | `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
| **RETURNS** | The rendered HTML markup. ~~str~~ | | **RETURNS** | The rendered HTML markup. ~~str~~ |
@ -244,7 +244,7 @@ If a setting is not present in the options, the default value will be used.
| Name | Description | | Name | Description |
| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | | `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemma's in a separate row below the token texts. Defaults to `False`. ~~bool~~ | | `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | | `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | | `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | | `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
@ -623,7 +623,7 @@ sequences in the batch.
Encode labelled spans into per-token tags, using the Encode labelled spans into per-token tags, using the
[BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit, [BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit,
Out). Returns a list of strings, describing the tags. Each tag string will be of Out). Returns a list of strings, describing the tags. Each tag string will be in
the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of
`"B"`, `"I"`, `"L"`, `"U"`. The string `"-"` is used where the entity offsets `"B"`, `"I"`, `"L"`, `"U"`. The string `"-"` is used where the entity offsets
don't align with the tokenization in the `Doc` object. The training algorithm don't align with the tokenization in the `Doc` object. The training algorithm
@ -747,7 +747,7 @@ decorator.
### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"} ### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"}
Check whether a `Language` subclass is already loaded. `Language` subclasses are Check whether a `Language` subclass is already loaded. `Language` subclasses are
loaded lazily, to avoid expensive setup code associated with the language data. loaded lazily to avoid expensive setup code associated with the language data.
> #### Example > #### Example
> >
@ -935,7 +935,7 @@ Compile a sequence of prefix rules into a regex object.
| Name | Description | | Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | | `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
| **RETURNS** | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ | | **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"} ### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
@ -952,7 +952,7 @@ Compile a sequence of suffix rules into a regex object.
| Name | Description | | Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | | `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
| **RETURNS** | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ | | **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
### util.compile_infix_regex {#util.compile_infix_regex tag="function"} ### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
@ -969,7 +969,7 @@ Compile a sequence of infix rules into a regex object.
| Name | Description | | Name | Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ | | `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
| **RETURNS** | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ | | **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ |
### util.minibatch {#util.minibatch tag="function" new="2"} ### util.minibatch {#util.minibatch tag="function" new="2"}

View File

@ -185,7 +185,7 @@ by [`Language.initialize`](/api/language#initialize).
## Transformer.predict {#predict tag="method"} ## Transformer.predict {#predict tag="method"}
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without Apply the component's model to a batch of [`Doc`](/api/doc) objects without
modifying them. modifying them.
> #### Example > #### Example
@ -202,7 +202,7 @@ modifying them.
## Transformer.set_annotations {#set_annotations tag="method"} ## Transformer.set_annotations {#set_annotations tag="method"}
Assign the extracted features to the Doc objects. By default, the Assign the extracted features to the `Doc` objects. By default, the
[`TransformerData`](/api/transformer#transformerdata) object is written to the [`TransformerData`](/api/transformer#transformerdata) object is written to the
[`Doc._.trf_data`](#custom-attributes) attribute. Your `set_extra_annotations` [`Doc._.trf_data`](#custom-attributes) attribute. Your `set_extra_annotations`
callback is then called, if provided. callback is then called, if provided.
@ -271,7 +271,7 @@ Create an optimizer for the pipeline component.
## Transformer.use_params {#use_params tag="method, contextmanager"} ## Transformer.use_params {#use_params tag="method, contextmanager"}
Modify the pipe's model, to use the given parameter values. At the end of the Modify the pipe's model to use the given parameter values. At the end of the
context, the original parameters are restored. context, the original parameters are restored.
> #### Example > #### Example
@ -387,8 +387,8 @@ by this class. Instances of this class are typically assigned to the
| Name | Description | | Name | Description |
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts, and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ | | `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |
| `tensors` | The activations for the Doc from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ | | `tensors` | The activations for the `Doc` from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | | `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
| `width` | The width of the last hidden layer. ~~int~~ | | `width` | The width of the last hidden layer. ~~int~~ |
@ -408,7 +408,7 @@ objects to associate the outputs to each [`Doc`](/api/doc) in the batch.
| Name | Description | | Name | Description |
| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each Span can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each Span may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ | | `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each `Span` can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each `Span` may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
| `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ | | `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ |
| `tensors` | The output of the transformer model. ~~List[torch.Tensor]~~ | | `tensors` | The output of the transformer model. ~~List[torch.Tensor]~~ |
| `align` | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ | | `align` | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
@ -438,10 +438,10 @@ Split a `TransformerData` object that represents a batch into a list with one
## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"} ## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
Span getters are functions that take a batch of [`Doc`](/api/doc) objects and Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
return a lists of [`Span`](/api/span) objects for each doc, to be processed by return a lists of [`Span`](/api/span) objects for each doc to be processed by
the transformer. This is used to manage long documents, by cutting them into the transformer. This is used to manage long documents by cutting them into
smaller sequences before running the transformer. The spans are allowed to smaller sequences before running the transformer. The spans are allowed to
overlap, and you can also omit sections of the Doc if they are not relevant. overlap, and you can also omit sections of the `Doc` if they are not relevant.
Span getters can be referenced in the `[components.transformer.model.get_spans]` Span getters can be referenced in the `[components.transformer.model.get_spans]`
block of the config to customize the sequences processed by the transformer. You block of the config to customize the sequences processed by the transformer. You

View File

@ -290,7 +290,7 @@ If a table is full, it can be resized using
## Vectors.n_keys {#n_keys tag="property"} ## Vectors.n_keys {#n_keys tag="property"}
Get the number of keys in the table. Note that this is the number of _all_ keys, Get the number of keys in the table. Note that this is the number of _all_ keys,
not just unique vectors. If several keys are mapped are mapped to the same not just unique vectors. If several keys are mapped to the same
vectors, they will be counted individually. vectors, they will be counted individually.
> #### Example > #### Example
@ -307,10 +307,10 @@ vectors, they will be counted individually.
## Vectors.most_similar {#most_similar tag="method"} ## Vectors.most_similar {#most_similar tag="method"}
For each of the given vectors, find the `n` most similar entries to it, by For each of the given vectors, find the `n` most similar entries to it by
cosine. Queries are by vector. Results are returned as a cosine. Queries are by vector. Results are returned as a
`(keys, best_rows, scores)` tuple. If `queries` is large, the calculations are `(keys, best_rows, scores)` tuple. If `queries` is large, the calculations are
performed in chunks, to avoid consuming too much memory. You can set the performed in chunks to avoid consuming too much memory. You can set the
`batch_size` to control the size/space trade-off during the calculations. `batch_size` to control the size/space trade-off during the calculations.
> #### Example > #### Example

View File

@ -29,7 +29,7 @@ Create the vocabulary.
| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | | `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
| `vectors_name` <Tag variant="new">2.2</Tag> | A name to identify the vectors table. ~~str~~ | | `vectors_name` <Tag variant="new">2.2</Tag> | A name to identify the vectors table. ~~str~~ |
| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | | `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
| `get_noun_chunks` | A function that yields base noun phrases, used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ | | `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ |
## Vocab.\_\_len\_\_ {#len tag="method"} ## Vocab.\_\_len\_\_ {#len tag="method"}
@ -150,7 +150,7 @@ rows, we would discard the vectors for "feline" and "reclined". These words
would then be remapped to the closest remaining vector so "feline" would have would then be remapped to the closest remaining vector so "feline" would have
the same vector as "cat", and "reclined" would have the same vector as "sat". the same vector as "cat", and "reclined" would have the same vector as "sat".
The similarities are judged by cosine. The original vectors may be large, so the The similarities are judged by cosine. The original vectors may be large, so the
cosines are calculated in minibatches, to reduce memory usage. cosines are calculated in minibatches to reduce memory usage.
> #### Example > #### Example
> >
@ -170,7 +170,7 @@ cosines are calculated in minibatches, to reduce memory usage.
Retrieve a vector for a word in the vocabulary. Words can be looked up by string Retrieve a vector for a word in the vocabulary. Words can be looked up by string
or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn` or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn`
is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s
subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`). subword features by average over n-grams of `orth` (introduced in spaCy `v2.1`).
> #### Example > #### Example
> >
@ -182,13 +182,13 @@ subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
| Name | Description | | Name | Description |
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- | | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- |
| `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ | | `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ |
| `minn` <Tag variant="new">2.1</Tag> | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ | | `minn` <Tag variant="new">2.1</Tag> | Minimum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ |
| `maxn` <Tag variant="new">2.1</Tag> | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ | | `maxn` <Tag variant="new">2.1</Tag> | Maximum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ |
| **RETURNS** | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ | | **RETURNS** | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
## Vocab.set_vector {#set_vector tag="method" new="2"} ## Vocab.set_vector {#set_vector tag="method" new="2"}
Set a vector for a word in the vocabulary. Words can be referenced by by string Set a vector for a word in the vocabulary. Words can be referenced by string
or hash value. or hash value.
> #### Example > #### Example

View File

@ -41,8 +41,8 @@ transformers is that word vectors model **lexical types**, rather than _tokens_.
If you have a list of terms with no context around them, a transformer model If you have a list of terms with no context around them, a transformer model
like BERT can't really help you. BERT is designed to understand language **in like BERT can't really help you. BERT is designed to understand language **in
context**, which isn't what you have. A word vectors table will be a much better context**, which isn't what you have. A word vectors table will be a much better
fit for your task. However, if you do have words in context whole sentences or fit for your task. However, if you do have words in context whole sentences or
paragraphs of running text word vectors will only provide a very rough paragraphs of running text word vectors will only provide a very rough
approximation of what the text is about. approximation of what the text is about.
Word vectors are also very computationally efficient, as they map a word to a Word vectors are also very computationally efficient, as they map a word to a
@ -256,7 +256,7 @@ for doc in nlp.pipe(["some text", "some other text"]):
``` ```
You can also customize how the [`Transformer`](/api/transformer) component sets You can also customize how the [`Transformer`](/api/transformer) component sets
annotations onto the [`Doc`](/api/doc), by specifying a custom annotations onto the [`Doc`](/api/doc) by specifying a custom
`set_extra_annotations` function. This callback will be called with the raw `set_extra_annotations` function. This callback will be called with the raw
input and output data for the whole batch, along with the batch of `Doc` input and output data for the whole batch, along with the batch of `Doc`
objects, allowing you to implement whatever you need. The annotation setter is objects, allowing you to implement whatever you need. The annotation setter is
@ -675,7 +675,7 @@ given you a 10% error reduction, pretraining with spaCy might give you another
The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific
subnetwork** within one of your components, and add additional layers to build a subnetwork** within one of your components, and add additional layers to build a
network for a temporary task, that forces the model to learn something about network for a temporary task that forces the model to learn something about
sentence structure and word cooccurrence statistics. Pretraining produces a sentence structure and word cooccurrence statistics. Pretraining produces a
**binary weights file** that can be loaded back in at the start of training. The **binary weights file** that can be loaded back in at the start of training. The
weights file specifies an initial set of weights. Training then proceeds as weights file specifies an initial set of weights. Training then proceeds as

View File

@ -6,6 +6,7 @@ menu:
- ['Introduction', 'basics'] - ['Introduction', 'basics']
- ['Quickstart', 'quickstart'] - ['Quickstart', 'quickstart']
- ['Config System', 'config'] - ['Config System', 'config']
<!-- - ['Data Utilities', 'data'] -->
- ['Custom Functions', 'custom-functions'] - ['Custom Functions', 'custom-functions']
- ['Parallel Training', 'parallel-training'] - ['Parallel Training', 'parallel-training']
- ['Internal API', 'api'] - ['Internal API', 'api']
@ -505,6 +506,16 @@ still look good.
</Accordion> </Accordion>
<!--
## Data Utilities {#data-utilities}
* spacy convert
* The [corpora] block
* Custom corpus class
* Minibatching
* Data augmentation
-->
## Custom Functions {#custom-functions} ## Custom Functions {#custom-functions}
Registered functions in the training config files can refer to built-in Registered functions in the training config files can refer to built-in
@ -689,7 +700,7 @@ from pathlib import Path
@spacy.registry.loggers("my_custom_logger.v1") @spacy.registry.loggers("my_custom_logger.v1")
def custom_logger(log_path): def custom_logger(log_path):
def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]: def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
with Path(log_path).open("w") as file_: with Path(log_path).open("w", encoding="utf8") as file_:
file_.write("step\\t") file_.write("step\\t")
file_.write("score\\t") file_.write("score\\t")
for pipe in nlp.pipe_names: for pipe in nlp.pipe_names:

View File

@ -1256,7 +1256,7 @@ def main(template_path, output=None, data_path=None):
data_str = f"export const DATA = {data}" data_str = f"export const DATA = {data}"
result = compiler.get_output() result = compiler.get_output()
if output is not None: if output is not None:
with output.open("w") as f: with output.open("w", encoding="utf8") as f:
f.write(f"{header}\n{result}\n{data_str}") f.write(f"{header}\n{result}\n{data_str}")
print(f"Updated {output.parts[-1]}") print(f"Updated {output.parts[-1]}")
else: else: