Update docs, types and API consistency

This commit is contained in:
Ines Montani 2020-08-17 16:45:24 +02:00
parent 61dfdd9fbd
commit 3ae5e02f4f
64 changed files with 3678 additions and 3514 deletions

View File

@ -70,7 +70,7 @@ def evaluate(
corpus = Corpus(data_path, gold_preproc=gold_preproc) corpus = Corpus(data_path, gold_preproc=gold_preproc)
nlp = util.load_model(model) nlp = util.load_model(model)
dev_dataset = list(corpus(nlp)) dev_dataset = list(corpus(nlp))
scores = nlp.evaluate(dev_dataset, verbose=False) scores = nlp.evaluate(dev_dataset)
metrics = { metrics = {
"TOK": "token_acc", "TOK": "token_acc",
"TAG": "tag_acc", "TAG": "tag_acc",

View File

@ -18,7 +18,7 @@ RENDER_WRAPPER = None
def render( def render(
docs: Union[Iterable[Doc], Doc], docs: Union[Iterable[Union[Doc, Span]], Doc, Span],
style: str = "dep", style: str = "dep",
page: bool = False, page: bool = False,
minify: bool = False, minify: bool = False,

View File

@ -439,8 +439,6 @@ class Language:
assigns: Iterable[str] = tuple(), assigns: Iterable[str] = tuple(),
requires: Iterable[str] = tuple(), requires: Iterable[str] = tuple(),
retokenizes: bool = False, retokenizes: bool = False,
scores: Iterable[str] = tuple(),
default_score_weights: Dict[str, float] = SimpleFrozenDict(),
func: Optional[Callable[[Doc], Doc]] = None, func: Optional[Callable[[Doc], Doc]] = None,
) -> Callable: ) -> Callable:
"""Register a new pipeline component. Can be used for stateless function """Register a new pipeline component. Can be used for stateless function
@ -456,12 +454,6 @@ class Language:
e.g. "token.ent_id". Used for pipeline analyis. e.g. "token.ent_id". Used for pipeline analyis.
retokenizes (bool): Whether the component changes the tokenization. retokenizes (bool): Whether the component changes the tokenization.
Used for pipeline analysis. Used for pipeline analysis.
scores (Iterable[str]): All scores set by the component if it's trainable,
e.g. ["ents_f", "ents_r", "ents_p"].
default_score_weights (Dict[str, float]): The scores to report during
training, and their default weight towards the final score used to
select the best model. Weights should sum to 1.0 per component and
will be combined and normalized for the whole pipeline.
func (Optional[Callable]): Factory function if not used as a decorator. func (Optional[Callable]): Factory function if not used as a decorator.
DOCS: https://spacy.io/api/language#component DOCS: https://spacy.io/api/language#component
@ -482,8 +474,6 @@ class Language:
assigns=assigns, assigns=assigns,
requires=requires, requires=requires,
retokenizes=retokenizes, retokenizes=retokenizes,
scores=scores,
default_score_weights=default_score_weights,
func=factory_func, func=factory_func,
) )
return component_func return component_func
@ -1112,7 +1102,6 @@ class Language:
self, self,
examples: Iterable[Example], examples: Iterable[Example],
*, *,
verbose: bool = False,
batch_size: int = 256, batch_size: int = 256,
scorer: Optional[Scorer] = None, scorer: Optional[Scorer] = None,
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None, component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
@ -1121,7 +1110,6 @@ class Language:
"""Evaluate a model's pipeline components. """Evaluate a model's pipeline components.
examples (Iterable[Example]): `Example` objects. examples (Iterable[Example]): `Example` objects.
verbose (bool): Print debugging information.
batch_size (int): Batch size to use. batch_size (int): Batch size to use.
scorer (Optional[Scorer]): Scorer to use. If not passed in, a new one scorer (Optional[Scorer]): Scorer to use. If not passed in, a new one
will be created. will be created.
@ -1140,7 +1128,6 @@ class Language:
scorer_cfg = {} scorer_cfg = {}
if scorer is None: if scorer is None:
kwargs = dict(scorer_cfg) kwargs = dict(scorer_cfg)
kwargs.setdefault("verbose", verbose)
kwargs.setdefault("nlp", self) kwargs.setdefault("nlp", self)
scorer = Scorer(**kwargs) scorer = Scorer(**kwargs)
texts = [eg.reference.text for eg in examples] texts = [eg.reference.text for eg in examples]
@ -1163,8 +1150,7 @@ class Language:
docs = list(docs) docs = list(docs)
end_time = timer() end_time = timer()
for i, (doc, eg) in enumerate(zip(docs, examples)): for i, (doc, eg) in enumerate(zip(docs, examples)):
if verbose: util.logger.debug(doc)
print(doc)
eg.predicted = doc eg.predicted = doc
results = scorer.score(examples) results = scorer.score(examples)
n_words = sum(len(eg.predicted) for eg in examples) n_words = sum(len(eg.predicted) for eg in examples)

View File

@ -2,7 +2,7 @@ from typing import Optional, Iterable, Dict, Any, Callable, Tuple, TYPE_CHECKING
import numpy as np import numpy as np
from .gold import Example from .gold import Example
from .tokens import Token, Doc from .tokens import Token, Doc, Span
from .errors import Errors from .errors import Errors
from .util import get_lang_class from .util import get_lang_class
from .morphology import Morphology from .morphology import Morphology
@ -250,15 +250,16 @@ class Scorer:
examples: Iterable[Example], examples: Iterable[Example],
attr: str, attr: str,
*, *,
getter: Callable[[Doc, str], Any] = getattr, getter: Callable[[Doc, str], Iterable[Span]] = getattr,
**cfg, **cfg,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
"""Returns PRF scores for labeled spans. """Returns PRF scores for labeled spans.
examples (Iterable[Example]): Examples to score examples (Iterable[Example]): Examples to score
attr (str): The attribute to score. attr (str): The attribute to score.
getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided, getter (Callable[[Doc, str], Iterable[Span]]): Defaults to getattr. If
getter(doc, attr) should return the spans for the individual doc. provided, getter(doc, attr) should return the spans for the
individual doc.
RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
the keys attr_p/r/f and the per-type PRF scores under attr_per_type. the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
@ -444,7 +445,7 @@ class Scorer:
*, *,
getter: Callable[[Token, str], Any] = getattr, getter: Callable[[Token, str], Any] = getattr,
head_attr: str = "head", head_attr: str = "head",
head_getter: Callable[[Token, str], Any] = getattr, head_getter: Callable[[Token, str], Token] = getattr,
ignore_labels: Tuple[str] = tuple(), ignore_labels: Tuple[str] = tuple(),
**cfg, **cfg,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
@ -458,7 +459,7 @@ class Scorer:
individual token. individual token.
head_attr (str): The attribute containing the head token. Defaults to head_attr (str): The attribute containing the head token. Defaults to
'head'. 'head'.
head_getter (Callable[[Token, str], Any]): Defaults to getattr. If provided, head_getter (Callable[[Token, str], Token]): Defaults to getattr. If provided,
head_getter(token, attr) should return the value of the head for an head_getter(token, attr) should return the value of the head for an
individual token. individual token.
ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct). ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).

View File

@ -356,13 +356,13 @@ def test_language_factories_combine_score_weights(weights, expected):
def test_language_factories_scores(): def test_language_factories_scores():
name = "test_language_factories_scores" name = "test_language_factories_scores"
func = lambda doc: doc func = lambda nlp, name: lambda doc: doc
weights1 = {"a1": 0.5, "a2": 0.5} weights1 = {"a1": 0.5, "a2": 0.5}
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1} weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
Language.component( Language.factory(
f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func, f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
) )
Language.component( Language.factory(
f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func, f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
) )
meta1 = Language.get_factory_meta(f"{name}1") meta1 = Language.get_factory_meta(f"{name}1")

View File

@ -102,8 +102,7 @@ cdef class Doc:
Construction 2 Construction 2
>>> from spacy.tokens import Doc >>> from spacy.tokens import Doc
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'], >>> doc = Doc(nlp.vocab, words=["hello", "world", "!"], spaces=[True, False, False])
>>> spaces=[True, False, False])
DOCS: https://spacy.io/api/doc DOCS: https://spacy.io/api/doc
""" """

View File

@ -886,6 +886,15 @@ def escape_html(text: str) -> str:
def get_words_and_spaces( def get_words_and_spaces(
words: Iterable[str], text: str words: Iterable[str], text: str
) -> Tuple[List[str], List[bool]]: ) -> Tuple[List[str], List[bool]]:
"""Given a list of words and a text, reconstruct the original tokens and
return a list of words and spaces that can be used to create a Doc. This
can help recover destructive tokenization that didn't preserve any
whitespace information.
words (Iterable[str]): The words.
text (str): The original text.
RETURNS (Tuple[List[str], List[bool]]): The words and spaces.
"""
if "".join("".join(words).split()) != "".join(text.split()): if "".join("".join(words).split()) != "".join(text.split()):
raise ValueError(Errors.E194.format(text=text, words=words)) raise ValueError(Errors.E194.format(text=text, words=words))
text_words = [] text_words = []

View File

@ -75,7 +75,8 @@ import { H1, H2, H3, H4, H5, Label, InlineList, Comment } from
Headlines are set in Headlines are set in
[HK Grotesk](http://cargocollective.com/hanken/HK-Grotesk-Open-Source-Font) by [HK Grotesk](http://cargocollective.com/hanken/HK-Grotesk-Open-Source-Font) by
Hanken Design. All other body text and code uses the best-matching default Hanken Design. All other body text and code uses the best-matching default
system font to provide a "native" reading experience. system font to provide a "native" reading experience. All code uses the
[JetBrains Mono](https://www.jetbrains.com/lp/mono/) typeface by JetBrains.
<Infobox title="Important note" variant="warning"> <Infobox title="Important note" variant="warning">
@ -106,7 +107,7 @@ Tags are also available as standalone `<Tag />` components.
| Argument | Example | Result | | Argument | Example | Result |
| -------- | -------------------------- | ----------------------------------------- | | -------- | -------------------------- | ----------------------------------------- |
| `tag` | `{tag="method"}` | <Tag>method</Tag> | | `tag` | `{tag="method"}` | <Tag>method</Tag> |
| `new` | `{new="2"}` | <Tag variant="new">2</Tag> | | `new` | `{new="3"}` | <Tag variant="new">3</Tag> |
| `model` | `{model="tagger, parser"}` | <Tag variant="model">tagger, parser</Tag> | | `model` | `{model="tagger, parser"}` | <Tag variant="model">tagger, parser</Tag> |
| `hidden` | `{hidden="true"}` | | | `hidden` | `{hidden="true"}` | |
@ -130,6 +131,8 @@ Special link styles are used depending on the link URL.
- [I am a regular external link](https://explosion.ai) - [I am a regular external link](https://explosion.ai)
- [I am a link to the documentation](/api/doc) - [I am a link to the documentation](/api/doc)
- [I am a link to an architecture](/api/architectures#HashEmbedCNN)
- [I am a link to a model](/models/en#en_core_web_sm)
- [I am a link to GitHub](https://github.com/explosion/spaCy) - [I am a link to GitHub](https://github.com/explosion/spaCy)
### Abbreviations {#abbr} ### Abbreviations {#abbr}
@ -188,18 +191,20 @@ the buttons are implemented as styled links instead of native button elements.
<InlineList><Button to="#" variant="primary">Primary small</Button> <InlineList><Button to="#" variant="primary">Primary small</Button>
<Button to="#" variant="secondary">Secondary small</Button></InlineList> <Button to="#" variant="secondary">Secondary small</Button></InlineList>
<br />
<InlineList><Button to="#" variant="primary" large>Primary large</Button> <InlineList><Button to="#" variant="primary" large>Primary large</Button>
<Button to="#" variant="secondary" large>Secondary large</Button></InlineList> <Button to="#" variant="secondary" large>Secondary large</Button></InlineList>
## Components ## Components
### Table ### Table {#table}
> #### Markdown > #### Markdown
> >
> ```markdown_ > ```markdown_
> | Header 1 | Header 2 | > | Header 1 | Header 2 |
> | --- | --- | > | -------- | -------- |
> | Column 1 | Column 2 | > | Column 1 | Column 2 |
> ``` > ```
> >
@ -213,7 +218,7 @@ the buttons are implemented as styled links instead of native button elements.
> ``` > ```
Tables are used to present data and API documentation. Certain keywords can be Tables are used to present data and API documentation. Certain keywords can be
used to mark a footer row with a distinct style, for example to visualise the used to mark a footer row with a distinct style, for example to visualize the
return values of a documented function. return values of a documented function.
| Header 1 | Header 2 | Header 3 | Header 4 | | Header 1 | Header 2 | Header 3 | Header 4 |
@ -224,7 +229,73 @@ return values of a documented function.
| Column 1 | Column 2 | Column 3 | Column 4 | | Column 1 | Column 2 | Column 3 | Column 4 |
| **RETURNS** | Column 2 | Column 3 | Column 4 | | **RETURNS** | Column 2 | Column 3 | Column 4 |
### List Tables also support optional "divider" rows that are typically used to denote
keyword-only arguments in API documentation. To turn a row into a dividing
headline, it should only include content in its first cell, and its value should
be italicized:
> #### Markdown
>
> ```markdown_
> | Header 1 | Header 2 | Header 3 |
> | -------- | -------- | -------- |
> | Column 1 | Column 2 | Column 3 |
> | _Hello_ | | |
> | Column 1 | Column 2 | Column 3 |
> ```
| Header 1 | Header 2 | Header 3 |
| -------- | -------- | -------- |
| Column 1 | Column 2 | Column 3 |
| _Hello_ | | |
| Column 1 | Column 2 | Column 3 |
### Type Annotations {#type-annotations}
> #### Markdown
>
> ```markdown_
> ~~Model[List[Doc], Floats2d]~~
> ```
>
> #### JSX
>
> ```markup
> <TypeAnnotation>Model[List[Doc], Floats2d]</Typeannotation>
> ```
Type annotations are special inline code blocks are used to describe Python
types in the [type hints](https://docs.python.org/3/library/typing.html) format.
The special component will split the type, apply syntax highlighting and link
all types that specify links in `meta/type-annotations.json`. Types can link to
internal or external documentation pages. To make it easy to represent the type
annotations in Markdown, the rendering "hijacks" the `~~` tags that would
typically be converted to a `<del>` element but in this case, text surrounded
by `~~` becomes a type annotation.
- ~~Dict[str, List[Union[Doc, Span]]]~~
- ~~Model[List[Doc], List[numpy.ndarray]]~~
Type annotations support a special visual style in tables and will render as a
separate row, under the cell text. This allows the API docs to display complex
types without taking up too much space in the cell. The type annotation should
always be the **last element** in the row.
> #### Markdown
>
> ```markdown_
> | Header 1 | Header 2 |
> | -------- | ----------------------- |
> | Column 1 | Column 2 ~~List[Doc]~~ |
> ```
| Name | Description |
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~ |
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
### List {#list}
> #### Markdown > #### Markdown
> >
@ -255,7 +326,7 @@ automatically.
3. Lorem ipsum dolor 3. Lorem ipsum dolor
4. consectetur adipiscing elit 4. consectetur adipiscing elit
### Aside ### Aside {#aside}
> #### Markdown > #### Markdown
> >
@ -280,7 +351,7 @@ To make them easier to use in Markdown, paragraphs formatted as blockquotes will
turn into asides by default. Level 4 headlines (with a leading `####`) will turn into asides by default. Level 4 headlines (with a leading `####`) will
become aside titles. become aside titles.
### Code Block ### Code Block {#code-block}
> #### Markdown > #### Markdown
> >
@ -387,7 +458,7 @@ original file is shown at the top of the widget.
https://github.com/explosion/spaCy/tree/master/spacy/language.py https://github.com/explosion/spaCy/tree/master/spacy/language.py
``` ```
### Infobox ### Infobox {#infobox}
import Infobox from 'components/infobox' import Infobox from 'components/infobox'
@ -425,7 +496,7 @@ blocks.
</Infobox> </Infobox>
### Accordion ### Accordion {#accordion}
import Accordion from 'components/accordion' import Accordion from 'components/accordion'

View File

@ -33,18 +33,18 @@ TODO: intro and how architectures work, link to
> subword_features = true > subword_features = true
> ``` > ```
Build spaCy's 'standard' tok2vec layer, which uses hash embedding with subword Build spaCy's "standard" tok2vec layer, which uses hash embedding with subword
features and a CNN with layer-normalized maxout. features and a CNN with layer-normalized maxout.
| Name | Type | Description | | Name | Description |
| -------------------- | ---- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | int | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. | | `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
| `depth` | int | The number of convolutional layers to use. Recommended values are between `2` and `8`. | | `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
| `embed_size` | int | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. | | `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
| `window_size` | int | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. | | `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
| `maxout_pieces` | int | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. | | `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
| `subword_features` | bool | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. | | `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
| `pretrained_vectors` | bool | Whether to also use static vectors. | | `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
### spacy.Tok2Vec.v1 {#Tok2Vec} ### spacy.Tok2Vec.v1 {#Tok2Vec}
@ -67,10 +67,10 @@ Construct a tok2vec model out of embedding and encoding subnetworks. See the
["Embed, Encode, Attend, Predict"](https://explosion.ai/blog/deep-learning-formula-nlp) ["Embed, Encode, Attend, Predict"](https://explosion.ai/blog/deep-learning-formula-nlp)
blog post for background. blog post for background.
| Name | Type | Description | | Name | Description |
| -------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `embed` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed) | | `embed` | Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed). ~~Model[List[Doc], List[Floats2d]]~~ |
| `encode` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Floats2d]`. **Output:** `List[Floats2d]`. Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). | | `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~ |
### spacy.Tok2VecListener.v1 {#Tok2VecListener} ### spacy.Tok2VecListener.v1 {#Tok2VecListener}
@ -108,10 +108,10 @@ Instead of defining its own `Tok2Vec` instance, a model architecture like
[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec` [Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
argument that connects to the shared `tok2vec` component in the pipeline. argument that connects to the shared `tok2vec` component in the pipeline.
| Name | Type | Description | | Name | Description |
| ---------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | int | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. | | `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ |
| `upstream` | str | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. | | `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed} ### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
@ -134,12 +134,12 @@ definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from
pretrained static vectors can also be incorporated into the concatenated pretrained static vectors can also be incorporated into the concatenated
representation. representation.
| Name | Type | Description | | Name | Description |
| ------------------------- | ---- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | int | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. | | `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ |
| `rows` | int | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. | | `rows` | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. ~~int~~ |
| `also_embed_subwords` | bool | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. | | `also_embed_subwords` | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. ~~bool~~ |
| `also_use_static_vectors` | bool | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. | | `also_use_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ |
### spacy.CharacterEmbed.v1 {#CharacterEmbed} ### spacy.CharacterEmbed.v1 {#CharacterEmbed}
@ -170,12 +170,12 @@ concatenated. A hash-embedded vector of the `NORM` of the word is also
concatenated on, and the result is then passed through a feed-forward network to concatenated on, and the result is then passed through a feed-forward network to
construct a single vector to represent the information. construct a single vector to represent the information.
| Name | Type | Description | | Name | Description |
| ------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | int | The width of the output vector and the `NORM` hash embedding. | | `width` | The width of the output vector and the `NORM` hash embedding. ~~int~~ |
| `rows` | int | The number of rows in the `NORM` hash embedding table. | | `rows` | The number of rows in the `NORM` hash embedding table. ~~int~~ |
| `nM` | int | The dimensionality of the character embeddings. Recommended values are between `16` and `64`. | | `nM` | The dimensionality of the character embeddings. Recommended values are between `16` and `64`. ~~int~~ |
| `nC` | int | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. | | `nC` | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. ~~int~~ |
### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder} ### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder}
@ -193,12 +193,12 @@ construct a single vector to represent the information.
Encode context using convolutions with maxout activation, layer normalization Encode context using convolutions with maxout activation, layer normalization
and residual connections. and residual connections.
| Name | Type | Description | | Name | Description |
| --------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. | | `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ |
| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. | | `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ |
| `maxout_pieces` | int | The number of maxout pieces to use. Recommended values are `2` or `3`. | | `maxout_pieces` | The number of maxout pieces to use. Recommended values are `2` or `3`. ~~int~~ |
| `depth` | int | The number of convolutional layers. Recommended value is `4`. | | `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
### spacy.MishWindowEncoder.v1 {#MishWindowEncoder} ### spacy.MishWindowEncoder.v1 {#MishWindowEncoder}
@ -216,11 +216,11 @@ Encode context using convolutions with
[`Mish`](https://thinc.ai/docs/api-layers#mish) activation, layer normalization [`Mish`](https://thinc.ai/docs/api-layers#mish) activation, layer normalization
and residual connections. and residual connections.
| Name | Type | Description | | Name | Description |
| ------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. | | `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ |
| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. | | `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ |
| `depth` | int | The number of convolutional layers. Recommended value is `4`. | | `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
### spacy.TorchBiLSTMEncoder.v1 {#TorchBiLSTMEncoder} ### spacy.TorchBiLSTMEncoder.v1 {#TorchBiLSTMEncoder}
@ -237,11 +237,11 @@ and residual connections.
Encode context using bidirectional LSTM layers. Requires Encode context using bidirectional LSTM layers. Requires
[PyTorch](https://pytorch.org). [PyTorch](https://pytorch.org).
| Name | Type | Description | | Name | Description |
| ------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. | | `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ |
| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. | | `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ |
| `depth` | int | The number of convolutional layers. Recommended value is `4`. | | `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"} ## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
@ -268,11 +268,11 @@ architectures into your training config.
<!-- TODO: description --> <!-- TODO: description -->
| Name | Type | Description | | Name | Description |
| ------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). | | `name` | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). ~~str~~ |
| `get_spans` | `Callable` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. | | `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ |
| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). | | `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
### spacy-transformers.Tok2VecListener.v1 {#transformers-Tok2VecListener} ### spacy-transformers.Tok2VecListener.v1 {#transformers-Tok2VecListener}
@ -297,10 +297,10 @@ operate over wordpieces, which usually don't align one-to-one against spaCy
tokens. The layer therefore requires a reduction operation in order to calculate tokens. The layer therefore requires a reduction operation in order to calculate
a single token vector given zero or more wordpiece vectors. a single token vector given zero or more wordpiece vectors.
| Name | Type | Description | | Name | Description |
| ------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `pooling` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types) | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. | | `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ |
| `grad_factor` | float | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. | | `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ |
### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer} ### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
@ -320,12 +320,12 @@ Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
object, but it's a **simpler solution** if you only need the transformer within object, but it's a **simpler solution** if you only need the transformer within
one component. one component.
| Name | Type | Description | | Name | Description |
| ------------------ | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_spans` | callable | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. | | `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ |
| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). | | `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
| `pooling` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types) | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. | | `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ |
| `grad_factor` | float | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. | | `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ |
## Parser & NER architectures {#parser} ## Parser & NER architectures {#parser}
@ -368,14 +368,14 @@ consists of either two or three subnetworks:
state representation. If not present, the output from the lower model is used state representation. If not present, the output from the lower model is used
as action scores directly. as action scores directly.
| Name | Type | Description | | Name | Description |
| ------------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. | | `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
| `nr_feature_tokens` | int | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. | | `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~ |
| `hidden_width` | int | The width of the hidden layer. | | `hidden_width` | The width of the hidden layer. ~~int~~ |
| `maxout_pieces` | int | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. | | `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ |
| `use_upper` | bool | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. | | `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
| `nO` | int | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. | | `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"} ### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"}
@ -402,9 +402,9 @@ generally results in better linear separation between classes, especially for
non-CRF models, because there are more distinct classes for the different non-CRF models, because there are more distinct classes for the different
situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)). situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)).
| Name | Type | Description | | Name | Description |
| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- | | --------- | ------------------------------------------------------------------------------------------ |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. | | `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"} ### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"}
@ -427,9 +427,9 @@ spans into tags assigned to each token. The first token of a span is given the
tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens
are assigned the tag O. are assigned the tag O.
| Name | Type | Description | | Name | Description |
| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- | | --------- | ------------------------------------------------------------------------------------------ |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. | | `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"} ## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
@ -450,10 +450,10 @@ Build a tagger model, using a provided token-to-vector component. The tagger
model simply adds a linear layer with softmax activation to predict scores given model simply adds a linear layer with softmax activation to predict scores given
the token vectors. the token vectors.
| Name | Type | Description | | Name | Description |
| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- | | --------- | ------------------------------------------------------------------------------------------ |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. | | `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
| `nO` | int | The number of tags to output. Inferred from the data if `None`. | | `nO` | The number of tags to output. Inferred from the data if `None`. ~~Optional[int]~~ |
## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"} ## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
@ -489,18 +489,17 @@ network has an internal CNN Tok2Vec layer and uses attention.
> nO = null > nO = null
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `pretrained_vectors` | bool | Whether or not pretrained vectors will be used in addition to the feature vectors. | | `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ |
| `width` | int | Output dimension of the feature encoding step. | | `width` | Output dimension of the feature encoding step. ~~int~~ |
| `embed_size` | int | Input dimension of the feature encoding step. | | `embed_size` | Input dimension of the feature encoding step. ~~int~~ |
| `conv_depth` | int | Depth of the Tok2Vec layer. | | `conv_depth` | Depth of the tok2vec layer. ~~int~~ |
| `window_size` | int | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. | | `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ |
| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. | | `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
| `dropout` | float | The dropout rate. | | `dropout` | The dropout rate. ~~float~~ |
| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when | | `nO` | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
| `begin_training` is called. |
### spacy.TextCatCNN.v1 {#TextCatCNN} ### spacy.TextCatCNN.v1 {#TextCatCNN}
@ -527,11 +526,11 @@ A neural network model where token vectors are calculated using a CNN. The
vectors are mean pooled and used as features in a feed-forward network. This vectors are mean pooled and used as features in a feed-forward network. This
architecture is usually less accurate than the ensemble, but runs faster. architecture is usually less accurate than the ensemble, but runs faster.
| Name | Type | Description | | Name | Description |
| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. | | `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. | | `nO` | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
### spacy.TextCatBOW.v1 {#TextCatBOW} ### spacy.TextCatBOW.v1 {#TextCatBOW}
@ -549,18 +548,18 @@ architecture is usually less accurate than the ensemble, but runs faster.
An ngram "bag-of-words" model. This architecture should run much faster than the An ngram "bag-of-words" model. This architecture should run much faster than the
others, but may not be as accurate, especially if texts are short. others, but may not be as accurate, especially if texts are short.
| Name | Type | Description | | Name | Description |
| ------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. | | `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. | | `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
| `no_output_layer` | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`. | | `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ |
| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. | | `nO` | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"} ## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
An [`EntityLinker`](/api/entitylinker) component disambiguates textual mentions An [`EntityLinker`](/api/entitylinker) component disambiguates textual mentions
(tagged as named entities) to unique identifiers, grounding the named entities (tagged as named entities) to unique identifiers, grounding the named entities
into the "real world". This requires 3 main components: into the "real world". This requires 3 main component
- A [`KnowledgeBase`](/api/kb) (KB) holding the unique identifiers, potential - A [`KnowledgeBase`](/api/kb) (KB) holding the unique identifiers, potential
synonyms and prior probabilities. synonyms and prior probabilities.
@ -571,8 +570,8 @@ into the "real world". This requires 3 main components:
### spacy.EntityLinker.v1 {#EntityLinker} ### spacy.EntityLinker.v1 {#EntityLinker}
The `EntityLinker` model architecture is a `Thinc` `Model` with a Linear output The `EntityLinker` model architecture is a Thinc `Model` with a
layer. [`Linear`](https://thinc.ai/api-layers#linear) output layer.
> #### Example Config > #### Example Config
> >
@ -599,27 +598,24 @@ layer.
> @assets = "spacy.CandidateGenerator.v1" > @assets = "spacy.CandidateGenerator.v1"
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------- | ------------------------------------------ | ---------------------------------------------------------------------------------------- | | --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. | | `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
| `nO` | int | Output dimension, determined by the length of the vectors encoding each entity in the KB | | `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ |
If the `nO` dimension is not set, the Entity Linking component will set it when
`begin_training` is called.
### spacy.EmptyKB.v1 {#EmptyKB} ### spacy.EmptyKB.v1 {#EmptyKB}
A function that creates a default, empty `KnowledgeBase` from a A function that creates a default, empty `KnowledgeBase` from a
[`Vocab`](/api/vocab) instance. [`Vocab`](/api/vocab) instance.
| Name | Type | Description | | Name | Description |
| ---------------------- | ---- | ------------------------------------------------------------------------- | | ---------------------- | ----------------------------------------------------------------------------------- |
| `entity_vector_length` | int | The length of the vectors encoding each entity in the KB - 64 by default. | | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
### spacy.CandidateGenerator.v1 {#CandidateGenerator} ### spacy.CandidateGenerator.v1 {#CandidateGenerator}
A function that takes as input a [`KnowledgeBase`](/api/kb) and a A function that takes as input a [`KnowledgeBase`](/api/kb) and a
[`Span`](/api/span) object denoting a named entity, and returns a list of [`Span`](/api/span) object denoting a named entity, and returns a list of
plausible [`Candidate` objects](/api/kb/#candidate_init). The default plausible [`Candidate`](/api/kb/#candidate) objects. The default
`CandidateGenerator` simply uses the text of a mention to find its potential `CandidateGenerator` simply uses the text of a mention to find its potential
aliases in the `KnowledgeBase`. Note that this function is case-dependent. aliases in the `KnowledgeBase`. Note that this function is case-dependent.

View File

@ -31,10 +31,10 @@ how the component should be configured. You can override its settings via the
> nlp.add_pipe("attribute_ruler", config=config) > nlp.add_pipe("attribute_ruler", config=config)
> ``` > ```
| Setting | Type | Description | Default | | Setting | Description |
| --------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------- | | --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `pattern_dicts` | `Iterable[dict]` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](#add) (`patterns`/`attrs`/`index`) to add as patterns. | `None` | | `pattern_dicts` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ |
| `validate` | bool | Whether patterns should be validated (passed to the `Matcher`). | `False` | | `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py
@ -47,10 +47,10 @@ be a list of dictionaries with `"patterns"`, `"attrs"`, and optional `"index"`
keys, e.g.: keys, e.g.:
```python ```python
pattern_dicts = \[ pattern_dicts = [
{"patterns": \[\[{"TAG": "VB"}\]\], "attrs": {"POS": "VERB"}}, {"patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"}},
{"patterns": \[\[{"LOWER": "an"}\]\], "attrs": {"LEMMA": "a"}}, {"patterns": [[{"LOWER": "an"}]], "attrs": {"LEMMA": "a"}},
\] ]
``` ```
> #### Example > #### Example
@ -60,23 +60,23 @@ pattern_dicts = \[
> attribute_ruler = nlp.add_pipe("attribute_ruler") > attribute_ruler = nlp.add_pipe("attribute_ruler")
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | --------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. | | `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ |
| `name` | str | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. | | `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `pattern_dicts` | `Iterable[Dict]]` | Optional patterns to load in on initialization. Defaults to `None`. | | `pattern_dicts` | Optional patterns to load in on initialization. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ |
| `validate` | bool | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. | | `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ |
## AttributeRuler.\_\_call\_\_ {#call tag="method"} ## AttributeRuler.\_\_call\_\_ {#call tag="method"}
Apply the attribute ruler to a Doc, setting token attributes for tokens matched Apply the attribute ruler to a Doc, setting token attributes for tokens matched
by the provided patterns. by the provided patterns.
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------------------------------------------ | | ----------- | -------------------------------- |
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | | `doc` | The document to process. ~~Doc~~ |
| **RETURNS** | `Doc` | The modified `Doc` with added entities, if available. | | **RETURNS** | The processed document. ~~Doc~~ |
## AttributeRuler.add {#add tag="method"} ## AttributeRuler.add {#add tag="method"}
@ -95,11 +95,11 @@ may be negative to index from the end of the span.
> attribute_ruler.add(patterns=patterns, attrs=attrs) > attribute_ruler.add(patterns=patterns, attrs=attrs)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---------------------- | ----------------------------------------------------------------------------------------------------------------------- | | ---------- | --------------------------------------------------------------------------------------------------------------------------------- |
| patterns | `Iterable[List[Dict]]` | A list of Matcher patterns. | | `patterns` | The `Matcher` patterns to add. ~~Iterable[List[Dict[Union[int, str], Any]]]~~ |
| attrs | dict | The attributes to assign to the target token in the matched span. | | `attrs` | The attributes to assign to the target token in the matched span. ~~Dict[str, Any]~~ |
| index | int | The index of the token in the matched span to modify. May be negative to index from the end of the span. Defaults to 0. | | `index` | The index of the token in the matched span to modify. May be negative to index from the end of the span. Defaults to `0`. ~~int~~ |
## AttributeRuler.add_patterns {#add_patterns tag="method"} ## AttributeRuler.add_patterns {#add_patterns tag="method"}
@ -107,52 +107,52 @@ may be negative to index from the end of the span.
> >
> ```python > ```python
> attribute_ruler = nlp.add_pipe("attribute_ruler") > attribute_ruler = nlp.add_pipe("attribute_ruler")
> pattern_dicts = \[ > pattern_dicts = [
> { > {
> "patterns": \[\[{"TAG": "VB"}\]\], > "patterns": [[{"TAG": "VB"}]],
> "attrs": {"POS": "VERB"} > "attrs": {"POS": "VERB"}
> }, > },
> { > {
> "patterns": \[\[{"LOWER": "two"}, {"LOWER": "apples"}\]\], > "patterns": [[{"LOWER": "two"}, {"LOWER": "apples"}]],
> "attrs": {"LEMMA": "apple"}, > "attrs": {"LEMMA": "apple"},
> "index": -1 > "index": -1
> }, > },
> \] > ]
> attribute_ruler.add_patterns(pattern_dicts) > attribute_ruler.add_patterns(pattern_dicts)
> ``` > ```
Add patterns from a list of pattern dicts with the keys as the arguments to Add patterns from a list of pattern dicts with the keys as the arguments to
[`AttributeRuler.add`](#add). [`AttributeRuler.add`](/api/attributeruler#add).
| Name | Type | Description | | Name | Description |
| --------------- | ----------------- | -------------------- | | --------------- | -------------------------------------------------------------------------- |
| `pattern_dicts` | `Iterable[Dict]]` | The patterns to add. | | `pattern_dicts` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ |
## AttributeRuler.patterns {#patterns tag="property"} ## AttributeRuler.patterns {#patterns tag="property"}
Get all patterns that have been added to the attribute ruler in the Get all patterns that have been added to the attribute ruler in the
`patterns_dict` format accepted by `patterns_dict` format accepted by
[`AttributeRuler.add_patterns`](#add_patterns). [`AttributeRuler.add_patterns`](/api/attributeruler#add_patterns).
| Name | Type | Description | | Name | Description |
| ----------- | ------------ | ------------------------------------------ | | ----------- | -------------------------------------------------------------------------------------------- |
| **RETURNS** | `List[dict]` | The patterns added to the attribute ruler. | | **RETURNS** | The patterns added to the attribute ruler. ~~List[Dict[str, Union[List[dict], dict, int]]]~~ |
## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"} ## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"}
Load attribute ruler patterns from a tag map. Load attribute ruler patterns from a tag map.
| Name | Type | Description | | Name | Description |
| --------- | ---- | ------------------------------------------------------------------------------------------ | | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
| `tag_map` | dict | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. | | `tag_map` | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. ~~Dict[str, Dict[Union[int, str], Union[int, str]]]~~ |
## AttributeRuler.load_from_morph_rules {#load_from_morph_rules tag="method"} ## AttributeRuler.load_from_morph_rules {#load_from_morph_rules tag="method"}
Load attribute ruler patterns from morph rules. Load attribute ruler patterns from morph rules.
| Name | Type | Description | | Name | Description |
| ------------- | ---- | -------------------------------------------------------------------------------------------------------------------- | | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `morph_rules` | dict | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. | | `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ |
## AttributeRuler.to_disk {#to_disk tag="method"} ## AttributeRuler.to_disk {#to_disk tag="method"}
@ -165,11 +165,11 @@ Serialize the pipe to disk.
> attribute_ruler.to_disk("/path/to/attribute_ruler") > attribute_ruler.to_disk("/path/to/attribute_ruler")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## AttributeRuler.from_disk {#from_disk tag="method"} ## AttributeRuler.from_disk {#from_disk tag="method"}
@ -182,12 +182,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
> attribute_ruler.from_disk("/path/to/attribute_ruler") > attribute_ruler.from_disk("/path/to/attribute_ruler")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ---------------- | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `AttributeRuler` | The modified `AttributeRuler` object. | | **RETURNS** | The modified `AttributeRuler` object. ~~AttributeRuler~~ |
## AttributeRuler.to_bytes {#to_bytes tag="method"} ## AttributeRuler.to_bytes {#to_bytes tag="method"}
@ -200,11 +200,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | bytes | The serialized form of the `AttributeRuler` object. | | **RETURNS** | The serialized form of the `AttributeRuler` object. ~~bytes~~ |
## AttributeRuler.from_bytes {#from_bytes tag="method"} ## AttributeRuler.from_bytes {#from_bytes tag="method"}
@ -218,12 +218,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> attribute_ruler.from_bytes(attribute_ruler_bytes) > attribute_ruler.from_bytes(attribute_ruler_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ---------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `AttributeRuler` | The `AttributeRuler` object. | | **RETURNS** | The `AttributeRuler` object. ~~AttributeRuler~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -598,9 +598,9 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
| Argument | Type | Description | | Argument | Type | Description |
| ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------- | | ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------- |
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | | | `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
| `component` | positional | Name of the pipeline component of which the model should be analyzed. |   | | `component` | positional | Name of the pipeline component of which the model should be analyzed. |
| `--layers`, `-l` | option | Comma-separated names of layer IDs to print. | | | `--layers`, `-l` | option | Comma-separated names of layer IDs to print. |
| `--dimensions`, `-DIM` | option | Show dimensions of each layer. | | `--dimensions`, `-DIM` | option | Show dimensions of each layer. |
| `--parameters`, `-PAR` | option | Show parameters of each layer. | | `--parameters`, `-PAR` | option | Show parameters of each layer. |
| `--gradients`, `-GRAD` | option | Show gradients of each layer. | | `--gradients`, `-GRAD` | option | Show gradients of each layer. |

View File

@ -34,12 +34,12 @@ streaming.
> limit = 0 > limit = 0
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------- | ------ | ----------------------------------------------------------------------------------------------------------------------------------------------- | | --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | `Path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). | | `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ |
|  `gold_preproc` | bool | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. | |  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. | | `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
| `limit` | int | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/gold/corpus.py https://github.com/explosion/spaCy/blob/develop/spacy/gold/corpus.py
@ -67,13 +67,13 @@ train/test skew.
> corpus = Corpus("./data", limit=10) > corpus = Corpus("./data", limit=10)
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------- | | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | The directory or filename to read from. | | `path` | The directory or filename to read from. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
|  `gold_preproc` | bool | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. | |  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ |
| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. | | `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
| `limit` | int | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. | | `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
## Corpus.\_\_call\_\_ {#call tag="method"} ## Corpus.\_\_call\_\_ {#call tag="method"}
@ -90,7 +90,7 @@ Yield examples from the data.
> train_data = corpus(nlp) > train_data = corpus(nlp)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ---------- | ------------------------- | | ---------- | -------------------------------------- |
| `nlp` | `Language` | The current `nlp` object. | | `nlp` | The current `nlp` object. ~~Language~~ |
| **YIELDS** | `Example` | The examples. | | **YIELDS** | The examples. ~~Example~~ |

View File

@ -23,13 +23,13 @@ accessed from Python. For the Python documentation, see [`Doc`](/api/doc).
### Attributes {#doc_attributes} ### Attributes {#doc_attributes}
| Name | Type | Description | | Name | Description |
| ------------ | ------------ | ----------------------------------------------------------------------------------------- | | ------------ | -------------------------------------------------------------------------------------------------------- |
| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Doc` object is garbage collected. | | `mem` | A memory pool. Allocated memory will be freed once the `Doc` object is garbage collected. ~~cymem.Pool~~ |
| `vocab` | `Vocab` | A reference to the shared `Vocab` object. | | `vocab` | A reference to the shared `Vocab` object. ~~Vocab~~ |
| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. | | `c` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. ~~TokenC\*~~ |
| `length` | `int` | The number of tokens in the document. | | `length` | The number of tokens in the document. ~~int~~ |
| `max_length` | `int` | The underlying size of the `Doc.c` array. | | `max_length` | The underlying size of the `Doc.c` array. ~~int~~ |
### Doc.push_back {#doc_push_back tag="method"} ### Doc.push_back {#doc_push_back tag="method"}
@ -50,10 +50,10 @@ Append a token to the `Doc`. The token can be provided as a
> assert doc.text == "hello " > assert doc.text == "hello "
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------ | --------------- | ----------------------------------------- | | ------------ | -------------------------------------------------- |
| `lex_or_tok` | `LexemeOrToken` | The word to append to the `Doc`. | | `lex_or_tok` | The word to append to the `Doc`. ~~LexemeOrToken~~ |
| `has_space` | `bint` | Whether the word has trailing whitespace. | | `has_space` | Whether the word has trailing whitespace. ~~bint~~ |
## Token {#token tag="cdef class" source="spacy/tokens/token.pxd"} ## Token {#token tag="cdef class" source="spacy/tokens/token.pxd"}
@ -70,12 +70,12 @@ accessed from Python. For the Python documentation, see [`Token`](/api/token).
### Attributes {#token_attributes} ### Attributes {#token_attributes}
| Name | Type | Description | | Name | Description |
| ------- | --------- | ------------------------------------------------------------- | | ------- | -------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A reference to the shared `Vocab` object. | | `vocab` | A reference to the shared `Vocab` object. ~~Vocab~~ |
| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. | | `c` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. ~~TokenC\*~~ |
| `i` | `int` | The offset of the token within the document. | | `i` | The offset of the token within the document. ~~int~~ |
| `doc` | `Doc` | The parent document. | | `doc` | The parent document. ~~Doc~~ |
### Token.cinit {#token_cinit tag="method"} ### Token.cinit {#token_cinit tag="method"}
@ -87,12 +87,12 @@ Create a `Token` object from a `TokenC*` pointer.
> token = Token.cinit(&doc.c[3], doc, 3) > token = Token.cinit(&doc.c[3], doc, 3)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | --------- | ------------------------------------------------------------ | | -------- | -------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A reference to the shared `Vocab`. | | `vocab` | A reference to the shared `Vocab`. ~~Vocab~~ |
| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. | | `c` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. ~~TokenC\*~~ |
| `offset` | `int` | The offset of the token within the document. | | `offset` | The offset of the token within the document. ~~int~~ |
| `doc` | `Doc` | The parent document. | | `doc` | The parent document. ~~int~~ |
## Span {#span tag="cdef class" source="spacy/tokens/span.pxd"} ## Span {#span tag="cdef class" source="spacy/tokens/span.pxd"}
@ -107,14 +107,14 @@ accessed from Python. For the Python documentation, see [`Span`](/api/span).
### Attributes {#span_attributes} ### Attributes {#span_attributes}
| Name | Type | Description | | Name | Description |
| ------------ | -------------------------------------- | ------------------------------------------------------- | | ------------ | ----------------------------------------------------------------------------- |
| `doc` | `Doc` | The parent document. | | `doc` | The parent document. ~~Doc~~ |
| `start` | `int` | The index of the first token of the span. | | `start` | The index of the first token of the span. ~~int~~ |
| `end` | `int` | The index of the first token after the span. | | `end` | The index of the first token after the span. ~~int~~ |
| `start_char` | `int` | The index of the first character of the span. | | `start_char` | The index of the first character of the span. ~~int~~ |
| `end_char` | `int` | The index of the last character of the span. | | `end_char` | The index of the last character of the span. ~~int~~ |
| `label` | <Abbr title="uint64_t">`attr_t`</Abbr> | A label to attach to the span, e.g. for named entities. | | `label` | A label to attach to the span, e.g. for named entities. ~~attr_t (uint64_t)~~ |
## Lexeme {#lexeme tag="cdef class" source="spacy/lexeme.pxd"} ## Lexeme {#lexeme tag="cdef class" source="spacy/lexeme.pxd"}
@ -129,11 +129,11 @@ accessed from Python. For the Python documentation, see [`Lexeme`](/api/lexeme).
### Attributes {#lexeme_attributes} ### Attributes {#lexeme_attributes}
| Name | Type | Description | | Name | Description |
| ------- | -------------------------------------- | --------------------------------------------------------------- | | ------- | ----------------------------------------------------------------------------- |
| `c` | `LexemeC*` | A pointer to a [`LexemeC`](/api/cython-structs#lexemec) struct. | | `c` | A pointer to a [`LexemeC`](/api/cython-structs#lexemec) struct. ~~LexemeC\*~~ |
| `vocab` | `Vocab` | A reference to the shared `Vocab` object. | | `vocab` | A reference to the shared `Vocab` object. ~~Vocab~~ |
| `orth` | <Abbr title="uint64_t">`attr_t`</Abbr> | ID of the verbatim text content. | | `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
## Vocab {#vocab tag="cdef class" source="spacy/vocab.pxd"} ## Vocab {#vocab tag="cdef class" source="spacy/vocab.pxd"}
@ -149,11 +149,11 @@ accessed from Python. For the Python documentation, see [`Vocab`](/api/vocab).
### Attributes {#vocab_attributes} ### Attributes {#vocab_attributes}
| Name | Type | Description | | Name | Description |
| --------- | ------------- | ------------------------------------------------------------------------------------------- | | --------- | ---------------------------------------------------------------------------------------------------------- |
| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. | | `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
| `strings` | `StringStore` | A `StringStore` that maps string to hash values and vice versa. | | `strings` | A `StringStore` that maps string to hash values and vice versa. ~~StringStore~~ |
| `length` | `int` | The number of entries in the vocabulary. | | `length` | The number of entries in the vocabulary. ~~int~~ |
### Vocab.get {#vocab_get tag="method"} ### Vocab.get {#vocab_get tag="method"}
@ -166,11 +166,11 @@ vocabulary.
> lexeme = vocab.get(vocab.mem, "hello") > lexeme = vocab.get(vocab.mem, "hello")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---------------- | ------------------------------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------------- |
| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. | | `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
| `string` | str | The string of the word to look up. | | `string` | The string of the word to look up. ~~str~~ |
| **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary. | | **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
### Vocab.get_by_orth {#vocab_get_by_orth tag="method"} ### Vocab.get_by_orth {#vocab_get_by_orth tag="method"}
@ -183,11 +183,11 @@ vocabulary.
> lexeme = vocab.get_by_orth(doc[0].lex.norm) > lexeme = vocab.get_by_orth(doc[0].lex.norm)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------------------------------------- | ------------------------------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------------- |
| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. | | `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
| `orth` | <Abbr title="uint64_t">`attr_t`</Abbr> | ID of the verbatim text content. | | `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
| **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary. | | **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
## StringStore {#stringstore tag="cdef class" source="spacy/strings.pxd"} ## StringStore {#stringstore tag="cdef class" source="spacy/strings.pxd"}
@ -203,7 +203,7 @@ accessed from Python. For the Python documentation, see
### Attributes {#stringstore_attributes} ### Attributes {#stringstore_attributes}
| Name | Type | Description | | Name | Description |
| ------ | ------------------------------------------------------ | ------------------------------------------------------------------------------------------------ | | ------ | ---------------------------------------------------------------------------------------------------------------- |
| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the`StringStore` object is garbage collected. | | `mem` | A memory pool. Allocated memory will be freed once the `StringStore` object is garbage collected. ~~cymem.Pool~~ |
| `keys` | <Abbr title="vector[uint64_t]">`vector[hash_t]`</Abbr> | A list of hash values in the `StringStore`. | | `keys` | A list of hash values in the `StringStore`. ~~vector[hash_t] \(vector[uint64_t])~~ |

View File

@ -18,26 +18,26 @@ Cython data container for the `Token` object.
> token_ptr = &doc.c[3] > token_ptr = &doc.c[3]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------ | -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lex` | `const LexemeC*` | A pointer to the lexeme for the token. | | `lex` | A pointer to the lexeme for the token. ~~const LexemeC\*~~ |
| `morph` | `uint64_t` | An ID allowing lookup of morphological attributes. | | `morph` | An ID allowing lookup of morphological attributes. ~~uint64_t~~ |
| `pos` | `univ_pos_t` | Coarse-grained part-of-speech tag. | | `pos` | Coarse-grained part-of-speech tag. ~~univ_pos_t~~ |
| `spacy` | `bint` | A binary value indicating whether the token has trailing whitespace. | | `spacy` | A binary value indicating whether the token has trailing whitespace. ~~bint~~ |
| `tag` | <Abbr title="uint64_t">`attr_t`</Abbr> | Fine-grained part-of-speech tag. | | `tag` | Fine-grained part-of-speech tag. ~~attr_t (uint64_t)~~ |
| `idx` | `int` | The character offset of the token within the parent document. | | `idx` | The character offset of the token within the parent document. ~~int~~ |
| `lemma` | <Abbr title="uint64_t">`attr_t`</Abbr> | Base form of the token, with no inflectional suffixes. | | `lemma` | Base form of the token, with no inflectional suffixes. ~~attr_t (uint64_t)~~ |
| `sense` | <Abbr title="uint64_t">`attr_t`</Abbr> | Space for storing a word sense ID, currently unused. | | `sense` | Space for storing a word sense ID, currently unused. ~~attr_t (uint64_t)~~ |
| `head` | `int` | Offset of the syntactic parent relative to the token. | | `head` | Offset of the syntactic parent relative to the token. ~~int~~ |
| `dep` | <Abbr title="uint64_t">`attr_t`</Abbr> | Syntactic dependency relation. | | `dep` | Syntactic dependency relation. ~~attr_t (uint64_t)~~ |
| `l_kids` | `uint32_t` | Number of left children. | | `l_kids` | Number of left children. ~~uint32_t~~ |
| `r_kids` | `uint32_t` | Number of right children. | | `r_kids` | Number of right children. ~~uint32_t~~ |
| `l_edge` | `uint32_t` | Offset of the leftmost token of this token's syntactic descendants. | | `l_edge` | Offset of the leftmost token of this token's syntactic descendants. ~~uint32_t~~ |
| `r_edge` | `uint32_t` | Offset of the rightmost token of this token's syntactic descendants. | | `r_edge` | Offset of the rightmost token of this token's syntactic descendants. ~~uint32_t~~ |
| `sent_start` | `int` | Ternary value indicating whether the token is the first word of a sentence. `0` indicates a missing value, `-1` indicates `False` and `1` indicates `True`. The default value, 0, is interpreted as no sentence break. Sentence boundary detectors will usually set 0 for all tokens except tokens that follow a sentence boundary. | | `sent_start` | Ternary value indicating whether the token is the first word of a sentence. `0` indicates a missing value, `-1` indicates `False` and `1` indicates `True`. The default value, 0, is interpreted as no sentence break. Sentence boundary detectors will usually set 0 for all tokens except tokens that follow a sentence boundary. ~~int~~ |
| `ent_iob` | `int` | IOB code of named entity tag. `0` indicates a missing value, `1` indicates `I`, `2` indicates `0` and `3` indicates `B`. | | `ent_iob` | IOB code of named entity tag. `0` indicates a missing value, `1` indicates `I`, `2` indicates `0` and `3` indicates `B`. ~~int~~ |
| `ent_type` | <Abbr title="uint64_t">`attr_t`</Abbr> | Named entity type. | | `ent_type` | Named entity type. ~~attr_t (uint64_t)~~ |
| `ent_id` | <Abbr title="uint64_t">`attr_t`</Abbr> | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | | `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~attr_t (uint64_t)~~ |
### Token.get_struct_attr {#token_get_struct_attr tag="staticmethod, nogil" source="spacy/tokens/token.pxd"} ### Token.get_struct_attr {#token_get_struct_attr tag="staticmethod, nogil" source="spacy/tokens/token.pxd"}
@ -52,11 +52,11 @@ Get the value of an attribute from the `TokenC` struct by attribute ID.
> is_alpha = Token.get_struct_attr(&doc.c[3], IS_ALPHA) > is_alpha = Token.get_struct_attr(&doc.c[3], IS_ALPHA)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------------------------------------- | -------------------------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------- |
| `token` | `const TokenC*` | A pointer to a `TokenC` struct. | | `token` | A pointer to a `TokenC` struct. ~~const TokenC\*~~ |
| `feat_name` | `attr_id_t` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. | | `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
| **RETURNS** | <Abbr title="uint64_t">`attr_t`</Abbr> | The value of the attribute. | | **RETURNS** | The value of the attribute. ~~attr_t (uint64_t)~~ |
### Token.set_struct_attr {#token_set_struct_attr tag="staticmethod, nogil" source="spacy/tokens/token.pxd"} ### Token.set_struct_attr {#token_set_struct_attr tag="staticmethod, nogil" source="spacy/tokens/token.pxd"}
@ -72,11 +72,11 @@ Set the value of an attribute of the `TokenC` struct by attribute ID.
> Token.set_struct_attr(token, TAG, 0) > Token.set_struct_attr(token, TAG, 0)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------------------------------------- | -------------------------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------- |
| `token` | `const TokenC*` | A pointer to a `TokenC` struct. | | `token` | A pointer to a `TokenC` struct. ~~const TokenC\*~~ |
| `feat_name` | `attr_id_t` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. | | `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
| `value` | <Abbr title="uint64_t">`attr_t`</Abbr> | The value to set. | | `value` | The value to set. ~~attr_t (uint64_t)~~ |
### token_by_start {#token_by_start tag="function" source="spacy/tokens/doc.pxd"} ### token_by_start {#token_by_start tag="function" source="spacy/tokens/doc.pxd"}
@ -93,12 +93,12 @@ Find a token in a `TokenC*` array by the offset of its first character.
> assert token_by_start(doc.c, doc.length, 4) == -1 > assert token_by_start(doc.c, doc.length, 4) == -1
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------ | --------------- | --------------------------------------------------------- | | ------------ | ----------------------------------------------------------------- |
| `tokens` | `const TokenC*` | A `TokenC*` array. | | `tokens` | A `TokenC*` array. ~~const TokenC\*~~ |
| `length` | `int` | The number of tokens in the array. | | `length` | The number of tokens in the array. ~~int~~ |
| `start_char` | `int` | The start index to search for. | | `start_char` | The start index to search for. ~~int~~ |
| **RETURNS** | `int` | The index of the token in the array or `-1` if not found. | | **RETURNS** | The index of the token in the array or `-1` if not found. ~~int~~ |
### token_by_end {#token_by_end tag="function" source="spacy/tokens/doc.pxd"} ### token_by_end {#token_by_end tag="function" source="spacy/tokens/doc.pxd"}
@ -115,12 +115,12 @@ Find a token in a `TokenC*` array by the offset of its final character.
> assert token_by_end(doc.c, doc.length, 1) == -1 > assert token_by_end(doc.c, doc.length, 1) == -1
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------- | --------------------------------------------------------- | | ----------- | ----------------------------------------------------------------- |
| `tokens` | `const TokenC*` | A `TokenC*` array. | | `tokens` | A `TokenC*` array. ~~const TokenC\*~~ |
| `length` | `int` | The number of tokens in the array. | | `length` | The number of tokens in the array. ~~int~~ |
| `end_char` | `int` | The end index to search for. | | `end_char` | The end index to search for. ~~int~~ |
| **RETURNS** | `int` | The index of the token in the array or `-1` if not found. | | **RETURNS** | The index of the token in the array or `-1` if not found. ~~int~~ |
### set_children_from_heads {#set_children_from_heads tag="function" source="spacy/tokens/doc.pxd"} ### set_children_from_heads {#set_children_from_heads tag="function" source="spacy/tokens/doc.pxd"}
@ -143,10 +143,10 @@ attribute, in order to make the parse tree navigation consistent.
> assert doc.c[3].l_kids == 1 > assert doc.c[3].l_kids == 1
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | --------------- | ---------------------------------- | | -------- | ------------------------------------------ |
| `tokens` | `const TokenC*` | A `TokenC*` array. | | `tokens` | A `TokenC*` array. ~~const TokenC\*~~ |
| `length` | `int` | The number of tokens in the array. | | `length` | The number of tokens in the array. ~~int~~ |
## LexemeC {#lexemec tag="C struct" source="spacy/structs.pxd"} ## LexemeC {#lexemec tag="C struct" source="spacy/structs.pxd"}
@ -160,17 +160,17 @@ struct.
> lex = doc.c[3].lex > lex = doc.c[3].lex
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------------------------- | -------------------------------------------------------------------------------------------------------------------------- | | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
| `flags` | <Abbr title="uint64_t">`flags_t`</Abbr> | Bit-field for binary lexical flag values. | | `flags` | Bit-field for binary lexical flag values. ~~flags_t (uint64_t)~~ |
| `id` | <Abbr title="uint64_t">`attr_t`</Abbr> | Usually used to map lexemes to rows in a matrix, e.g. for word vectors. Does not need to be unique, so currently misnamed. | | `id` | Usually used to map lexemes to rows in a matrix, e.g. for word vectors. Does not need to be unique, so currently misnamed. ~~attr_t (uint64_t)~~ |
| `length` | <Abbr title="uint64_t">`attr_t`</Abbr> | Number of unicode characters in the lexeme. | | `length` | Number of unicode characters in the lexeme. ~~attr_t (uint64_t)~~ |
| `orth` | <Abbr title="uint64_t">`attr_t`</Abbr> | ID of the verbatim text content. | | `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
| `lower` | <Abbr title="uint64_t">`attr_t`</Abbr> | ID of the lowercase form of the lexeme. | | `lower` | ID of the lowercase form of the lexeme. ~~attr_t (uint64_t)~~ |
| `norm` | <Abbr title="uint64_t">`attr_t`</Abbr> | ID of the lexeme's norm, i.e. a normalized form of the text. | | `norm` | ID of the lexeme's norm, i.e. a normalized form of the text. ~~attr_t (uint64_t)~~ |
| `shape` | <Abbr title="uint64_t">`attr_t`</Abbr> | Transform of the lexeme's string, to show orthographic features. | | `shape` | Transform of the lexeme's string, to show orthographic features. ~~attr_t (uint64_t)~~ |
| `prefix` | <Abbr title="uint64_t">`attr_t`</Abbr> | Length-N substring from the start of the lexeme. Defaults to `N=1`. | | `prefix` | Length-N substring from the start of the lexeme. Defaults to `N=1`. ~~attr_t (uint64_t)~~ |
| `suffix` | <Abbr title="uint64_t">`attr_t`</Abbr> | Length-N substring from the end of the lexeme. Defaults to `N=3`. | | `suffix` | Length-N substring from the end of the lexeme. Defaults to `N=3`. ~~attr_t (uint64_t)~~ |
### Lexeme.get_struct_attr {#lexeme_get_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"} ### Lexeme.get_struct_attr {#lexeme_get_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"}
@ -186,11 +186,11 @@ Get the value of an attribute from the `LexemeC` struct by attribute ID.
> is_alpha = Lexeme.get_struct_attr(lexeme, IS_ALPHA) > is_alpha = Lexeme.get_struct_attr(lexeme, IS_ALPHA)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------------------------------------- | -------------------------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------- |
| `lex` | `const LexemeC*` | A pointer to a `LexemeC` struct. | | `lex` | A pointer to a `LexemeC` struct. ~~const LexemeC\*~~ |
| `feat_name` | `attr_id_t` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. | | `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
| **RETURNS** | <Abbr title="uint64_t">`attr_t`</Abbr> | The value of the attribute. | | **RETURNS** | The value of the attribute. ~~attr_t (uint64_t)~~ |
### Lexeme.set_struct_attr {#lexeme_set_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"} ### Lexeme.set_struct_attr {#lexeme_set_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"}
@ -206,11 +206,11 @@ Set the value of an attribute of the `LexemeC` struct by attribute ID.
> Lexeme.set_struct_attr(lexeme, NORM, lexeme.lower) > Lexeme.set_struct_attr(lexeme, NORM, lexeme.lower)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------------------------------------- | -------------------------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------- |
| `lex` | `const LexemeC*` | A pointer to a `LexemeC` struct. | | `lex` | A pointer to a `LexemeC` struct. ~~const LexemeC\*~~ |
| `feat_name` | `attr_id_t` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. | | `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
| `value` | <Abbr title="uint64_t">`attr_t`</Abbr> | The value to set. | | `value` | The value to set. ~~attr_t (uint64_t)~~ |
### Lexeme.c_check_flag {#lexeme_c_check_flag tag="staticmethod, nogil" source="spacy/lexeme.pxd"} ### Lexeme.c_check_flag {#lexeme_c_check_flag tag="staticmethod, nogil" source="spacy/lexeme.pxd"}
@ -226,11 +226,11 @@ Check the value of a binary flag attribute.
> is_stop = Lexeme.c_check_flag(lexeme, IS_STOP) > is_stop = Lexeme.c_check_flag(lexeme, IS_STOP)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---------------- | ------------------------------------------------------------------------------- | | ----------- | --------------------------------------------------------------------------------------------- |
| `lexeme` | `const LexemeC*` | A pointer to a `LexemeC` struct. | | `lexeme` | A pointer to a `LexemeC` struct. ~~const LexemeC\*~~ |
| `flag_id` | `attr_id_t` | The ID of the flag to look up. The flag IDs are enumerated in `spacy.typedefs`. | | `flag_id` | The ID of the flag to look up. The flag IDs are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
| **RETURNS** | `bint` | The boolean value of the flag. | | **RETURNS** | The boolean value of the flag. ~~bint~~ |
### Lexeme.c_set_flag {#lexeme_c_set_flag tag="staticmethod, nogil" source="spacy/lexeme.pxd"} ### Lexeme.c_set_flag {#lexeme_c_set_flag tag="staticmethod, nogil" source="spacy/lexeme.pxd"}
@ -246,8 +246,8 @@ Set the value of a binary flag attribute.
> Lexeme.c_set_flag(lexeme, IS_STOP, 0) > Lexeme.c_set_flag(lexeme, IS_STOP, 0)
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------- | ---------------- | ------------------------------------------------------------------------------- | | --------- | --------------------------------------------------------------------------------------------- |
| `lexeme` | `const LexemeC*` | A pointer to a `LexemeC` struct. | | `lexeme` | A pointer to a `LexemeC` struct. ~~const LexemeC\*~~ |
| `flag_id` | `attr_id_t` | The ID of the flag to look up. The flag IDs are enumerated in `spacy.typedefs`. | | `flag_id` | The ID of the flag to look up. The flag IDs are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
| `value` | `bint` | The value to set. | | `value` | The value to set. ~~bint~~ |

View File

@ -73,15 +73,15 @@ your config and check that it's valid, you can run the
Defines the `nlp` object, its tokenizer and Defines the `nlp` object, its tokenizer and
[processing pipeline](/usage/processing-pipelines) component names. [processing pipeline](/usage/processing-pipelines) component names.
| Name | Type | Description | Default | | Name | Description | Default |
| ------------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------- | | ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------- |
| `lang` | str | The language code to use. | `null` | | `lang` | The language code to use. ~~str~~ | `null` |
| `pipeline` | `List[str]` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). | `[]` | | `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). ~~List[str]~~ | `[]` |
| `load_vocab_data` | bool | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. | `true` | | `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. ~~bool~~ | `true` |
| `before_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. | `null` | | `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ | `null` |
| `after_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. | `null` | | `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. ~~Optional[Callable[[Language], Language]]~~ | `null` |
| `after_pipeline_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. | `null` | | `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. ~~Optional[Callable[[Language], Language]]~~ | `null` |
| `tokenizer` | callable | The tokenizer to use. | [`Tokenizer`](/api/tokenizer) | | `tokenizer` | The tokenizer to use. ~~Callable[[str], Doc]~~ | [`Tokenizer`](/api/tokenizer) |
### components {#config-components tag="section"} ### components {#config-components tag="section"}
@ -128,24 +128,24 @@ process that are used when you run [`spacy train`](/api/cli#train).
<!-- TODO: complete --> <!-- TODO: complete -->
| Name | Type | Description | Default | | Name | Description | Default |
| --------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------- | | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------- |
| `seed` | int | The random seed. | `${system:seed}` | | `seed` | The random seed. ~~int~~ | `${system:seed}` |
| `dropout` | float | The dropout rate. | `0.1` | | `dropout` | The dropout rate. ~~float~~ | `0.1` |
| `accumulate_gradient` | int | Whether to divide the batch up into substeps. | `1` | | `accumulate_gradient` | Whether to divide the batch up into substeps. ~~int~~ | `1` |
| `init_tok2vec` | str | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). | `${paths:init_tok2vec}` | | `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). ~~Optional[str]~~ | `${paths:init_tok2vec}` |
| `raw_text` | str | | `${paths:raw}` | | `raw_text` | ~~Optional[str]~~ | `${paths:raw}` |
| `vectors` | str | | `null` | | `vectors` | ~~Optional[str]~~ | `null` |
| `patience` | int | How many steps to continue without improvement in evaluation score. | `1600` | | `patience` | How many steps to continue without improvement in evaluation score. ~~int~~ | `1600` |
| `max_epochs` | int | Maximum number of epochs to train for. | `0` | | `max_epochs` | Maximum number of epochs to train for. ~~int~~ | `0` |
| `max_steps` | int | Maximum number of update steps to train for. | `20000` | | `max_steps` | Maximum number of update steps to train for. ~~int~~ | `20000` |
| `eval_frequency` | int | How often to evaluate during training (steps). | `200` | | `eval_frequency` | How often to evaluate during training (steps). ~~int~~ | `200` |
| `score_weights` | `Dict[str, float]` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. | `{}` | | `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. ~~Dict[str, float]~~ | `{}` |
| `frozen_components` | `List[str]` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. | `[]` | | `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. ~~List[str]~~ | `[]` |
| `train_corpus` | callable | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. | [`Corpus`](/api/corpus) | | `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. ~~Callable[[Language], Iterator[Example]]~~ | [`Corpus`](/api/corpus) |
| `dev_corpus` | callable | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. | [`Corpus`](/api/corpus) | | `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. ~~Callable[[Language], Iterator[Example]]~~ | [`Corpus`](/api/corpus) |
| `batcher` | callable | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. | [`batch_by_words`](/api/top-level#batch_by_words) | | `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | [`batch_by_words`](/api/top-level#batch_by_words) |
| `optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) | | `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. ~~Optimizer~~ | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) |
### pretraining {#config-pretraining tag="section,optional"} ### pretraining {#config-pretraining tag="section,optional"}
@ -153,19 +153,19 @@ This section is optional and defines settings and controls for
[language model pretraining](/usage/training#pretraining). It's used when you [language model pretraining](/usage/training#pretraining). It's used when you
run [`spacy pretrain`](/api/cli#pretrain). run [`spacy pretrain`](/api/cli#pretrain).
| Name | Type | Description | Default | | Name | Description | Default |
| ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- | | ---------------------------- | ----------------------------------------------------------------------------------------------------------- | --------------------------------------------------- |
| `max_epochs` | int | Maximum number of epochs. | `1000` | | `max_epochs` | Maximum number of epochs. ~~int~~ | `1000` |
| `min_length` | int | Minimum length of examples. | `5` | | `min_length` | Minimum length of examples. ~~int~~ | `5` |
| `max_length` | int | Maximum length of examples. | `500` | | `max_length` | Maximum length of examples. ~~int~~ | `500` |
| `dropout` | float | The dropout rate. | `0.2` | | `dropout` | The dropout rate. ~~float~~ | `0.2` |
| `n_save_every` | int | Saving frequency. | `null` | | `n_save_every` | Saving frequency. ~~int~~ | `null` |
| `batch_size` | int / `Sequence[int]` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). | `3000` | | `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). ~~Union[int, Sequence[int]]~~ | `3000` |
| `seed` | int | The random seed. | `${system.seed}` | | `seed` | The random seed. ~~int~~ | `${system.seed}` |
| `use_pytorch_for_gpu_memory` | bool | Allocate memory via PyTorch. | `${system:use_pytorch_for_gpu_memory}` | | `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. ~~bool~~ | `${system:use_pytorch_for_gpu_memory}` |
| `tok2vec_model` | str | tok2vec model section in the config. | `"components.tok2vec.model"` | | `tok2vec_model` | tok2vec model section in the config. ~~str~~ | `"components.tok2vec.model"` |
| `objective` | dict | The pretraining objective. | `{"type": "characters", "n_characters": 4}` | | `objective` | The pretraining objective. ~~Dict[str, Any]~~ | `{"type": "characters", "n_characters": 4}` |
| `optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) | | `optimizer` | The optimizer. ~~Optimizer~~ | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) |
## Training data {#training} ## Training data {#training}
@ -313,22 +313,22 @@ to keep track of your settings and hyperparameters and your own
> } > }
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------- | ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `text` | str | Raw text. | | `text` | Raw text. ~~str~~ |
| `words` | `List[str]` | List of gold-standard tokens. | | `words` | List of gold-standard tokens. ~~List[str]~~ |
| `lemmas` | `List[str]` | List of lemmas. | | `lemmas` | List of lemmas. ~~List[str]~~ |
| `spaces` | `List[bool]` | List of boolean values indicating whether the corresponding tokens is followed by a space or not. | | `spaces` | List of boolean values indicating whether the corresponding tokens is followed by a space or not. ~~List[bool]~~ |
| `tags` | `List[str]` | List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging). | | `tags` | List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging). ~~List[str]~~ |
| `pos` | `List[str]` | List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging). | | `pos` | List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging). ~~List[str]~~ |
| `morphs` | `List[str]` | List of [morphological features](/usage/linguistic-features#rule-based-morphology). | | `morphs` | List of [morphological features](/usage/linguistic-features#rule-based-morphology). ~~List[str]~~ |
| `sent_starts` | `List[bool]` | List of boolean values indicating whether each token is the first of a sentence or not. | | `sent_starts` | List of boolean values indicating whether each token is the first of a sentence or not. ~~List[bool]~~ |
| `deps` | `List[str]` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. | | `deps` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. ~~List[str]~~ |
| `heads` | `List[int]` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. | | `heads` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. ~~List[int]~~ |
| `entities` | `List[str]` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. | | `entities` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. ~~List[str]~~ |
| `entities` | `List[Tuple[int, int, str]]` | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. | | `entities` | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ |
| `cats` | `Dict[str, float]` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. | | `cats` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. ~~Dict[str, float]~~ |
| `links` | `Dict[(int, int), Dict]` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. | | `links` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. ~~Dict[Tuple[int, int], Dict]~~ |
<Infobox title="Notes and caveats"> <Infobox title="Notes and caveats">
@ -390,10 +390,10 @@ provided.
> srsly.write_jsonl("/path/to/text.jsonl", data) > srsly.write_jsonl("/path/to/text.jsonl", data)
> ``` > ```
| Key | Type | Description | | Key | Description |
| -------- | ---- | ---------------------------------------------------------- | | -------- | ------------------------------------------------------------------ |
| `text` | str | The raw input text. Is not required if `tokens` available. | | `text` | The raw input text. Is not required if `tokens` available. ~~str~~ |
| `tokens` | list | Optional tokenization, one string per token. | | `tokens` | Optional tokenization, one string per token. ~~List[str]~~ |
```json ```json
### Example ### Example

View File

@ -44,18 +44,18 @@ A pattern added to the `DependencyMatcher` consists of a list of dictionaries,
with each dictionary describing a node to match. Each pattern should have the with each dictionary describing a node to match. Each pattern should have the
following top-level keys: following top-level keys:
| Name | Type | Description | | Name | Description |
| --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- | | --------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). | | `PATTERN` | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). ~~Dict[str, Any]~~ |
| `SPEC` | dict | The relationships of the nodes in the subtree that should be matched. | | `SPEC` | The relationships of the nodes in the subtree that should be matched. ~~Dict[str, str]~~ |
The `SPEC` includes the following fields: The `SPEC` includes the following fields:
| Name | Type | Description | | Name | Description |
| ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `NODE_NAME` | str | A unique name for this node to refer to it in other specs. | | `NODE_NAME` | A unique name for this node to refer to it in other specs. ~~str~~ |
| `NBOR_RELOP` | str | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. | | `NBOR_RELOP` | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. ~~str~~ |
| `NBOR_NAME` | str | The unique name of the node that this node is connected to. | | `NBOR_NAME` | The unique name of the node that this node is connected to. ~~str~~ |
## DependencyMatcher.\_\_init\_\_ {#init tag="method"} ## DependencyMatcher.\_\_init\_\_ {#init tag="method"}
@ -68,9 +68,9 @@ Create a rule-based `DependencyMatcher`.
> matcher = DependencyMatcher(nlp.vocab) > matcher = DependencyMatcher(nlp.vocab)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------- | ------- | ------------------------------------------------------------------------------------------- | | ------- | ----------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | | `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ |
## DependencyMatcher.\_\call\_\_ {#call tag="method"} ## DependencyMatcher.\_\call\_\_ {#call tag="method"}
@ -79,9 +79,9 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
> #### Example > #### Example
> >
> ```python > ```python
> from spacy.matcher import Matcher > from spacy.matcher import DependencyMatcher
> >
> matcher = Matcher(nlp.vocab) > matcher = DependencyMatcher(nlp.vocab)
> pattern = [ > pattern = [
> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}}, > {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
> {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}}, > {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
@ -91,10 +91,10 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. | | `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | | **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
## DependencyMatcher.\_\_len\_\_ {#len tag="method"} ## DependencyMatcher.\_\_len\_\_ {#len tag="method"}
@ -115,9 +115,9 @@ number of individual patterns.
> assert len(matcher) == 1 > assert len(matcher) == 1
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------- | | ----------- | ---------------------------- |
| **RETURNS** | int | The number of rules. | | **RETURNS** | The number of rules. ~~int~~ |
## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"} ## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"}
@ -132,10 +132,10 @@ Check whether the matcher contains rules for a match ID.
> assert "Rule" in matcher > assert "Rule" in matcher
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ----------------------------------------------------- | | ----------- | -------------------------------------------------------------- |
| `key` | str | The match ID. | | `key` | The match ID. ~~str~~ |
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | | **RETURNS** | Whether the matcher contains rules for this match ID. ~~bool~~ |
## DependencyMatcher.add {#add tag="method"} ## DependencyMatcher.add {#add tag="method"}
@ -151,16 +151,16 @@ will be overwritten.
> def on_match(matcher, doc, id, matches): > def on_match(matcher, doc, id, matches):
> print('Matched!', matches) > print('Matched!', matches)
> >
> matcher = Matcher(nlp.vocab) > matcher = DependencyMatcher(nlp.vocab)
> matcher.add("TEST_PATTERNS", patterns) > matcher.add("TEST_PATTERNS", patterns)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ------------------ | --------------------------------------------------------------------------------------------- | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. | | `match_id` | An ID for the thing you're matching. ~~str~~ |
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | | `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a `"PATTERN"` and `"SPEC"`. ~~List[List[Dict[str, dict]]]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[Matcher, Doc, int, List[tuple], Any]]~~ |
## DependencyMatcher.remove {#remove tag="method"} ## DependencyMatcher.remove {#remove tag="method"}
@ -176,9 +176,9 @@ exist.
> assert "Rule" not in matcher > assert "Rule" not in matcher
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----- | ---- | ------------------------- | | ----- | --------------------------------- |
| `key` | str | The ID of the match rule. | | `key` | The ID of the match rule. ~~str~~ |
## DependencyMatcher.get {#get tag="method"} ## DependencyMatcher.get {#get tag="method"}
@ -192,7 +192,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
> on_match, patterns = matcher.get("Rule") > on_match, patterns = matcher.get("Rule")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | --------------------------------------------- | | ----------- | --------------------------------------------------------------------------------------------- |
| `key` | str | The ID of the match rule. | | `key` | The ID of the match rule. ~~str~~ |
| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. | | **RETURNS** | The rule, as an `(on_match, patterns)` tuple. ~~Tuple[Optional[Callable], List[List[dict]]]~~ |

View File

@ -48,13 +48,13 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("parser", config=config) > nlp.add_pipe("parser", config=config)
> ``` > ```
| Setting | Type | Description | Default | | Setting | Description |
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------- | | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | `None` | | `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100` | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
| `learn_tokens` | bool | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. | `False` | | `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ |
| `min_action_freq` | int | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. | `30` | | `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. Defaults to `30`. ~~int~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/dep_parser.pyx https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/dep_parser.pyx
@ -81,16 +81,16 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe).
| Name | Type | Description | | Name | Description |
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | | `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ |
| `learn_tokens` | bool | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. | | `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. ~~bool~~ |
| `min_action_freq` | int | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. | | `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ |
## DependencyParser.\_\_call\_\_ {#call tag="method"} ## DependencyParser.\_\_call\_\_ {#call tag="method"}
@ -111,10 +111,10 @@ and all pipeline components are applied to the `Doc` in order. Both
> processed = parser(doc) > processed = parser(doc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------ | | ----------- | -------------------------------- |
| `doc` | `Doc` | The document to process. | | `doc` | The document to process. ~~Doc~~ |
| **RETURNS** | `Doc` | The processed document. | | **RETURNS** | The processed document. ~~Doc~~ |
## DependencyParser.pipe {#pipe tag="method"} ## DependencyParser.pipe {#pipe tag="method"}
@ -133,12 +133,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------ | | -------------- | ------------------------------------------------------------- |
| `stream` | `Iterable[Doc]` | A stream of documents. | | `docs` | A stream of documents. ~~Iterable[Doc]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | `Doc` | Processed documents in the order of the original text. | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## DependencyParser.begin_training {#begin_training tag="method"} ## DependencyParser.begin_training {#begin_training tag="method"}
@ -158,13 +158,13 @@ setting up the label scheme based on the data.
> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | | `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## DependencyParser.predict {#predict tag="method"} ## DependencyParser.predict {#predict tag="method"}
@ -178,10 +178,10 @@ modifying them.
> scores = parser.predict([doc1, doc2]) > scores = parser.predict([doc1, doc2])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------------- | ---------------------------------------------- | | ----------- | ------------------------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to predict. | | `docs` | The documents to predict. ~~Iterable[Doc]~~ |
| **RETURNS** | `syntax.StateClass` | A helper class for the parse state (internal). | | **RETURNS** | A helper class for the parse state (internal). ~~StateClass~~ |
## DependencyParser.set_annotations {#set_annotations tag="method"} ## DependencyParser.set_annotations {#set_annotations tag="method"}
@ -195,10 +195,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
> parser.set_annotations([doc1, doc2], scores) > parser.set_annotations([doc1, doc2], scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ------------------- | ---------------------------------------------------------- | | -------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to modify. | | `docs` | The documents to modify. ~~Iterable[Doc]~~ |
| `scores` | `syntax.StateClass` | The scores to set, produced by `DependencyParser.predict`. | | `scores` | The scores to set, produced by `DependencyParser.predict`. Returns an internal helper class for the parse state. ~~List[StateClass]~~ |
## DependencyParser.update {#update tag="method"} ## DependencyParser.update {#update tag="method"}
@ -214,15 +214,15 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
> losses = parser.update(examples, sgd=optimizer) > losses = parser.update(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/dependencyparser#set_annotations). | | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## DependencyParser.get_loss {#get_loss tag="method"} ## DependencyParser.get_loss {#get_loss tag="method"}
@ -237,11 +237,11 @@ predicted scores.
> loss, d_loss = parser.get_loss(examples, scores) > loss, d_loss = parser.get_loss(examples, scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------- | --------------------------------------------------- | | ----------- | --------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The batch of examples. | | `examples` | The batch of examples. ~~Iterable[Example]~~ |
| `scores` | `syntax.StateClass` | Scores representing the model's predictions. | | `scores` | Scores representing the model's predictions. ~~StateClass~~ |
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
## DependencyParser.score {#score tag="method" new="3"} ## DependencyParser.score {#score tag="method" new="3"}
@ -253,10 +253,10 @@ Score a batch of examples.
> scores = parser.score(examples) > scores = parser.score(examples)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `examples` | `Iterable[Example]` | The examples to score. | | `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans) and [`Scorer.score_deps`](/api/scorer#score_deps). | | **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans) and [`Scorer.score_deps`](/api/scorer#score_deps). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## DependencyParser.create_optimizer {#create_optimizer tag="method"} ## DependencyParser.create_optimizer {#create_optimizer tag="method"}
@ -270,9 +270,9 @@ component.
> optimizer = parser.create_optimizer() > optimizer = parser.create_optimizer()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------------------------------------- | -------------- | | ----------- | ---------------------------- |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## DependencyParser.use_params {#use_params tag="method, contextmanager"} ## DependencyParser.use_params {#use_params tag="method, contextmanager"}
@ -287,9 +287,9 @@ context, the original parameters are restored.
> parser.to_disk("/best_model") > parser.to_disk("/best_model")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---- | ----------------------------------------- | | -------- | -------------------------------------------------- |
| `params` | dict | The parameter values to use in the model. | | `params` | The parameter values to use in the model. ~~dict~~ |
## DependencyParser.add_label {#add_label tag="method"} ## DependencyParser.add_label {#add_label tag="method"}
@ -302,10 +302,10 @@ Add a new label to the pipe.
> parser.add_label("MY_LABEL") > parser.add_label("MY_LABEL")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | --------------------------------------------------- | | ----------- | ----------------------------------------------------------- |
| `label` | str | The label to add. | | `label` | The label to add. ~~str~~ |
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. | | **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
## DependencyParser.to_disk {#to_disk tag="method"} ## DependencyParser.to_disk {#to_disk tag="method"}
@ -318,11 +318,11 @@ Serialize the pipe to disk.
> parser.to_disk("/path/to/parser") > parser.to_disk("/path/to/parser")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## DependencyParser.from_disk {#from_disk tag="method"} ## DependencyParser.from_disk {#from_disk tag="method"}
@ -335,12 +335,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
> parser.from_disk("/path/to/parser") > parser.from_disk("/path/to/parser")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ------------------ | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. | | **RETURNS** | The modified `DependencyParser` object. ~~DependencyParser~~ |
## DependencyParser.to_bytes {#to_bytes tag="method"} ## DependencyParser.to_bytes {#to_bytes tag="method"}
@ -353,11 +353,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. | | **RETURNS** | The serialized form of the `DependencyParser` object. ~~bytes~~ |
## DependencyParser.from_bytes {#from_bytes tag="method"} ## DependencyParser.from_bytes {#from_bytes tag="method"}
@ -371,12 +371,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> parser.from_bytes(parser_bytes) > parser.from_bytes(parser_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ------------------ | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `DependencyParser` | The `DependencyParser` object. | | **RETURNS** | The `DependencyParser` object. ~~DependencyParser~~ |
## DependencyParser.labels {#labels tag="property"} ## DependencyParser.labels {#labels tag="property"}
@ -389,9 +389,9 @@ The labels currently added to the component.
> assert "MY_LABEL" in parser.labels > assert "MY_LABEL" in parser.labels
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ---------------------------------- | | ----------- | ------------------------------------------------------ |
| **RETURNS** | tuple | The labels added to the component. | | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -30,11 +30,11 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
> doc = Doc(nlp.vocab, words=words, spaces=spaces) > doc = Doc(nlp.vocab, words=words, spaces=spaces)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. | | `vocab` | A storage container for lexical types. ~~Vocab~~ |
| `words` | iterable | A list of strings to add to the container. | | `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. | | `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
## Doc.\_\_getitem\_\_ {#getitem tag="method"} ## Doc.\_\_getitem\_\_ {#getitem tag="method"}
@ -52,10 +52,10 @@ Negative indexing is supported, and follows the usual Python semantics, i.e.
> assert span.text == "it back" > assert span.text == "it back"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------- | ----------------------- | | ----------- | -------------------------------- |
| `i` | int | The index of the token. | | `i` | The index of the token. ~~int~~ |
| **RETURNS** | `Token` | The token at `doc[i]`. | | **RETURNS** | The token at `doc[i]`. ~~Token~~ |
Get a [`Span`](/api/span) object, starting at position `start` (token index) and Get a [`Span`](/api/span) object, starting at position `start` (token index) and
ending at position `end` (token index). For instance, `doc[2:5]` produces a span ending at position `end` (token index). For instance, `doc[2:5]` produces a span
@ -64,10 +64,10 @@ are not supported, as `Span` objects must be contiguous (cannot have gaps). You
can use negative indices and open-ended ranges, which have their normal Python can use negative indices and open-ended ranges, which have their normal Python
semantics. semantics.
| Name | Type | Description | | Name | Description |
| ----------- | ------ | --------------------------------- | | ----------- | ----------------------------------------------------- |
| `start_end` | tuple | The slice of the document to get. | | `start_end` | The slice of the document to get. ~~Tuple[int, int]~~ |
| **RETURNS** | `Span` | The span at `doc[start:end]`. | | **RETURNS** | The span at `doc[start:end]`. ~~Span~~ |
## Doc.\_\_iter\_\_ {#iter tag="method"} ## Doc.\_\_iter\_\_ {#iter tag="method"}
@ -85,9 +85,9 @@ main way annotations are accessed from Python. If faster-than-Python speeds are
required, you can instead access the annotations as a numpy array, or access the required, you can instead access the annotations as a numpy array, or access the
underlying C data directly from Cython. underlying C data directly from Cython.
| Name | Type | Description | | Name | Description |
| ---------- | ------- | ----------------- | | ---------- | --------------------------- |
| **YIELDS** | `Token` | A `Token` object. | | **YIELDS** | A `Token` object. ~~Token~~ |
## Doc.\_\_len\_\_ {#len tag="method"} ## Doc.\_\_len\_\_ {#len tag="method"}
@ -100,9 +100,9 @@ Get the number of tokens in the document.
> assert len(doc) == 7 > assert len(doc) == 7
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------- | | ----------- | --------------------------------------------- |
| **RETURNS** | int | The number of tokens in the document. | | **RETURNS** | The number of tokens in the document. ~~int~~ |
## Doc.set_extension {#set_extension tag="classmethod" new="2"} ## Doc.set_extension {#set_extension tag="classmethod" new="2"}
@ -120,14 +120,14 @@ details, see the documentation on
> assert doc._.has_city > assert doc._.has_city
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- | | --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `doc._.my_attr`. | | `name` | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `doc._.my_attr`. ~~str~~ |
| `default` | - | Optional default value of the attribute if no getter or method is defined. | | `default` | Optional default value of the attribute if no getter or method is defined. ~~Optional[Any]~~ |
| `method` | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`. | | `method` | Set a custom method on the object, for example `doc._.compare(other_doc)`. ~~Optional[Callable[[Doc, ...], Any]]~~ |
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | | `getter` | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. ~~Optional[Callable[[Doc], Any]]~~ |
| `setter` | callable | Setter function that takes the `Doc` and a value, and modifies the object. Is called when the user writes to the `Doc._` attribute. | | `setter` | Setter function that takes the `Doc` and a value, and modifies the object. Is called when the user writes to the `Doc._` attribute. ~~Optional[Callable[[Doc, Any], None]]~~ |
| `force` | bool | Force overwriting existing attribute. | | `force` | Force overwriting existing attribute. ~~bool~~ |
## Doc.get_extension {#get_extension tag="classmethod" new="2"} ## Doc.get_extension {#get_extension tag="classmethod" new="2"}
@ -144,10 +144,10 @@ Look up a previously registered extension by name. Returns a 4-tuple
> assert extension == (False, None, None, None) > assert extension == (False, None, None, None)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | Name of the extension. | | `name` | Name of the extension. ~~str~~ |
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | | **RETURNS** | A `(default, method, getter, setter)` tuple of the extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ |
## Doc.has_extension {#has_extension tag="classmethod" new="2"} ## Doc.has_extension {#has_extension tag="classmethod" new="2"}
@ -161,10 +161,10 @@ Check whether an extension has been registered on the `Doc` class.
> assert Doc.has_extension("has_city") > assert Doc.has_extension("has_city")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------------ | | ----------- | --------------------------------------------------- |
| `name` | str | Name of the extension to check. | | `name` | Name of the extension to check. ~~str~~ |
| **RETURNS** | bool | Whether the extension has been registered. | | **RETURNS** | Whether the extension has been registered. ~~bool~~ |
## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"} ## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
@ -179,10 +179,10 @@ Remove a previously registered extension.
> assert not Doc.has_extension("has_city") > assert not Doc.has_extension("has_city")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | --------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | Name of the extension. | | `name` | Name of the extension. ~~str~~ |
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | | **RETURNS** | A `(default, method, getter, setter)` tuple of the removed extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ |
## Doc.char_span {#char_span tag="method" new="2"} ## Doc.char_span {#char_span tag="method" new="2"}
@ -197,14 +197,14 @@ the character indices don't map to a valid span.
> assert span.text == "New York" > assert span.text == "New York"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- | | ------------------------------------ | ----------------------------------------------------------------------------------------- |
| `start` | int | The index of the first character of the span. | | `start` | The index of the first character of the span. ~~int~~ |
| `end` | int | The index of the last character after the span. | | `end` | The index of the last character after the span. ~int~~ |
| `label` | uint64 / str | A label to attach to the span, e.g. for named entities. | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / str | An ID from a knowledge base to capture the meaning of a named entity. | | `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| **RETURNS** | `Span` | The newly constructed object or `None`. | | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
## Doc.similarity {#similarity tag="method" model="vectors"} ## Doc.similarity {#similarity tag="method" model="vectors"}
@ -221,10 +221,10 @@ using an average of word vectors.
> assert apples_oranges == oranges_apples > assert apples_oranges == oranges_apples
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | -------------------------------------------------------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------------------------------------------------- |
| `other` | - | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. | | `other` | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ |
| **RETURNS** | float | A scalar similarity score. Higher is more similar. | | **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~ |
## Doc.count_by {#count_by tag="method"} ## Doc.count_by {#count_by tag="method"}
@ -237,15 +237,15 @@ attribute ID.
> ```python > ```python
> from spacy.attrs import ORTH > from spacy.attrs import ORTH
> doc = nlp("apple apple orange banana") > doc = nlp("apple apple orange banana")
> assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2} > assert doc.count_by(ORTH) == {7024: 1, 119552: 1, 2087: 2}
> doc.to_array([ORTH]) > doc.to_array([ORTH])
> # array([[11880], [11880], [7561], [12800]]) > # array([[11880], [11880], [7561], [12800]])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------------------------- | | ----------- | --------------------------------------------------------------------- |
| `attr_id` | int | The attribute ID | | `attr_id` | The attribute ID. ~~int~~ |
| **RETURNS** | dict | A dictionary mapping attributes to integer counts. | | **RETURNS** | A dictionary mapping attributes to integer counts. ~~Dict[int, int]~~ |
## Doc.get_lca_matrix {#get_lca_matrix tag="method"} ## Doc.get_lca_matrix {#get_lca_matrix tag="method"}
@ -261,9 +261,9 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
> # array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32) > # array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------------------------------------- | ----------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------- |
| **RETURNS** | `numpy.ndarray[ndim=2, dtype="int32"]` | The lowest common ancestor matrix of the `Doc`. | | **RETURNS** | The lowest common ancestor matrix of the `Doc`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |
## Doc.to_array {#to_array tag="method"} ## Doc.to_array {#to_array tag="method"}
@ -288,10 +288,10 @@ Returns a 2D array with one row per token and one column per attribute (when
> np_array = doc.to_array("POS") > np_array = doc.to_array("POS")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
| `attr_ids` | list or int or string | A list of attributes (int IDs or string names) or a single attribute (int ID or string name) | | `attr_ids` | A list of attributes (int IDs or string names) or a single attribute (int ID or string name). ~~Union[int, str, List[Union[int, str]]]~~ |
| **RETURNS** | `numpy.ndarray[ndim=2, dtype="uint64"]` or `numpy.ndarray[ndim=1, dtype="uint64"]` | The exported attributes as a numpy array. | | **RETURNS** | The exported attributes as a numpy array. ~~Union[numpy.ndarray[ndim=2, dtype=uint64], numpy.ndarray[ndim=1, dtype=uint64]]~~ |
## Doc.from_array {#from_array tag="method"} ## Doc.from_array {#from_array tag="method"}
@ -310,15 +310,17 @@ array of attributes.
> assert doc[0].pos_ == doc2[0].pos_ > assert doc[0].pos_ == doc2[0].pos_
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------------------------------------- | ------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------- |
| `attrs` | list | A list of attribute ID ints. | | `attrs` | A list of attribute ID ints. ~~List[int]~~ |
| `array` | `numpy.ndarray[ndim=2, dtype="int32"]` | The attribute values to load. | | `array` | The attribute values to load. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Doc` | Itself. | | **RETURNS** | The `Doc` itself. ~~Doc~~ |
## Doc.from_docs {#from_docs tag="staticmethod"} ## Doc.from_docs {#from_docs tag="staticmethod"}
<!-- TODO: When was this added? -->
Concatenate multiple `Doc` objects to form a new one. Raises an error if the Concatenate multiple `Doc` objects to form a new one. Raises an error if the
`Doc` objects do not all share the same `Vocab`. `Doc` objects do not all share the same `Vocab`.
@ -337,12 +339,12 @@ Concatenate multiple `Doc` objects to form a new one. Raises an error if the
> [str(ent) for doc in docs for ent in doc.ents] > [str(ent) for doc in docs for ent in doc.ents]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------------- | ----- | ----------------------------------------------------------------------------------------------- | | ------------------- | ----------------------------------------------------------------------------------------------------------------- |
| `docs` | list | A list of `Doc` objects. | | `docs` | A list of `Doc` objects. ~~List[Doc]~~ |
| `ensure_whitespace` | bool | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. | | `ensure_whitespace` | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. ~~bool~~ |
| `attrs` | list | Optional list of attribute ID ints or attribute name strings. | | `attrs` | Optional list of attribute ID ints or attribute name strings. ~~Optional[List[Union[str, int]]]~~ |
| **RETURNS** | `Doc` | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. | | **RETURNS** | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. ~~Optional[Doc]~~ |
## Doc.to_disk {#to_disk tag="method" new="2"} ## Doc.to_disk {#to_disk tag="method" new="2"}
@ -354,11 +356,11 @@ Save the current state to a directory.
> doc.to_disk("/path/to/doc") > doc.to_disk("/path/to/doc")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## Doc.from_disk {#from_disk tag="method" new="2"} ## Doc.from_disk {#from_disk tag="method" new="2"}
@ -372,12 +374,12 @@ Loads state from a directory. Modifies the object in place and returns it.
> doc = Doc(Vocab()).from_disk("/path/to/doc") > doc = Doc(Vocab()).from_disk("/path/to/doc")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Doc` | The modified `Doc` object. | | **RETURNS** | The modified `Doc` object. ~~Doc~~ |
## Doc.to_bytes {#to_bytes tag="method"} ## Doc.to_bytes {#to_bytes tag="method"}
@ -390,11 +392,11 @@ Serialize, i.e. export the document contents to a binary string.
> doc_bytes = doc.to_bytes() > doc_bytes = doc.to_bytes()
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. | | **RETURNS** | A losslessly serialized copy of the `Doc`, including all annotations. ~~bytes~~ |
## Doc.from_bytes {#from_bytes tag="method"} ## Doc.from_bytes {#from_bytes tag="method"}
@ -410,12 +412,12 @@ Deserialize, i.e. import the document contents from a binary string.
> assert doc.text == doc2.text > assert doc.text == doc2.text
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| `data` | bytes | The string to load from. | | `data` | The string to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Doc` | The `Doc` object. | | **RETURNS** | The `Doc` object. ~~Doc~~ |
## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"} ## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"}
@ -433,9 +435,9 @@ invalidated, although they may accidentally continue to work.
> retokenizer.merge(doc[0:2]) > retokenizer.merge(doc[0:2])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------- | ---------------- | | ----------- | -------------------------------- |
| **RETURNS** | `Retokenizer` | The retokenizer. | | **RETURNS** | The retokenizer. ~~Retokenizer~~ |
### Retokenizer.merge {#retokenizer.merge tag="method"} ### Retokenizer.merge {#retokenizer.merge tag="method"}
@ -454,10 +456,10 @@ dictionary mapping attribute names to values as the `"_"` key.
> retokenizer.merge(doc[2:4], attrs=attrs) > retokenizer.merge(doc[2:4], attrs=attrs)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------- | ------ | -------------------------------------- | | ------- | --------------------------------------------------------------------- |
| `span` | `Span` | The span to merge. | | `span` | The span to merge. ~~Span~~ |
| `attrs` | dict | Attributes to set on the merged token. | | `attrs` | Attributes to set on the merged token. ~~Dict[Union[str, int], Any]~~ |
### Retokenizer.split {#retokenizer.split tag="method"} ### Retokenizer.split {#retokenizer.split tag="method"}
@ -488,33 +490,12 @@ underlying lexeme (if they're context-independent lexical attributes like
> retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs) > retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------- | ------- | ----------------------------------------------------------------------------------------------------------- | | ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
| `token` | `Token` | The token to split. | | `token` | The token to split. ~~Token~~ |
| `orths` | list | The verbatim text of the split tokens. Needs to match the text of the original token. | | `orths` | The verbatim text of the split tokens. Needs to match the text of the original token. ~~List[str]~~ |
| `heads` | list | List of `token` or `(token, subtoken)` tuples specifying the tokens to attach the newly split subtokens to. | | `heads` | List of `token` or `(token, subtoken)` tuples specifying the tokens to attach the newly split subtokens to. ~~List[Union[Token, Tuple[Token, int]]]~~ |
| `attrs` | dict | Attributes to set on all split tokens. Attribute names mapped to list of per-token attribute values. | | `attrs` | Attributes to set on all split tokens. Attribute names mapped to list of per-token attribute values. ~~Dict[Union[str, int], List[Any]]~~ |
## Doc.merge {#merge tag="method"}
Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
is merged into a single token. If `start_idx` and `end_idx` do not mark start
and end token boundaries, the document remains unchanged.
> #### Example
>
> ```python
> doc = nlp("Los Angeles start.")
> doc.merge(0, len("Los Angeles"), "NNP", "Los Angeles", "GPE")
> assert [t.text for t in doc] == ["Los Angeles", "start", "."]
> ```
| Name | Type | Description |
| -------------- | ------- | ------------------------------------------------------------------------------------------------------------------------- |
| `start_idx` | int | The character index of the start of the slice to merge. |
| `end_idx` | int | The character index after the end of the slice to merge. |
| `**attributes` | - | Attributes to assign to the merged token. By default, attributes are inherited from the syntactic root token of the span. |
| **RETURNS** | `Token` | The newly merged token, or `None` if the start and end indices did not fall at token boundaries |
## Doc.ents {#ents tag="property" model="NER"} ## Doc.ents {#ents tag="property" model="NER"}
@ -531,9 +512,9 @@ objects, if the entity recognizer has been applied.
> assert ents[0].text == "Mr. Best" > assert ents[0].text == "Mr. Best"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------------------------------ | | ----------- | --------------------------------------------------------------------- |
| **RETURNS** | tuple | Entities in the document, one `Span` per entity. | | **RETURNS** | Entities in the document, one `Span` per entity. ~~Tuple[Span, ...]~~ |
## Doc.noun_chunks {#noun_chunks tag="property" model="parser"} ## Doc.noun_chunks {#noun_chunks tag="property" model="parser"}
@ -552,9 +533,9 @@ relative clauses.
> assert chunks[1].text == "another phrase" > assert chunks[1].text == "another phrase"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ------ | ---------------------------- | | ---------- | ------------------------------------- |
| **YIELDS** | `Span` | Noun chunks in the document. | | **YIELDS** | Noun chunks in the document. ~~Span~~ |
## Doc.sents {#sents tag="property" model="parser"} ## Doc.sents {#sents tag="property" model="parser"}
@ -572,9 +553,9 @@ will be unavailable.
> assert [s.root.text for s in sents] == ["is", "'s"] > assert [s.root.text for s in sents] == ["is", "'s"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ------ | -------------------------- | | ---------- | ----------------------------------- |
| **YIELDS** | `Span` | Sentences in the document. | | **YIELDS** | Sentences in the document. ~~Span~~ |
## Doc.has_vector {#has_vector tag="property" model="vectors"} ## Doc.has_vector {#has_vector tag="property" model="vectors"}
@ -587,9 +568,9 @@ A boolean value indicating whether a word vector is associated with the object.
> assert doc.has_vector > assert doc.has_vector
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------------------ | | ----------- | --------------------------------------------------------- |
| **RETURNS** | bool | Whether the document has a vector data attached. | | **RETURNS** | Whether the document has a vector data attached. ~~bool~~ |
## Doc.vector {#vector tag="property" model="vectors"} ## Doc.vector {#vector tag="property" model="vectors"}
@ -604,9 +585,9 @@ vectors.
> assert doc.vector.shape == (300,) > assert doc.vector.shape == (300,)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---------------------------------------- | ------------------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------------------- |
| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the document's semantics. | | **RETURNS** | A 1-dimensional array representing the document's vector. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
## Doc.vector_norm {#vector_norm tag="property" model="vectors"} ## Doc.vector_norm {#vector_norm tag="property" model="vectors"}
@ -622,32 +603,32 @@ The L2 norm of the document's vector representation.
> assert doc1.vector_norm != doc2.vector_norm > assert doc1.vector_norm != doc2.vector_norm
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ----------------------------------------- | | ----------- | --------------------------------------------------- |
| **RETURNS** | float | The L2 norm of the vector representation. | | **RETURNS** | The L2 norm of the vector representation. ~~float~~ |
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Description |
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `text` | str | A string representation of the document text. | | `text` | A string representation of the document text. ~~str~~ |
| `text_with_ws` | str | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | | `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ |
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. | | `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ |
| `vocab` | `Vocab` | The store of lexical types. | | `vocab` | The store of lexical types. ~~Vocab~~ |
| `tensor` <Tag variant="new">2</Tag> | `ndarray` | Container for dense vector representations. | | `tensor` <Tag variant="new">2</Tag> | Container for dense vector representations. ~~numpy.ndarray~~ |
| `cats` <Tag variant="new">2</Tag> | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. | | `cats` <Tag variant="new">2</Tag> | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ |
| `user_data` | - | A generic storage area, for user custom data. | | `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
| `lang` <Tag variant="new">2.1</Tag> | int | Language of the document's vocabulary. | | `lang` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~int~~ |
| `lang_` <Tag variant="new">2.1</Tag> | str | Language of the document's vocabulary. | | `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~ |
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. | | `is_tagged` | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. ~~bool~~ |
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. | | `is_parsed` | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. ~~bool~~ |
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. | | `is_sentenced` | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. ~~bool~~ |
| `is_nered` <Tag variant="new">2.1</Tag> | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. | | `is_nered` <Tag variant="new">2.1</Tag> | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. ~~bool~~ |
| `sentiment` | float | The document's positivity/negativity score, if available. | | `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. | | `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. | | `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. | | `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | | `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -44,11 +44,11 @@ Create a `DocBin` object to hold serialized annotations.
> doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) > doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
> ``` > ```
| Argument | Type | Description | | Argument | Description |
| ----------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `attrs` | `Iterable[str]` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. | | `attrs` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. ~~Iterable[str]~~ |
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. | | `store_user_data` | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. ~~bool~~ |
| `docs` | `Iterable[Doc]` | `Doc` objects to add on initialization. | | `docs` | `Doc` objects to add on initialization. ~~Iterable[Doc]~~ |
## DocBin.\_\len\_\_ {#len tag="method"} ## DocBin.\_\len\_\_ {#len tag="method"}
@ -63,9 +63,9 @@ Get the number of `Doc` objects that were added to the `DocBin`.
> assert len(doc_bin) == 1 > assert len(doc_bin) == 1
> ``` > ```
| Argument | Type | Description | | Argument | Description |
| ----------- | ---- | ------------------------------------------- | | ----------- | --------------------------------------------------- |
| **RETURNS** | int | The number of `Doc`s added to the `DocBin`. | | **RETURNS** | The number of `Doc`s added to the `DocBin`. ~~int~~ |
## DocBin.add {#add tag="method"} ## DocBin.add {#add tag="method"}
@ -79,9 +79,9 @@ Add a `Doc`'s annotations to the `DocBin` for serialization.
> doc_bin.add(doc) > doc_bin.add(doc)
> ``` > ```
| Argument | Type | Description | | Argument | Description |
| -------- | ----- | ------------------------ | | -------- | -------------------------------- |
| `doc` | `Doc` | The `Doc` object to add. | | `doc` | The `Doc` object to add. ~~Doc~~ |
## DocBin.get_docs {#get_docs tag="method"} ## DocBin.get_docs {#get_docs tag="method"}
@ -93,15 +93,15 @@ Recover `Doc` objects from the annotations, using the given vocab.
> docs = list(doc_bin.get_docs(nlp.vocab)) > docs = list(doc_bin.get_docs(nlp.vocab))
> ``` > ```
| Argument | Type | Description | | Argument | Description |
| ---------- | ------- | ------------------ | | ---------- | --------------------------- |
| `vocab` | `Vocab` | The shared vocab. | | `vocab` | The shared vocab. ~~Vocab~~ |
| **YIELDS** | `Doc` | The `Doc` objects. | | **YIELDS** | The `Doc` objects. ~~Doc~~ |
## DocBin.merge {#merge tag="method"} ## DocBin.merge {#merge tag="method"}
Extend the annotations of this `DocBin` with the annotations from another. Will Extend the annotations of this `DocBin` with the annotations from another. Will
raise an error if the pre-defined attrs of the two `DocBin`s don't match. raise an error if the pre-defined `attrs` of the two `DocBin`s don't match.
> #### Example > #### Example
> >
@ -114,9 +114,9 @@ raise an error if the pre-defined attrs of the two `DocBin`s don't match.
> assert len(doc_bin1) == 2 > assert len(doc_bin1) == 2
> ``` > ```
| Argument | Type | Description | | Argument | Description |
| -------- | -------- | ------------------------------------------- | | -------- | ------------------------------------------------------ |
| `other` | `DocBin` | The `DocBin` to merge into the current bin. | | `other` | The `DocBin` to merge into the current bin. ~~DocBin~~ |
## DocBin.to_bytes {#to_bytes tag="method"} ## DocBin.to_bytes {#to_bytes tag="method"}
@ -130,9 +130,9 @@ Serialize the `DocBin`'s annotations to a bytestring.
> doc_bin_bytes = doc_bin.to_bytes() > doc_bin_bytes = doc_bin.to_bytes()
> ``` > ```
| Argument | Type | Description | | Argument | Description |
| ----------- | ----- | ------------------------ | | ----------- | ---------------------------------- |
| **RETURNS** | bytes | The serialized `DocBin`. | | **RETURNS** | The serialized `DocBin`. ~~bytes~~ |
## DocBin.from_bytes {#from_bytes tag="method"} ## DocBin.from_bytes {#from_bytes tag="method"}
@ -145,10 +145,10 @@ Deserialize the `DocBin`'s annotations from a bytestring.
> new_doc_bin = DocBin().from_bytes(doc_bin_bytes) > new_doc_bin = DocBin().from_bytes(doc_bin_bytes)
> ``` > ```
| Argument | Type | Description | | Argument | Description |
| ------------ | -------- | ---------------------- | | ------------ | -------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| **RETURNS** | `DocBin` | The loaded `DocBin`. | | **RETURNS** | The loaded `DocBin`. ~~DocBin~~ |
## DocBin.to_disk {#to_disk tag="method" new="3"} ## DocBin.to_disk {#to_disk tag="method" new="3"}
@ -164,9 +164,9 @@ and the result can be used as the input data for
> doc_bin.to_disk("./data.spacy") > doc_bin.to_disk("./data.spacy")
> ``` > ```
| Argument | Type | Description | | Argument | Description |
| -------- | ------------ | ----------------------------------------------------- | | -------- | -------------------------------------------------------------------------- |
| `path` | str / `Path` | The file path, typically with the `.spacy` extension. | | `path` | The file path, typically with the `.spacy` extension. ~~Union[str, Path]~~ |
## DocBin.from_disk {#from_disk tag="method" new="3"} ## DocBin.from_disk {#from_disk tag="method" new="3"}
@ -178,7 +178,7 @@ Load a serialized `DocBin` from a file. Typically uses the `.spacy` extension.
> doc_bin = DocBin().from_disk("./data.spacy") > doc_bin = DocBin().from_disk("./data.spacy")
> ``` > ```
| Argument | Type | Description | | Argument | Description |
| ----------- | ------------ | ----------------------------------------------------- | | ----------- | -------------------------------------------------------------------------- |
| `path` | str / `Path` | The file path, typically with the `.spacy` extension. | | `path` | The file path, typically with the `.spacy` extension. ~~Union[str, Path]~~ |
| **RETURNS** | `DocBin` | The loaded `DocBin`. | | **RETURNS** | The loaded `DocBin`. ~~DocBin~~ |

View File

@ -40,14 +40,13 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("entity_linker", config=config) > nlp.add_pipe("entity_linker", config=config)
> ``` > ```
| Setting | Type | Description | Default | | Setting | Description |
| ---------------- | -------------------------------------------------------- | --------------------------------------------------------------------------- | ------------------------------------------------------ | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | `[]` | | `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | `True` | | `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
| `incl_context` | bool | Whether or not to include the local context in the model. | `True` | | `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
| `kb_loader` | `Callable[[Vocab], KnowledgeBase]` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. | An empty KnowledgeBase with `entity_vector_length` 64. | | `kb` | The [`KnowledgeBase`](/api/kb). Defaults to [EmptyKB](/api/architectures#EmptyKB), a function returning an empty `KnowledgeBase` with an `entity_vector_length` of `64`. ~~KnowledgeBase~~ |
| `get_candidates` | `Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]` | Function that generates plausible candidates for a given `Span` object. | Built-in dictionary-lookup function. |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
@ -66,7 +65,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
> entity_linker = nlp.add_pipe("entity_linker", config=config) > entity_linker = nlp.add_pipe("entity_linker", config=config)
> >
> # Construction via add_pipe with custom KB and candidate generation > # Construction via add_pipe with custom KB and candidate generation
> config = {"kb_loader": {"@assets": "my_kb.v1"}, "get_candidates": {"@assets": "my_candidates.v1"},} > config = {"kb": {"@assets": "my_kb.v1"}}
> entity_linker = nlp.add_pipe("entity_linker", config=config) > entity_linker = nlp.add_pipe("entity_linker", config=config)
> >
> # Construction from class > # Construction from class
@ -76,22 +75,20 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
Create a new pipeline instance. In your application, you would normally use a Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe). Note that both the internal
`KnowledgeBase` as well as the Candidate generator can be customized by
providing custom registered functions.
Note that both the internal KB as well as the Candidate generator can be | Name | Description |
customized by providing custom registered functions. | ---------------- | --------------------------------------------------------------------------------------------------- |
| `vocab` | The shared vocabulary. ~~Vocab~~ |
| Name | Type | Description | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
| ---------------- | -------------------------------------------------------- | ------------------------------------------------------------------------------------------- | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| `vocab` | `Vocab` | The shared vocabulary. | | _keyword-only_ | | |
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `kb` | The [`KnowledgeBase`](/api/kb). ~~KnowledgeBase~~ |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
| _keyword-only_ | | | | `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
| `kb_loader` | `Callable[[Vocab], KnowledgeBase]` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. | | `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
| `get_candidates` | `Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]` | Function that generates plausible candidates for a given `Span` object. |
| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. |
| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. |
| `incl_context` | bool | Whether or not to include the local context in the model. |
## EntityLinker.\_\_call\_\_ {#call tag="method"} ## EntityLinker.\_\_call\_\_ {#call tag="method"}
@ -111,10 +108,10 @@ delegate to the [`predict`](/api/entitylinker#predict) and
> processed = entity_linker(doc) > processed = entity_linker(doc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------ | | ----------- | -------------------------------- |
| `doc` | `Doc` | The document to process. | | `doc` | The document to process. ~~Doc~~ |
| **RETURNS** | `Doc` | The processed document. | | **RETURNS** | The processed document. ~~Doc~~ |
## EntityLinker.pipe {#pipe tag="method"} ## EntityLinker.pipe {#pipe tag="method"}
@ -133,12 +130,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------ | | -------------- | ------------------------------------------------------------- |
| `stream` | `Iterable[Doc]` | A stream of documents. | | `stream` | A stream of documents. ~~Iterable[Doc]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | `Doc` | Processed documents in the order of the original text. | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## EntityLinker.begin_training {#begin_training tag="method"} ## EntityLinker.begin_training {#begin_training tag="method"}
@ -158,13 +155,13 @@ setting up the label scheme based on the data.
> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | | `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## EntityLinker.predict {#predict tag="method"} ## EntityLinker.predict {#predict tag="method"}
@ -179,10 +176,10 @@ if there is no prediction.
> kb_ids = entity_linker.predict([doc1, doc2]) > kb_ids = entity_linker.predict([doc1, doc2])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------- | ------------------------------------------------------------ | | ----------- | ------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to predict. | | `docs` | The documents to predict. ~~Iterable[Doc]~~ |
| **RETURNS** | `List[str]` | The predicted KB identifiers for the entities in the `docs`. | | **RETURNS** | `List[str]` | The predicted KB identifiers for the entities in the `docs`. ~~List[str]~~ |
## EntityLinker.set_annotations {#set_annotations tag="method"} ## EntityLinker.set_annotations {#set_annotations tag="method"}
@ -197,10 +194,10 @@ entities.
> entity_linker.set_annotations([doc1, doc2], kb_ids) > entity_linker.set_annotations([doc1, doc2], kb_ids)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | --------------- | ------------------------------------------------------------------------------------------------- | | -------- | --------------------------------------------------------------------------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to modify. | | `docs` | The documents to modify. ~~Iterable[Doc]~~ |
| `kb_ids` | `List[str]` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. | | `kb_ids` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. ~~List[str]~~ |
## EntityLinker.update {#update tag="method"} ## EntityLinker.update {#update tag="method"}
@ -216,15 +213,15 @@ pipe's entity linking model and context encoder. Delegates to
> losses = entity_linker.update(examples, sgd=optimizer) > losses = entity_linker.update(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/textcategorizer#set_annotations). | | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## EntityLinker.create_optimizer {#create_optimizer tag="method"} ## EntityLinker.create_optimizer {#create_optimizer tag="method"}
@ -237,9 +234,9 @@ Create an optimizer for the pipeline component.
> optimizer = entity_linker.create_optimizer() > optimizer = entity_linker.create_optimizer()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------------------------------------- | -------------- | | ----------- | ---------------------------- |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## EntityLinker.use_params {#use_params tag="method, contextmanager"} ## EntityLinker.use_params {#use_params tag="method, contextmanager"}
@ -254,9 +251,9 @@ context, the original parameters are restored.
> entity_linker.to_disk("/best_model") > entity_linker.to_disk("/best_model")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---- | ----------------------------------------- | | -------- | -------------------------------------------------- |
| `params` | dict | The parameter values to use in the model. | | `params` | The parameter values to use in the model. ~~dict~~ |
## EntityLinker.to_disk {#to_disk tag="method"} ## EntityLinker.to_disk {#to_disk tag="method"}
@ -269,11 +266,11 @@ Serialize the pipe to disk.
> entity_linker.to_disk("/path/to/entity_linker") > entity_linker.to_disk("/path/to/entity_linker")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## EntityLinker.from_disk {#from_disk tag="method"} ## EntityLinker.from_disk {#from_disk tag="method"}
@ -286,12 +283,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
> entity_linker.from_disk("/path/to/entity_linker") > entity_linker.from_disk("/path/to/entity_linker")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | | **RETURNS** | The modified `EntityLinker` object. ~~EntityLinker~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -41,11 +41,11 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("ner", config=config) > nlp.add_pipe("ner", config=config)
> ``` > ```
| Setting | Type | Description | Default | | Setting | Description |
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------- | | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | | `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]] |
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100` | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/ner.pyx https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/ner.pyx
@ -72,14 +72,14 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe).
| Name | Type | Description | | Name | Description |
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | | `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. | | `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ |
## EntityRecognizer.\_\_call\_\_ {#call tag="method"} ## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
@ -100,10 +100,10 @@ and all pipeline components are applied to the `Doc` in order. Both
> processed = ner(doc) > processed = ner(doc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------ | | ----------- | -------------------------------- |
| `doc` | `Doc` | The document to process. | | `doc` | The document to process. ~~Doc~~ |
| **RETURNS** | `Doc` | The processed document. | | **RETURNS** | The processed document. ~~Doc~~ |
## EntityRecognizer.pipe {#pipe tag="method"} ## EntityRecognizer.pipe {#pipe tag="method"}
@ -122,12 +122,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------ | | -------------- | ------------------------------------------------------------- |
| `docs` | `Iterable[Doc]` | A stream of documents. | | `docs` | A stream of documents. ~~Iterable[Doc]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | `Doc` | Processed documents in the order of the original text. | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## EntityRecognizer.begin_training {#begin_training tag="method"} ## EntityRecognizer.begin_training {#begin_training tag="method"}
@ -147,13 +147,13 @@ setting up the label scheme based on the data.
> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | | `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/entityrecognizer#create_optimizer) if not set. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## EntityRecognizer.predict {#predict tag="method"} ## EntityRecognizer.predict {#predict tag="method"}
@ -167,10 +167,10 @@ modifying them.
> scores = ner.predict([doc1, doc2]) > scores = ner.predict([doc1, doc2])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------------ | ---------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to predict. | | `docs` | The documents to predict. ~~Iterable[Doc]~~ |
| **RETURNS** | `List[StateClass]` | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). | | **RETURNS** | A helper class for the parse state (internal). ~~StateClass~~ |
## EntityRecognizer.set_annotations {#set_annotations tag="method"} ## EntityRecognizer.set_annotations {#set_annotations tag="method"}
@ -184,10 +184,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
> ner.set_annotations([doc1, doc2], scores) > ner.set_annotations([doc1, doc2], scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ------------------ | ---------------------------------------------------------- | | -------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to modify. | | `docs` | The documents to modify. ~~Iterable[Doc]~~ |
| `scores` | `List[StateClass]` | The scores to set, produced by `EntityRecognizer.predict`. | | `scores` | The scores to set, produced by `EntityRecognizer.predict`. Returns an internal helper class for the parse state. ~~List[StateClass]~~ |
## EntityRecognizer.update {#update tag="method"} ## EntityRecognizer.update {#update tag="method"}
@ -203,15 +203,15 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
> losses = ner.update(examples, sgd=optimizer) > losses = ner.update(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entityrecognizer#set_annotations). | | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## EntityRecognizer.get_loss {#get_loss tag="method"} ## EntityRecognizer.get_loss {#get_loss tag="method"}
@ -226,11 +226,11 @@ predicted scores.
> loss, d_loss = ner.get_loss(examples, scores) > loss, d_loss = ner.get_loss(examples, scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------- | --------------------------------------------------- | | ----------- | --------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The batch of examples. | | `examples` | The batch of examples. ~~Iterable[Example]~~ |
| `scores` | `List[StateClass]` | Scores representing the model's predictions. | | `scores` | Scores representing the model's predictions. ~~StateClass~~ |
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
## EntityRecognizer.score {#score tag="method" new="3"} ## EntityRecognizer.score {#score tag="method" new="3"}
@ -242,10 +242,10 @@ Score a batch of examples.
> scores = ner.score(examples) > scores = ner.score(examples)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------------- | ------------------------------------------------------------------------ | | ----------- | ---------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The examples to score. | | `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). | | **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"} ## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
@ -258,9 +258,9 @@ Create an optimizer for the pipeline component.
> optimizer = ner.create_optimizer() > optimizer = ner.create_optimizer()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------------------------------------- | -------------- | | ----------- | ---------------------------- |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## EntityRecognizer.use_params {#use_params tag="method, contextmanager"} ## EntityRecognizer.use_params {#use_params tag="method, contextmanager"}
@ -275,9 +275,9 @@ context, the original parameters are restored.
> ner.to_disk("/best_model") > ner.to_disk("/best_model")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---- | ----------------------------------------- | | -------- | -------------------------------------------------- |
| `params` | dict | The parameter values to use in the model. | | `params` | The parameter values to use in the model. ~~dict~~ |
## EntityRecognizer.add_label {#add_label tag="method"} ## EntityRecognizer.add_label {#add_label tag="method"}
@ -290,10 +290,10 @@ Add a new label to the pipe.
> ner.add_label("MY_LABEL") > ner.add_label("MY_LABEL")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | --------------------------------------------------- | | ----------- | ----------------------------------------------------------- |
| `label` | str | The label to add. | | `label` | The label to add. ~~str~~ |
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. | | **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
## EntityRecognizer.to_disk {#to_disk tag="method"} ## EntityRecognizer.to_disk {#to_disk tag="method"}
@ -306,11 +306,11 @@ Serialize the pipe to disk.
> ner.to_disk("/path/to/ner") > ner.to_disk("/path/to/ner")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## EntityRecognizer.from_disk {#from_disk tag="method"} ## EntityRecognizer.from_disk {#from_disk tag="method"}
@ -323,12 +323,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
> ner.from_disk("/path/to/ner") > ner.from_disk("/path/to/ner")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ------------------ | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. | | **RETURNS** | The modified `EntityRecognizer` object. ~~EntityRecognizer~~ |
## EntityRecognizer.to_bytes {#to_bytes tag="method"} ## EntityRecognizer.to_bytes {#to_bytes tag="method"}
@ -341,11 +341,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. | | **RETURNS** | The serialized form of the `EntityRecognizer` object. ~~bytes~~ |
## EntityRecognizer.from_bytes {#from_bytes tag="method"} ## EntityRecognizer.from_bytes {#from_bytes tag="method"}
@ -359,12 +359,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> ner.from_bytes(ner_bytes) > ner.from_bytes(ner_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ------------------ | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. | | **RETURNS** | The `EntityRecognizer` object. ~~EntityRecognizer~~ |
## EntityRecognizer.labels {#labels tag="property"} ## EntityRecognizer.labels {#labels tag="property"}
@ -377,9 +377,9 @@ The labels currently added to the component.
> assert "MY_LABEL" in ner.labels > assert "MY_LABEL" in ner.labels
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ---------------------------------- | | ----------- | ------------------------------------------------------ |
| **RETURNS** | tuple | The labels added to the component. | | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -34,12 +34,12 @@ how the component should be configured. You can override its settings via the
> nlp.add_pipe("entity_ruler", config=config) > nlp.add_pipe("entity_ruler", config=config)
> ``` > ```
| Setting | Type | Description | Default | | Setting | Description |
| --------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- | | --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `phrase_matcher_attr` | str | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. | `None` | | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
| `validate` | bool | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). | `False` | | `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. | `False` | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
| `ent_id_sep` | str | Separator used internally for entity IDs. | `"||"` | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entityruler.py https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entityruler.py
@ -63,16 +63,16 @@ be a token pattern (list) or a phrase pattern (string). For example:
> ruler = EntityRuler(nlp, overwrite_ents=True) > ruler = EntityRuler(nlp, overwrite_ents=True)
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | `Language` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. | | `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ |
| `name` <Tag variant="new">3</Tag> | str | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. | | `name` <Tag variant="new">3</Tag> | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `phrase_matcher_attr` | int / str | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. | | `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
| `validate` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. | | `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ |
| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. | | `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
| `ent_id_sep` | str | Separator used internally for entity IDs. Defaults to `"||"`. | | `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ |
| `patterns` | iterable | Optional patterns to load in on initialization. | | `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ |
## EntityRuler.\_\len\_\_ {#len tag="method"} ## EntityRuler.\_\len\_\_ {#len tag="method"}
@ -87,9 +87,9 @@ The number of all patterns added to the entity ruler.
> assert len(ruler) == 1 > assert len(ruler) == 1
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ----------------------- | | ----------- | ------------------------------- |
| **RETURNS** | int | The number of patterns. | | **RETURNS** | The number of patterns. ~~int~~ |
## EntityRuler.\_\_contains\_\_ {#contains tag="method"} ## EntityRuler.\_\_contains\_\_ {#contains tag="method"}
@ -104,10 +104,10 @@ Whether a label is present in the patterns.
> assert not "PERSON" in ruler > assert not "PERSON" in ruler
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------------------- | | ----------- | ----------------------------------------------------- |
| `label` | str | The label to check. | | `label` | The label to check. ~~str~~ |
| **RETURNS** | bool | Whether the entity ruler contains the label. | | **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ |
## EntityRuler.\_\_call\_\_ {#call tag="method"} ## EntityRuler.\_\_call\_\_ {#call tag="method"}
@ -130,10 +130,10 @@ is chosen.
> assert ents == [("Apple", "ORG")] > assert ents == [("Apple", "ORG")]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------------------------------------------ | | ----------- | -------------------------------------------------------------------- |
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | | `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
| **RETURNS** | `Doc` | The modified `Doc` with added entities, if available. | | **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~ |
## EntityRuler.add_patterns {#add_patterns tag="method"} ## EntityRuler.add_patterns {#add_patterns tag="method"}
@ -152,9 +152,9 @@ of dicts) or a phrase pattern (string). For more details, see the usage guide on
> ruler.add_patterns(patterns) > ruler.add_patterns(patterns)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ---- | -------------------- | | ---------- | ---------------------------------------------------------------- |
| `patterns` | list | The patterns to add. | | `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
## EntityRuler.to_disk {#to_disk tag="method"} ## EntityRuler.to_disk {#to_disk tag="method"}
@ -171,9 +171,9 @@ only the patterns are saved as JSONL. If a directory name is provided, a
> ruler.to_disk("/path/to/entity_ruler") # saves patterns and config > ruler.to_disk("/path/to/entity_ruler") # saves patterns and config
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------ | ------------ | ----------------------------------------------------------------------------------------------------------------------------------- | | ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
## EntityRuler.from_disk {#from_disk tag="method"} ## EntityRuler.from_disk {#from_disk tag="method"}
@ -190,10 +190,10 @@ configuration.
> ruler.from_disk("/path/to/entity_ruler") # loads patterns and config > ruler.from_disk("/path/to/entity_ruler") # loads patterns and config
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------- | ---------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. | | **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~ |
## EntityRuler.to_bytes {#to_bytes tag="method"} ## EntityRuler.to_bytes {#to_bytes tag="method"}
@ -206,9 +206,9 @@ Serialize the entity ruler patterns to a bytestring.
> ruler_bytes = ruler.to_bytes() > ruler_bytes = ruler.to_bytes()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------ | | ----------- | ---------------------------------- |
| **RETURNS** | bytes | The serialized patterns. | | **RETURNS** | The serialized patterns. ~~bytes~~ |
## EntityRuler.from_bytes {#from_bytes tag="method"} ## EntityRuler.from_bytes {#from_bytes tag="method"}
@ -222,40 +222,40 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> ruler.from_bytes(ruler_bytes) > ruler.from_bytes(ruler_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------ | ------------- | ---------------------------------- | | ------------ | -------------------------------------------------- |
| `bytes_data` | bytes | The bytestring to load. | | `bytes_data` | The bytestring to load. ~~bytes~~ |
| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. | | **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~ |
## EntityRuler.labels {#labels tag="property"} ## EntityRuler.labels {#labels tag="property"}
All labels present in the match patterns. All labels present in the match patterns.
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------ | | ----------- | -------------------------------------- |
| **RETURNS** | tuple | The string labels. | | **RETURNS** | The string labels. ~~Tuple[str, ...]~~ |
## EntityRuler.ent_ids {#labels tag="property" new="2.2.2"} ## EntityRuler.ent_ids {#labels tag="property" new="2.2.2"}
All entity ids present in the match patterns `id` properties. All entity IDs present in the `id` properties of the match patterns.
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------- | | ----------- | ----------------------------------- |
| **RETURNS** | tuple | The string ent_ids. | | **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ |
## EntityRuler.patterns {#patterns tag="property"} ## EntityRuler.patterns {#patterns tag="property"}
Get all patterns that were added to the entity ruler. Get all patterns that were added to the entity ruler.
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------- |
| **RETURNS** | list | The original patterns, one dictionary per pattern. | | **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ |
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Description |
| ----------------- | ------------------------------------- | ---------------------------------------------------------------- | | ----------------- | --------------------------------------------------------------------------------------------------------------------- |
| `matcher` | [`Matcher`](/api/matcher) | The underlying matcher used to process token patterns. | | `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ | |
| `phrase_matcher` | [`PhraseMatcher`](/api/phrasematcher) | The underlying phrase matcher, used to process phrase patterns. | | `phrase_matcher` | The underlying phrase matcher, used to process phrase patterns. ~~PhraseMatcher~~ |
| `token_patterns` | dict | The token patterns present in the entity ruler, keyed by label. | | `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
| `phrase_patterns` | dict | The phrase patterns present in the entity ruler, keyed by label. | | `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ |

View File

@ -8,9 +8,9 @@ new: 3.0
An `Example` holds the information for one training instance. It stores two An `Example` holds the information for one training instance. It stores two
`Doc` objects: one for holding the gold-standard reference data, and one for `Doc` objects: one for holding the gold-standard reference data, and one for
holding the predictions of the pipeline. An [`Alignment`](#alignment-object) holding the predictions of the pipeline. An
object stores the alignment between these two documents, as they can differ in [`Alignment`](/api/example#alignment-object) object stores the alignment between
tokenization. these two documents, as they can differ in tokenization.
## Example.\_\_init\_\_ {#init tag="method"} ## Example.\_\_init\_\_ {#init tag="method"}
@ -31,12 +31,12 @@ both documents.
> example = Example(predicted, reference) > example = Example(predicted, reference)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ----------- | ------------------------------------------------------------------------------------------------ | | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `predicted` | `Doc` | The document containing (partial) predictions. Can not be `None`. | | `predicted` | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ |
| `reference` | `Doc` | The document containing gold-standard annotations. Can not be `None`. | | `reference` | The document containing gold-standard annotations. Can not be `None`. ~~Doc~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `alignment` | `Alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. | | `alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. ~~Optional[Alignment]~~ |
## Example.from_dict {#from_dict tag="classmethod"} ## Example.from_dict {#from_dict tag="classmethod"}
@ -56,11 +56,11 @@ see the [training format documentation](/api/data-formats#dict-input).
> example = Example.from_dict(predicted, {"words": token_ref, "tags": tags_ref}) > example = Example.from_dict(predicted, {"words": token_ref, "tags": tags_ref})
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ---------------- | ----------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------- |
| `predicted` | `Doc` | The document containing (partial) predictions. Can not be `None`. | | `predicted` | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ |
| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Can not be `None`. | | `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Can not be `None`. ~~Dict[str, Any]~~ |
| **RETURNS** | `Example` | The newly constructed object. | | **RETURNS** | The newly constructed object. ~~Example~~ |
## Example.text {#text tag="property"} ## Example.text {#text tag="property"}
@ -72,12 +72,14 @@ The text of the `predicted` document in this `Example`.
> raw_text = example.text > raw_text = example.text
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------- | | ----------- | --------------------------------------------- |
| **RETURNS** | str | The text of the `predicted` document. | | **RETURNS** | The text of the `predicted` document. ~~str~~ |
## Example.predicted {#predicted tag="property"} ## Example.predicted {#predicted tag="property"}
The `Doc` holding the predictions. Occasionally also referred to as `example.x`.
> #### Example > #### Example
> >
> ```python > ```python
@ -86,14 +88,15 @@ The text of the `predicted` document in this `Example`.
> set_annotations(docs, predictions) > set_annotations(docs, predictions)
> ``` > ```
The `Doc` holding the predictions. Occassionally also refered to as `example.x`. | Name | Description |
| ----------- | ------------------------------------------------------ |
| Name | Type | Description | | **RETURNS** | The document containing (partial) predictions. ~~Doc~~ |
| ----------- | ----- | ---------------------------------------------- |
| **RETURNS** | `Doc` | The document containing (partial) predictions. |
## Example.reference {#reference tag="property"} ## Example.reference {#reference tag="property"}
The `Doc` holding the gold-standard annotations. Occasionally also referred to
as `example.y`.
> #### Example > #### Example
> >
> ```python > ```python
@ -102,15 +105,15 @@ The `Doc` holding the predictions. Occassionally also refered to as `example.x`.
> gold_labels[i][j] = eg.reference.cats.get(label, 0.0) > gold_labels[i][j] = eg.reference.cats.get(label, 0.0)
> ``` > ```
The `Doc` holding the gold-standard annotations. Occassionally also refered to | Name | Description |
as `example.y`. | ----------- | ---------------------------------------------------------- |
| **RETURNS** | The document containing gold-standard annotations. ~~Doc~~ |
| Name | Type | Description |
| ----------- | ----- | -------------------------------------------------- |
| **RETURNS** | `Doc` | The document containing gold-standard annotations. |
## Example.alignment {#alignment tag="property"} ## Example.alignment {#alignment tag="property"}
The [`Alignment`](/api/example#alignment-object) object mapping the tokens of
the `predicted` document to those of the `reference` document.
> #### Example > #### Example
> >
> ```python > ```python
@ -122,15 +125,15 @@ as `example.y`.
> assert list(alignment.y2x.data) == [[0], [1], [2], [2]] > assert list(alignment.y2x.data) == [[0], [1], [2], [2]]
> ``` > ```
The `Alignment` object mapping the tokens of the `predicted` document to those | Name | Description |
of the `reference` document. | ----------- | ---------------------------------------------------------------- |
| **RETURNS** | The document containing gold-standard annotations. ~~Alignment~~ |
| Name | Type | Description |
| ----------- | ----------- | -------------------------------------------------- |
| **RETURNS** | `Alignment` | The document containing gold-standard annotations. |
## Example.get_aligned {#get_aligned tag="method"} ## Example.get_aligned {#get_aligned tag="method"}
Get the aligned view of a certain token attribute, denoted by its int ID or
string name.
> #### Example > #### Example
> >
> ```python > ```python
@ -141,17 +144,18 @@ of the `reference` document.
> assert example.get_aligned("TAG", as_string=True) == ["VERB", "DET", "NOUN"] > assert example.get_aligned("TAG", as_string=True) == ["VERB", "DET", "NOUN"]
> ``` > ```
Get the aligned view of a certain token attribute, denoted by its int ID or | Name | Description |
string name. | ----------- | -------------------------------------------------------------------------------------------------- |
| `field` | Attribute ID or string name. ~~Union[int, str]~~ |
| Name | Type | Description | Default | | `as_string` | Whether or not to return the list of values as strings. Defaults to `False`. ~~bool~~ |
| ----------- | -------------------------- | ------------------------------------------------------------------ | ------- | | **RETURNS** | List of integer values, or string values if `as_string` is `True`. ~~Union[List[int], List[str]]~~ |
| `field` | int or str | Attribute ID or string name | |
| `as_string` | bool | Whether or not to return the list of values as strings. | `False` |
| **RETURNS** | `List[int]` or `List[str]` | List of integer values, or string values if `as_string` is `True`. | |
## Example.get_aligned_parse {#get_aligned_parse tag="method"} ## Example.get_aligned_parse {#get_aligned_parse tag="method"}
Get the aligned view of the dependency parse. If `projectivize` is set to
`True`, non-projective dependency trees are made projective through the
Pseudo-Projective Dependency Parsing algorithm by Nivre and Nilsson (2005).
> #### Example > #### Example
> >
> ```python > ```python
@ -161,17 +165,16 @@ string name.
> assert proj_heads == [3, 2, 3, 0, 3] > assert proj_heads == [3, 2, 3, 0, 3]
> ``` > ```
Get the aligned view of the dependency parse. If `projectivize` is set to | Name | Description |
`True`, non-projective dependency trees are made projective through the | -------------- | -------------------------------------------------------------------------------------------------- |
Pseudo-Projective Dependency Parsing algorithm by Nivre and Nilsson (2005). | `projectivize` | Whether or not to projectivize the dependency trees. Defaults to `True`. ~~bool~~ |
| **RETURNS** | List of integer values, or string values if `as_string` is `True`. ~~Union[List[int], List[str]]~~ |
| Name | Type | Description | Default |
| -------------- | -------------------------- | ------------------------------------------------------------------ | ------- |
| `projectivize` | bool | Whether or not to projectivize the dependency trees | `True` |
| **RETURNS** | `List[int]` or `List[str]` | List of integer values, or string values if `as_string` is `True`. | |
## Example.get_aligned_ner {#get_aligned_ner tag="method"} ## Example.get_aligned_ner {#get_aligned_ner tag="method"}
Get the aligned view of the NER
[BILUO](/usage/linguistic-features#accessing-ner) tags.
> #### Example > #### Example
> >
> ```python > ```python
@ -184,15 +187,16 @@ Pseudo-Projective Dependency Parsing algorithm by Nivre and Nilsson (2005).
> assert ner_tags == ["B-PERSON", "L-PERSON", "O", "O", "U-LOC"] > assert ner_tags == ["B-PERSON", "L-PERSON", "O", "O", "U-LOC"]
> ``` > ```
Get the aligned view of the NER | Name | Description |
[BILUO](/usage/linguistic-features#accessing-ner) tags. | ----------- | ------------------------------------------------------------------------------------------------- |
| **RETURNS** | List of BILUO values, denoting whether tokens are part of an NER annotation or not. ~~List[str]~~ |
| Name | Type | Description |
| ----------- | ----------- | ----------------------------------------------------------------------------------- |
| **RETURNS** | `List[str]` | List of BILUO values, denoting whether tokens are part of an NER annotation or not. |
## Example.get_aligned_spans_y2x {#get_aligned_spans_y2x tag="method"} ## Example.get_aligned_spans_y2x {#get_aligned_spans_y2x tag="method"}
Get the aligned view of any set of [`Span`](/api/span) objects defined over
[`Example.reference`](/api/example#reference). The resulting span indices will
align to the tokenization in [`Example.predicted`](/api/example#predicted).
> #### Example > #### Example
> >
> ```python > ```python
@ -207,17 +211,19 @@ Get the aligned view of the NER
> assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1)] > assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1)]
> ``` > ```
Get the aligned view of any set of [`Span`](/api/span) objects defined over | Name | Description |
`example.reference`. The resulting span indices will align to the tokenization | ----------- | ----------------------------------------------------------------------------- |
in `example.predicted`. | `y_spans` | `Span` objects aligned to the tokenization of `reference`. ~~Iterable[Span]~~ |
| **RETURNS** | `Span` objects aligned to the tokenization of `predicted`. ~~List[Span]~~ |
| Name | Type | Description |
| ----------- | ---------------- | --------------------------------------------------------------- |
| `y_spans` | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.reference`. |
| **RETURNS** | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.predicted`. |
## Example.get_aligned_spans_x2y {#get_aligned_spans_x2y tag="method"} ## Example.get_aligned_spans_x2y {#get_aligned_spans_x2y tag="method"}
Get the aligned view of any set of [`Span`](/api/span) objects defined over
[`Example.predicted`](/api/example#predicted). The resulting span indices will
align to the tokenization in [`Example.reference`](/api/example#reference). This
method is particularly useful to assess the accuracy of predicted entities
against the original gold-standard annotation.
> #### Example > #### Example
> >
> ```python > ```python
@ -232,15 +238,10 @@ in `example.predicted`.
> assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2)] > assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2)]
> ``` > ```
Get the aligned view of any set of [`Span`](/api/span) objects defined over | Name | Description |
`example.predicted`. The resulting span indices will align to the tokenization | ----------- | ----------------------------------------------------------------------------- |
in `example.reference`. This method is particularly useful to assess the | `x_spans` | `Span` objects aligned to the tokenization of `predicted`. ~~Iterable[Span]~~ |
accuracy of predicted entities against the original gold-standard annotation. | **RETURNS** | `Span` objects aligned to the tokenization of `reference`. ~~List[Span]~~ |
| Name | Type | Description |
| ----------- | ---------------- | --------------------------------------------------------------- |
| `x_spans` | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.predicted`. |
| **RETURNS** | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.reference`. |
## Example.to_dict {#to_dict tag="method"} ## Example.to_dict {#to_dict tag="method"}
@ -253,12 +254,14 @@ reference annotation contained in this `Example`.
> eg_dict = example.to_dict() > eg_dict = example.to_dict()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---------------- | ------------------------------------------------------ | | ----------- | ------------------------------------------------------------------------- |
| **RETURNS** | `Dict[str, Any]` | Dictionary representation of the reference annotation. | | **RETURNS** | Dictionary representation of the reference annotation. ~~Dict[str, Any]~~ |
## Example.split_sents {#split_sents tag="method"} ## Example.split_sents {#split_sents tag="method"}
Split one `Example` into multiple `Example` objects, one for each sentence.
> #### Example > #### Example
> >
> ```python > ```python
@ -271,11 +274,9 @@ reference annotation contained in this `Example`.
> assert split_examples[1].text == "had lots of fun" > assert split_examples[1].text == "had lots of fun"
> ``` > ```
Split one `Example` into multiple `Example` objects, one for each sentence. | Name | Description |
| ----------- | ---------------------------------------------------------------------------- |
| Name | Type | Description | | **RETURNS** | List of `Example` objects, one for each original sentence. ~~List[Example]~~ |
| ----------- | --------------- | ---------------------------------------------------------- |
| **RETURNS** | `List[Example]` | List of `Example` objects, one for each original sentence. |
## Alignment {#alignment-object new="3"} ## Alignment {#alignment-object new="3"}
@ -283,10 +284,10 @@ Calculate alignment tables between two tokenizations.
### Alignment attributes {#alignment-attributes"} ### Alignment attributes {#alignment-attributes"}
| Name | Type | Description | | Name | Description |
| ----- | -------------------------------------------------- | ---------------------------------------------------------- | | ----- | --------------------------------------------------------------------- |
| `x2y` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | The `Ragged` object holding the alignment from `x` to `y`. | | `x2y` | The `Ragged` object holding the alignment from `x` to `y`. ~~Ragged~~ |
| `y2x` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | The `Ragged` object holding the alignment from `y` to `x`. | | `y2x` | The `Ragged` object holding the alignment from `y` to `x`. ~~Ragged~~ |
<Infobox title="Important note" variant="warning"> <Infobox title="Important note" variant="warning">
@ -314,8 +315,8 @@ tokenizations add up to the same string. For example, you'll be able to align
### Alignment.from_strings {#classmethod tag="function"} ### Alignment.from_strings {#classmethod tag="function"}
| Name | Type | Description | | Name | Description |
| ----------- | ----------- | ----------------------------------------------- | | ----------- | ------------------------------------------------------------- |
| `A` | list | String values of candidate tokens to align. | | `A` | String values of candidate tokens to align. ~~List[str]~~ |
| `B` | list | String values of reference tokens to align. | | `B` | String values of reference tokens to align. ~~List[str]~~ |
| **RETURNS** | `Alignment` | An `Alignment` object describing the alignment. | | **RETURNS** | An `Alignment` object describing the alignment. ~~Alignment~~ |

View File

@ -9,7 +9,7 @@ new: 2.2
--- ---
The `KnowledgeBase` object provides a method to generate The `KnowledgeBase` object provides a method to generate
[`Candidate`](/api/kb/#candidate_init) objects, which are plausible external [`Candidate`](/api/kb/#candidate) objects, which are plausible external
identifiers given a certain textual mention. Each such `Candidate` holds identifiers given a certain textual mention. Each such `Candidate` holds
information from the relevant KB entities, such as its frequency in text and information from the relevant KB entities, such as its frequency in text and
possible aliases. Each entity in the knowledge base also has a pretrained entity possible aliases. Each entity in the knowledge base also has a pretrained entity
@ -27,18 +27,18 @@ Create the knowledge base.
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64) > kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------------------- | ------- | ---------------------------------------- | | ---------------------- | ------------------------------------------------ |
| `vocab` | `Vocab` | A `Vocab` object. | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `entity_vector_length` | int | Length of the fixed-size entity vectors. | | `entity_vector_length` | Length of the fixed-size entity vectors. ~~int~~ |
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"} ## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
The length of the fixed-size entity vectors in the knowledge base. The length of the fixed-size entity vectors in the knowledge base.
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ---------------------------------------- | | ----------- | ------------------------------------------------ |
| **RETURNS** | int | Length of the fixed-size entity vectors. | | **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ |
## KnowledgeBase.add_entity {#add_entity tag="method"} ## KnowledgeBase.add_entity {#add_entity tag="method"}
@ -53,11 +53,11 @@ vector, which should be of length
> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2) > kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------- | ------ | ----------------------------------------------- | | --------------- | ---------------------------------------------------------- |
| `entity` | str | The unique entity identifier | | `entity` | The unique entity identifier. ~~str~~ |
| `freq` | float | The frequency of the entity in a typical corpus | | `freq` | The frequency of the entity in a typical corpus. ~~float~~ |
| `entity_vector` | vector | The pretrained vector of the entity | | `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~ |
## KnowledgeBase.set_entities {#set_entities tag="method"} ## KnowledgeBase.set_entities {#set_entities tag="method"}
@ -70,11 +70,11 @@ frequency and entity vector for each entity.
> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2]) > kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------- | -------- | --------------------------------- | | ------------- | ---------------------------------------------------------------- |
| `entity_list` | iterable | List of unique entity identifiers | | `entity_list` | List of unique entity identifiers. ~~Iterable[Union[str, int]]~~ |
| `freq_list` | iterable | List of entity frequencies | | `freq_list` | List of entity frequencies. ~~Iterable[int]~~ |
| `vector_list` | iterable | List of entity vectors | | `vector_list` | List of entity vectors. ~~Iterable[numpy.ndarray]~~ |
## KnowledgeBase.add_alias {#add_alias tag="method"} ## KnowledgeBase.add_alias {#add_alias tag="method"}
@ -90,11 +90,11 @@ should not exceed 1.
> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3]) > kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------- | -------- | -------------------------------------------------- | | --------------- | --------------------------------------------------------------------------------- |
| `alias` | str | The textual mention or alias | | `alias` | The textual mention or alias. ~~str~~ |
| `entities` | iterable | The potential entities that the alias may refer to | | `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
| `probabilities` | iterable | The prior probabilities of each entity | | `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ |
## KnowledgeBase.\_\_len\_\_ {#len tag="method"} ## KnowledgeBase.\_\_len\_\_ {#len tag="method"}
@ -106,9 +106,9 @@ Get the total number of entities in the knowledge base.
> total_entities = len(kb) > total_entities = len(kb)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | --------------------------------------------- | | ----------- | ----------------------------------------------------- |
| **RETURNS** | int | The number of entities in the knowledge base. | | **RETURNS** | The number of entities in the knowledge base. ~~int~~ |
## KnowledgeBase.get_entity_strings {#get_entity_strings tag="method"} ## KnowledgeBase.get_entity_strings {#get_entity_strings tag="method"}
@ -120,9 +120,9 @@ Get a list of all entity IDs in the knowledge base.
> all_entities = kb.get_entity_strings() > all_entities = kb.get_entity_strings()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------------- | | ----------- | --------------------------------------------------------- |
| **RETURNS** | list | The list of entities in the knowledge base. | | **RETURNS** | The list of entities in the knowledge base. ~~List[str]~~ |
## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"} ## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
@ -134,9 +134,9 @@ Get the total number of aliases in the knowledge base.
> total_aliases = kb.get_size_aliases() > total_aliases = kb.get_size_aliases()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------------------- | | ----------- | ---------------------------------------------------- |
| **RETURNS** | int | The number of aliases in the knowledge base. | | **RETURNS** | The number of aliases in the knowledge base. ~~int~~ |
## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"} ## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
@ -148,14 +148,14 @@ Get a list of all aliases in the knowledge base.
> all_aliases = kb.get_alias_strings() > all_aliases = kb.get_alias_strings()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------------ | | ----------- | -------------------------------------------------------- |
| **RETURNS** | list | The list of aliases in the knowledge base. | | **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ |
## KnowledgeBase.get_candidates {#get_candidates tag="method"} ## KnowledgeBase.get_candidates {#get_candidates tag="method"}
Given a certain textual mention as input, retrieve a list of candidate entities Given a certain textual mention as input, retrieve a list of candidate entities
of type [`Candidate`](/api/kb/#candidate_init). of type [`Candidate`](/api/kb/#candidate).
> #### Example > #### Example
> >
@ -163,10 +163,10 @@ of type [`Candidate`](/api/kb/#candidate_init).
> candidates = kb.get_candidates("Douglas") > candidates = kb.get_candidates("Douglas")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------- | ---------------------------------------- | | ----------- | ------------------------------------- |
| `alias` | str | The textual mention or alias | | `alias` | The textual mention or alias. ~~str~~ |
| **RETURNS** | iterable | The list of relevant `Candidate` objects | | **RETURNS** | iterable | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
## KnowledgeBase.get_vector {#get_vector tag="method"} ## KnowledgeBase.get_vector {#get_vector tag="method"}
@ -178,10 +178,10 @@ Given a certain entity ID, retrieve its pretrained entity vector.
> vector = kb.get_vector("Q42") > vector = kb.get_vector("Q42")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------ | ----------------- | | ----------- | ------------------------------------ |
| `entity` | str | The entity ID | | `entity` | The entity ID. ~~str~~ |
| **RETURNS** | vector | The entity vector | | **RETURNS** | The entity vector. ~~numpy.ndarray~~ |
## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"} ## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
@ -194,11 +194,11 @@ probability of the fact that the mention links to the entity ID.
> probability = kb.get_prior_prob("Q42", "Douglas") > probability = kb.get_prior_prob("Q42", "Douglas")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | -------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------- |
| `entity` | str | The entity ID | | `entity` | The entity ID. ~~str~~ |
| `alias` | str | The textual mention or alias | | `alias` | The textual mention or alias. ~~str~~ |
| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` | | **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
## KnowledgeBase.dump {#dump tag="method"} ## KnowledgeBase.dump {#dump tag="method"}
@ -210,9 +210,9 @@ Save the current state of the knowledge base to a directory.
> kb.dump(loc) > kb.dump(loc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----- | ------------ | --------------------------------------------------------------------------------------------------------------------- | | ----- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `loc` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `loc` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
## KnowledgeBase.load_bulk {#load_bulk tag="method"} ## KnowledgeBase.load_bulk {#load_bulk tag="method"}
@ -229,12 +229,20 @@ Restore the state of the knowledge base from a given directory. Note that the
> kb.load_bulk("/path/to/kb") > kb.load_bulk("/path/to/kb")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------- | -------------------------------------------------------------------------- | | ----------- | ----------------------------------------------------------------------------------------------- |
| `loc` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `loc` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. | | **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
## Candidate.\_\_init\_\_ {#candidate_init tag="method"} ## Candidate {#candidate tag="class"}
A `Candidate` object refers to a textual mention (alias) that may or may not be
resolved to a specific entity from a `KnowledgeBase`. This will be used as input
for the entity linking algorithm which will disambiguate the various candidates
to the correct one. Each candidate `(alias, entity)` pair is assigned to a
certain prior probability.
### Candidate.\_\_init\_\_ {#candidate-init tag="method"}
Construct a `Candidate` object. Usually this constructor is not called directly, Construct a `Candidate` object. Usually this constructor is not called directly,
but instead these objects are returned by the but instead these objects are returned by the
@ -247,22 +255,22 @@ but instead these objects are returned by the
> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob) > candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------- | --------------- | -------------------------------------------------------------- | | ------------- | ------------------------------------------------------------------------- |
| `kb` | `KnowledgeBase` | The knowledge base that defined this candidate. | | `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ |
| `entity_hash` | int | The hash of the entity's KB ID. | | `entity_hash` | The hash of the entity's KB ID. ~~int~~ |
| `entity_freq` | float | The entity frequency as recorded in the KB. | | `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ |
| `alias_hash` | int | The hash of the textual mention or alias. | | `alias_hash` | The hash of the textual mention or alias. ~~int~~ |
| `prior_prob` | float | The prior probability of the `alias` referring to the `entity` | | `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
## Candidate attributes {#candidate_attributes} ## Candidate attributes {#candidate-attributes}
| Name | Type | Description | | Name | Description |
| --------------- | ------ | -------------------------------------------------------------- | | --------------- | ------------------------------------------------------------------------ |
| `entity` | int | The entity's unique KB identifier | | `entity` | The entity's unique KB identifier. ~~int~~ |
| `entity_` | str | The entity's unique KB identifier | | `entity_` | The entity's unique KB identifier. ~~str~~ |
| `alias` | int | The alias or textual mention | | `alias` | The alias or textual mention. ~~int~~ |
| `alias_` | str | The alias or textual mention | | `alias_` | The alias or textual mention. ~~str~~ |
| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` | | `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~long~~ |
| `entity_freq` | long | The frequency of the entity in a typical corpus | | `entity_freq` | The frequency of the entity in a typical corpus. ~~long~~ |
| `entity_vector` | vector | The pretrained vector of the entity | | `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~ |

View File

@ -32,13 +32,13 @@ Initialize a `Language` object.
> nlp = Language(Vocab()) > nlp = Language(Vocab())
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------------ | ----------- | ------------------------------------------------------------------------------------------ | | ------------------ | ------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. | | `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. | | `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ |
| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. | | `meta` | Custom meta data for the `Language` class. Is written to by models to add model meta data. ~~dict~~ |
| `create_tokenizer` |  `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. | | `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
## Language.from_config {#from_config tag="classmethod"} ## Language.from_config {#from_config tag="classmethod"}
@ -58,14 +58,14 @@ model under the hood based on its [`config.cfg`](/api/data-formats#config).
> nlp = Language.from_config(config) > nlp = Language.from_config(config)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. | | `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
| _keyword-only_ | | | _keyword-only_ | |
| `disable` | `Iterable[str]` | List of pipeline component names to disable. | | `disable` | List of pipeline component names to disable. ~~Iterable[str]~~ |
| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. | | `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | | `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | `Language` | The initialized object. | | **RETURNS** | The initialized object. ~~Language~~ |
## Language.component {#component tag="classmethod" new="3"} ## Language.component {#component tag="classmethod" new="3"}
@ -94,16 +94,14 @@ decorator. For more details and examples, see the
> Language.component("my_component2", func=my_component) > Language.component("my_component2", func=my_component)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `name` | str | The name of the component factory. | | `name` | The name of the component factory. ~~str~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).. | | `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `func` | Optional function if not used a a decorator. ~~Optional[Callable[[Doc], Doc]]~~ |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
## Language.factory {#factory tag="classmethod"} ## Language.factory {#factory tag="classmethod"}
@ -141,17 +139,17 @@ examples, see the
> ) > )
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `name` | str | The name of the component factory. | | `name` | The name of the component factory. ~~str~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. | | `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ |
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. | | `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ |
## Language.\_\_call\_\_ {#call tag="method"} ## Language.\_\_call\_\_ {#call tag="method"}
@ -165,13 +163,13 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
> assert (doc[0].text, doc[0].head.tag_) == ("An", "NN") > assert (doc[0].text, doc[0].head.tag_) == ("An", "NN")
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------- | ----------------- | ------------------------------------------------------------------------------------------------------ | | --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `text` | str | The text to be processed. | | `text` | The text to be processed. ~~str~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
| **RETURNS** | [`Doc`](/api/doc) | A container for accessing the annotations. | | **RETURNS** | A container for accessing the annotations. ~~Doc~~ |
## Language.pipe {#pipe tag="method"} ## Language.pipe {#pipe tag="method"}
@ -186,17 +184,17 @@ more efficient than processing texts one-by-one.
> assert doc.is_parsed > assert doc.is_parsed
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------------------------------------ | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `texts` | `Iterable[str]` | A sequence of strings. | | `texts` | A sequence of strings. ~~Iterable[str]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. | | `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
| `batch_size` | int | The number of texts to buffer. | | `batch_size` | The number of texts to buffer. ~~int~~ |
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
| `cleanup` | bool | If `True`, unneeded strings are freed to control memory use. Experimental. | | `cleanup` | If `True`, unneeded strings are freed to control memory use. Experimental. ~~bool~~ |
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
| `n_process` <Tag variant="new">2.2.2</Tag> | int | Number of processors to use, only supported in Python 3. Defaults to `1`. | | `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
| **YIELDS** | `Doc` | Documents in the order of the original text. | | **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
## Language.begin_training {#begin_training tag="method"} ## Language.begin_training {#begin_training tag="method"}
@ -225,12 +223,12 @@ tuples of `Doc` and `GoldParse` objects.
> optimizer = nlp.begin_training(get_examples) > optimizer = nlp.begin_training(get_examples)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | | `get_examples` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Optional[Callable[[], Iterable[Example]]]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## Language.resume_training {#resume_training tag="method,experimental" new="3"} ## Language.resume_training {#resume_training tag="method,experimental" new="3"}
@ -248,11 +246,11 @@ a batch of [Example](/api/example) objects.
> nlp.rehearse(examples, sgd=optimizer) > nlp.rehearse(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## Language.update {#update tag="method"} ## Language.update {#update tag="method"}
@ -282,15 +280,15 @@ and custom registered functions if needed. See the
> nlp.update([example], sgd=optimizer) > nlp.update([example], sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ | | --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. | | `losses` | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~ |
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Language.rehearse {#rehearse tag="method,experimental" new="3"} ## Language.rehearse {#rehearse tag="method,experimental" new="3"}
@ -305,14 +303,14 @@ the "catastrophic forgetting" problem. This feature is experimental.
> losses = nlp.rehearse(examples, sgd=optimizer) > losses = nlp.rehearse(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | | `losses` | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Language.evaluate {#evaluate tag="method"} ## Language.evaluate {#evaluate tag="method"}
@ -328,20 +326,19 @@ objects instead of tuples of `Doc` and `GoldParse` objects.
> #### Example > #### Example
> >
> ```python > ```python
> scores = nlp.evaluate(examples, verbose=True) > scores = nlp.evaluate(examples)
> print(scores) > print(scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------- | ------------------------------- | ------------------------------------------------------------------------------------------------------ | | --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `verbose` | bool | Print debugging information. | | `batch_size` | The batch size to use. ~~int~~ |
| `batch_size` | int | The batch size to use. | | `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ |
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. | | `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. | | `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |
| `scorer_cfg` | `Dict[str, Any]` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. | | **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
| **RETURNS** | `Dict[str, Union[float, dict]]` | A dictionary of evaluation scores. |
## Language.use_params {#use_params tag="contextmanager, method"} ## Language.use_params {#use_params tag="contextmanager, method"}
@ -356,9 +353,9 @@ their original weights after the block.
> nlp.to_disk("/tmp/checkpoint") > nlp.to_disk("/tmp/checkpoint")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---- | --------------------------------------------- | | -------- | ------------------------------------------------------ |
| `params` | dict | A dictionary of parameters keyed by model ID. | | `params` | A dictionary of parameters keyed by model ID. ~~dict~~ |
## Language.create_pipe {#create_pipe tag="method" new="2"} ## Language.create_pipe {#create_pipe tag="method" new="2"}
@ -380,14 +377,14 @@ To create a component and add it to the pipeline, you should always use
> parser = nlp.create_pipe("parser") > parser = nlp.create_pipe("parser")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory_name` | str | Name of the registered component factory. | | `factory_name` | Name of the registered component factory. ~~str~~ |
| `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. | | `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. | | `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | | `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** | callable | The pipeline component. | | **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
## Language.add_pipe {#add_pipe tag="method" new="2"} ## Language.add_pipe {#add_pipe tag="method" new="2"}
@ -423,19 +420,19 @@ component, adds it to the pipeline and returns it.
> nlp.add_pipe("ner", source=source_nlp) > nlp.add_pipe("ner", source=source_nlp)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------------------------------- | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `factory_name` | str | Name of the registered component factory. | | `factory_name` | Name of the registered component factory. ~~str~~ |
| `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. | | `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `before` | str / int | Component name or index to insert component directly before. | | `before` | Component name or index to insert component directly before. ~~Optional[Union[str, int]]~~ |
| `after` | str / int | Component name or index to insert component directly after: | | `after` | Component name or index to insert component directly after. ~~Optional[Union[str, int]]~~ |
| `first` | bool | Insert component first / not first in the pipeline. | | `first` | Insert component first / not first in the pipeline. ~~Optional[bool]~~ |
| `last` | bool | Insert component last / not last in the pipeline. | | `last` | Insert component last / not last in the pipeline. ~~Optional[bool]~~ |
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. | | `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
| `source` <Tag variant="new">3</Tag> | `Language` | Optional source model to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source model match the target model. | | `source` <Tag variant="new">3</Tag> | Optional source model to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source model match the target model. ~~Optional[Language]~~ |
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | | `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
| **RETURNS** <Tag variant="new">3</Tag> | callable | The pipeline component. | | **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
## Language.has_factory {#has_factory tag="classmethod" new="3"} ## Language.has_factory {#has_factory tag="classmethod" new="3"}
@ -459,10 +456,10 @@ the `Language` base class, available to all subclasses.
> assert not Language.has_factory("component") > assert not Language.has_factory("component")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ---------------------------------------------------------- | | ----------- | ------------------------------------------------------------------- |
| `name` | str | Name of the pipeline factory to check. | | `name` | Name of the pipeline factory to check. ~~str~~ |
| **RETURNS** | bool | Whether a factory of that name is registered on the class. | | **RETURNS** | Whether a factory of that name is registered on the class. ~~bool~~ |
## Language.has_pipe {#has_pipe tag="method" new="2"} ## Language.has_pipe {#has_pipe tag="method" new="2"}
@ -481,10 +478,10 @@ Check whether a component is present in the pipeline. Equivalent to
> assert nlp.has_pipe("my_component") > assert nlp.has_pipe("my_component")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------------------------------- | | ----------- | ----------------------------------------------------------------- |
| `name` | str | Name of the pipeline component to check. | | `name` | Name of the pipeline component to check. ~~str~~ |
| **RETURNS** | bool | Whether a component of that name exists in the pipeline. | | **RETURNS** | Whether a component of that name exists in the pipeline. ~~bool~~ |
## Language.get_pipe {#get_pipe tag="method" new="2"} ## Language.get_pipe {#get_pipe tag="method" new="2"}
@ -497,28 +494,37 @@ Get a pipeline component for a given component name.
> custom_component = nlp.get_pipe("custom_component") > custom_component = nlp.get_pipe("custom_component")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------- | -------------------------------------- | | ----------- | ------------------------------------------------ |
| `name` | str | Name of the pipeline component to get. | | `name` | Name of the pipeline component to get. ~~str~~ |
| **RETURNS** | callable | The pipeline component. | | **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
## Language.replace_pipe {#replace_pipe tag="method" new="2"} ## Language.replace_pipe {#replace_pipe tag="method" new="2"}
Replace a component in the pipeline. Replace a component in the pipeline.
<Infobox title="Changed in v3.0" variant="warning">
As of v3.0, the `Language.replace_pipe` method doesn't take callables anymore
and instead expects the **name of a component factory** registered using
[`@Language.component`](/api/language#component) or
[`@Language.factory`](/api/language#factory).
</Infobox>
> #### Example > #### Example
> >
> ```python > ```python
> nlp.replace_pipe("parser", my_custom_parser) > nlp.replace_pipe("parser", my_custom_parser)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------------------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `name` | str | Name of the component to replace. | | `name` | Name of the component to replace. ~~str~~ |
| `component` | callable | The pipeline component to insert. | | `component` | The factory name of the component to insert. ~~str~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. | | `config` <Tag variant="new">3</Tag> | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. | | `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
## Language.rename_pipe {#rename_pipe tag="method" new="2"} ## Language.rename_pipe {#rename_pipe tag="method" new="2"}
@ -533,10 +539,10 @@ added to the pipeline, you can also use the `name` argument on
> nlp.rename_pipe("parser", "spacy_parser") > nlp.rename_pipe("parser", "spacy_parser")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ---- | -------------------------------- | | ---------- | ---------------------------------------- |
| `old_name` | str | Name of the component to rename. | | `old_name` | Name of the component to rename. ~~str~~ |
| `new_name` | str | New name of the component. | | `new_name` | New name of the component. ~~str~~ |
## Language.remove_pipe {#remove_pipe tag="method" new="2"} ## Language.remove_pipe {#remove_pipe tag="method" new="2"}
@ -550,10 +556,10 @@ component function.
> assert name == "parser" > assert name == "parser"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ----------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------ |
| `name` | str | Name of the component to remove. | | `name` | Name of the component to remove. ~~str~~ |
| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. | | **RETURNS** | A `(name, component)` tuple of the removed component. ~~Tuple[str, Callable[[Doc], Doc]]~~ |
## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"} ## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"}
@ -589,12 +595,12 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
</Infobox> </Infobox>
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------------------ | | -------------- | ------------------------------------------------------------------------------------------------------ |
| _keyword-only_ | | | | _keyword-only_ | |
| `disable` | str / list | Name(s) of pipeline components to disable. | | `disable` | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~ |
| `enable` | str / list | Names(s) of pipeline components that will not be disabled. | | `enable` | Names(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ |
| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | | **RETURNS** | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ |
## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"} ## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"}
@ -613,10 +619,10 @@ information about the component and its default provided by the
> print(factory_meta.default_config) > print(factory_meta.default_config)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----------------------------- | ------------------ | | ----------- | --------------------------------- |
| `name` | str | The factory name. | | `name` | The factory name. ~~str~~ |
| **RETURNS** | [`FactoryMeta`](#factorymeta) |  The factory meta. | | **RETURNS** | The factory meta. ~~FactoryMeta~~ |
## Language.get_pipe_meta {#get_pipe_meta tag="method" new="3"} ## Language.get_pipe_meta {#get_pipe_meta tag="method" new="3"}
@ -636,10 +642,10 @@ contains the information about the component and its default provided by the
> print(factory_meta.default_config) > print(factory_meta.default_config)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----------------------------- | ---------------------------- | | ----------- | ------------------------------------ |
| `name` | str | The pipeline component name. | | `name` | The pipeline component name. ~~str~~ |
| **RETURNS** | [`FactoryMeta`](#factorymeta) |  The factory meta. | | **RETURNS** | The factory meta. ~~FactoryMeta~~ |
## Language.analyze_pipes {#analyze_pipes tag="method" new="3"} ## Language.analyze_pipes {#analyze_pipes tag="method" new="3"}
@ -725,12 +731,12 @@ token.ent_iob, token.ent_type
</Accordion> </Accordion>
| Name | Type | Description | | Name | Description |
| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. | | `keys` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. ~~List[str]~~ |
| `pretty` | bool | Pretty-print the results as a table. Defaults to `False`. | | `pretty` | Pretty-print the results as a table. Defaults to `False`. ~~bool~~ |
| **RETURNS** | dict | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). | | **RETURNS** | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). ~~Optional[Dict[str, Any]]~~ |
## Language.meta {#meta tag="property"} ## Language.meta {#meta tag="property"}
@ -744,9 +750,9 @@ data of the model. The `Language.meta` is also what's serialized as the
> print(nlp.meta) > print(nlp.meta)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------- | | ----------- | --------------------------------- |
| **RETURNS** | dict | The meta data. | | **RETURNS** | The meta data. ~~Dict[str, Any]~~ |
## Language.config {#config tag="property" new="3"} ## Language.config {#config tag="property" new="3"}
@ -765,9 +771,9 @@ subclass of the built-in `dict`. It supports the additional methods `to_disk`
> print(nlp.config.to_str()) > print(nlp.config.to_str())
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------------------------------------- | ----------- | | ----------- | ---------------------- |
| **RETURNS** | [`Config`](https://thinc.ai/docs/api-config#config) | The config. | | **RETURNS** | The config. ~~Config~~ |
## Language.to_disk {#to_disk tag="method" new="2"} ## Language.to_disk {#to_disk tag="method" new="2"}
@ -780,11 +786,11 @@ the model**.
> nlp.to_disk("/path/to/models") > nlp.to_disk("/path/to/models")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | | `exclude` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## Language.from_disk {#from_disk tag="method" new="2"} ## Language.from_disk {#from_disk tag="method" new="2"}
@ -806,12 +812,12 @@ loaded object.
> nlp = English().from_disk("/path/to/en_model") > nlp = English().from_disk("/path/to/en_model")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ----------------------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | | `exclude` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Language` | The modified `Language` object. | | **RETURNS** | The modified `Language` object. ~~Language~~ |
## Language.to_bytes {#to_bytes tag="method"} ## Language.to_bytes {#to_bytes tag="method"}
@ -823,11 +829,11 @@ Serialize the current state to a binary string.
> nlp_bytes = nlp.to_bytes() > nlp_bytes = nlp.to_bytes()
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ----------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | | `exclude` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. ~~iterable~~ |
| **RETURNS** | bytes | The serialized form of the `Language` object. | | **RETURNS** | The serialized form of the `Language` object. ~~bytes~~ |
## Language.from_bytes {#from_bytes tag="method"} ## Language.from_bytes {#from_bytes tag="method"}
@ -845,35 +851,35 @@ available to the loaded object.
> nlp2.from_bytes(nlp_bytes) > nlp2.from_bytes(nlp_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ----------------------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | | `exclude` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Language` | The `Language` object. | | **RETURNS** | The `Language` object. ~~Language~~ |
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Description |
| --------------------------------------------- | ---------------------- | ---------------------------------------------------------------------------------------- | | --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A container for the lexical types. | | `vocab` | A container for the lexical types. ~~Vocab~~ |
| `tokenizer` | `Tokenizer` | The tokenizer. | | `tokenizer` | The tokenizer. ~~Tokenizer~~ |
| `make_doc` | `Callable` | Callable that takes a string and returns a `Doc`. | | `make_doc` | Callable that takes a string and returns a `Doc`. ~~Callable[[str], Doc]~~ |
| `pipeline` | `List[str, Callable]` | List of `(name, component)` tuples describing the current processing pipeline, in order. | | `pipeline` | List of `(name, component)` tuples describing the current processing pipeline, in order. ~~List[str, Callable[[Doc], Doc]]~~ |
| `pipe_names` <Tag variant="new">2</Tag> | `List[str]` | List of pipeline component names, in order. | | `pipe_names` <Tag variant="new">2</Tag> | List of pipeline component names, in order. ~~List[str]~~ |
| `pipe_labels` <Tag variant="new">2.2</Tag> | `Dict[str, List[str]]` | List of labels set by the pipeline components, if available, keyed by component name. | | `pipe_labels` <Tag variant="new">2.2</Tag> | List of labels set by the pipeline components, if available, keyed by component name. ~~Dict[str, List[str]]~~ |
| `pipe_factories` <Tag variant="new">2.2</Tag> | `Dict[str, str]` | Dictionary of pipeline component names, mapped to their factory names. | | `pipe_factories` <Tag variant="new">2.2</Tag> | Dictionary of pipeline component names, mapped to their factory names. ~~Dict[str, str]~~ |
| `factories` | `Dict[str, Callable]` | All available factory functions, keyed by name. | | `factories` | All available factory functions, keyed by name. ~~Dict[str, Callable[[...], Callable[[Doc], Doc]]]~~ |
| `factory_names` <Tag variant="new">3</Tag> | `List[str]` | List of all available factory names. | | `factory_names` <Tag variant="new">3</Tag> | List of all available factory names. ~~List[str]~~ |
| `path` <Tag variant="new">2</Tag> | `Path` | Path to the model data directory, if a model is loaded. Otherwise `None`. | | `path` <Tag variant="new">2</Tag> | Path to the model data directory, if a model is loaded. Otherwise `None`. ~~Optional[Path]~~ |
## Class attributes {#class-attributes} ## Class attributes {#class-attributes}
| Name | Type | Description | | Name | Description |
| ---------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. | | `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ |
| `lang` | str | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). | | `lang` | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~ |
| `default_config` | dict | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](https://github.com/explosion/spaCy/tree/develop/spacy/default_config.cfg). | | `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](https://github.com/explosion/spaCy/tree/develop/spacy/default_config.cfg). ~~Config~~ |
## Defaults {#defaults} ## Defaults {#defaults}
@ -906,17 +912,17 @@ customize the default language data:
> config = Config().from_str(DEFAULT_CONFIG) > config = Config().from_str(DEFAULT_CONFIG)
> ``` > ```
| Name | Description | | Name | Description |
| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `stop_words` | List of stop words, used for `Token.is_stop`.<br />**Example:** [`stop_words.py`][stop_words.py] | | `stop_words` | List of stop words, used for `Token.is_stop`.<br />**Example:** [`stop_words.py`][stop_words.py] ~~Set[str]~~ |
| `tokenizer_exceptions` | Tokenizer exception rules, string mapped to list of token attributes.<br />**Example:** [`de/tokenizer_exceptions.py`][de/tokenizer_exceptions.py] | | `tokenizer_exceptions` | Tokenizer exception rules, string mapped to list of token attributes.<br />**Example:** [`de/tokenizer_exceptions.py`][de/tokenizer_exceptions.py] ~~Dict[str, List[dict]]~~ |
| `prefixes`, `suffixes`, `infixes` | Prefix, suffix and infix rules for the default tokenizer.<br />**Example:** [`puncutation.py`][punctuation.py] | | `prefixes`, `suffixes`, `infixes` | Prefix, suffix and infix rules for the default tokenizer.<br />**Example:** [`puncutation.py`][punctuation.py] ~~Optional[List[Union[str, Pattern]]]~~ |
| `token_match` | Optional regex for matching strings that should never be split, overriding the infix rules.<br />**Example:** [`fr/tokenizer_exceptions.py`][fr/tokenizer_exceptions.py] | | `token_match` | Optional regex for matching strings that should never be split, overriding the infix rules.<br />**Example:** [`fr/tokenizer_exceptions.py`][fr/tokenizer_exceptions.py] ~~Optional[Pattern]~~ |
| `url_match` | Regular expression for matching URLs. Prefixes and suffixes are removed before applying the match.<br />**Example:** [`tokenizer_exceptions.py`][tokenizer_exceptions.py] | | `url_match` | Regular expression for matching URLs. Prefixes and suffixes are removed before applying the match.<br />**Example:** [`tokenizer_exceptions.py`][tokenizer_exceptions.py] ~~Optional[Pattern]~~ |
| `lex_attr_getters` | Custom functions for setting lexical attributes on tokens, e.g. `like_num`.<br />**Example:** [`lex_attrs.py`][lex_attrs.py] | | `lex_attr_getters` | Custom functions for setting lexical attributes on tokens, e.g. `like_num`.<br />**Example:** [`lex_attrs.py`][lex_attrs.py] ~~Dict[int, Callable[[str], Any]]~~ |
| `syntax_iterators` | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks).<br />**Example:** [`syntax_iterators.py`][syntax_iterators.py]. | | `syntax_iterators` | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks).<br />**Example:** [`syntax_iterators.py`][syntax_iterators.py]. ~~Dict[str, Callable[[Union[Doc, Span]], Iterator[Span]]]~~ |
| `writing_system` | Information about the language's writing system, available via `Vocab.writing_system`. Defaults to: `{"direction": "ltr", "has_case": True, "has_letters": True}.`.<br />**Example:** [`zh/__init__.py`][zh/__init__.py] | | `writing_system` | Information about the language's writing system, available via `Vocab.writing_system`. Defaults to: `{"direction": "ltr", "has_case": True, "has_letters": True}.`.<br />**Example:** [`zh/__init__.py`][zh/__init__.py] ~~Dict[str, Any]~~ |
| `config` | Default [config](/usage/training#config) added to `nlp.config`. This can include references to custom tokenizers or lemmatizers.<br />**Example:** [`zh/__init__.py`][zh/__init__.py] | | `config` | Default [config](/usage/training#config) added to `nlp.config`. This can include references to custom tokenizers or lemmatizers.<br />**Example:** [`zh/__init__.py`][zh/__init__.py] ~~Config~~ |
[stop_words.py]: [stop_words.py]:
https://github.com/explosion/spaCy/tree/master/spacy/lang/en/stop_words.py https://github.com/explosion/spaCy/tree/master/spacy/lang/en/stop_words.py
@ -963,12 +969,12 @@ provided by the [`@Language.component`](/api/language#component) or
component is defined and stored on the `Language` class for each component component is defined and stored on the `Language` class for each component
instance and factory instance. instance and factory instance.
| Name | Type | Description | | Name | Description |
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `factory` | str | The name of the registered component factory. | | `factory` | The name of the registered component factory. ~~str~~ |
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. | | `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ |
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).  | | `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~  |
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis).  | | `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~  |
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). | | `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. | | `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ |

View File

@ -36,11 +36,9 @@ tags is available in the pipeline and runs _before_ the lemmatizer.
The default config is defined by the pipeline component factory and describes The default config is defined by the pipeline component factory and describes
how the component should be configured. You can override its settings via the how the component should be configured. You can override its settings via the
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your `config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
[`config.cfg` for training](/usage/training#config). [`config.cfg` for training](/usage/training#config). For examples of the lookups
data formats used by the lookup and rule-based lemmatizers, see
For examples of the lookups data formats used by the lookup and rule-based [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
lemmatizers, see the
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
> #### Example > #### Example
> >
@ -49,12 +47,12 @@ lemmatizers, see the
> nlp.add_pipe("lemmatizer", config=config) > nlp.add_pipe("lemmatizer", config=config)
> ``` > ```
| Setting | Type | Description | Default | | Setting | Description |
| ----------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- | | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `mode` | str | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. | `"lookup"` | | `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
| `lookups` | [`Lookups`](/api/lookups) | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from `spacy-lookups-data`. | `None` | | `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
| `overwrite` | bool | Whether to overwrite existing lemmas. | `False` | | `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use. | `None` | | `model` | **Not yet implemented:** the model to use. ~~Model~~ |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/lemmatizer.py https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/lemmatizer.py
@ -77,15 +75,15 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe).
| Name | Type | Description | | Name | Description |
| -------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | [`Vocab`](/api/vocab) | The vocab. | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented). | | `model` | **Not yet implemented:** The model to use. ~~Model~~ |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| mode | str | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. | | mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. | | lookups | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. ~~Optional[Lookups]~~ |
| overwrite | bool | Whether to overwrite existing lemmas. | | overwrite | Whether to overwrite existing lemmas. ~~bool~ |
## Lemmatizer.\_\_call\_\_ {#call tag="method"} ## Lemmatizer.\_\_call\_\_ {#call tag="method"}
@ -102,10 +100,10 @@ and all pipeline components are applied to the `Doc` in order.
> processed = lemmatizer(doc) > processed = lemmatizer(doc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------ | | ----------- | -------------------------------- |
| `doc` | `Doc` | The document to process. | | `doc` | The document to process. ~~Doc~~ |
| **RETURNS** | `Doc` | The processed document. | | **RETURNS** | The processed document. ~~Doc~~ |
## Lemmatizer.pipe {#pipe tag="method"} ## Lemmatizer.pipe {#pipe tag="method"}
@ -121,12 +119,12 @@ applied to the `Doc` in order.
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------ | | -------------- | ------------------------------------------------------------- |
| `stream` | `Iterable[Doc]` | A stream of documents. | | `stream` | A stream of documents. ~~Iterable[Doc]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | `Doc` | Processed documents in the order of the original text. | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"} ## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
@ -134,39 +132,39 @@ Lemmatize a token using a lookup-based approach. If no lemma is found, the
original string is returned. Languages can provide a original string is returned. Languages can provide a
[lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`. [lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`.
| Name | Type | Description | | Name | Description |
| ----------- | --------------------- | ------------------------------------- | | ----------- | --------------------------------------------------- |
| `token` | [`Token`](/api/token) | The token to lemmatize. | | `token` | The token to lemmatize. ~~Token~~ |
| **RETURNS** | `List[str]` | A list containing one or more lemmas. | | **RETURNS** | A list containing one or more lemmas. ~~List[str]~~ |
## Lemmatizer.rule_lemmatize {#rule_lemmatize tag="method"} ## Lemmatizer.rule_lemmatize {#rule_lemmatize tag="method"}
Lemmatize a token using a rule-based approach. Typically relies on POS tags. Lemmatize a token using a rule-based approach. Typically relies on POS tags.
| Name | Type | Description | | Name | Description |
| ----------- | --------------------- | ------------------------------------- | | ----------- | --------------------------------------------------- |
| `token` | [`Token`](/api/token) | The token to lemmatize. | | `token` | The token to lemmatize. ~~Token~~ |
| **RETURNS** | `List[str]` | A list containing one or more lemmas. | | **RETURNS** | A list containing one or more lemmas. ~~List[str]~~ |
## Lemmatizer.is_base_form {#is_base_form tag="method"} ## Lemmatizer.is_base_form {#is_base_form tag="method"}
Check whether we're dealing with an uninflected paradigm, so we can avoid Check whether we're dealing with an uninflected paradigm, so we can avoid
lemmatization entirely. lemmatization entirely.
| Name | Type | Description | | Name | Description |
| ----------- | --------------------- | ------------------------------------------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------------------- |
| `token` | [`Token`](/api/token) | The token to analyze. | | `token` | The token to analyze. ~~Token~~ |
| **RETURNS** | bool | Whether the token's attributes (e.g., part-of-speech tag, morphological features) describe a base form. | | **RETURNS** | Whether the token's attributes (e.g., part-of-speech tag, morphological features) describe a base form. ~~bool~~ |
## Lemmatizer.get_lookups_config {#get_lookups_config tag="classmethod"} ## Lemmatizer.get_lookups_config {#get_lookups_config tag="classmethod"}
Returns the lookups configuration settings for a given mode for use in Returns the lookups configuration settings for a given mode for use in
[`Lemmatizer.load_lookups`](#load_lookups). [`Lemmatizer.load_lookups`](/api/lemmatizer#load_lookups).
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------------------- | | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `mode` | str | The lemmatizer mode. | | `mode` | The lemmatizer mode. ~~str~~ |
| **RETURNS** | dict | The lookups configuration settings for this mode. | | **RETURNS** | The lookups configuration settings for this mode. Includes the keys `"required_tables"` and `"optional_tables"`, mapped to a list of table string names. ~~Dict[str, List[str]]~~ |
## Lemmatizer.load_lookups {#load_lookups tag="classmethod"} ## Lemmatizer.load_lookups {#load_lookups tag="classmethod"}
@ -174,12 +172,12 @@ Load and validate lookups tables. If the provided lookups is `None`, load the
default lookups tables according to the language and mode settings. Confirm that default lookups tables according to the language and mode settings. Confirm that
all required tables for the language and mode are present. all required tables for the language and mode are present.
| Name | Type | Description | | Name | Description |
| ----------- | ------------------------- | ---------------------------------------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------------------- |
| `lang` | str | The language. | | `lang` | The language. ~~str~~ |
| `mode` | str | The lemmatizer mode. | | `mode` | The lemmatizer mode. ~~str~~ |
| `lookups` | [`Lookups`](/api/lookups) | The provided lookups, may be `None` if the default lookups should be loaded. | | `lookups` | The provided lookups, may be `None` if the default lookups should be loaded. ~~Optional[Lookups]~~ |
| **RETURNS** | [`Lookups`](/api/lookups) | The lookups object. | | **RETURNS** | The lookups. ~~Lookups~~ |
## Lemmatizer.to_disk {#to_disk tag="method"} ## Lemmatizer.to_disk {#to_disk tag="method"}
@ -192,11 +190,11 @@ Serialize the pipe to disk.
> lemmatizer.to_disk("/path/to/lemmatizer") > lemmatizer.to_disk("/path/to/lemmatizer")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## Lemmatizer.from_disk {#from_disk tag="method"} ## Lemmatizer.from_disk {#from_disk tag="method"}
@ -209,12 +207,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
> lemmatizer.from_disk("/path/to/lemmatizer") > lemmatizer.from_disk("/path/to/lemmatizer")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Lemmatizer` | The modified `Lemmatizer` object. | | **RETURNS** | The modified `Lemmatizer` object. ~~Lemmatizer~~ |
## Lemmatizer.to_bytes {#to_bytes tag="method"} ## Lemmatizer.to_bytes {#to_bytes tag="method"}
@ -227,11 +225,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | bytes | The serialized form of the `Lemmatizer` object. | | **RETURNS** | The serialized form of the `Lemmatizer` object. ~~bytes~~ |
## Lemmatizer.from_bytes {#from_bytes tag="method"} ## Lemmatizer.from_bytes {#from_bytes tag="method"}
@ -245,27 +243,20 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> lemmatizer.from_bytes(lemmatizer_bytes) > lemmatizer.from_bytes(lemmatizer_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Lemmatizer` | The `Lemmatizer` object. | | **RETURNS** | The `Lemmatizer` object. ~~Lemmatizer~~ |
## Lemmatizer.mode {#mode tag="property"}
The lemmatizer mode.
| Name | Type | Description |
| ----------- | ----- | -------------------- |
| **RETURNS** | `str` | The lemmatizer mode. |
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Description |
| --------- | --------------------------------- | ------------------- | | --------- | ------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). | | `vocab` | The shared [`Vocab`](/api/vocab). ~~Vocab~~ |
| `lookups` | [`Lookups`](/api/lookups) | The lookups object. | | `lookups` | The lookups object. ~~Lookups~~ |
| `mode` | The lemmatizer mode. ~~str~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -13,10 +13,10 @@ lemmatization depends on the part-of-speech tag).
Create a `Lexeme` object. Create a `Lexeme` object.
| Name | Type | Description | | Name | Description |
| ------- | ------- | -------------------------- | | ------- | ---------------------------------- |
| `vocab` | `Vocab` | The parent vocabulary. | | `vocab` | The parent vocabulary. ~~Vocab~~ |
| `orth` | int | The orth id of the lexeme. | | `orth` | The orth id of the lexeme. ~~int~~ |
## Lexeme.set_flag {#set_flag tag="method"} ## Lexeme.set_flag {#set_flag tag="method"}
@ -29,10 +29,10 @@ Change the value of a boolean flag.
> nlp.vocab["spaCy"].set_flag(COOL_FLAG, True) > nlp.vocab["spaCy"].set_flag(COOL_FLAG, True)
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------- | ---- | ------------------------------------ | | --------- | -------------------------------------------- |
| `flag_id` | int | The attribute ID of the flag to set. | | `flag_id` | The attribute ID of the flag to set. ~~int~~ |
| `value` | bool | The new value of the flag. | | `value` | The new value of the flag. ~~bool~~ |
## Lexeme.check_flag {#check_flag tag="method"} ## Lexeme.check_flag {#check_flag tag="method"}
@ -46,10 +46,10 @@ Check the value of a boolean flag.
> assert nlp.vocab["spaCy"].check_flag(MY_LIBRARY) == True > assert nlp.vocab["spaCy"].check_flag(MY_LIBRARY) == True
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------------- | | ----------- | ---------------------------------------------- |
| `flag_id` | int | The attribute ID of the flag to query. | | `flag_id` | The attribute ID of the flag to query. ~~int~~ |
| **RETURNS** | bool | The value of the flag. | | **RETURNS** | The value of the flag. ~~bool~~ |
## Lexeme.similarity {#similarity tag="method" model="vectors"} ## Lexeme.similarity {#similarity tag="method" model="vectors"}
@ -65,10 +65,10 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors.
> assert apple_orange == orange_apple > assert apple_orange == orange_apple
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | -------------------------------------------------------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------------------------------------------------- |
| other | - | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. | | other | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ |
| **RETURNS** | float | A scalar similarity score. Higher is more similar. | | **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~ |
## Lexeme.has_vector {#has_vector tag="property" model="vectors"} ## Lexeme.has_vector {#has_vector tag="property" model="vectors"}
@ -81,9 +81,9 @@ A boolean value indicating whether a word vector is associated with the lexeme.
> assert apple.has_vector > assert apple.has_vector
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ---------------------------------------------- | | ----------- | ------------------------------------------------------- |
| **RETURNS** | bool | Whether the lexeme has a vector data attached. | | **RETURNS** | Whether the lexeme has a vector data attached. ~~bool~~ |
## Lexeme.vector {#vector tag="property" model="vectors"} ## Lexeme.vector {#vector tag="property" model="vectors"}
@ -97,9 +97,9 @@ A real-valued meaning representation.
> assert apple.vector.shape == (300,) > assert apple.vector.shape == (300,)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---------------------------------------- | ----------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------ |
| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A 1D numpy array representing the lexeme's semantics. | | **RETURNS** | A 1-dimensional array representing the lexeme's vector. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
## Lexeme.vector_norm {#vector_norm tag="property" model="vectors"} ## Lexeme.vector_norm {#vector_norm tag="property" model="vectors"}
@ -115,50 +115,50 @@ The L2 norm of the lexeme's vector representation.
> assert apple.vector_norm != pasta.vector_norm > assert apple.vector_norm != pasta.vector_norm
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ----------------------------------------- | | ----------- | --------------------------------------------------- |
| **RETURNS** | float | The L2 norm of the vector representation. | | **RETURNS** | The L2 norm of the vector representation. ~~float~~ |
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Description |
| -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | -------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The lexeme's vocabulary. | | `vocab` | The lexeme's vocabulary. ~~Vocab~~ |
| `text` | str | Verbatim text content. | | `text` | Verbatim text content. ~~str~~ |
| `orth` | int | ID of the verbatim text content. | | `orth` | ID of the verbatim text content. ~~int~~ |
| `orth_` | str | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. | | `orth_` | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. ~~str~~ |
| `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. | | `rank` | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
| `flags` | int | Container of the lexeme's binary flags. | | `flags` | Container of the lexeme's binary flags. ~~int~~ |
| `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. | | `norm` | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~int~~ |
| `norm_` | str | The lexemes's norm, i.e. a normalized form of the lexeme text. | | `norm_` | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~str~~ |
| `lower` | int | Lowercase form of the word. | | `lower` | Lowercase form of the word. ~~int~~ |
| `lower_` | str | Lowercase form of the word. | | `lower_` | Lowercase form of the word. ~~str~~ |
| `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `shape` | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
| `shape_` | str | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `shape_` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
| `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. | | `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ |
| `prefix_` | str | Length-N substring from the start of the word. Defaults to `N=1`. | | `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ |
| `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. | | `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ |
| `suffix_` | str | Length-N substring from the start of the word. Defaults to `N=3`. | | `suffix_` | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~ |
| `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. | | `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ |
| `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. | | `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ |
| `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. | | `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ |
| `is_lower` | bool | Is the lexeme in lowercase? Equivalent to `lexeme.text.islower()`. | | `is_lower` | Is the lexeme in lowercase? Equivalent to `lexeme.text.islower()`. ~~bool~~ |
| `is_upper` | bool | Is the lexeme in uppercase? Equivalent to `lexeme.text.isupper()`. | | `is_upper` | Is the lexeme in uppercase? Equivalent to `lexeme.text.isupper()`. ~~bool~~ |
| `is_title` | bool | Is the lexeme in titlecase? Equivalent to `lexeme.text.istitle()`. | | `is_title` | Is the lexeme in titlecase? Equivalent to `lexeme.text.istitle()`. ~~bool~~ |
| `is_punct` | bool | Is the lexeme punctuation? | | `is_punct` | Is the lexeme punctuation? ~~bool~~ |
| `is_left_punct` | bool | Is the lexeme a left punctuation mark, e.g. `(`? | | `is_left_punct` | Is the lexeme a left punctuation mark, e.g. `(`? ~~bool~~ |
| `is_right_punct` | bool | Is the lexeme a right punctuation mark, e.g. `)`? | | `is_right_punct` | Is the lexeme a right punctuation mark, e.g. `)`? ~~bool~~ |
| `is_space` | bool | Does the lexeme consist of whitespace characters? Equivalent to `lexeme.text.isspace()`. | | `is_space` | Does the lexeme consist of whitespace characters? Equivalent to `lexeme.text.isspace()`. ~~bool~~ |
| `is_bracket` | bool | Is the lexeme a bracket? | | `is_bracket` | Is the lexeme a bracket? ~~bool~~ |
| `is_quote` | bool | Is the lexeme a quotation mark? | | `is_quote` | Is the lexeme a quotation mark? ~~bool~~ |
| `is_currency` <Tag variant="new">2.0.8</Tag> | bool | Is the lexeme a currency symbol? | | `is_currency` <Tag variant="new">2.0.8</Tag> | Is the lexeme a currency symbol? ~~bool~~ |
| `like_url` | bool | Does the lexeme resemble a URL? | | `like_url` | Does the lexeme resemble a URL? ~~bool~~ |
| `like_num` | bool | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. | | `like_num` | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ |
| `like_email` | bool | Does the lexeme resemble an email address? | | `like_email` | Does the lexeme resemble an email address? ~~bool~~ |
| `is_oov` | bool | Does the lexeme have a word vector? | | `is_oov` | Does the lexeme have a word vector? ~~bool~~ |
| `is_stop` | bool | Is the lexeme part of a "stop list"? | | `is_stop` | Is the lexeme part of a "stop list"? ~~bool~~ |
| `lang` | int | Language of the parent vocabulary. | | `lang` | Language of the parent vocabulary. ~~int~~ |
| `lang_` | str | Language of the parent vocabulary. | | `lang_` | Language of the parent vocabulary. ~~str~~ |
| `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). | | `prob` | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). ~~float~~ |
| `cluster` | int | Brown cluster ID. | | `cluster` | Brown cluster ID. ~~int~~ |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. | | `sentiment` | A scalar value indicating the positivity or negativity of the lexeme. ~~float~~ |

View File

@ -24,10 +24,6 @@ Create a `Lookups` object.
> lookups = Lookups() > lookups = Lookups()
> ``` > ```
| Name | Type | Description |
| ----------- | --------- | ----------------------------- |
| **RETURNS** | `Lookups` | The newly constructed object. |
## Lookups.\_\_len\_\_ {#len tag="method"} ## Lookups.\_\_len\_\_ {#len tag="method"}
Get the current number of tables in the lookups. Get the current number of tables in the lookups.
@ -39,9 +35,9 @@ Get the current number of tables in the lookups.
> assert len(lookups) == 0 > assert len(lookups) == 0
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------ | | ----------- | -------------------------------------------- |
| **RETURNS** | int | The number of tables in the lookups. | | **RETURNS** | The number of tables in the lookups. ~~int~~ |
## Lookups.\_\contains\_\_ {#contains tag="method"} ## Lookups.\_\contains\_\_ {#contains tag="method"}
@ -56,10 +52,10 @@ Check if the lookups contain a table of a given name. Delegates to
> assert "some_table" in lookups > assert "some_table" in lookups
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ----------------------------------------------- | | ----------- | -------------------------------------------------------- |
| `name` | str | Name of the table. | | `name` | Name of the table. ~~str~~ |
| **RETURNS** | bool | Whether a table of that name is in the lookups. | | **RETURNS** | Whether a table of that name is in the lookups. ~~bool~~ |
## Lookups.tables {#tables tag="property"} ## Lookups.tables {#tables tag="property"}
@ -73,9 +69,9 @@ Get the names of all tables in the lookups.
> assert lookups.tables == ["some_table"] > assert lookups.tables == ["some_table"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ----------------------------------- | | ----------- | ------------------------------------------------- |
| **RETURNS** | list | Names of the tables in the lookups. | | **RETURNS** | Names of the tables in the lookups. ~~List[str]~~ |
## Lookups.add_table {#add_table tag="method"} ## Lookups.add_table {#add_table tag="method"}
@ -89,11 +85,11 @@ exists.
> lookups.add_table("some_table", {"foo": "bar"}) > lookups.add_table("some_table", {"foo": "bar"})
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----------------------------- | ---------------------------------- | | ----------- | ------------------------------------------- |
| `name` | str | Unique name of the table. | | `name` | Unique name of the table. ~~str~~ |
| `data` | dict | Optional data to add to the table. | | `data` | Optional data to add to the table. ~~dict~~ |
| **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. | | **RETURNS** | The newly added table. ~~Table~~ |
## Lookups.get_table {#get_table tag="method"} ## Lookups.get_table {#get_table tag="method"}
@ -108,10 +104,10 @@ Get a table from the lookups. Raises an error if the table doesn't exist.
> assert table["foo"] == "bar" > assert table["foo"] == "bar"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----------------------------- | ------------------ | | ----------- | -------------------------- |
| `name` | str | Name of the table. | | `name` | Name of the table. ~~str~~ |
| **RETURNS** | [`Table`](/api/lookups#table) | The table. | | **RETURNS** | The table. ~~Table~~ |
## Lookups.remove_table {#remove_table tag="method"} ## Lookups.remove_table {#remove_table tag="method"}
@ -126,10 +122,10 @@ Remove a table from the lookups. Raises an error if the table doesn't exist.
> assert "some_table" not in lookups > assert "some_table" not in lookups
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----------------------------- | ---------------------------- | | ----------- | ------------------------------------ |
| `name` | str | Name of the table to remove. | | `name` | Name of the table to remove. ~~str~~ |
| **RETURNS** | [`Table`](/api/lookups#table) | The removed table. | | **RETURNS** | The removed table. ~~Table~~ |
## Lookups.has_table {#has_table tag="method"} ## Lookups.has_table {#has_table tag="method"}
@ -144,10 +140,10 @@ Check if the lookups contain a table of a given name. Equivalent to
> assert lookups.has_table("some_table") > assert lookups.has_table("some_table")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ----------------------------------------------- | | ----------- | -------------------------------------------------------- |
| `name` | str | Name of the table. | | `name` | Name of the table. ~~str~~ |
| **RETURNS** | bool | Whether a table of that name is in the lookups. | | **RETURNS** | Whether a table of that name is in the lookups. ~~bool~~ |
## Lookups.to_bytes {#to_bytes tag="method"} ## Lookups.to_bytes {#to_bytes tag="method"}
@ -159,9 +155,9 @@ Serialize the lookups to a bytestring.
> lookup_bytes = lookups.to_bytes() > lookup_bytes = lookups.to_bytes()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ----------------------- | | ----------- | --------------------------------- |
| **RETURNS** | bytes | The serialized lookups. | | **RETURNS** | The serialized lookups. ~~bytes~~ |
## Lookups.from_bytes {#from_bytes tag="method"} ## Lookups.from_bytes {#from_bytes tag="method"}
@ -175,10 +171,10 @@ Load the lookups from a bytestring.
> lookups.from_bytes(lookup_bytes) > lookups.from_bytes(lookup_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------ | --------- | ---------------------- | | ------------ | -------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| **RETURNS** | `Lookups` | The loaded lookups. | | **RETURNS** | The loaded lookups. ~~Lookups~~ |
## Lookups.to_disk {#to_disk tag="method"} ## Lookups.to_disk {#to_disk tag="method"}
@ -191,9 +187,9 @@ which will be created if it doesn't exist.
> lookups.to_disk("/path/to/lookups") > lookups.to_disk("/path/to/lookups")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | | ------ | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
## Lookups.from_disk {#from_disk tag="method"} ## Lookups.from_disk {#from_disk tag="method"}
@ -208,10 +204,10 @@ the file doesn't exist.
> lookups.from_disk("/path/to/lookups") > lookups.from_disk("/path/to/lookups")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------ | -------------------------------------------------------------------------- | | ----------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| **RETURNS** | `Lookups` | The loaded lookups. | | **RETURNS** | The loaded lookups. ~~Lookups~~ |
## Table {#table tag="class, ordererddict"} ## Table {#table tag="class, ordererddict"}
@ -236,9 +232,9 @@ Initialize a new table.
> assert table["foo"] == "bar" > assert table["foo"] == "bar"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------ | ---- | ---------------------------------- | | ------ | ------------------------------------------ |
| `name` | str | Optional table name for reference. | | `name` | Optional table name for reference. ~~str~~ |
### Table.from_dict {#table.from_dict tag="classmethod"} ### Table.from_dict {#table.from_dict tag="classmethod"}
@ -252,11 +248,11 @@ Initialize a new table from a dict.
> table = Table.from_dict(data, name="some_table") > table = Table.from_dict(data, name="some_table")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------- | ---------------------------------- | | ----------- | ------------------------------------------ |
| `data` | dict | The dictionary. | | `data` | The dictionary. ~~dict~~ |
| `name` | str | Optional table name for reference. | | `name` | Optional table name for reference. ~~str~~ |
| **RETURNS** | `Table` | The newly constructed object. | | **RETURNS** | The newly constructed object. ~~Table~~ |
### Table.set {#table.set tag="method"} ### Table.set {#table.set tag="method"}
@ -272,10 +268,10 @@ Set a new key / value pair. String keys will be hashed. Same as
> assert table["foo"] == "bar" > assert table["foo"] == "bar"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------- | --------- | ----------- | | ------- | ---------------------------- |
| `key` | str / int | The key. | | `key` | The key. ~~Union[str, int]~~ |
| `value` | - | The value. | | `value` | The value. |
### Table.to_bytes {#table.to_bytes tag="method"} ### Table.to_bytes {#table.to_bytes tag="method"}
@ -287,9 +283,9 @@ Serialize the table to a bytestring.
> table_bytes = table.to_bytes() > table_bytes = table.to_bytes()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | --------------------- | | ----------- | ------------------------------- |
| **RETURNS** | bytes | The serialized table. | | **RETURNS** | The serialized table. ~~bytes~~ |
### Table.from_bytes {#table.from_bytes tag="method"} ### Table.from_bytes {#table.from_bytes tag="method"}
@ -303,15 +299,15 @@ Load a table from a bytestring.
> table.from_bytes(table_bytes) > table.from_bytes(table_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------ | ------- | ----------------- | | ------------ | --------------------------- |
| `bytes_data` | bytes | The data to load. | | `bytes_data` | The data to load. ~~bytes~~ |
| **RETURNS** | `Table` | The loaded table. | | **RETURNS** | The loaded table. ~~Table~~ |
### Attributes {#table-attributes} ### Attributes {#table-attributes}
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------- | ----------------------------------------------------- | | -------------- | ------------------------------------------------------------- |
| `name` | str | Table name. | | `name` | Table name. ~~str~~ |
| `default_size` | int | Default size of bloom filters if no data is provided. | | `default_size` | Default size of bloom filters if no data is provided. ~~int~~ |
| `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. | | `bloom` | The bloom filters. ~~preshed.BloomFilter~~ |

View File

@ -30,20 +30,20 @@ pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for [`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are: rule-based matching are:
| Attribute | Type |  Description | | Attribute |  Description |
| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ | | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `ORTH` | str | The exact verbatim text of a token. | | `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `TEXT` <Tag variant="new">2.1</Tag> | str | The exact verbatim text of a token. | | `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
| `LOWER` | str | The lowercase form of the token text. | | `LOWER` | The lowercase form of the token text. ~~str~~ |
|  `LENGTH` | int | The length of the token text. | |  `LENGTH` | The length of the token text. ~~int~~ |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | |  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | str | The token's entity label. | | `ENT_TYPE` | The token's entity label. ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | | `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
| `OP` | str | Operator or quantifier to determine how often to match a token pattern. | | `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ |
Operators and quantifiers define **how often** a token pattern should be Operators and quantifiers define **how often** a token pattern should be
matched: matched:
@ -75,11 +75,11 @@ it compares to another value.
> ] > ]
> ``` > ```
| Attribute | Type | Description | | Attribute | Description |
| -------------------------- | ---------- | --------------------------------------------------------------------------------- | | -------------------------- | ------------------------------------------------------------------------------------------------------- |
| `IN` | any | Attribute value is member of a list. | | `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | any | Attribute value is _not_ member of a list. | | `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. | | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
## Matcher.\_\_init\_\_ {#init tag="method"} ## Matcher.\_\_init\_\_ {#init tag="method"}
@ -95,10 +95,10 @@ string where an integer is expected) or unexpected property names.
> matcher = Matcher(nlp.vocab) > matcher = Matcher(nlp.vocab)
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------- | | --------------------------------------- | ----------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | | `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ |
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate all patterns added to this matcher. | | `validate` <Tag variant="new">2.1</Tag> | Validate all patterns added to this matcher. ~~bool~~ |
## Matcher.\_\_call\_\_ {#call tag="method"} ## Matcher.\_\_call\_\_ {#call tag="method"}
@ -116,10 +116,10 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. | | `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. | | **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
## Matcher.pipe {#pipe tag="method"} ## Matcher.pipe {#pipe tag="method"}
@ -134,13 +134,13 @@ Match a stream of documents, yielding them in turn.
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | iterable | A stream of documents or spans. | | `docs` | A stream of documents or spans. ~~Iterable[Union[Doc, Span]]~~ |
| `batch_size` | int | The number of documents to accumulate into a working set. | | `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
| `return_matches` <Tag variant="new">2.1</Tag> | bool | Yield the match lists along with the docs, making results `(doc, matches)` tuples. | | `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
| `as_tuples` | bool | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. | | `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
| **YIELDS** | `Doc` | Documents, in order. | | **YIELDS** | Documents, in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
## Matcher.\_\_len\_\_ {#len tag="method" new="2"} ## Matcher.\_\_len\_\_ {#len tag="method" new="2"}
@ -157,9 +157,9 @@ patterns.
> assert len(matcher) == 1 > assert len(matcher) == 1
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------- | | ----------- | ---------------------------- |
| **RETURNS** | int | The number of rules. | | **RETURNS** | The number of rules. ~~int~~ |
## Matcher.\_\_contains\_\_ {#contains tag="method" new="2"} ## Matcher.\_\_contains\_\_ {#contains tag="method" new="2"}
@ -174,10 +174,10 @@ Check whether the matcher contains rules for a match ID.
> assert "Rule" in matcher > assert "Rule" in matcher
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ----------------------------------------------------- | | ----------- | -------------------------------------------------------------- |
| `key` | str | The match ID. | | `key` | The match ID. ~~str~~ |
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | | **RETURNS** | Whether the matcher contains rules for this match ID. ~~bool~~ |
## Matcher.add {#add tag="method" new="2"} ## Matcher.add {#add tag="method" new="2"}
@ -217,13 +217,13 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
</Infobox> </Infobox>
| Name | Type | Description | | Name | Description |
| ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- | | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. | | `match_id` | An ID for the thing you're matching. ~~str~~ |
| `patterns` | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | | `patterns` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. ~~List[List[Dict[str, Any]]]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `on_match` | callable / `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[Matcher, Doc, int, List[tuple], Any]]~~ |
| `greedy` <Tag variant="new">3</Tag> | str | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`. | | `greedy` <Tag variant="new">3</Tag> | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`. ~~Optional[str]~~ |
## Matcher.remove {#remove tag="method" new="2"} ## Matcher.remove {#remove tag="method" new="2"}
@ -239,9 +239,9 @@ exist.
> assert "Rule" not in matcher > assert "Rule" not in matcher
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----- | ---- | ------------------------- | | ----- | --------------------------------- |
| `key` | str | The ID of the match rule. | | `key` | The ID of the match rule. ~~str~~ |
## Matcher.get {#get tag="method" new="2"} ## Matcher.get {#get tag="method" new="2"}
@ -255,7 +255,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
> on_match, patterns = matcher.get("Rule") > on_match, patterns = matcher.get("Rule")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | --------------------------------------------- | | ----------- | --------------------------------------------------------------------------------------------- |
| `key` | str | The ID of the match rule. | | `key` | The ID of the match rule. ~~str~~ |
| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. | | **RETURNS** | The rule, as an `(on_match, patterns)` tuple. ~~Tuple[Optional[Callable], List[List[dict]]]~~ |

View File

@ -1,142 +0,0 @@
---
title: MorphAnalysis
tag: class
source: spacy/tokens/morphanalysis.pyx
---
Stores a single morphological analysis.
## MorphAnalysis.\_\_init\_\_ {#init tag="method"}
Initialize a MorphAnalysis object from a UD FEATS string or a dictionary of
morphological features.
> #### Example
>
> ```python
> from spacy.tokens import MorphAnalysis
>
> feats = "Feat1=Val1|Feat2=Val2"
> m = MorphAnalysis(nlp.vocab, feats)
> ```
| Name | Type | Description |
| ---------- | ------------------ | --------------------------- |
| `vocab` | `Vocab` | The vocab. |
| `features` | `Union[Dict, str]` | The morphological features. |
## MorphAnalysis.\_\_contains\_\_ {#contains tag="method"}
Whether a feature/value pair is in the analysis.
> #### Example
>
> ```python
> feats = "Feat1=Val1,Val2|Feat2=Val2"
> morph = MorphAnalysis(nlp.vocab, feats)
> assert "Feat1=Val1" in morph
> ```
| Name | Type | Description |
| ----------- | ----- | ------------------------------------- |
| **RETURNS** | `str` | A feature/value pair in the analysis. |
## MorphAnalysis.\_\_iter\_\_ {#iter tag="method"}
Iterate over the feature/value pairs in the analysis.
> #### Example
>
> ```python
> feats = "Feat1=Val1,Val3|Feat2=Val2"
> morph = MorphAnalysis(nlp.vocab, feats)
> assert list(morph) == ["Feat1=Va1", "Feat1=Val3", "Feat2=Val2"]
> ```
| Name | Type | Description |
| ---------- | ----- | ------------------------------------- |
| **YIELDS** | `str` | A feature/value pair in the analysis. |
## MorphAnalysis.\_\_len\_\_ {#len tag="method"}
Returns the number of features in the analysis.
> #### Example
>
> ```python
> feats = "Feat1=Val1,Val2|Feat2=Val2"
> morph = MorphAnalysis(nlp.vocab, feats)
> assert len(morph) == 3
> ```
| Name | Type | Description |
| ----------- | ----- | --------------------------------------- |
| **RETURNS** | `int` | The number of features in the analysis. |
## MorphAnalysis.\_\_str\_\_ {#str tag="method"}
Returns the morphological analysis in the UD FEATS string format.
> #### Example
>
> ```python
> feats = "Feat1=Val1,Val2|Feat2=Val2"
> morph = MorphAnalysis(nlp.vocab, feats)
> assert str(morph) == feats
> ```
| Name | Type | Description |
| ----------- | ----- | -------------------------------- |
| **RETURNS** | `str` | The analysis in UD FEATS format. |
## MorphAnalysis.get {#get tag="method"}
Retrieve values for a feature by field.
> #### Example
>
> ```python
> feats = "Feat1=Val1,Val2"
> morph = MorphAnalysis(nlp.vocab, feats)
> assert morph.get("Feat1") == ["Val1", "Val2"]
> ```
| Name | Type | Description |
| ----------- | ------ | ---------------------------------- |
| `field` | `str` | The field to retrieve. |
| **RETURNS** | `list` | A list of the individual features. |
## MorphAnalysis.to_dict {#to_dict tag="method"}
Produce a dict representation of the analysis, in the same format as the tag
map.
> #### Example
>
> ```python
> feats = "Feat1=Val1,Val2|Feat2=Val2"
> morph = MorphAnalysis(nlp.vocab, feats)
> assert morph.to_dict() == {"Feat1": "Val1,Val2", "Feat2": "Val2"}
> ```
| Name | Type | Description |
| ----------- | ------ | ---------------------------------------- |
| **RETURNS** | `dict` | The dict representation of the analysis. |
## MorphAnalysis.from_id {#from_id tag="classmethod"}
Create a morphological analysis from a given hash ID.
> #### Example
>
> ```python
> feats = "Feat1=Val1|Feat2=Val2"
> hash = nlp.vocab.strings[feats]
> morph = MorphAnalysis.from_id(nlp.vocab, hash)
> assert str(morph) == feats
> ```
| Name | Type | Description |
| ------- | ------- | -------------------------------- |
| `vocab` | `Vocab` | The vocab. |
| `key` | `int` | The hash of the features string. |

View File

@ -32,9 +32,9 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("morphologizer", config=config) > nlp.add_pipe("morphologizer", config=config)
> ``` > ```
| Setting | Type | Description | Default | | Setting | Description |
| ------- | ------------------------------------------ | ----------------- | ----------------------------------- | | ------- | ------------------------------------------------------------------------------------------------------- |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [Tagger](/api/architectures#Tagger) | | `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/morphologizer.pyx https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/morphologizer.pyx
@ -42,7 +42,9 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/morphologizer.pyx
## Morphologizer.\_\_init\_\_ {#init tag="method"} ## Morphologizer.\_\_init\_\_ {#init tag="method"}
Initialize the morphologizer. Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe).
> #### Example > #### Example
> >
@ -59,18 +61,14 @@ Initialize the morphologizer.
> morphologizer = Morphologizer(nlp.vocab, model) > morphologizer = Morphologizer(nlp.vocab, model)
> ``` > ```
Create a new pipeline instance. In your application, you would normally use a | Name | Description |
shortcut for this and instantiate the component using its string name and | -------------- | -------------------------------------------------------------------------------------------------------------------- |
[`nlp.add_pipe`](/api/language#add_pipe). | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| Name | Type | Description | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| -------------- | ------- | ------------------------------------------------------------------------------------------- | | _keyword-only_ | |
| `vocab` | `Vocab` | The shared vocabulary. | | `labels_morph` | Mapping of morph + POS tags to morph labels. ~~Dict[str, str]~~ |
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `labels_pos` | Mapping of morph + POS tags to POS tags. ~~Dict[str, str]~~ |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
| _keyword-only_ | | |
| `labels_morph` | dict | Mapping of morph + POS tags to morph labels. |
| `labels_pos` | dict | Mapping of morph + POS tags to POS tags. |
## Morphologizer.\_\_call\_\_ {#call tag="method"} ## Morphologizer.\_\_call\_\_ {#call tag="method"}
@ -90,10 +88,10 @@ delegate to the [`predict`](/api/morphologizer#predict) and
> processed = morphologizer(doc) > processed = morphologizer(doc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------ | | ----------- | -------------------------------- |
| `doc` | `Doc` | The document to process. | | `doc` | The document to process. ~~Doc~~ |
| **RETURNS** | `Doc` | The processed document. | | **RETURNS** | The processed document. ~~Doc~~ |
## Morphologizer.pipe {#pipe tag="method"} ## Morphologizer.pipe {#pipe tag="method"}
@ -112,12 +110,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------ | | -------------- | ------------------------------------------------------------- |
| `stream` | `Iterable[Doc]` | A stream of documents. | | `stream` | A stream of documents. ~~Iterable[Doc]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | `Doc` | Processed documents in the order of the original text. | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Morphologizer.begin_training {#begin_training tag="method"} ## Morphologizer.begin_training {#begin_training tag="method"}
@ -138,13 +136,13 @@ setting up the label scheme based on the data.
> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | | `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/sentencerecognizer#create_optimizer) if not set. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## Morphologizer.predict {#predict tag="method"} ## Morphologizer.predict {#predict tag="method"}
@ -158,10 +156,10 @@ modifying them.
> scores = morphologizer.predict([doc1, doc2]) > scores = morphologizer.predict([doc1, doc2])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------- | ----------------------------------------- | | ----------- | ------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to predict. | | `docs` | The documents to predict. ~~Iterable[Doc]~~ |
| **RETURNS** | - | The model's prediction for each document. | | **RETURNS** | The model's prediction for each document. |
## Morphologizer.set_annotations {#set_annotations tag="method"} ## Morphologizer.set_annotations {#set_annotations tag="method"}
@ -175,10 +173,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
> morphologizer.set_annotations([doc1, doc2], scores) > morphologizer.set_annotations([doc1, doc2], scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | --------------- | ------------------------------------------------------- | | -------- | ------------------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to modify. | | `docs` | The documents to modify. ~~Iterable[Doc]~~ |
| `scores` | - | The scores to set, produced by `Morphologizer.predict`. | | `scores` | The scores to set, produced by `Morphologizer.predict`. |
## Morphologizer.update {#update tag="method"} ## Morphologizer.update {#update tag="method"}
@ -195,15 +193,15 @@ Delegates to [`predict`](/api/morphologizer#predict) and
> losses = morphologizer.update(examples, sgd=optimizer) > losses = morphologizer.update(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/sentencerecognizer#set_annotations). | | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Morphologizer.get_loss {#get_loss tag="method"} ## Morphologizer.get_loss {#get_loss tag="method"}
@ -218,11 +216,11 @@ predicted scores.
> loss, d_loss = morphologizer.get_loss(examples, scores) > loss, d_loss = morphologizer.get_loss(examples, scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------- | --------------------------------------------------- | | ----------- | --------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The batch of examples. | | `examples` | The batch of examples. ~~Iterable[Example]~~ |
| `scores` | - | Scores representing the model's predictions. | | `scores` | Scores representing the model's predictions. |
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
## Morphologizer.create_optimizer {#create_optimizer tag="method"} ## Morphologizer.create_optimizer {#create_optimizer tag="method"}
@ -235,9 +233,9 @@ Create an optimizer for the pipeline component.
> optimizer = morphologizer.create_optimizer() > optimizer = morphologizer.create_optimizer()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------------------------------------- | -------------- | | ----------- | ---------------------------- |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## Morphologizer.use_params {#use_params tag="method, contextmanager"} ## Morphologizer.use_params {#use_params tag="method, contextmanager"}
@ -252,9 +250,9 @@ context, the original parameters are restored.
> morphologizer.to_disk("/best_model") > morphologizer.to_disk("/best_model")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---- | ----------------------------------------- | | -------- | -------------------------------------------------- |
| `params` | dict | The parameter values to use in the model. | | `params` | The parameter values to use in the model. ~~dict~~ |
## Morphologizer.add_label {#add_label tag="method"} ## Morphologizer.add_label {#add_label tag="method"}
@ -268,10 +266,10 @@ both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
> morphologizer.add_label("Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin") > morphologizer.add_label("Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | --------------------------------------------------- | | ----------- | ----------------------------------------------------------- |
| `label` | str | The label to add. | | `label` | The label to add. ~~str~~ |
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. | | **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
## Morphologizer.to_disk {#to_disk tag="method"} ## Morphologizer.to_disk {#to_disk tag="method"}
@ -284,11 +282,11 @@ Serialize the pipe to disk.
> morphologizer.to_disk("/path/to/morphologizer") > morphologizer.to_disk("/path/to/morphologizer")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## Morphologizer.from_disk {#from_disk tag="method"} ## Morphologizer.from_disk {#from_disk tag="method"}
@ -301,12 +299,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
> morphologizer.from_disk("/path/to/morphologizer") > morphologizer.from_disk("/path/to/morphologizer")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Morphologizer` | The modified `Morphologizer` object. | | **RETURNS** | The modified `Morphologizer` object. ~~Morphologizer~~ |
## Morphologizer.to_bytes {#to_bytes tag="method"} ## Morphologizer.to_bytes {#to_bytes tag="method"}
@ -319,11 +317,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | bytes | The serialized form of the `Morphologizer` object. | | **RETURNS** | The serialized form of the `Morphologizer` object. ~~bytes~~ |
## Morphologizer.from_bytes {#from_bytes tag="method"} ## Morphologizer.from_bytes {#from_bytes tag="method"}
@ -337,19 +335,19 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> morphologizer.from_bytes(morphologizer_bytes) > morphologizer.from_bytes(morphologizer_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Morphologizer` | The `Morphologizer` object. | | **RETURNS** | The `Morphologizer` object. ~~Morphologizer~~ |
## Morphologizer.labels {#labels tag="property"} ## Morphologizer.labels {#labels tag="property"}
The labels currently added to the component in Universal Dependencies The labels currently added to the component in the Universal Dependencies
[FEATS format](https://universaldependencies.org/format.html#morphological-annotation). [FEATS](https://universaldependencies.org/format.html#morphological-annotation)
Note that even for a blank component, this will always include the internal format. Note that even for a blank component, this will always include the
empty label `_`. If POS features are used, the labels will include the internal empty label `_`. If POS features are used, the labels will include the
coarse-grained POS as the feature `POS`. coarse-grained POS as the feature `POS`.
> #### Example > #### Example
@ -359,9 +357,9 @@ coarse-grained POS as the feature `POS`.
> assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels > assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ---------------------------------- | | ----------- | ------------------------------------------------------ |
| **RETURNS** | tuple | The labels added to the component. | | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -7,7 +7,8 @@ source: spacy/morphology.pyx
Store the possible morphological analyses for a language, and index them by Store the possible morphological analyses for a language, and index them by
hash. To save space on each token, tokens only know the hash of their hash. To save space on each token, tokens only know the hash of their
morphological analysis, so queries of morphological attributes are delegated to morphological analysis, so queries of morphological attributes are delegated to
this class. this class. See [`MorphAnalysis`](/api/morphology#morphansalysis) for the
container storing a single morphological analysis.
## Morphology.\_\_init\_\_ {#init tag="method"} ## Morphology.\_\_init\_\_ {#init tag="method"}
@ -21,15 +22,17 @@ Create a Morphology object.
> morphology = Morphology(strings) > morphology = Morphology(strings)
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------- | ------------- | ----------------- | | --------- | --------------------------------- |
| `strings` | `StringStore` | The string store. | | `strings` | The string store. ~~StringStore~~ |
## Morphology.add {#add tag="method"} ## Morphology.add {#add tag="method"}
Insert a morphological analysis in the morphology table, if not already present. Insert a morphological analysis in the morphology table, if not already present.
The morphological analysis may be provided in the UD FEATS format as a string or The morphological analysis may be provided in the Universal Dependencies
in the tag map dictionary format. Returns the hash of the new analysis. [FEATS](https://universaldependencies.org/format.html#morphological-annotation)
format as a string or in the tag map dictionary format. Returns the hash of the
new analysis.
> #### Example > #### Example
> >
@ -39,9 +42,9 @@ in the tag map dictionary format. Returns the hash of the new analysis.
> assert hash == nlp.vocab.strings[feats] > assert hash == nlp.vocab.strings[feats]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ------------------ | --------------------------- | | ---------- | ------------------------------------------------ |
| `features` | `Union[Dict, str]` | The morphological features. | | `features` | The morphological features. ~~Union[Dict, str]~~ |
## Morphology.get {#get tag="method"} ## Morphology.get {#get tag="method"}
@ -53,16 +56,20 @@ in the tag map dictionary format. Returns the hash of the new analysis.
> assert nlp.vocab.morphology.get(hash) == feats > assert nlp.vocab.morphology.get(hash) == feats
> ``` > ```
Get the FEATS string for the hash of the morphological analysis. Get the
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
string for the hash of the morphological analysis.
| Name | Type | Description | | Name | Description |
| ------- | ---- | --------------------------------------- | | ------- | ----------------------------------------------- |
| `morph` | int | The hash of the morphological analysis. | | `morph` | The hash of the morphological analysis. ~~int~~ |
## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"} ## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"}
Convert a string FEATS representation to a dictionary of features and values in Convert a string
the same format as the tag map. [FEATS](https://universaldependencies.org/format.html#morphological-annotation)
representation to a dictionary of features and values in the same format as the
tag map.
> #### Example > #### Example
> >
@ -72,14 +79,16 @@ the same format as the tag map.
> assert d == {"Feat1": "Val1", "Feat2": "Val2"} > assert d == {"Feat1": "Val1", "Feat2": "Val2"}
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------------------------------------ | | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
| `feats` | str | The morphological features in Universal Dependencies FEATS format. | | `feats` | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
| **RETURNS** | dict | The morphological features as a dictionary. | | **RETURNS** | The morphological features as a dictionary. ~~Dict[str, str]~~ |
## Morphology.dict_to_feats {#dict_to_feats tag="staticmethod"} ## Morphology.dict_to_feats {#dict_to_feats tag="staticmethod"}
Convert a dictionary of features and values to a string FEATS representation. Convert a dictionary of features and values to a string
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
representation.
> #### Example > #### Example
> >
@ -89,15 +98,157 @@ Convert a dictionary of features and values to a string FEATS representation.
> assert f == "Feat1=Val1|Feat2=Val2" > assert f == "Feat1=Val1|Feat2=Val2"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------ | ----------------- | --------------------------------------------------------------------- | | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `feats_dict` | `Dict[str, Dict]` | The morphological features as a dictionary. | | `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ |
| **RETURNS** | str | The morphological features as in Universal Dependencies FEATS format. | | **RETURNS** | The morphological features as in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Description |
| ------------- | ----- | -------------------------------------------- | | ------------- | ------------------------------------------------------------------------------------------------------------------------------ |
| `FEATURE_SEP` | `str` | The FEATS feature separator. Default is `|`. | | `FEATURE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) feature separator. Default is `|`. ~~str~~ |
| `FIELD_SEP` | `str` | The FEATS field separator. Default is `=`. | | `FIELD_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) field separator. Default is `=`. ~~str~~ |
| `VALUE_SEP` | `str` | The FEATS value separator. Default is `,`. | | `VALUE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) value separator. Default is `,`. ~~str~~ |
## MorphAnalysis {#morphanalysis tag="class" source="spacy/tokens/morphanalysis.pyx"}
Stores a single morphological analysis.
### MorphAnalysis.\_\_init\_\_ {#morphanalysis-init tag="method"}
Initialize a MorphAnalysis object from a Universal Dependencies
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
string or a dictionary of morphological features.
> #### Example
>
> ```python
> from spacy.tokens import MorphAnalysis
>
> feats = "Feat1=Val1|Feat2=Val2"
> m = MorphAnalysis(nlp.vocab, feats)
> ```
| Name | Description |
| ---------- | ---------------------------------------------------------- |
| `vocab` | The vocab. ~~Vocab~~ |
| `features` | The morphological features. ~~Union[Dict[str, str], str]~~ |
### MorphAnalysis.\_\_contains\_\_ {#morphanalysis-contains tag="method"}
Whether a feature/value pair is in the analysis.
> #### Example
>
> ```python
> feats = "Feat1=Val1,Val2|Feat2=Val2"
> morph = MorphAnalysis(nlp.vocab, feats)
> assert "Feat1=Val1" in morph
> ```
| Name | Description |
| ----------- | --------------------------------------------- |
| **RETURNS** | A feature/value pair in the analysis. ~~str~~ |
### MorphAnalysis.\_\_iter\_\_ {#morphanalysis-iter tag="method"}
Iterate over the feature/value pairs in the analysis.
> #### Example
>
> ```python
> feats = "Feat1=Val1,Val3|Feat2=Val2"
> morph = MorphAnalysis(nlp.vocab, feats)
> assert list(morph) == ["Feat1=Va1", "Feat1=Val3", "Feat2=Val2"]
> ```
| Name | Description |
| ---------- | --------------------------------------------- |
| **YIELDS** | A feature/value pair in the analysis. ~~str~~ |
### MorphAnalysis.\_\_len\_\_ {#morphanalysis-len tag="method"}
Returns the number of features in the analysis.
> #### Example
>
> ```python
> feats = "Feat1=Val1,Val2|Feat2=Val2"
> morph = MorphAnalysis(nlp.vocab, feats)
> assert len(morph) == 3
> ```
| Name | Description |
| ----------- | ----------------------------------------------- |
| **RETURNS** | The number of features in the analysis. ~~int~~ |
### MorphAnalysis.\_\_str\_\_ {#morphanalysis-str tag="method"}
Returns the morphological analysis in the Universal Dependencies
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
string format.
> #### Example
>
> ```python
> feats = "Feat1=Val1,Val2|Feat2=Val2"
> morph = MorphAnalysis(nlp.vocab, feats)
> assert str(morph) == feats
> ```
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| **RETURNS** | The analysis in the Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
### MorphAnalysis.get {#morphanalysis-get tag="method"}
Retrieve values for a feature by field.
> #### Example
>
> ```python
> feats = "Feat1=Val1,Val2"
> morph = MorphAnalysis(nlp.vocab, feats)
> assert morph.get("Feat1") == ["Val1", "Val2"]
> ```
| Name | Description |
| ----------- | ------------------------------------------------ |
| `field` | The field to retrieve. ~~str~~ |
| **RETURNS** | A list of the individual features. ~~List[str]~~ |
### MorphAnalysis.to_dict {#morphanalysis-to_dict tag="method"}
Produce a dict representation of the analysis, in the same format as the tag
map.
> #### Example
>
> ```python
> feats = "Feat1=Val1,Val2|Feat2=Val2"
> morph = MorphAnalysis(nlp.vocab, feats)
> assert morph.to_dict() == {"Feat1": "Val1,Val2", "Feat2": "Val2"}
> ```
| Name | Description |
| ----------- | ----------------------------------------------------------- |
| **RETURNS** | The dict representation of the analysis. ~~Dict[str, str]~~ |
### MorphAnalysis.from_id {#morphanalysis-from_id tag="classmethod"}
Create a morphological analysis from a given hash ID.
> #### Example
>
> ```python
> feats = "Feat1=Val1|Feat2=Val2"
> hash = nlp.vocab.strings[feats]
> morph = MorphAnalysis.from_id(nlp.vocab, hash)
> assert str(morph) == feats
> ```
| Name | Description |
| ------- | ---------------------------------------- |
| `vocab` | The vocab. ~~Vocab~~ |
| `key` | The hash of the features string. ~~int~~ |

View File

@ -36,11 +36,11 @@ be shown.
> matcher = PhraseMatcher(nlp.vocab) > matcher = PhraseMatcher(nlp.vocab)
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- | | --------------------------------------- | ------------------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. | | `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ |
| `attr` <Tag variant="new">2.1</Tag> | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. | | `attr` <Tag variant="new">2.1</Tag> | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. ~~Union[int, str]~~ |
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate patterns added to the matcher. | | `validate` <Tag variant="new">2.1</Tag> | Validate patterns added to the matcher. ~~bool~~ |
## PhraseMatcher.\_\_call\_\_ {#call tag="method"} ## PhraseMatcher.\_\_call\_\_ {#call tag="method"}
@ -57,10 +57,10 @@ Find all token sequences matching the supplied patterns on the `Doc`.
> matches = matcher(doc) > matches = matcher(doc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------- | ----------------------------------- |
| `doc` | `Doc` | The document to match over. | | `doc` | The document to match over. ~~Doc~~ |
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. | | **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
<Infobox title="Note on retrieving the string representation of the match_id" variant="warning"> <Infobox title="Note on retrieving the string representation of the match_id" variant="warning">
@ -87,11 +87,13 @@ Match a stream of documents, yielding them in turn.
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------ | -------- | --------------------------------------------------------- | | --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | iterable | A stream of documents. | | `docs` | A stream of documents. ~~Iterable[Doc]~~ |
| `batch_size` | int | The number of documents to accumulate into a working set. | | `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
| **YIELDS** | `Doc` | Documents, in order. | | `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
| **YIELDS** | Documents and optional matches or context in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
## PhraseMatcher.\_\_len\_\_ {#len tag="method"} ## PhraseMatcher.\_\_len\_\_ {#len tag="method"}
@ -108,9 +110,9 @@ patterns.
> assert len(matcher) == 1 > assert len(matcher) == 1
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------- | | ----------- | ---------------------------- |
| **RETURNS** | int | The number of rules. | | **RETURNS** | The number of rules. ~~int~~ |
## PhraseMatcher.\_\_contains\_\_ {#contains tag="method"} ## PhraseMatcher.\_\_contains\_\_ {#contains tag="method"}
@ -125,10 +127,10 @@ Check whether the matcher contains rules for a match ID.
> assert "OBAMA" in matcher > assert "OBAMA" in matcher
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ----------------------------------------------------- | | ----------- | -------------------------------------------------------------- |
| `key` | str | The match ID. | | `key` | The match ID. ~~str~~ |
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | | **RETURNS** | Whether the matcher contains rules for this match ID. ~~bool~~ |
## PhraseMatcher.add {#add tag="method"} ## PhraseMatcher.add {#add tag="method"}
@ -165,12 +167,12 @@ patterns = [nlp("health care reform"), nlp("healthcare reform")]
</Infobox> </Infobox>
| Name | Type | Description | | Name | Description |
| -------------- | ------------------ | --------------------------------------------------------------------------------------------- | | -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `match_id` | str | An ID for the thing you're matching. | | `match_id` | str | An ID for the thing you're matching. ~~str~~ |
| `docs` | list | `Doc` objects of the phrases to match. | | `docs` | `Doc` objects of the phrases to match. ~~List[Doc]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[Matcher, Doc, int, List[tuple], Any]]~~ |
## PhraseMatcher.remove {#remove tag="method" new="2.2"} ## PhraseMatcher.remove {#remove tag="method" new="2.2"}
@ -187,6 +189,6 @@ does not exist.
> assert "OBAMA" not in matcher > assert "OBAMA" not in matcher
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----- | ---- | ------------------------- | | ----- | --------------------------------- |
| `key` | str | The ID of the match rule. | | `key` | The ID of the match rule. ~~str~~ |

View File

@ -45,12 +45,12 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#create_pipe). [`nlp.add_pipe`](/api/language#create_pipe).
| Name | Type | Description | | Name | Description |
| ------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------- | | ------- | ------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], Any]~~ |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| `**cfg` | | Additional config parameters and settings. Will be available as the dictionary `Pipe.cfg` and is serialized with the component. | | `**cfg` | Additional config parameters and settings. Will be available as the dictionary `Pipe.cfg` and is serialized with the component. |
## Pipe.\_\_call\_\_ {#call tag="method"} ## Pipe.\_\_call\_\_ {#call tag="method"}
@ -70,10 +70,10 @@ and all pipeline components are applied to the `Doc` in order. Both
> processed = pipe(doc) > processed = pipe(doc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------ | | ----------- | -------------------------------- |
| `doc` | `Doc` | The document to process. | | `doc` | The document to process. ~~Doc~~ |
| **RETURNS** | `Doc` | The processed document. | | **RETURNS** | The processed document. ~~Doc~~ |
## Pipe.pipe {#pipe tag="method"} ## Pipe.pipe {#pipe tag="method"}
@ -91,12 +91,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ----------------------------------------------------- | | -------------- | ------------------------------------------------------------- |
| `stream` | `Iterable[Doc]` | A stream of documents. | | `stream` | A stream of documents. ~~Iterable[Doc]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `batch_size` | int | The number of documents to buffer. Defaults to `128`. | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | `Doc` | The processed documents in order. | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Pipe.begin_training {#begin_training tag="method"} ## Pipe.begin_training {#begin_training tag="method"}
@ -116,13 +116,13 @@ setting up the label scheme based on the data.
> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | | `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/pipe#create_optimizer) if not set. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## Pipe.predict {#predict tag="method"} ## Pipe.predict {#predict tag="method"}
@ -142,10 +142,10 @@ This method needs to be overwritten with your own custom `predict` method.
> scores = pipe.predict([doc1, doc2]) > scores = pipe.predict([doc1, doc2])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------- | ----------------------------------------- | | ----------- | ------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to predict. | | `docs` | The documents to predict. ~~Iterable[Doc]~~ |
| **RETURNS** | - | The model's prediction for each document. | | **RETURNS** | The model's prediction for each document. |
## Pipe.set_annotations {#set_annotations tag="method"} ## Pipe.set_annotations {#set_annotations tag="method"}
@ -166,10 +166,10 @@ method.
> pipe.set_annotations(docs, scores) > pipe.set_annotations(docs, scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | --------------- | ---------------------------------------------- | | -------- | ------------------------------------------------ |
| `docs` | `Iterable[Doc]` | The documents to modify. | | `docs` | The documents to modify. ~~Iterable[Doc]~~ |
| `scores` | - | The scores to set, produced by `Pipe.predict`. | | `scores` | The scores to set, produced by `Tagger.predict`. |
## Pipe.update {#update tag="method"} ## Pipe.update {#update tag="method"}
@ -184,15 +184,15 @@ predictions and gold-standard annotations, and update the component's model.
> losses = pipe.update(examples, sgd=optimizer) > losses = pipe.update(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/pipe#set_annotations). | | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Pipe.rehearse {#rehearse tag="method,experimental" new="3"} ## Pipe.rehearse {#rehearse tag="method,experimental" new="3"}
@ -208,14 +208,14 @@ the "catastrophic forgetting" problem. This feature is experimental.
> losses = pipe.rehearse(examples, sgd=optimizer) > losses = pipe.rehearse(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Pipe.get_loss {#get_loss tag="method"} ## Pipe.get_loss {#get_loss tag="method"}
@ -230,11 +230,11 @@ predicted scores.
> loss, d_loss = ner.get_loss(examples, scores) > loss, d_loss = ner.get_loss(examples, scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------- | --------------------------------------------------- | | ----------- | --------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The batch of examples. | | `examples` | The batch of examples. ~~Iterable[Example]~~ |
| `scores` | | Scores representing the model's predictions. | | `scores` | Scores representing the model's predictions. |
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
## Pipe.score {#score tag="method" new="3"} ## Pipe.score {#score tag="method" new="3"}
@ -246,10 +246,10 @@ Score a batch of examples.
> scores = pipe.score(examples) > scores = pipe.score(examples)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------------- | --------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The examples to score. | | `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | `Dict[str, Any]` | The scores, e.g. produced by the [`Scorer`](/api/scorer). | | **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Pipe.create_optimizer {#create_optimizer tag="method"} ## Pipe.create_optimizer {#create_optimizer tag="method"}
@ -263,26 +263,9 @@ Create an optimizer for the pipeline component. Defaults to
> optimizer = pipe.create_optimizer() > optimizer = pipe.create_optimizer()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------------------------------------- | -------------- | | ----------- | ---------------------------- |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## Pipe.add_label {#add_label tag="method"}
Add a new label to the pipe. It's possible to extend pretrained models with new
labels, but care should be taken to avoid the "catastrophic forgetting" problem.
> #### Example
>
> ```python
> pipe = nlp.add_pipe("your_custom_pipe")
> pipe.add_label("MY_LABEL")
> ```
| Name | Type | Description |
| ----------- | ---- | --------------------------------------------------- |
| `label` | str | The label to add. |
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
## Pipe.use_params {#use_params tag="method, contextmanager"} ## Pipe.use_params {#use_params tag="method, contextmanager"}
@ -297,9 +280,26 @@ context, the original parameters are restored.
> pipe.to_disk("/best_model") > pipe.to_disk("/best_model")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---- | ----------------------------------------- | | -------- | -------------------------------------------------- |
| `params` | dict | The parameter values to use in the model. | | `params` | The parameter values to use in the model. ~~dict~~ |
## Pipe.add_label {#add_label tag="method"}
Add a new label to the pipe. It's possible to extend pretrained models with new
labels, but care should be taken to avoid the "catastrophic forgetting" problem.
> #### Example
>
> ```python
> pipe = nlp.add_pipe("your_custom_pipe")
> pipe.add_label("MY_LABEL")
> ```
| Name | Description |
| ----------- | ----------------------------------------------------------- |
| `label` | The label to add. ~~str~~ |
| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
## Pipe.to_disk {#to_disk tag="method"} ## Pipe.to_disk {#to_disk tag="method"}
@ -312,11 +312,11 @@ Serialize the pipe to disk.
> pipe.to_disk("/path/to/pipe") > pipe.to_disk("/path/to/pipe")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## Pipe.from_disk {#from_disk tag="method"} ## Pipe.from_disk {#from_disk tag="method"}
@ -329,12 +329,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
> pipe.from_disk("/path/to/pipe") > pipe.from_disk("/path/to/pipe")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Pipe` | The modified pipe. | | **RETURNS** | The modified pipe. ~~Pipe~~ |
## Pipe.to_bytes {#to_bytes tag="method"} ## Pipe.to_bytes {#to_bytes tag="method"}
@ -347,11 +347,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | bytes | The serialized form of the pipe. | | **RETURNS** | The serialized form of the pipe. ~~bytes~~ |
## Pipe.from_bytes {#from_bytes tag="method"} ## Pipe.from_bytes {#from_bytes tag="method"}
@ -365,21 +365,21 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> pipe.from_bytes(pipe_bytes) > pipe.from_bytes(pipe_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Pipe` | The pipe. | | **RETURNS** | The pipe. ~~Pipe~~ |
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Description |
| ------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------- | | ------- | ------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | [`Vocab`](/api/vocab) | The shared vocabulary that's passed in on initialization. | | `vocab` | The shared vocabulary that's passed in on initialization. ~~Vocab~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model powering the component. | | `model` | The model powering the component. ~~Model[List[Doc], Any]~~ |
| `name` | str | The name of the component instance in the pipeline. Can be used in the losses. | | `name` | The name of the component instance in the pipeline. Can be used in the losses. ~~str~~ |
| `cfg` | dict | Keyword arguments passed to [`Pipe.__init__`](/api/pipe#init). Will be serialized with the component. | | `cfg` | Keyword arguments passed to [`Pipe.__init__`](/api/pipe#init). Will be serialized with the component. ~~Dict[str, Any]~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -33,10 +33,10 @@ all other components.
</Infobox> </Infobox>
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------------------------------------------ | | ----------- | -------------------------------------------------------------------- |
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | | `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
| **RETURNS** | `Doc` | The modified `Doc` with merged noun chunks. | | **RETURNS** | The modified `Doc` with merged noun chunks. ~~Doc~~ |
## merge_entities {#merge_entities tag="function"} ## merge_entities {#merge_entities tag="function"}
@ -63,10 +63,10 @@ components to the end of the pipeline and after all other components.
</Infobox> </Infobox>
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------------------------------------------ | | ----------- | -------------------------------------------------------------------- |
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | | `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
| **RETURNS** | `Doc` | The modified `Doc` with merged entities. | | **RETURNS** | The modified `Doc` with merged entities. ~~Doc~~ |
## merge_subtokens {#merge_subtokens tag="function" new="2.1"} ## merge_subtokens {#merge_subtokens tag="function" new="2.1"}
@ -102,8 +102,8 @@ end of the pipeline and after all other components.
</Infobox> </Infobox>
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------------------------------------------ | | ----------- | -------------------------------------------------------------------- |
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | | `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
| `label` | str | The subtoken dependency label. Defaults to `"subtok"`. | | `label` | The subtoken dependency label. Defaults to `"subtok"`. ~~str~~ |
| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. | | **RETURNS** | The modified `Doc` with merged subtokens. ~~Doc~~ |

View File

@ -27,9 +27,9 @@ Create a new `Scorer`.
> scorer = Scorer(nlp) > scorer = Scorer(nlp)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. | | `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. ~~Language~~ |
## Scorer.score {#score tag="method"} ## Scorer.score {#score tag="method"}
@ -55,10 +55,10 @@ attribute being scored:
> scores = scorer.score(examples) > scores = scorer.score(examples)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------------- | --------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| **RETURNS** | `Dict` | A dictionary of scores. | | **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Scorer.score_tokenization {#score_tokenization tag="staticmethod" new="3"} ## Scorer.score_tokenization {#score_tokenization tag="staticmethod" new="3"}
@ -74,10 +74,10 @@ Scores the tokenization:
> scores = Scorer.score_tokenization(examples) > scores = Scorer.score_tokenization(examples)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------------- | --------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. | | **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. ~~Dict[str, float]]~~ |
## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"} ## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"}
@ -90,18 +90,19 @@ Scores a single token attribute.
> print(scores["pos_acc"]) > print(scores["pos_acc"])
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | `str` | The attribute to score. | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | | `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| **RETURNS** | `Dict[str, float]` | A dictionary containing the score `{attr}_acc`. | | **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ |
## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"} ## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"}
Scores a single token attribute per feature for a token attribute in Scores a single token attribute per feature for a token attribute in the
[UFEATS](https://universaldependencies.org/format.html#morphological-annotation) Universal Dependencies
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
format. format.
> #### Example > #### Example
@ -111,13 +112,13 @@ format.
> print(scores["morph_per_feat"]) > print(scores["morph_per_feat"])
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | `str` | The attribute to score. | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | | `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| **RETURNS** | `Dict` | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. | | **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
## Scorer.score_spans {#score_spans tag="staticmethod" new="3"} ## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}
@ -130,13 +131,13 @@ Returns PRF scores for labeled or unlabeled spans.
> print(scores["ents_f"]) > print(scores["ents_f"])
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | `str` | The attribute to score. | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. | | `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ |
| **RETURNS** | `Dict` | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. | | **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Scorer.score_deps {#score_deps tag="staticmethod" new="3"} ## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}
@ -159,16 +160,16 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses.
> print(scores["dep_uas"], scores["dep_las"]) > print(scores["dep_uas"], scores["dep_las"])
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- | | --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | `str` | The attribute containing the dependency label. | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. | | `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
| `head_attr` | `str` | The attribute containing the head token. | | `head_attr` | The attribute containing the head token. ~~str~~ |
| `head_getter` | `callable` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. | | `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ |
| `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`). | | `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ |
| **RETURNS** | `Dict` | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. | | **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## Scorer.score_cats {#score_cats tag="staticmethod" new="3"} ## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
@ -195,13 +196,13 @@ depends on the scorer settings:
> print(scores["cats_macro_auc"]) > print(scores["cats_macro_auc"])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------- | | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. | | `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
| `attr` | `str` | The attribute to score. | | `attr` | The attribute to score. ~~str~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. | | `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ |
| labels | `Iterable[str]` | The set of possible labels. Defaults to `[]`. | | labels | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~ |
| `multi_label` | `bool` | Whether the attribute allows multiple labels. Defaults to `True`. | | `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ |
| `positive_label` | `str` | The positive label for a binary task with exclusive classes. Defaults to `None`. | | `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ |
| **RETURNS** | `Dict` | A dictionary containing the scores, with inapplicable scores as `None`. | | **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ |

View File

@ -29,9 +29,9 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("senter", config=config) > nlp.add_pipe("senter", config=config)
> ``` > ```
| Setting | Type | Description | Default | | Setting | Description |
| ------- | ------------------------------------------ | ----------------- | ----------------------------------- | | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [Tagger](/api/architectures#Tagger) | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/senter.pyx https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/senter.pyx
@ -60,11 +60,11 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe).
| Name | Type | Description | | Name | Description |
| ------- | ------- | ------------------------------------------------------------------------------------------- | | ------- | -------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
## SentenceRecognizer.\_\_call\_\_ {#call tag="method"} ## SentenceRecognizer.\_\_call\_\_ {#call tag="method"}
@ -85,10 +85,10 @@ and all pipeline components are applied to the `Doc` in order. Both
> processed = senter(doc) > processed = senter(doc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------ | | ----------- | -------------------------------- |
| `doc` | `Doc` | The document to process. | | `doc` | The document to process. ~~Doc~~ |
| **RETURNS** | `Doc` | The processed document. | | **RETURNS** | The processed document. ~~Doc~~ |
## SentenceRecognizer.pipe {#pipe tag="method"} ## SentenceRecognizer.pipe {#pipe tag="method"}
@ -107,12 +107,12 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------ | | -------------- | ------------------------------------------------------------- |
| `stream` | `Iterable[Doc]` | A stream of documents. | | `stream` | A stream of documents. ~~Iterable[Doc]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | `Doc` | Processed documents in the order of the original text. | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## SentenceRecognizer.begin_training {#begin_training tag="method"} ## SentenceRecognizer.begin_training {#begin_training tag="method"}
@ -132,13 +132,13 @@ setting up the label scheme based on the data.
> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | | `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/sentencerecognizer#create_optimizer) if not set. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## SentenceRecognizer.predict {#predict tag="method"} ## SentenceRecognizer.predict {#predict tag="method"}
@ -152,10 +152,10 @@ modifying them.
> scores = senter.predict([doc1, doc2]) > scores = senter.predict([doc1, doc2])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------- | ----------------------------------------- | | ----------- | ------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to predict. | | `docs` | The documents to predict. ~~Iterable[Doc]~~ |
| **RETURNS** | - | The model's prediction for each document. | | **RETURNS** | The model's prediction for each document. |
## SentenceRecognizer.set_annotations {#set_annotations tag="method"} ## SentenceRecognizer.set_annotations {#set_annotations tag="method"}
@ -169,10 +169,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
> senter.set_annotations([doc1, doc2], scores) > senter.set_annotations([doc1, doc2], scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | --------------- | ------------------------------------------------------------ | | -------- | ------------------------------------------------------------ |
| `docs` | `Iterable[Doc]` | The documents to modify. | | `docs` | The documents to modify. ~~Iterable[Doc]~~ |
| `scores` | - | The scores to set, produced by `SentenceRecognizer.predict`. | | `scores` | The scores to set, produced by `SentenceRecognizer.predict`. |
## SentenceRecognizer.update {#update tag="method"} ## SentenceRecognizer.update {#update tag="method"}
@ -189,15 +189,15 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
> losses = senter.update(examples, sgd=optimizer) > losses = senter.update(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/sentencerecognizer#set_annotations). | | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"} ## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"}
@ -213,14 +213,14 @@ the "catastrophic forgetting" problem. This feature is experimental.
> losses = senter.rehearse(examples, sgd=optimizer) > losses = senter.rehearse(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## SentenceRecognizer.get_loss {#get_loss tag="method"} ## SentenceRecognizer.get_loss {#get_loss tag="method"}
@ -235,11 +235,11 @@ predicted scores.
> loss, d_loss = senter.get_loss(examples, scores) > loss, d_loss = senter.get_loss(examples, scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------- | --------------------------------------------------- | | ----------- | --------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The batch of examples. | | `examples` | The batch of examples. ~~Iterable[Example]~~ |
| `scores` | - | Scores representing the model's predictions. | | `scores` | Scores representing the model's predictions. |
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
## SentenceRecognizer.score {#score tag="method" new="3"} ## SentenceRecognizer.score {#score tag="method" new="3"}
@ -251,10 +251,10 @@ Score a batch of examples.
> scores = senter.score(examples) > scores = senter.score(examples)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------------- | ------------------------------------------------------------------------ | | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The examples to score. | | `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). | | **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. ~~Dict[str, float]~~ |
## SentenceRecognizer.create_optimizer {#create_optimizer tag="method"} ## SentenceRecognizer.create_optimizer {#create_optimizer tag="method"}
@ -267,9 +267,9 @@ Create an optimizer for the pipeline component.
> optimizer = senter.create_optimizer() > optimizer = senter.create_optimizer()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------------------------------------- | -------------- | | ----------- | ---------------------------- |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## SentenceRecognizer.use_params {#use_params tag="method, contextmanager"} ## SentenceRecognizer.use_params {#use_params tag="method, contextmanager"}
@ -284,9 +284,9 @@ context, the original parameters are restored.
> senter.to_disk("/best_model") > senter.to_disk("/best_model")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---- | ----------------------------------------- | | -------- | -------------------------------------------------- |
| `params` | dict | The parameter values to use in the model. | | `params` | The parameter values to use in the model. ~~dict~~ |
## SentenceRecognizer.to_disk {#to_disk tag="method"} ## SentenceRecognizer.to_disk {#to_disk tag="method"}
@ -299,11 +299,11 @@ Serialize the pipe to disk.
> senter.to_disk("/path/to/senter") > senter.to_disk("/path/to/senter")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## SentenceRecognizer.from_disk {#from_disk tag="method"} ## SentenceRecognizer.from_disk {#from_disk tag="method"}
@ -316,12 +316,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
> senter.from_disk("/path/to/senter") > senter.from_disk("/path/to/senter")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | -------------------- | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `SentenceRecognizer` | The modified `SentenceRecognizer` object. | | **RETURNS** | The modified `SentenceRecognizer` object. ~~SentenceRecognizer~~ |
## SentenceRecognizer.to_bytes {#to_bytes tag="method"} ## SentenceRecognizer.to_bytes {#to_bytes tag="method"}
@ -334,11 +334,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | bytes | The serialized form of the `SentenceRecognizer` object. | | **RETURNS** | The serialized form of the `SentenceRecognizer` object. ~~bytes~~ |
## SentenceRecognizer.from_bytes {#from_bytes tag="method"} ## SentenceRecognizer.from_bytes {#from_bytes tag="method"}
@ -352,12 +352,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> senter.from_bytes(senter_bytes) > senter.from_bytes(senter_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | -------------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `SentenceRecognizer` | The `SentenceRecognizer` object. | | **RETURNS** | The `SentenceRecognizer` object. ~~SentenceRecognizer~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -28,9 +28,9 @@ how the component should be configured. You can override its settings via the
> nlp.add_pipe("entity_ruler", config=config) > nlp.add_pipe("entity_ruler", config=config)
> ``` > ```
| Setting | Type | Description | Default | | Setting | Description |
| ------------- | ----------- | ---------------------------------------------------------------------------------------------------------- | ------- | | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `punct_chars` | `List[str]` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. | `None` | | `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/sentencizer.pyx https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/sentencizer.pyx
@ -51,10 +51,10 @@ Initialize the sentencizer.
> sentencizer = Sentencizer() > sentencizer = Sentencizer()
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ----------- | ----------------------------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | | |
| `punct_chars` | `List[str]` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. | | `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~ |
```python ```python
### punct_chars defaults ### punct_chars defaults
@ -87,10 +87,10 @@ the component has been added to the pipeline using
> assert len(list(doc.sents)) == 2 > assert len(list(doc.sents)) == 2
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------------------------------------------ | | ----------- | -------------------------------------------------------------------- |
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | | `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
| **RETURNS** | `Doc` | The modified `Doc` with added sentence boundaries. | | **RETURNS** | The modified `Doc` with added sentence boundaries. ~~Doc~~ |
## Sentencizer.pipe {#pipe tag="method"} ## Sentencizer.pipe {#pipe tag="method"}
@ -106,12 +106,12 @@ applied to the `Doc` in order.
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ----------------------------------------------------- | | -------------- | ------------------------------------------------------------- |
| `stream` | `Iterable[Doc]` | A stream of documents. | | `stream` | A stream of documents. ~~Iterable[Doc]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `batch_size` | int | The number of documents to buffer. Defaults to `128`. | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | `Doc` | The processed documents in order. | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Sentencizer.score {#score tag="method" new="3"} ## Sentencizer.score {#score tag="method" new="3"}
@ -123,10 +123,10 @@ Score a batch of examples.
> scores = sentencizer.score(examples) > scores = sentencizer.score(examples)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------------- | ------------------------------------------------------------------------ | | ----------- | --------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The examples to score. | | `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). | | **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]~~ |
## Sentencizer.to_disk {#to_disk tag="method"} ## Sentencizer.to_disk {#to_disk tag="method"}
@ -142,9 +142,9 @@ a file `sentencizer.json`. This also happens automatically when you save an
> sentencizer.to_disk("/path/to/sentencizer.json") > sentencizer.to_disk("/path/to/sentencizer.json")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | | ------ | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a JSON file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a JSON file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
## Sentencizer.from_disk {#from_disk tag="method"} ## Sentencizer.from_disk {#from_disk tag="method"}
@ -159,10 +159,10 @@ added to its pipeline.
> sentencizer.from_disk("/path/to/sentencizer.json") > sentencizer.from_disk("/path/to/sentencizer.json")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------- | -------------------------------------------------------------------------- | | ----------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. | | `path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. | | **RETURNS** | The modified `Sentencizer` object. ~~Sentencizer~~ |
## Sentencizer.to_bytes {#to_bytes tag="method"} ## Sentencizer.to_bytes {#to_bytes tag="method"}
@ -176,9 +176,9 @@ Serialize the sentencizer settings to a bytestring.
> sentencizer_bytes = sentencizer.to_bytes() > sentencizer_bytes = sentencizer.to_bytes()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | -------------------- | | ----------- | ------------------------------ |
| **RETURNS** | bytes | The serialized data. | | **RETURNS** | The serialized data. ~~bytes~~ |
## Sentencizer.from_bytes {#from_bytes tag="method"} ## Sentencizer.from_bytes {#from_bytes tag="method"}
@ -192,7 +192,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> sentencizer.from_bytes(sentencizer_bytes) > sentencizer.from_bytes(sentencizer_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------ | ------------- | ---------------------------------- | | ------------ | -------------------------------------------------- |
| `bytes_data` | bytes | The bytestring to load. | | `bytes_data` | The bytestring to load. ~~bytes~~ |
| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. | | **RETURNS** | The modified `Sentencizer` object. ~~Sentencizer~~ |

View File

@ -18,14 +18,14 @@ Create a Span object from the slice `doc[start : end]`.
> assert [t.text for t in span] == ["it", "back", "!"] > assert [t.text for t in span] == ["it", "back", "!"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- | | -------- | --------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The parent document. | | `doc` | The parent document. ~~Doc~~ |
| `start` | int | The index of the first token of the span. | | `start` | The index of the first token of the span. ~~int~~ |
| `end` | int | The index of the first token after the span. | | `end` | The index of the first token after the span. ~~int~~ |
| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[str, int]~~ |
| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. | | `kb_id` | A knowledge base ID to attach to the span, e.g. for named entities. ~~Union[str, int]~~ |
| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
## Span.\_\_getitem\_\_ {#getitem tag="method"} ## Span.\_\_getitem\_\_ {#getitem tag="method"}
@ -39,10 +39,10 @@ Get a `Token` object.
> assert span[1].text == "back" > assert span[1].text == "back"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------- | --------------------------------------- | | ----------- | ----------------------------------------------- |
| `i` | int | The index of the token within the span. | | `i` | The index of the token within the span. ~~int~~ |
| **RETURNS** | `Token` | The token at `span[i]`. | | **RETURNS** | The token at `span[i]`. ~~Token~~ |
Get a `Span` object. Get a `Span` object.
@ -54,10 +54,10 @@ Get a `Span` object.
> assert span[1:3].text == "back!" > assert span[1:3].text == "back!"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------ | -------------------------------- | | ----------- | ------------------------------------------------- |
| `start_end` | tuple | The slice of the span to get. | | `start_end` | The slice of the span to get. ~~Tuple[int, int]~~ |
| **RETURNS** | `Span` | The span at `span[start : end]`. | | **RETURNS** | The span at `span[start : end]`. ~~Span~~ |
## Span.\_\_iter\_\_ {#iter tag="method"} ## Span.\_\_iter\_\_ {#iter tag="method"}
@ -71,9 +71,9 @@ Iterate over `Token` objects.
> assert [t.text for t in span] == ["it", "back", "!"] > assert [t.text for t in span] == ["it", "back", "!"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ------- | ----------------- | | ---------- | --------------------------- |
| **YIELDS** | `Token` | A `Token` object. | | **YIELDS** | A `Token` object. ~~Token~~ |
## Span.\_\_len\_\_ {#len tag="method"} ## Span.\_\_len\_\_ {#len tag="method"}
@ -87,9 +87,9 @@ Get the number of tokens in the span.
> assert len(span) == 3 > assert len(span) == 3
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | --------------------------------- | | ----------- | ----------------------------------------- |
| **RETURNS** | int | The number of tokens in the span. | | **RETURNS** | The number of tokens in the span. ~~int~~ |
## Span.set_extension {#set_extension tag="classmethod" new="2"} ## Span.set_extension {#set_extension tag="classmethod" new="2"}
@ -107,14 +107,14 @@ For details, see the documentation on
> assert doc[1:4]._.has_city > assert doc[1:4]._.has_city
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- | | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `span._.my_attr`. | | `name` | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `span._.my_attr`. ~~str~~ |
| `default` | - | Optional default value of the attribute if no getter or method is defined. | | `default` | Optional default value of the attribute if no getter or method is defined. ~~Optional[Any]~~ |
| `method` | callable | Set a custom method on the object, for example `span._.compare(other_span)`. | | `method` | Set a custom method on the object, for example `span._.compare(other_span)`. ~~Optional[Callable[[Span, ...], Any]]~~ |
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | | `getter` | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. ~~Optional[Callable[[Span], Any]]~~ |
| `setter` | callable | Setter function that takes the `Span` and a value, and modifies the object. Is called when the user writes to the `Span._` attribute. | | `setter` | Setter function that takes the `Span` and a value, and modifies the object. Is called when the user writes to the `Span._` attribute. ~~Optional[Callable[[Span, Any], None]]~~ |
| `force` | bool | Force overwriting existing attribute. | | `force` | Force overwriting existing attribute. ~~bool~~ |
## Span.get_extension {#get_extension tag="classmethod" new="2"} ## Span.get_extension {#get_extension tag="classmethod" new="2"}
@ -131,10 +131,10 @@ Look up a previously registered extension by name. Returns a 4-tuple
> assert extension == (False, None, None, None) > assert extension == (False, None, None, None)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | Name of the extension. | | `name` | Name of the extension. ~~str~~ |
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | | **RETURNS** | A `(default, method, getter, setter)` tuple of the extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ |
## Span.has_extension {#has_extension tag="classmethod" new="2"} ## Span.has_extension {#has_extension tag="classmethod" new="2"}
@ -148,10 +148,10 @@ Check whether an extension has been registered on the `Span` class.
> assert Span.has_extension("is_city") > assert Span.has_extension("is_city")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------------ | | ----------- | --------------------------------------------------- |
| `name` | str | Name of the extension to check. | | `name` | Name of the extension to check. ~~str~~ |
| **RETURNS** | bool | Whether the extension has been registered. | | **RETURNS** | Whether the extension has been registered. ~~bool~~ |
## Span.remove_extension {#remove_extension tag="classmethod" new="2.0.12"} ## Span.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
@ -166,10 +166,10 @@ Remove a previously registered extension.
> assert not Span.has_extension("is_city") > assert not Span.has_extension("is_city")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | --------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | Name of the extension. | | `name` | Name of the extension. ~~str~~ |
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | | **RETURNS** | A `(default, method, getter, setter)` tuple of the removed extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ |
## Span.char_span {#char_span tag="method" new="2.2.4"} ## Span.char_span {#char_span tag="method" new="2.2.4"}
@ -184,14 +184,14 @@ the character indices don't map to a valid span.
> assert span.text == "New York" > assert span.text == "New York"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---------------------------------------- | --------------------------------------------------------------------- | | ------------------------------------ | ----------------------------------------------------------------------------------------- |
| `start` | int | The index of the first character of the span. | | `start` | The index of the first character of the span. ~~int~~ |
| `end` | int | The index of the last character after the span. | | `end` | The index of the last character after the span. ~int~~ |
| `label` | uint64 / str | A label to attach to the span, e.g. for named entities. | | `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` | uint64 / str | An ID from a knowledge base to capture the meaning of a named entity. | | `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. | | `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| **RETURNS** | `Span` | The newly constructed object or `None`. | | **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
## Span.similarity {#similarity tag="method" model="vectors"} ## Span.similarity {#similarity tag="method" model="vectors"}
@ -209,10 +209,10 @@ using an average of word vectors.
> assert apples_oranges == oranges_apples > assert apples_oranges == oranges_apples
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | -------------------------------------------------------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------------------------------------------------- |
| `other` | - | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. | | `other` | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ |
| **RETURNS** | float | A scalar similarity score. Higher is more similar. | | **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~ |
## Span.get_lca_matrix {#get_lca_matrix tag="method"} ## Span.get_lca_matrix {#get_lca_matrix tag="method"}
@ -229,9 +229,9 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
> # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32) > # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------------------------------------- | ------------------------------------------------ | | ----------- | --------------------------------------------------------------------------------------- |
| **RETURNS** | `numpy.ndarray[ndim=2, dtype="int32"]` | The lowest common ancestor matrix of the `Span`. | | **RETURNS** | The lowest common ancestor matrix of the `Span`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |
## Span.to_array {#to_array tag="method" new="2"} ## Span.to_array {#to_array tag="method" new="2"}
@ -249,10 +249,10 @@ shape `(N, M)`, where `N` is the length of the document. The values will be
> np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA]) > np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----------------------------- | -------------------------------------------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
| `attr_ids` | list | A list of attribute ID ints. | | `attr_ids` | A list of attributes (int IDs or string names) or a single attribute (int ID or string name). ~~Union[int, str, List[Union[int, str]]]~~ |
| **RETURNS** | `numpy.ndarray[long, ndim=2]` | A feature matrix, with one row per word, and one column per attribute indicated in the input `attr_ids`. | | **RETURNS** | The exported attributes as a numpy array. ~~Union[numpy.ndarray[ndim=2, dtype=uint64], numpy.ndarray[ndim=1, dtype=uint64]]~~ |
## Span.ents {#ents tag="property" new="2.0.13" model="ner"} ## Span.ents {#ents tag="property" new="2.0.13" model="ner"}
@ -270,9 +270,9 @@ if the entity recognizer has been applied.
> assert ents[0].text == "Mr. Best" > assert ents[0].text == "Mr. Best"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | -------------------------------------------- | | ----------- | ----------------------------------------------------------------- |
| **RETURNS** | tuple | Entities in the span, one `Span` per entity. | | **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
## Span.as_doc {#as_doc tag="method"} ## Span.as_doc {#as_doc tag="method"}
@ -287,10 +287,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
> assert doc2.text == "New York" > assert doc2.text == "New York"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------------- | ----- | ---------------------------------------------------- | | ---------------- | ------------------------------------------------------------- |
| `copy_user_data` | bool | Whether or not to copy the original doc's user data. | | `copy_user_data` | Whether or not to copy the original doc's user data. ~~bool~~ |
| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. | | **RETURNS** | A `Doc` object of the `Span`'s content. ~~Doc~~ |
## Span.root {#root tag="property" model="parser"} ## Span.root {#root tag="property" model="parser"}
@ -309,9 +309,9 @@ taken.
> assert new_york.root.text == "York" > assert new_york.root.text == "York"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------- | --------------- | | ----------- | ------------------------- |
| **RETURNS** | `Token` | The root token. | | **RETURNS** | The root token. ~~Token~~ |
## Span.conjuncts {#conjuncts tag="property" model="parser"} ## Span.conjuncts {#conjuncts tag="property" model="parser"}
@ -325,9 +325,9 @@ A tuple of tokens coordinated to `span.root`.
> assert [t.text for t in apples_conjuncts] == ["oranges"] > assert [t.text for t in apples_conjuncts] == ["oranges"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------- | ----------------------- | | ----------- | --------------------------------------------- |
| **RETURNS** | `tuple` | The coordinated tokens. | | **RETURNS** | The coordinated tokens. ~~Tuple[Token, ...]~~ |
## Span.lefts {#lefts tag="property" model="parser"} ## Span.lefts {#lefts tag="property" model="parser"}
@ -341,9 +341,9 @@ Tokens that are to the left of the span, whose heads are within the span.
> assert lefts == ["New"] > assert lefts == ["New"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ------- | ------------------------------------ | | ---------- | ---------------------------------------------- |
| **YIELDS** | `Token` | A left-child of a token of the span. | | **YIELDS** | A left-child of a token of the span. ~~Token~~ |
## Span.rights {#rights tag="property" model="parser"} ## Span.rights {#rights tag="property" model="parser"}
@ -357,9 +357,9 @@ Tokens that are to the right of the span, whose heads are within the span.
> assert rights == ["in"] > assert rights == ["in"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ------- | ------------------------------------- | | ---------- | ----------------------------------------------- |
| **YIELDS** | `Token` | A right-child of a token of the span. | | **YIELDS** | A right-child of a token of the span. ~~Token~~ |
## Span.n_lefts {#n_lefts tag="property" model="parser"} ## Span.n_lefts {#n_lefts tag="property" model="parser"}
@ -373,9 +373,9 @@ the span.
> assert doc[3:7].n_lefts == 1 > assert doc[3:7].n_lefts == 1
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------- | | ----------- | ---------------------------------------- |
| **RETURNS** | int | The number of left-child tokens. | | **RETURNS** | The number of left-child tokens. ~~int~~ |
## Span.n_rights {#n_rights tag="property" model="parser"} ## Span.n_rights {#n_rights tag="property" model="parser"}
@ -389,9 +389,9 @@ the span.
> assert doc[2:4].n_rights == 1 > assert doc[2:4].n_rights == 1
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | --------------------------------- | | ----------- | ----------------------------------------- |
| **RETURNS** | int | The number of right-child tokens. | | **RETURNS** | The number of right-child tokens. ~~int~~ |
## Span.subtree {#subtree tag="property" model="parser"} ## Span.subtree {#subtree tag="property" model="parser"}
@ -405,9 +405,9 @@ Tokens within the span and tokens which descend from them.
> assert subtree == ["Give", "it", "back", "!"] > assert subtree == ["Give", "it", "back", "!"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ------- | ------------------------------------------------- | | ---------- | ----------------------------------------------------------- |
| **YIELDS** | `Token` | A token within the span, or a descendant from it. | | **YIELDS** | A token within the span, or a descendant from it. ~~Token~~ |
## Span.has_vector {#has_vector tag="property" model="vectors"} ## Span.has_vector {#has_vector tag="property" model="vectors"}
@ -420,9 +420,9 @@ A boolean value indicating whether a word vector is associated with the object.
> assert doc[1:].has_vector > assert doc[1:].has_vector
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------------------- | | ----------- | ----------------------------------------------------- |
| **RETURNS** | bool | Whether the span has a vector data attached. | | **RETURNS** | Whether the span has a vector data attached. ~~bool~~ |
## Span.vector {#vector tag="property" model="vectors"} ## Span.vector {#vector tag="property" model="vectors"}
@ -437,9 +437,9 @@ vectors.
> assert doc[1:].vector.shape == (300,) > assert doc[1:].vector.shape == (300,)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---------------------------------------- | --------------------------------------------------- | | ----------- | ----------------------------------------------------------------------------------------------- |
| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the span's semantics. | | **RETURNS** | A 1-dimensional array representing the span's vector. ~~`numpy.ndarray[ndim=1, dtype=float32]~~ |
## Span.vector_norm {#vector_norm tag="property" model="vectors"} ## Span.vector_norm {#vector_norm tag="property" model="vectors"}
@ -454,31 +454,31 @@ The L2 norm of the span's vector representation.
> assert doc[1:].vector_norm != doc[2:].vector_norm > assert doc[1:].vector_norm != doc[2:].vector_norm
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ----------------------------------------- | | ----------- | --------------------------------------------------- |
| **RETURNS** | float | The L2 norm of the vector representation. | | **RETURNS** | The L2 norm of the vector representation. ~~float~~ |
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Description |
| --------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------- | | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The parent document. | | `doc` | The parent document. ~~Doc~~ |
| `tensor` <Tag variant="new">2.1.7</Tag> | `ndarray` | The span's slice of the parent `Doc`'s tensor. | | `tensor` <Tag variant="new">2.1.7</Tag> | The span's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ |
| `sent` | `Span` | The sentence span that this span is a part of. | | `sent` | The sentence span that this span is a part of. ~~Span~~ |
| `start` | int | The token offset for the start of the span. | | `start` | The token offset for the start of the span. ~~int~~ |
| `end` | int | The token offset for the end of the span. | | `end` | The token offset for the end of the span. ~~int~~ |
| `start_char` | int | The character offset for the start of the span. | | `start_char` | The character offset for the start of the span. ~~int~~ |
| `end_char` | int | The character offset for the end of the span. | | `end_char` | The character offset for the end of the span. ~~int~~ |
| `text` | str | A string representation of the span text. | | `text` | A string representation of the span text. ~~str~~ |
| `text_with_ws` | str | The text content of the span with a trailing whitespace character if the last token has one. | | `text_with_ws` | The text content of the span with a trailing whitespace character if the last token has one. ~~str~~ |
| `orth` | int | ID of the verbatim text content. | | `orth` | ID of the verbatim text content. ~~int~~ |
| `orth_` | str | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. | | `orth_` | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. ~~str~~ |
| `label` | int | The hash value of the span's label. | | `label` | The hash value of the span's label. ~~int~~ |
| `label_` | str | The span's label. | | `label_` | The span's label. ~~str~~ |
| `lemma_` | str | The span's lemma. | | `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ |
| `kb_id` | int | The hash value of the knowledge base ID referred to by the span. | | `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ |
| `kb_id_` | str | The knowledge base ID referred to by the span. | | `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ |
| `ent_id` | int | The hash value of the named entity the token is an instance of. | | `ent_id` | The hash value of the named entity the token is an instance of. ~~int~~ |
| `ent_id_` | str | The string ID of the named entity the token is an instance of. | | `ent_id_` | The string ID of the named entity the token is an instance of. ~~str~~ |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the span. | | `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | | `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |

View File

@ -19,9 +19,9 @@ Create the `StringStore`.
> stringstore = StringStore(["apple", "orange"]) > stringstore = StringStore(["apple", "orange"])
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------- | -------- | ------------------------------------------ | | --------- | ---------------------------------------------------------------------- |
| `strings` | iterable | A sequence of strings to add to the store. | | `strings` | A sequence of strings to add to the store. ~~Optional[Iterable[str]]~~ |
## StringStore.\_\_len\_\_ {#len tag="method"} ## StringStore.\_\_len\_\_ {#len tag="method"}
@ -34,9 +34,9 @@ Get the number of strings in the store.
> assert len(stringstore) == 2 > assert len(stringstore) == 2
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ----------------------------------- | | ----------- | ------------------------------------------- |
| **RETURNS** | int | The number of strings in the store. | | **RETURNS** | The number of strings in the store. ~~int~~ |
## StringStore.\_\_getitem\_\_ {#getitem tag="method"} ## StringStore.\_\_getitem\_\_ {#getitem tag="method"}
@ -51,10 +51,10 @@ Retrieve a string from a given hash, or vice versa.
> assert stringstore[apple_hash] == "apple" > assert stringstore[apple_hash] == "apple"
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | -------------------- | -------------------------- | | -------------- | ----------------------------------------------- |
| `string_or_id` | bytes, str or uint64 | The value to encode. | | `string_or_id` | The value to encode. ~~Union[bytes, str, int]~~ |
| **RETURNS** | str or int | The value to be retrieved. | | **RETURNS** | The value to be retrieved. ~~Union[str, int]~~ |
## StringStore.\_\_contains\_\_ {#contains tag="method"} ## StringStore.\_\_contains\_\_ {#contains tag="method"}
@ -68,15 +68,15 @@ Check whether a string is in the store.
> assert not "cherry" in stringstore > assert not "cherry" in stringstore
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------------- | | ----------- | ----------------------------------------------- |
| `string` | str | The string to check. | | `string` | The string to check. ~~str~~ |
| **RETURNS** | bool | Whether the store contains the string. | | **RETURNS** | Whether the store contains the string. ~~bool~~ |
## StringStore.\_\_iter\_\_ {#iter tag="method"} ## StringStore.\_\_iter\_\_ {#iter tag="method"}
Iterate over the strings in the store, in order. Note that a newly initialized Iterate over the strings in the store, in order. Note that a newly initialized
store will always include an empty string `''` at position `0`. store will always include an empty string `""` at position `0`.
> #### Example > #### Example
> >
@ -86,9 +86,9 @@ store will always include an empty string `''` at position `0`.
> assert all_strings == ["apple", "orange"] > assert all_strings == ["apple", "orange"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ---- | ---------------------- | | ---------- | ------------------------------ |
| **YIELDS** | str | A string in the store. | | **YIELDS** | A string in the store. ~~str~~ |
## StringStore.add {#add tag="method" new="2"} ## StringStore.add {#add tag="method" new="2"}
@ -105,10 +105,10 @@ Add a string to the `StringStore`.
> assert stringstore["banana"] == banana_hash > assert stringstore["banana"] == banana_hash
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------ | ------------------------ | | ----------- | -------------------------------- |
| `string` | str | The string to add. | | `string` | The string to add. ~~str~~ |
| **RETURNS** | uint64 | The string's hash value. | | **RETURNS** | The string's hash value. ~~int~~ |
## StringStore.to_disk {#to_disk tag="method" new="2"} ## StringStore.to_disk {#to_disk tag="method" new="2"}
@ -120,9 +120,9 @@ Save the current state to a directory.
> stringstore.to_disk("/path/to/strings") > stringstore.to_disk("/path/to/strings")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | | ------ | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
## StringStore.from_disk {#from_disk tag="method" new="2"} ## StringStore.from_disk {#from_disk tag="method" new="2"}
@ -135,10 +135,10 @@ Loads state from a directory. Modifies the object in place and returns it.
> stringstore = StringStore().from_disk("/path/to/strings") > stringstore = StringStore().from_disk("/path/to/strings")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------- | -------------------------------------------------------------------------- | | ----------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| **RETURNS** | `StringStore` | The modified `StringStore` object. | | **RETURNS** | The modified `StringStore` object. ~~StringStore~~ |
## StringStore.to_bytes {#to_bytes tag="method"} ## StringStore.to_bytes {#to_bytes tag="method"}
@ -150,9 +150,9 @@ Serialize the current state to a binary string.
> store_bytes = stringstore.to_bytes() > store_bytes = stringstore.to_bytes()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------------------------------ | | ----------- | ---------------------------------------------------------- |
| **RETURNS** | bytes | The serialized form of the `StringStore` object. | | **RETURNS** | The serialized form of the `StringStore` object. ~~bytes~~ |
## StringStore.from_bytes {#from_bytes tag="method"} ## StringStore.from_bytes {#from_bytes tag="method"}
@ -166,10 +166,10 @@ Load state from a binary string.
> new_store = StringStore().from_bytes(store_bytes) > new_store = StringStore().from_bytes(store_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------ | ------------- | ------------------------- | | ------------ | ----------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| **RETURNS** | `StringStore` | The `StringStore` object. | | **RETURNS** | The `StringStore` object. ~~StringStore~~ |
## Utilities {#util} ## Utilities {#util}
@ -184,7 +184,7 @@ Get a 64-bit hash for a given string.
> assert hash_string("apple") == 8566208034543834098 > assert hash_string("apple") == 8566208034543834098
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------ | ------------------- | | ----------- | --------------------------- |
| `string` | str | The string to hash. | | `string` | The string to hash. ~~str~~ |
| **RETURNS** | uint64 | The hash. | | **RETURNS** | The hash. ~~int~~ |

View File

@ -28,10 +28,10 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("tagger", config=config) > nlp.add_pipe("tagger", config=config)
> ``` > ```
| Setting | Type | Description | Default | | Setting | Description |
| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------- | | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `set_morphology` | bool | Whether to set morphological features. | `False` | | `set_morphology` | Whether to set morphological features. Defaults to `False`. ~~bool~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). | [Tagger](/api/architectures#Tagger) | | `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx
@ -58,13 +58,13 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#add_pipe). [`nlp.add_pipe`](/api/language#add_pipe).
| Name | Type | Description | | Name | Description |
| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). | | `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `set_morphology` | bool | Whether to set morphological features. | | `set_morphology` | Whether to set morphological features. ~~bool~~ |
## Tagger.\_\_call\_\_ {#call tag="method"} ## Tagger.\_\_call\_\_ {#call tag="method"}
@ -84,10 +84,10 @@ and all pipeline components are applied to the `Doc` in order. Both
> processed = tagger(doc) > processed = tagger(doc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------ | | ----------- | -------------------------------- |
| `doc` | `Doc` | The document to process. | | `doc` | The document to process. ~~Doc~~ |
| **RETURNS** | `Doc` | The processed document. | | **RETURNS** | The processed document. ~~Doc~~ |
## Tagger.pipe {#pipe tag="method"} ## Tagger.pipe {#pipe tag="method"}
@ -105,12 +105,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------ | | -------------- | ------------------------------------------------------------- |
| `stream` | `Iterable[Doc]` | A stream of documents. | | `stream` | A stream of documents. ~~Iterable[Doc]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | `Doc` | Processed documents in the order of the original text. | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Tagger.begin_training {#begin_training tag="method"} ## Tagger.begin_training {#begin_training tag="method"}
@ -130,13 +130,13 @@ setting up the label scheme based on the data.
> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | | `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/tagger#create_optimizer) if not set. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## Tagger.predict {#predict tag="method"} ## Tagger.predict {#predict tag="method"}
@ -150,10 +150,10 @@ modifying them.
> scores = tagger.predict([doc1, doc2]) > scores = tagger.predict([doc1, doc2])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------- | ----------------------------------------- | | ----------- | ------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to predict. | | `docs` | The documents to predict. ~~Iterable[Doc]~~ |
| **RETURNS** | - | The model's prediction for each document. | | **RETURNS** | The model's prediction for each document. |
## Tagger.set_annotations {#set_annotations tag="method"} ## Tagger.set_annotations {#set_annotations tag="method"}
@ -167,10 +167,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
> tagger.set_annotations([doc1, doc2], scores) > tagger.set_annotations([doc1, doc2], scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | --------------- | ------------------------------------------------ | | -------- | ------------------------------------------------ |
| `docs` | `Iterable[Doc]` | The documents to modify. | | `docs` | The documents to modify. ~~Iterable[Doc]~~ |
| `scores` | - | The scores to set, produced by `Tagger.predict`. | | `scores` | The scores to set, produced by `Tagger.predict`. |
## Tagger.update {#update tag="method"} ## Tagger.update {#update tag="method"}
@ -187,15 +187,15 @@ Delegates to [`predict`](/api/tagger#predict) and
> losses = tagger.update(examples, sgd=optimizer) > losses = tagger.update(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/tagger#set_annotations). | | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Tagger.rehearse {#rehearse tag="method,experimental" new="3"} ## Tagger.rehearse {#rehearse tag="method,experimental" new="3"}
@ -211,14 +211,14 @@ the "catastrophic forgetting" problem. This feature is experimental.
> losses = tagger.rehearse(examples, sgd=optimizer) > losses = tagger.rehearse(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Tagger.get_loss {#get_loss tag="method"} ## Tagger.get_loss {#get_loss tag="method"}
@ -233,11 +233,11 @@ predicted scores.
> loss, d_loss = tagger.get_loss(examples, scores) > loss, d_loss = tagger.get_loss(examples, scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------- | --------------------------------------------------- | | ----------- | --------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The batch of examples. | | `examples` | The batch of examples. ~~Iterable[Example]~~ |
| `scores` | - | Scores representing the model's predictions. | | `scores` | Scores representing the model's predictions. |
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
## Tagger.score {#score tag="method" new="3"} ## Tagger.score {#score tag="method" new="3"}
@ -249,10 +249,10 @@ Score a batch of examples.
> scores = tagger.score(examples) > scores = tagger.score(examples)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The examples to score. | | `examples` | The examples to score. ~~Iterable[Example]~~ |
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. | | **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. ~~Dict[str, float]~~ |
## Tagger.create_optimizer {#create_optimizer tag="method"} ## Tagger.create_optimizer {#create_optimizer tag="method"}
@ -265,9 +265,9 @@ Create an optimizer for the pipeline component.
> optimizer = tagger.create_optimizer() > optimizer = tagger.create_optimizer()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------------------------------------- | -------------- | | ----------- | ---------------------------- |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## Tagger.use_params {#use_params tag="method, contextmanager"} ## Tagger.use_params {#use_params tag="method, contextmanager"}
@ -282,9 +282,9 @@ context, the original parameters are restored.
> tagger.to_disk("/best_model") > tagger.to_disk("/best_model")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---- | ----------------------------------------- | | -------- | -------------------------------------------------- |
| `params` | dict | The parameter values to use in the model. | | `params` | The parameter values to use in the model. ~~dict~~ |
## Tagger.add_label {#add_label tag="method"} ## Tagger.add_label {#add_label tag="method"}
@ -297,10 +297,10 @@ Add a new label to the pipe.
> tagger.add_label("MY_LABEL") > tagger.add_label("MY_LABEL")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | --------------------------------------------------- | | ----------- | ----------------------------------------------------------- |
| `label` | str | The label to add. | | `label` | The label to add. ~~str~~ |
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. | | **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
## Tagger.to_disk {#to_disk tag="method"} ## Tagger.to_disk {#to_disk tag="method"}
@ -313,11 +313,11 @@ Serialize the pipe to disk.
> tagger.to_disk("/path/to/tagger") > tagger.to_disk("/path/to/tagger")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## Tagger.from_disk {#from_disk tag="method"} ## Tagger.from_disk {#from_disk tag="method"}
@ -330,12 +330,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
> tagger.from_disk("/path/to/tagger") > tagger.from_disk("/path/to/tagger")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Tagger` | The modified `Tagger` object. | | **RETURNS** | The modified `Tagger` object. ~~Tagger~~ |
## Tagger.to_bytes {#to_bytes tag="method"} ## Tagger.to_bytes {#to_bytes tag="method"}
@ -348,11 +348,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | bytes | The serialized form of the `Tagger` object. | | **RETURNS** | The serialized form of the `Tagger` object. ~~bytes~~ |
## Tagger.from_bytes {#from_bytes tag="method"} ## Tagger.from_bytes {#from_bytes tag="method"}
@ -366,12 +366,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> tagger.from_bytes(tagger_bytes) > tagger.from_bytes(tagger_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Tagger` | The `Tagger` object. | | **RETURNS** | The `Tagger` object. ~~Tagger~~ |
## Tagger.labels {#labels tag="property"} ## Tagger.labels {#labels tag="property"}
@ -384,9 +384,9 @@ The labels currently added to the component.
> assert "MY_LABEL" in tagger.labels > assert "MY_LABEL" in tagger.labels
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------ | ---------------------------------- | | ----------- | ------------------------------------------------------ |
| **RETURNS** | `Tuple[str]` | The labels added to the component. | | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -35,10 +35,10 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("textcat", config=config) > nlp.add_pipe("textcat", config=config)
> ``` > ```
| Setting | Type | Description | Default | | Setting | Description |
| -------- | ------------------------------------------ | --------------------------------------------------------------------------------------- | ----------------------------------------------------- | | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `labels` | `List[str]` | A list of categories to learn. If empty, the model infers the categories from the data. | `[]` | | `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts scores for each category. | [TextCatEnsemble](/api/architectures#TextCatEnsemble) | | `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/textcat.py https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/textcat.py
@ -65,13 +65,13 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#create_pipe). [`nlp.add_pipe`](/api/language#create_pipe).
| Name | Type | Description | | Name | Description |
| -------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | | -------------- | -------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `labels` | `Iterable[str]` | The labels to use. | | `labels` | The labels to use. ~~Iterable[str]~~ |
## TextCategorizer.\_\_call\_\_ {#call tag="method"} ## TextCategorizer.\_\_call\_\_ {#call tag="method"}
@ -91,10 +91,10 @@ delegate to the [`predict`](/api/textcategorizer#predict) and
> processed = textcat(doc) > processed = textcat(doc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------ | | ----------- | -------------------------------- |
| `doc` | `Doc` | The document to process. | | `doc` | The document to process. ~~Doc~~ |
| **RETURNS** | `Doc` | The processed document. | | **RETURNS** | The processed document. ~~Doc~~ |
## TextCategorizer.pipe {#pipe tag="method"} ## TextCategorizer.pipe {#pipe tag="method"}
@ -113,12 +113,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ----------------------------------------------------- | | -------------- | ------------------------------------------------------------- |
| `stream` | `Iterable[Doc]` | A stream of documents. | | `stream` | A stream of documents. ~~Iterable[Doc]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `batch_size` | int | The number of documents to buffer. Defaults to `128`. | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | `Doc` | The processed documents in order. | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## TextCategorizer.begin_training {#begin_training tag="method"} ## TextCategorizer.begin_training {#begin_training tag="method"}
@ -138,13 +138,13 @@ setting up the label scheme based on the data.
> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | | `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/textcategorizer#create_optimizer) if not set. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## TextCategorizer.predict {#predict tag="method"} ## TextCategorizer.predict {#predict tag="method"}
@ -158,10 +158,10 @@ modifying them.
> scores = textcat.predict([doc1, doc2]) > scores = textcat.predict([doc1, doc2])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------- | ----------------------------------------- | | ----------- | ------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to predict. | | `docs` | The documents to predict. ~~Iterable[Doc]~~ |
| **RETURNS** | - | The model's prediction for each document. | | **RETURNS** | The model's prediction for each document. |
## TextCategorizer.set_annotations {#set_annotations tag="method"} ## TextCategorizer.set_annotations {#set_annotations tag="method"}
@ -175,10 +175,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
> textcat.set_annotations(docs, scores) > textcat.set_annotations(docs, scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | --------------- | --------------------------------------------------------- | | -------- | --------------------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to modify. | | `docs` | The documents to modify. ~~Iterable[Doc]~~ |
| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. | | `scores` | The scores to set, produced by `TextCategorizer.predict`. |
## TextCategorizer.update {#update tag="method"} ## TextCategorizer.update {#update tag="method"}
@ -195,15 +195,15 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
> losses = textcat.update(examples, sgd=optimizer) > losses = textcat.update(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/textcategorizer#set_annotations). | | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"} ## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"}
@ -219,14 +219,14 @@ the "catastrophic forgetting" problem. This feature is experimental.
> losses = textcat.rehearse(examples, sgd=optimizer) > losses = textcat.rehearse(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------ |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## TextCategorizer.get_loss {#get_loss tag="method"} ## TextCategorizer.get_loss {#get_loss tag="method"}
@ -241,11 +241,11 @@ predicted scores.
> loss, d_loss = textcat.get_loss(examples, scores) > loss, d_loss = textcat.get_loss(examples, scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------- | --------------------------------------------------- | | ----------- | --------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The batch of examples. | | `examples` | The batch of examples. ~~Iterable[Example]~~ |
| `scores` | - | Scores representing the model's predictions. | | `scores` | Scores representing the model's predictions. |
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. | | **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
## TextCategorizer.score {#score tag="method" new="3"} ## TextCategorizer.score {#score tag="method" new="3"}
@ -257,12 +257,12 @@ Score a batch of examples.
> scores = textcat.score(examples) > scores = textcat.score(examples)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------------- | ------------------- | ---------------------------------------------------------------------- | | ---------------- | -------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | The examples to score. | | `examples` | The examples to score. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `positive_label` | str | Optional positive label. | | `positive_label` | Optional positive label. ~~Optional[str]~~ |
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). | | **RETURNS** | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
## TextCategorizer.create_optimizer {#create_optimizer tag="method"} ## TextCategorizer.create_optimizer {#create_optimizer tag="method"}
@ -275,25 +275,9 @@ Create an optimizer for the pipeline component.
> optimizer = textcat.create_optimizer() > optimizer = textcat.create_optimizer()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------------------------------------- | -------------- | | ----------- | ---------------------------- |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## TextCategorizer.add_label {#add_label tag="method"}
Add a new label to the pipe.
> #### Example
>
> ```python
> textcat = nlp.add_pipe("textcat")
> textcat.add_label("MY_LABEL")
> ```
| Name | Type | Description |
| ----------- | ---- | --------------------------------------------------- |
| `label` | str | The label to add. |
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
## TextCategorizer.use_params {#use_params tag="method, contextmanager"} ## TextCategorizer.use_params {#use_params tag="method, contextmanager"}
@ -307,9 +291,25 @@ Modify the pipe's model, to use the given parameter values.
> textcat.to_disk("/best_model") > textcat.to_disk("/best_model")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---- | ----------------------------------------- | | -------- | -------------------------------------------------- |
| `params` | dict | The parameter values to use in the model. | | `params` | The parameter values to use in the model. ~~dict~~ |
## TextCategorizer.add_label {#add_label tag="method"}
Add a new label to the pipe.
> #### Example
>
> ```python
> textcat = nlp.add_pipe("textcat")
> textcat.add_label("MY_LABEL")
> ```
| Name | Description |
| ----------- | ----------------------------------------------------------- |
| `label` | The label to add. ~~str~~ |
| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
## TextCategorizer.to_disk {#to_disk tag="method"} ## TextCategorizer.to_disk {#to_disk tag="method"}
@ -322,11 +322,11 @@ Serialize the pipe to disk.
> textcat.to_disk("/path/to/textcat") > textcat.to_disk("/path/to/textcat")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## TextCategorizer.from_disk {#from_disk tag="method"} ## TextCategorizer.from_disk {#from_disk tag="method"}
@ -339,12 +339,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
> textcat.from_disk("/path/to/textcat") > textcat.from_disk("/path/to/textcat")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ----------------- | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. | | **RETURNS** | The modified `TextCategorizer` object. ~~TextCategorizer~~ |
## TextCategorizer.to_bytes {#to_bytes tag="method"} ## TextCategorizer.to_bytes {#to_bytes tag="method"}
@ -357,11 +357,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. | | **RETURNS** | The serialized form of the `TextCategorizer` object. ~~bytes~~ |
## TextCategorizer.from_bytes {#from_bytes tag="method"} ## TextCategorizer.from_bytes {#from_bytes tag="method"}
@ -375,12 +375,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> textcat.from_bytes(textcat_bytes) > textcat.from_bytes(textcat_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ----------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. | | **RETURNS** | The `TextCategorizer` object. ~~TextCategorizer~~ |
## TextCategorizer.labels {#labels tag="property"} ## TextCategorizer.labels {#labels tag="property"}
@ -393,9 +393,9 @@ The labels currently added to the component.
> assert "MY_LABEL" in textcat.labels > assert "MY_LABEL" in textcat.labels
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ---------------------------------- | | ----------- | ------------------------------------------------------ |
| **RETURNS** | tuple | The labels added to the component. | | **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -40,9 +40,9 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("tok2vec", config=config) > nlp.add_pipe("tok2vec", config=config)
> ``` > ```
| Setting | Type | Description | Default | | Setting | Description |
| ------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- | | ------- | ------------------------------------------------------------------------------------------------------------------ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) | | `model` | The model to use. Defaults to [HashEmbedCNN](/api/architectures#HashEmbedCNN). ~~Model[List[Doc], List[Floats2d]~~ |
```python ```python
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tok2vec.py https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tok2vec.py
@ -69,11 +69,11 @@ Create a new pipeline instance. In your application, you would normally use a
shortcut for this and instantiate the component using its string name and shortcut for this and instantiate the component using its string name and
[`nlp.add_pipe`](/api/language#create_pipe). [`nlp.add_pipe`](/api/language#create_pipe).
| Name | Type | Description | | Name | Description |
| ------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- | | ------- | ------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. | | `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]~~ |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
## Tok2Vec.\_\_call\_\_ {#call tag="method"} ## Tok2Vec.\_\_call\_\_ {#call tag="method"}
@ -95,10 +95,10 @@ pipeline components are applied to the `Doc` in order. Both
> processed = tok2vec(doc) > processed = tok2vec(doc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------ | | ----------- | -------------------------------- |
| `doc` | `Doc` | The document to process. | | `doc` | The document to process. ~~Doc~~ |
| **RETURNS** | `Doc` | The processed document. | | **RETURNS** | The processed document. ~~Doc~~ |
## Tok2Vec.pipe {#pipe tag="method"} ## Tok2Vec.pipe {#pipe tag="method"}
@ -116,12 +116,12 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ----------------------------------------------------- | | -------------- | ------------------------------------------------------------- |
| `stream` | `Iterable[Doc]` | A stream of documents. | | `stream` | A stream of documents. ~~Iterable[Doc]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `batch_size` | int | The number of documents to buffer. Defaults to `128`. | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | `Doc` | The processed documents in order. | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Tok2Vec.begin_training {#begin_training tag="method"} ## Tok2Vec.begin_training {#begin_training tag="method"}
@ -141,13 +141,13 @@ setting up the label scheme based on the data.
> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | | `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/tok2vec#create_optimizer) if not set. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## Tok2Vec.predict {#predict tag="method"} ## Tok2Vec.predict {#predict tag="method"}
@ -161,10 +161,10 @@ modifying them.
> scores = tok2vec.predict([doc1, doc2]) > scores = tok2vec.predict([doc1, doc2])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------- | ----------------------------------------- | | ----------- | ------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to predict. | | `docs` | The documents to predict. ~~Iterable[Doc]~~ |
| **RETURNS** | - | The model's prediction for each document. | | **RETURNS** | The model's prediction for each document. |
## Tok2Vec.set_annotations {#set_annotations tag="method"} ## Tok2Vec.set_annotations {#set_annotations tag="method"}
@ -178,10 +178,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
> tok2vec.set_annotations(docs, scores) > tok2vec.set_annotations(docs, scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | --------------- | ------------------------------------------------- | | -------- | ------------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to modify. | | `docs` | The documents to modify. ~~Iterable[Doc]~~ |
| `scores` | - | The scores to set, produced by `Tok2Vec.predict`. | | `scores` | The scores to set, produced by `Tok2Vec.predict`. |
## Tok2Vec.update {#update tag="method"} ## Tok2Vec.update {#update tag="method"}
@ -197,15 +197,15 @@ Delegates to [`predict`](/api/tok2vec#predict).
> losses = tok2vec.update(examples, sgd=optimizer) > losses = tok2vec.update(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. | | `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/tok2vec#set_annotations). | | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Tok2Vec.create_optimizer {#create_optimizer tag="method"} ## Tok2Vec.create_optimizer {#create_optimizer tag="method"}
@ -218,9 +218,9 @@ Create an optimizer for the pipeline component.
> optimizer = tok2vec.create_optimizer() > optimizer = tok2vec.create_optimizer()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------------------------------------- | -------------- | | ----------- | ---------------------------- |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## Tok2Vec.use_params {#use_params tag="method, contextmanager"} ## Tok2Vec.use_params {#use_params tag="method, contextmanager"}
@ -235,9 +235,9 @@ context, the original parameters are restored.
> tok2vec.to_disk("/best_model") > tok2vec.to_disk("/best_model")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---- | ----------------------------------------- | | -------- | -------------------------------------------------- |
| `params` | dict | The parameter values to use in the model. | | `params` | The parameter values to use in the model. ~~dict~~ |
## Tok2Vec.to_disk {#to_disk tag="method"} ## Tok2Vec.to_disk {#to_disk tag="method"}
@ -250,11 +250,11 @@ Serialize the pipe to disk.
> tok2vec.to_disk("/path/to/tok2vec") > tok2vec.to_disk("/path/to/tok2vec")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## Tok2Vec.from_disk {#from_disk tag="method"} ## Tok2Vec.from_disk {#from_disk tag="method"}
@ -267,12 +267,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
> tok2vec.from_disk("/path/to/tok2vec") > tok2vec.from_disk("/path/to/tok2vec")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. | | **RETURNS** | The modified `Tok2Vec` object. ~~Tok2Vec~~ |
## Tok2Vec.to_bytes {#to_bytes tag="method"} ## Tok2Vec.to_bytes {#to_bytes tag="method"}
@ -285,11 +285,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. | | **RETURNS** | The serialized form of the `Tok2Vec` object. ~~bytes~~ |
## Tok2Vec.from_bytes {#from_bytes tag="method"} ## Tok2Vec.from_bytes {#from_bytes tag="method"}
@ -303,12 +303,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> tok2vec.from_bytes(tok2vec_bytes) > tok2vec.from_bytes(tok2vec_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. | | **RETURNS** | The `Tok2Vec` object. ~~Tok2Vec~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -17,11 +17,11 @@ Construct a `Token` object.
> assert token.text == "Give" > assert token.text == "Give"
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ------- | ------------------------------------------- | | -------- | --------------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. | | `vocab` | A storage container for lexical types. ~~Vocab~~ |
| `doc` | `Doc` | The parent document. | | `doc` | The parent document. ~~Doc~~ |
| `offset` | int | The index of the token within the document. | | `offset` | The index of the token within the document. ~~int~~ |
## Token.\_\_len\_\_ {#len tag="method"} ## Token.\_\_len\_\_ {#len tag="method"}
@ -35,9 +35,9 @@ The number of unicode characters in the token, i.e. `token.text`.
> assert len(token) == 4 > assert len(token) == 4
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ---------------------------------------------- | | ----------- | ------------------------------------------------------ |
| **RETURNS** | int | The number of unicode characters in the token. | | **RETURNS** | The number of unicode characters in the token. ~~int~~ |
## Token.set_extension {#set_extension tag="classmethod" new="2"} ## Token.set_extension {#set_extension tag="classmethod" new="2"}
@ -55,14 +55,14 @@ For details, see the documentation on
> assert doc[3]._.is_fruit > assert doc[3]._.is_fruit
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------- | | --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `token._.my_attr`. | | `name` | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `token._.my_attr`. ~~str~~ |
| `default` | - | Optional default value of the attribute if no getter or method is defined. | | `default` | Optional default value of the attribute if no getter or method is defined. ~~Optional[Any]~~ |
| `method` | callable | Set a custom method on the object, for example `token._.compare(other_token)`. | | `method` | Set a custom method on the object, for example `token._.compare(other_token)`. ~~Optional[Callable[[Token, ...], Any]]~~ |
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | | `getter` | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. ~~Optional[Callable[[Token], Any]]~~ |
| `setter` | callable | Setter function that takes the `Token` and a value, and modifies the object. Is called when the user writes to the `Token._` attribute. | | `setter` | Setter function that takes the `Token` and a value, and modifies the object. Is called when the user writes to the `Token._` attribute. ~~Optional[Callable[[Token, Any], None]]~~ |
| `force` | bool | Force overwriting existing attribute. | | `force` | Force overwriting existing attribute. ~~bool~~ |
## Token.get_extension {#get_extension tag="classmethod" new="2"} ## Token.get_extension {#get_extension tag="classmethod" new="2"}
@ -79,10 +79,10 @@ Look up a previously registered extension by name. Returns a 4-tuple
> assert extension == (False, None, None, None) > assert extension == (False, None, None, None)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | Name of the extension. | | `name` | Name of the extension. ~~str~~ |
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | | **RETURNS** | A `(default, method, getter, setter)` tuple of the extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ |
## Token.has_extension {#has_extension tag="classmethod" new="2"} ## Token.has_extension {#has_extension tag="classmethod" new="2"}
@ -96,10 +96,10 @@ Check whether an extension has been registered on the `Token` class.
> assert Token.has_extension("is_fruit") > assert Token.has_extension("is_fruit")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------------ | | ----------- | --------------------------------------------------- |
| `name` | str | Name of the extension to check. | | `name` | Name of the extension to check. ~~str~~ |
| **RETURNS** | bool | Whether the extension has been registered. | | **RETURNS** | Whether the extension has been registered. ~~bool~~ |
## Token.remove_extension {#remove_extension tag="classmethod" new=""2.0.11""} ## Token.remove_extension {#remove_extension tag="classmethod" new=""2.0.11""}
@ -114,10 +114,10 @@ Remove a previously registered extension.
> assert not Token.has_extension("is_fruit") > assert not Token.has_extension("is_fruit")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | --------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | Name of the extension. | | `name` | Name of the extension. ~~str~~ |
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | | **RETURNS** | A `(default, method, getter, setter)` tuple of the removed extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ |
## Token.check_flag {#check_flag tag="method"} ## Token.check_flag {#check_flag tag="method"}
@ -132,10 +132,10 @@ Check the value of a boolean flag.
> assert token.check_flag(IS_TITLE) == True > assert token.check_flag(IS_TITLE) == True
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------------- | | ----------- | ---------------------------------------------- |
| `flag_id` | int | The attribute ID of the flag to check. | | `flag_id` | The attribute ID of the flag to check. ~~int~~ |
| **RETURNS** | bool | Whether the flag is set. | | **RETURNS** | Whether the flag is set. ~~bool~~ |
## Token.similarity {#similarity tag="method" model="vectors"} ## Token.similarity {#similarity tag="method" model="vectors"}
@ -150,10 +150,10 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors.
> assert apples_oranges == oranges_apples > assert apples_oranges == oranges_apples
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | -------------------------------------------------------------------------------------------- | | ----------- | -------------------------------------------------------------------------------------------------------------------------------- |
| other | - | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. | | other | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ |
| **RETURNS** | float | A scalar similarity score. Higher is more similar. | | **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~ |
## Token.nbor {#nbor tag="method"} ## Token.nbor {#nbor tag="method"}
@ -167,10 +167,10 @@ Get a neighboring token.
> assert give_nbor.text == "it" > assert give_nbor.text == "it"
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------- | ----------------------------------------------------------- | | ----------- | ------------------------------------------------------------------- |
| `i` | int | The relative position of the token to get. Defaults to `1`. | | `i` | The relative position of the token to get. Defaults to `1`. ~~int~~ |
| **RETURNS** | `Token` | The token at position `self.doc[self.i+i]`. | | **RETURNS** | The token at position `self.doc[self.i+i]`. ~~Token~~ |
## Token.is_ancestor {#is_ancestor tag="method" model="parser"} ## Token.is_ancestor {#is_ancestor tag="method" model="parser"}
@ -186,10 +186,10 @@ dependency tree.
> assert give.is_ancestor(it) > assert give.is_ancestor(it)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------- | ----------------------------------------------------- | | ----------- | -------------------------------------------------------------- |
| descendant | `Token` | Another token. | | descendant | Another token. ~~Token~~ |
| **RETURNS** | bool | Whether this token is the ancestor of the descendant. | | **RETURNS** | Whether this token is the ancestor of the descendant. ~~bool~~ |
## Token.ancestors {#ancestors tag="property" model="parser"} ## Token.ancestors {#ancestors tag="property" model="parser"}
@ -205,9 +205,9 @@ The rightmost token of this token's syntactic descendants.
> assert [t.text for t in he_ancestors] == ["pleaded"] > assert [t.text for t in he_ancestors] == ["pleaded"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ------- | --------------------------------------------------------------------- | | ---------- | ------------------------------------------------------------------------------- |
| **YIELDS** | `Token` | A sequence of ancestor tokens such that `ancestor.is_ancestor(self)`. | | **YIELDS** | A sequence of ancestor tokens such that `ancestor.is_ancestor(self)`. ~~Token~~ |
## Token.conjuncts {#conjuncts tag="property" model="parser"} ## Token.conjuncts {#conjuncts tag="property" model="parser"}
@ -221,9 +221,9 @@ A tuple of coordinated tokens, not including the token itself.
> assert [t.text for t in apples_conjuncts] == ["oranges"] > assert [t.text for t in apples_conjuncts] == ["oranges"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------- | ----------------------- | | ----------- | --------------------------------------------- |
| **RETURNS** | `tuple` | The coordinated tokens. | | **RETURNS** | The coordinated tokens. ~~Tuple[Token, ...]~~ |
## Token.children {#children tag="property" model="parser"} ## Token.children {#children tag="property" model="parser"}
@ -237,9 +237,9 @@ A sequence of the token's immediate syntactic children.
> assert [t.text for t in give_children] == ["it", "back", "!"] > assert [t.text for t in give_children] == ["it", "back", "!"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ------- | ------------------------------------------- | | ---------- | ------------------------------------------------------- |
| **YIELDS** | `Token` | A child token such that `child.head==self`. | | **YIELDS** | A child token such that `child.head == self`. ~~Token~~ |
## Token.lefts {#lefts tag="property" model="parser"} ## Token.lefts {#lefts tag="property" model="parser"}
@ -253,9 +253,9 @@ The leftward immediate children of the word, in the syntactic dependency parse.
> assert lefts == ["New"] > assert lefts == ["New"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ------- | -------------------------- | | ---------- | ------------------------------------ |
| **YIELDS** | `Token` | A left-child of the token. | | **YIELDS** | A left-child of the token. ~~Token~~ |
## Token.rights {#rights tag="property" model="parser"} ## Token.rights {#rights tag="property" model="parser"}
@ -269,9 +269,9 @@ The rightward immediate children of the word, in the syntactic dependency parse.
> assert rights == ["in"] > assert rights == ["in"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ------- | --------------------------- | | ---------- | ------------------------------------- |
| **YIELDS** | `Token` | A right-child of the token. | | **YIELDS** | A right-child of the token. ~~Token~~ |
## Token.n_lefts {#n_lefts tag="property" model="parser"} ## Token.n_lefts {#n_lefts tag="property" model="parser"}
@ -285,9 +285,9 @@ dependency parse.
> assert doc[3].n_lefts == 1 > assert doc[3].n_lefts == 1
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------- | | ----------- | ---------------------------------------- |
| **RETURNS** | int | The number of left-child tokens. | | **RETURNS** | The number of left-child tokens. ~~int~~ |
## Token.n_rights {#n_rights tag="property" model="parser"} ## Token.n_rights {#n_rights tag="property" model="parser"}
@ -301,9 +301,9 @@ dependency parse.
> assert doc[3].n_rights == 1 > assert doc[3].n_rights == 1
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | --------------------------------- | | ----------- | ----------------------------------------- |
| **RETURNS** | int | The number of right-child tokens. | | **RETURNS** | The number of right-child tokens. ~~int~~ |
## Token.subtree {#subtree tag="property" model="parser"} ## Token.subtree {#subtree tag="property" model="parser"}
@ -317,9 +317,9 @@ A sequence containing the token and all the token's syntactic descendants.
> assert [t.text for t in give_subtree] == ["Give", "it", "back", "!"] > assert [t.text for t in give_subtree] == ["Give", "it", "back", "!"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ------- | -------------------------------------------------------------------------- | | ---------- | ------------------------------------------------------------------------------------ |
| **YIELDS** | `Token` | A descendant token such that `self.is_ancestor(token)` or `token == self`. | | **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ |
## Token.is_sent_start {#is_sent_start tag="property" new="2"} ## Token.is_sent_start {#is_sent_start tag="property" new="2"}
@ -334,9 +334,9 @@ unknown. Defaults to `True` for the first token in the `Doc`.
> assert not doc[5].is_sent_start > assert not doc[5].is_sent_start
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------ | | ----------- | --------------------------------------------- |
| **RETURNS** | bool | Whether the token starts a sentence. | | **RETURNS** | Whether the token starts a sentence. ~~bool~~ |
## Token.has_vector {#has_vector tag="property" model="vectors"} ## Token.has_vector {#has_vector tag="property" model="vectors"}
@ -350,9 +350,9 @@ A boolean value indicating whether a word vector is associated with the token.
> assert apples.has_vector > assert apples.has_vector
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | --------------------------------------------- | | ----------- | ------------------------------------------------------ |
| **RETURNS** | bool | Whether the token has a vector data attached. | | **RETURNS** | Whether the token has a vector data attached. ~~bool~~ |
## Token.vector {#vector tag="property" model="vectors"} ## Token.vector {#vector tag="property" model="vectors"}
@ -367,9 +367,9 @@ A real-valued meaning representation.
> assert apples.vector.shape == (300,) > assert apples.vector.shape == (300,)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---------------------------------------- | ---------------------------------------------------- | | ----------- | ----------------------------------------------------------------------------------------------- |
| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the token's semantics. | | **RETURNS** | A 1-dimensional array representing the token's vector. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
## Token.vector_norm {#vector_norm tag="property" model="vectors"} ## Token.vector_norm {#vector_norm tag="property" model="vectors"}
@ -386,80 +386,80 @@ The L2 norm of the token's vector representation.
> assert apples.vector_norm != pasta.vector_norm > assert apples.vector_norm != pasta.vector_norm
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ----------------------------------------- | | ----------- | --------------------------------------------------- |
| **RETURNS** | float | The L2 norm of the vector representation. | | **RETURNS** | The L2 norm of the vector representation. ~~float~~ |
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Description |
| -------------------------------------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The parent document. | | `doc` | The parent document. ~~Doc~~ |
| `lex` <Tag variant="new">3</Tag> | [`Lexeme`](/api/lexeme) | The underlying lexeme. | | `lex` <Tag variant="new">3</Tag> | The underlying lexeme. ~~Lexeme~~ |
| `sent` <Tag variant="new">2.0.12</Tag> | [`Span`](/api/span) | The sentence span that this token is a part of. | | `sent` <Tag variant="new">2.0.12</Tag> | The sentence span that this token is a part of. ~~Span~~ |
| `text` | str | Verbatim text content. | | `text` | Verbatim text content. ~~str~~ |
| `text_with_ws` | str | Text content, with trailing space character if present. | | `text_with_ws` | Text content, with trailing space character if present. ~~str~~ |
| `whitespace_` | str | Trailing space character if present. | | `whitespace_` | Trailing space character if present. ~~str~~ |
| `orth` | int | ID of the verbatim text content. | | `orth` | ID of the verbatim text content. ~~int~~ |
| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. | | `orth_` | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~ |
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | | `vocab` | The vocab object of the parent `Doc`. ~~vocab~~ |
| `tensor` <Tag variant="new">2.1.7</Tag> | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. | | `tensor` <Tag variant="new">2.1.7</Tag> | The tokens's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ |
| `head` | `Token` | The syntactic parent, or "governor", of this token. | | `head` | The syntactic parent, or "governor", of this token. ~~Token~~ |
| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. | | `left_edge` | The leftmost token of this token's syntactic descendants. ~~Token~~ |
| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. | | `right_edge` | The rightmost token of this token's syntactic descendants. ~~Token~~ |
| `i` | int | The index of the token within the parent document. | | `i` | The index of the token within the parent document. ~~int~~ |
| `ent_type` | int | Named entity type. | | `ent_type` | Named entity type. ~~int~~ |
| `ent_type_` | str | Named entity type. | | `ent_type_` | Named entity type. ~~str~~ |
| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. | | `ent_iob` | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~ |
| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. | | `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ |
| `ent_kb_id` <Tag variant="new">2.2</Tag> | int | Knowledge base ID that refers to the named entity this token is a part of, if any. | | `ent_kb_id` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ |
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | str | Knowledge base ID that refers to the named entity this token is a part of, if any. | | `ent_kb_id_` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ |
| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | | `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ |
| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. | | `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ |
| `lemma` | int | Base form of the token, with no inflectional suffixes. | | `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ |
| `lemma_` | str | Base form of the token, with no inflectional suffixes. | | `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ |
| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | | `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~int~~ |
| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). | | `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~ |
| `lower` | int | Lowercase form of the token. | | `lower` | Lowercase form of the token. ~~int~~ |
| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. | | `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ |
| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `shape` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `shape_` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. | | `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ |
| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. | | `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ |
| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. | | `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ |
| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. | | `suffix_` | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~ |
| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. | | `is_alpha` | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~ |
| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. | | `is_ascii` | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~ |
| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. | | `is_digit` | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~ |
| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. | | `is_lower` | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~ |
| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. | | `is_upper` | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~ |
| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. | | `is_title` | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~ |
| `is_punct` | bool | Is the token punctuation? | | `is_punct` | Is the token punctuation? ~~bool~~ |
| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `"("` ? | | `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ |
| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `")"` ? | | `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ |
| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. | | `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ |
| `is_bracket` | bool | Is the token a bracket? | | `is_bracket` | Is the token a bracket? ~~bool~~ |
| `is_quote` | bool | Is the token a quotation mark? | | `is_quote` | Is the token a quotation mark? ~~bool~~ |
| `is_currency` <Tag variant="new">2.0.8</Tag> | bool | Is the token a currency symbol? | | `is_currency` <Tag variant="new">2.0.8</Tag> | Is the token a currency symbol? ~~bool~~ |
| `like_url` | bool | Does the token resemble a URL? | | `like_url` | Does the token resemble a URL? ~~bool~~ |
| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. | | `like_num` | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ |
| `like_email` | bool | Does the token resemble an email address? | | `like_email` | Does the token resemble an email address? ~~bool~~ |
| `is_oov` | bool | Does the token have a word vector? | | `is_oov` | Does the token have a word vector? ~~bool~~ |
| `is_stop` | bool | Is the token part of a "stop list"? | | `is_stop` | Is the token part of a "stop list"? ~~bool~~ |
| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | | `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ |
| `pos_` | str | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). | | `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ |
| `tag` | int | Fine-grained part-of-speech. | | `tag` | Fine-grained part-of-speech. ~~int~~ |
| `tag_` | str | Fine-grained part-of-speech. | | `tag_` | Fine-grained part-of-speech. ~~str~~ |
| `morph` | `MorphAnalysis` | Morphological analysis. | | `morph` | Morphological analysis. ~~MorphAnalysis~~ |
| `morph_` | str | Morphological analysis in UD FEATS format. | | `morph_` | Morphological analysis in the Universal Dependencies [FEATS]https://universaldependencies.org/format.html#morphological-annotation format. ~~str~~ |
| `dep` | int | Syntactic dependency relation. | | `dep` | Syntactic dependency relation. ~~int~~ |
| `dep_` | str | Syntactic dependency relation. | | `dep_` | Syntactic dependency relation. ~~str~~ |
| `lang` | int | Language of the parent document's vocabulary. | | `lang` | Language of the parent document's vocabulary. ~~int~~ |
| `lang_` | str | Language of the parent document's vocabulary. | | `lang_` | Language of the parent document's vocabulary. ~~str~~ |
| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). | | `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ |
| `idx` | int | The character offset of the token within the parent document. | | `idx` | The character offset of the token within the parent document. ~~int~~ |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. | | `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ |
| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | | `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. | | `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
| `cluster` | int | Brown cluster ID. | | `cluster` | Brown cluster ID. ~~int~~ |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | | `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |

View File

@ -45,15 +45,15 @@ the
> tokenizer = nlp.tokenizer > tokenizer = nlp.tokenizer
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------ | | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. | | `vocab` | A storage container for lexical types. ~~Vocab~~ |
| `rules` | dict | Exceptions and special-cases for the tokenizer. | | `rules` | Exceptions and special-cases for the tokenizer. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ |
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | | `prefix_search` | A function matching the signature of `re.compile(string).search` to match prefixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. | | `suffix_search` | A function matching the signature of `re.compile(string).search` to match suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. | | `infix_finditer` | A function matching the signature of `re.compile(string).finditer` to find infixes. ~~Optional[Callable[[str], Iterator[Match]]]~~ |
| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. | | `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. ~~Optional[Callable[[str], Optional[Match]]]~~ |
| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. | | `url_match` | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
## Tokenizer.\_\_call\_\_ {#call tag="method"} ## Tokenizer.\_\_call\_\_ {#call tag="method"}
@ -66,10 +66,10 @@ Tokenize a string.
> assert len(tokens) == 4 > assert len(tokens) == 4
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | --------------------------------------- | | ----------- | ----------------------------------------------- |
| `string` | str | The string to tokenize. | | `string` | The string to tokenize. ~~str~~ |
| **RETURNS** | `Doc` | A container for linguistic annotations. | | **RETURNS** | A container for linguistic annotations. ~~Doc~~ |
## Tokenizer.pipe {#pipe tag="method"} ## Tokenizer.pipe {#pipe tag="method"}
@ -83,40 +83,40 @@ Tokenize a stream of texts.
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------ | ----- | ---------------------------------------------------------------------------- | | ------------ | ------------------------------------------------------------------------------------ |
| `texts` | - | A sequence of unicode texts. | | `texts` | A sequence of unicode texts. ~~Iterable[str]~~ |
| `batch_size` | int | The number of texts to accumulate in an internal buffer. Defaults to `1000`. | | `batch_size` | The number of texts to accumulate in an internal buffer. Defaults to `1000`. ~~int~~ |
| **YIELDS** | `Doc` | A sequence of Doc objects, in order. | | **YIELDS** | The tokenized Doc objects, in order. ~~Doc~~ |
## Tokenizer.find_infix {#find_infix tag="method"} ## Tokenizer.find_infix {#find_infix tag="method"}
Find internal split points of the string. Find internal split points of the string.
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `string` | str | The string to split. | | `string` | The string to split. ~~str~~ |
| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. | | **RETURNS** | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. ~~List[Match]~~ |
## Tokenizer.find_prefix {#find_prefix tag="method"} ## Tokenizer.find_prefix {#find_prefix tag="method"}
Find the length of a prefix that should be segmented from the string, or `None` Find the length of a prefix that should be segmented from the string, or `None`
if no prefix rules match. if no prefix rules match.
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------------------------ | | ----------- | ------------------------------------------------------------------------ |
| `string` | str | The string to segment. | | `string` | The string to segment. ~~str~~ |
| **RETURNS** | int | The length of the prefix if present, otherwise `None`. | | **RETURNS** | The length of the prefix if present, otherwise `None`. ~~Optional[int]~~ |
## Tokenizer.find_suffix {#find_suffix tag="method"} ## Tokenizer.find_suffix {#find_suffix tag="method"}
Find the length of a suffix that should be segmented from the string, or `None` Find the length of a suffix that should be segmented from the string, or `None`
if no suffix rules match. if no suffix rules match.
| Name | Type | Description | | Name | Description |
| ----------- | ------------ | ------------------------------------------------------ | | ----------- | ------------------------------------------------------------------------ |
| `string` | str | The string to segment. | | `string` | The string to segment. ~~str~~ |
| **RETURNS** | int / `None` | The length of the suffix if present, otherwise `None`. | | **RETURNS** | The length of the suffix if present, otherwise `None`. ~~Optional[int]~~ |
## Tokenizer.add_special_case {#add_special_case tag="method"} ## Tokenizer.add_special_case {#add_special_case tag="method"}
@ -134,10 +134,10 @@ and examples.
> tokenizer.add_special_case("don't", case) > tokenizer.add_special_case("don't", case)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `string` | str | The string to specially tokenize. | | `string` | The string to specially tokenize. ~~str~~ |
| `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. | | `token_attrs` | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. ~~Iterable[Dict[int, str]]~~ |
## Tokenizer.explain {#explain tag="method"} ## Tokenizer.explain {#explain tag="method"}
@ -153,10 +153,10 @@ produced are identical to `Tokenizer.__call__` except for whitespace tokens.
> assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"] > assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | --------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------- |
| `string` | str | The string to tokenize with the debugging tokenizer | | `string` | The string to tokenize with the debugging tokenizer. ~~str~~ |
| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples | | **RETURNS** | A list of `(pattern_string, token_string)` tuples. ~~List[Tuple[str, str]]~~ |
## Tokenizer.to_disk {#to_disk tag="method"} ## Tokenizer.to_disk {#to_disk tag="method"}
@ -169,11 +169,11 @@ Serialize the tokenizer to disk.
> tokenizer.to_disk("/path/to/tokenizer") > tokenizer.to_disk("/path/to/tokenizer")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## Tokenizer.from_disk {#from_disk tag="method"} ## Tokenizer.from_disk {#from_disk tag="method"}
@ -186,12 +186,12 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
> tokenizer.from_disk("/path/to/tokenizer") > tokenizer.from_disk("/path/to/tokenizer")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | | **RETURNS** | The modified `Tokenizer` object. ~~Tokenizer~~ |
## Tokenizer.to_bytes {#to_bytes tag="method"} ## Tokenizer.to_bytes {#to_bytes tag="method"}
@ -204,11 +204,11 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
Serialize the tokenizer to a bytestring. Serialize the tokenizer to a bytestring.
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. | | **RETURNS** | The serialized form of the `Tokenizer` object. ~~bytes~~ |
## Tokenizer.from_bytes {#from_bytes tag="method"} ## Tokenizer.from_bytes {#from_bytes tag="method"}
@ -223,23 +223,23 @@ it.
> tokenizer.from_bytes(tokenizer_bytes) > tokenizer.from_bytes(tokenizer_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Tokenizer` | The `Tokenizer` object. | | **RETURNS** | The `Tokenizer` object. ~~Tokenizer~~ |
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Description |
| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | | `vocab` | The vocab object of the parent `Doc`. ~~Vocab~~ |
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | | `prefix_search` | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. ~~Optional[Callable[[str], Optional[Match]]]~~ |
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | | `suffix_search` | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. ~~Optional[Callable[[str], Optional[Match]]]~~ |
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | | `infix_finditer` | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) sequence of `re.MatchObject` objects. ~~Optional[Callable[[str], Iterator[Match]]]~~ |
| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an`re.MatchObject`or`None. | | `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. Returns an `re.MatchObject` or `None`. ~~Optional[Callable[[str], Optional[Match]]]~~ |
| `rules` | dict | A dictionary of tokenizer exceptions and special cases. | | `rules` | A dictionary of tokenizer exceptions and special cases. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -32,13 +32,13 @@ loaded in via [`Language.from_disk`](/api/language#from_disk).
> nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"]) > nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------------------------------- | ---------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- | | ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `name` | str / `Path` | Model to load, i.e. package name or path. | | `name` | Model to load, i.e. package name or path. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. | | `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | `Language` | A `Language` object with the loaded model. | | **RETURNS** | A `Language` object with the loaded model. ~~Language~~ |
Essentially, `spacy.load()` is a convenience wrapper that reads the language ID Essentially, `spacy.load()` is a convenience wrapper that reads the language ID
and pipeline components from a model's `meta.json`, initializes the `Language` and pipeline components from a model's `meta.json`, initializes the `Language`
@ -65,10 +65,10 @@ Create a blank model of a given language class. This function is the twin of
> nlp_de = spacy.blank("de") # equivalent to German() > nlp_de = spacy.blank("de") # equivalent to German()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---------- | ------------------------------------------------------------------------------------------------ | | ----------- | -------------------------------------------------------------------------------------------------------- |
| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. | | `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ |
| **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. | | **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ |
#### spacy.info {#spacy.info tag="function"} #### spacy.info {#spacy.info tag="function"}
@ -85,12 +85,12 @@ meta data as a dictionary instead, you can use the `meta` attribute on your
> markdown = spacy.info(markdown=True, silent=True) > markdown = spacy.info(markdown=True, silent=True)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ---- | ------------------------------------------------ | | -------------- | ------------------------------------------------------------------ |
| `model` | str | A model, i.e. a package name or path (optional). | | `model` | A model, i.e. a package name or path (optional). ~~Optional[str]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `markdown` | bool | Print information as Markdown. | | `markdown` | Print information as Markdown. ~~bool~~ |
| `silent` | bool | Don't print anything, just return. | | `silent` | Don't print anything, just return. ~~bool~~ |
### spacy.explain {#spacy.explain tag="function"} ### spacy.explain {#spacy.explain tag="function"}
@ -111,10 +111,10 @@ list of available terms, see
> # world NN noun, singular or mass > # world NN noun, singular or mass
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------------------------------- | | ----------- | -------------------------------------------------------------------------- |
| `term` | str | Term to explain. | | `term` | Term to explain. ~~str~~ |
| **RETURNS** | str | The explanation, or `None` if not found in the glossary. | | **RETURNS** | The explanation, or `None` if not found in the glossary. ~~Optional[str]~~ |
### spacy.prefer_gpu {#spacy.prefer_gpu tag="function" new="2.0.14"} ### spacy.prefer_gpu {#spacy.prefer_gpu tag="function" new="2.0.14"}
@ -131,9 +131,9 @@ models.
> nlp = spacy.load("en_core_web_sm") > nlp = spacy.load("en_core_web_sm")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------ | | ----------- | --------------------------------------- |
| **RETURNS** | bool | Whether the GPU was activated. | | **RETURNS** | Whether the GPU was activated. ~~bool~~ |
### spacy.require_gpu {#spacy.require_gpu tag="function" new="2.0.14"} ### spacy.require_gpu {#spacy.require_gpu tag="function" new="2.0.14"}
@ -150,9 +150,9 @@ and _before_ loading any models.
> nlp = spacy.load("en_core_web_sm") > nlp = spacy.load("en_core_web_sm")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ----------- | | ----------- | --------------- |
| **RETURNS** | bool | `True` | | **RETURNS** | `True` ~~bool~~ |
## displaCy {#displacy source="spacy/displacy"} ## displaCy {#displacy source="spacy/displacy"}
@ -175,16 +175,16 @@ browser. Will run a simple web server.
> displacy.serve([doc1, doc2], style="dep") > displacy.serve([doc1, doc2], style="dep")
> ``` > ```
| Name | Type | Description | Default | | Name | Description |
| --------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ----------- | | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `docs` | list, `Doc`, `Span` | Document(s) to visualize. | | `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` | | `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ |
| `page` | bool | Render markup as full HTML page. | `True` | | `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
| `minify` | bool | Minify HTML markup. | `False` | | `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
| `port` | int | Port to serve visualization. | `5000` | | `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ |
| `host` | str | Host to serve visualization. | `'0.0.0.0'` | | `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ |
### displacy.render {#displacy.render tag="method" new="2"} ### displacy.render {#displacy.render tag="method" new="2"}
@ -200,16 +200,16 @@ Render a dependency parse tree or named entity visualization.
> html = displacy.render(doc, style="dep") > html = displacy.render(doc, style="dep")
> ``` > ```
| Name | Type | Description | Default | | Name | Description |
| ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | list, `Doc`, `Span` | Document(s) to visualize. | | `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` | | `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ |
| `page` | bool | Render markup as full HTML page. | `False` | | `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
| `minify` | bool | Minify HTML markup. | `False` | | `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
| `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` | | `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
| **RETURNS** | str | Rendered HTML markup. | | **RETURNS** | The rendered HTML markup. ~~str~~ |
### Visualizer options {#displacy_options} ### Visualizer options {#displacy_options}
@ -225,22 +225,22 @@ If a setting is not present in the options, the default value will be used.
> displacy.serve(doc, style="dep", options=options) > displacy.serve(doc, style="dep", options=options)
> ``` > ```
| Name | Type | Description | Default | | Name | Description |
| ------------------------------------------ | ---- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | | `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
| `add_lemma` <Tag variant="new">2.2.4</Tag> | bool | Print the lemma's in a separate row below the token texts. | `False` | | `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemma's in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | | `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | | `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | | `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
| `color` | str | Text color (HEX, RGB or color names). | `'#000000'` | | `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
| `bg` | str | Background color (HEX, RGB or color names). | `'#ffffff'` | | `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
| `font` | str | Font name or font family for all text. | `'Arial'` | | `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | | `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
| `arrow_stroke` | int | Width of arrow path in px. | `2` | | `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | | `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) | | `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` | | `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
| `distance` | int | Distance between words in px. | `175` / `150` (compact) | | `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
#### Named Entity Visualizer options {#displacy_options-ent} #### Named Entity Visualizer options {#displacy_options-ent}
@ -252,11 +252,11 @@ If a setting is not present in the options, the default value will be used.
> displacy.serve(doc, style="ent", options=options) > displacy.serve(doc, style="ent", options=options)
> ``` > ```
| Name | Type | Description | Default | | Name | Description |
| --------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ | | --------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `ents` | list | Entity types to highlight (`None` for all types). | `None` | | `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ |
| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | | `colors` | Color overrides. Entity types in uppercase should be mapped to color names or values. ~~Dict[str, str]~~ |
| `template` <Tag variant="new">2.2</Tag> | str | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) | | `template` <Tag variant="new">2.2</Tag> | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
By default, displaCy comes with colors for all entity types used by By default, displaCy comes with colors for all entity types used by
[spaCy models](/models). If you're using custom entity types, you can use the [spaCy models](/models). If you're using custom entity types, you can use the
@ -359,13 +359,13 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
> get_length = null > get_length = null
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `seqs` | `Iterable[Any]` | The sequences to minibatch. | | `seqs` | The sequences to minibatch. ~~Iterable[Any]~~ |
| `size` | `Iterable[int]` / int | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | | `size` | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
| `tolerance` | float | What percentage of the size to allow batches to exceed. | | `tolerance` | What percentage of the size to allow batches to exceed. ~~float~~ |
| `discard_oversize` | bool | Whether to discard sequences that by themselves exceed the tolerated size. | | `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~ |
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. | | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
#### batch_by_sequence.v1 {#batch_by_sequence tag="registered function"} #### batch_by_sequence.v1 {#batch_by_sequence tag="registered function"}
@ -380,10 +380,10 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
Create a batcher that creates batches of the specified size. Create a batcher that creates batches of the specified size.
| Name | Type | Description | | Name | Description |
| ------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `size` | `Iterable[int]` / int | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | | `size` | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. | | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
#### batch_by_padded.v1 {#batch_by_padded tag="registered function"} #### batch_by_padded.v1 {#batch_by_padded tag="registered function"}
@ -403,12 +403,12 @@ sequences binned by length within a window. The padded size is defined as the
maximum length of sequences within the batch multiplied by the number of maximum length of sequences within the batch multiplied by the number of
sequences in the batch. sequences in the batch.
| Name | Type | Description | | Name | Description |
| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `size` | `Iterable[int]` / int | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). | | `size` | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
| `buffer` | int | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. | | `buffer` | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. ~~int~~ |
| `discard_oversize` | bool | Whether to discard sequences that are by themselves longer than the largest padded batch size. | | `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ |
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. | | `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
## Training data and alignment {#gold source="spacy/gold"} ## Training data and alignment {#gold source="spacy/gold"}
@ -436,11 +436,11 @@ single-token entity.
> assert tags == ["O", "O", "U-LOC", "O"] > assert tags == ["O", "O", "U-LOC", "O"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. | | `doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. ~~Doc~~ |
| `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. | | `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ |
| **RETURNS** | list | str strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. | | **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} ### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}
@ -458,11 +458,11 @@ Encode per-token tags following the
> assert entities == [(7, 13, "LOC")] > assert entities == [(7, 13, "LOC")]
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `doc` | `Doc` | The document that the BILUO tags refer to. | | `doc` | The document that the BILUO tags refer to. ~~Doc~~ |
| `entities` | iterable | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | | `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
| **RETURNS** | list | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. | | **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ |
### gold.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"} ### gold.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"}
@ -481,11 +481,11 @@ token-based tags, e.g. to overwrite the `doc.ents`.
> doc.ents = spans_from_biluo_tags(doc, tags) > doc.ents = spans_from_biluo_tags(doc, tags)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `doc` | `Doc` | The document that the BILUO tags refer to. | | `doc` | The document that the BILUO tags refer to. ~~Doc~~ |
| `entities` | iterable | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. | | `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
| **RETURNS** | list | A sequence of `Span` objects with added entity labels. | | **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~ |
## Utility functions {#util source="spacy/util.py"} ## Utility functions {#util source="spacy/util.py"}
@ -504,7 +504,8 @@ depends on any of spaCy's utilities.
Import and load a `Language` class. Allows lazy-loading Import and load a `Language` class. Allows lazy-loading
[language data](/usage/adding-languages) and importing languages using the [language data](/usage/adding-languages) and importing languages using the
two-letter language code. To add a language code for a custom language class, two-letter language code. To add a language code for a custom language class,
you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper. you can register it using the [`@registry.languages`](/api/top-level#registry)
decorator.
> #### Example > #### Example
> >
@ -514,36 +515,14 @@ you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper.
> lang = lang_class() > lang = lang_class()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---------- | -------------------------------------- | | ----------- | ---------------------------------------------- |
| `lang` | str | Two-letter language code, e.g. `'en'`. | | `lang` | Two-letter language code, e.g. `"en"`. ~~str~~ |
| **RETURNS** | `Language` | Language class. | | **RETURNS** | The respective subclass. ~~Language~~ |
### util.set_lang_class {#util.set_lang_class tag="function"}
Set a custom `Language` class name that can be loaded via
[`get_lang_class`](/api/top-level#util.get_lang_class). If your model uses a
custom language, this is required so that spaCy can load the correct class from
the two-letter language code.
> #### Example
>
> ```python
> from spacy.lang.xy import CustomLanguage
>
> util.set_lang_class('xy', CustomLanguage)
> lang_class = util.get_lang_class('xy')
> nlp = lang_class()
> ```
| Name | Type | Description |
| ------ | ---------- | -------------------------------------- |
| `name` | str | Two-letter language code, e.g. `'en'`. |
| `cls` | `Language` | The language class, e.g. `English`. |
### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"} ### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"}
Check whether a `Language` class is already loaded. `Language` classes are Check whether a `Language` subclass is already loaded. `Language` subclasses are
loaded lazily, to avoid expensive setup code associated with the language data. loaded lazily, to avoid expensive setup code associated with the language data.
> #### Example > #### Example
@ -554,10 +533,10 @@ loaded lazily, to avoid expensive setup code associated with the language data.
> assert util.lang_class_is_loaded("de") is False > assert util.lang_class_is_loaded("de") is False
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------------- | | ----------- | ---------------------------------------------- |
| `name` | str | Two-letter language code, e.g. `'en'`. | | `name` | Two-letter language code, e.g. `"en"`. ~~str~~ |
| **RETURNS** | bool | Whether the class has been loaded. | | **RETURNS** | Whether the class has been loaded. ~~bool~~ |
### util.load_model {#util.load_model tag="function" new="2"} ### util.load_model {#util.load_model tag="function" new="2"}
@ -566,7 +545,7 @@ will assume the model is a Python package and import and call its `load()`
method. If called with a path, spaCy will assume it's a data directory, read the method. If called with a path, spaCy will assume it's a data directory, read the
language and pipeline settings from the meta.json and initialize a `Language` language and pipeline settings from the meta.json and initialize a `Language`
class. The model data will then be loaded in via class. The model data will then be loaded in via
[`Language.from_disk()`](/api/language#from_disk). [`Language.from_disk`](/api/language#from_disk).
> #### Example > #### Example
> >
@ -576,31 +555,13 @@ class. The model data will then be loaded in via
> nlp = util.load_model("/path/to/data") > nlp = util.load_model("/path/to/data")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------- | ---------- | -------------------------------------------------------- | | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | str | Package name or model path. | | `name` | Package name or model path. ~~str~~ |
| `**overrides` | - | Specific overrides, like pipeline components to disable. | | `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
| **RETURNS** | `Language` | `Language` class with the loaded model. | | `disable` | Names of pipeline components to disable. ~~Iterable[str]~~ |
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
### util.load_model_from_path {#util.load_model_from_path tag="function" new="2"} | **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
Load a model from a data directory path. Creates the [`Language`](/api/language)
class and pipeline based on the directory's meta.json and then calls
[`from_disk()`](/api/language#from_disk) with the path. This function also makes
it easy to test a new model that you haven't packaged yet.
> #### Example
>
> ```python
> nlp = load_model_from_path("/path/to/data")
> ```
| Name | Type | Description |
| ------------- | ---------- | ---------------------------------------------------------------------------------------------------- |
| `model_path` | str | Path to model data directory. |
| `meta` | dict | Model meta data. If `False`, spaCy will try to load the meta from a meta.json in the same directory. |
| `**overrides` | - | Specific overrides, like pipeline components to disable. |
| **RETURNS** | `Language` | `Language` class with the loaded model. |
### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"} ### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
@ -616,11 +577,13 @@ A helper function to use in the `load()` method of a model package's
> return load_model_from_init_py(__file__, **overrides) > return load_model_from_init_py(__file__, **overrides)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------- | ---------- | -------------------------------------------------------- | | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
| `init_file` | str | Path to model's `__init__.py`, i.e. `__file__`. | | `init_file` | Path to model's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
| `**overrides` | - | Specific overrides, like pipeline components to disable. | | `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
| **RETURNS** | `Language` | `Language` class with the loaded model. | | `disable` | Names of pipeline components to disable. ~~Iterable[str]~~ |
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
### util.get_model_meta {#util.get_model_meta tag="function" new="2"} ### util.get_model_meta {#util.get_model_meta tag="function" new="2"}
@ -632,10 +595,10 @@ Get a model's meta.json from a directory path and validate its contents.
> meta = util.get_model_meta("/path/to/model") > meta = util.get_model_meta("/path/to/model")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------ | ------------------------ | | ----------- | --------------------------------------------- |
| `path` | str / `Path` | Path to model directory. | | `path` | Path to model directory. ~~Union[str, Path]~~ |
| **RETURNS** | dict | The model's meta data. | | **RETURNS** | The model's meta data. ~~Dict[str, Any]~~ |
### util.is_package {#util.is_package tag="function"} ### util.is_package {#util.is_package tag="function"}
@ -649,10 +612,10 @@ Check if string maps to a package installed via pip. Mainly used to validate
> util.is_package("xyz") # False > util.is_package("xyz") # False
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------ | -------------------------------------------- | | ----------- | ----------------------------------------------------- |
| `name` | str | Name of package. | | `name` | Name of package. ~~str~~ |
| **RETURNS** | `bool` | `True` if installed package, `False` if not. | | **RETURNS** | `True` if installed package, `False` if not. ~~bool~~ |
### util.get_package_path {#util.get_package_path tag="function" new="2"} ### util.get_package_path {#util.get_package_path tag="function" new="2"}
@ -666,10 +629,10 @@ Get path to an installed package. Mainly used to resolve the location of
> # /usr/lib/python3.6/site-packages/en_core_web_sm > # /usr/lib/python3.6/site-packages/en_core_web_sm
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ------ | -------------------------------- | | -------------- | ----------------------------------------- |
| `package_name` | str | Name of installed package. | | `package_name` | Name of installed package. ~~str~~ |
| **RETURNS** | `Path` | Path to model package directory. | | **RETURNS** | Path to model package directory. ~~Path~~ |
### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"} ### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"}
@ -686,9 +649,9 @@ detecting the IPython kernel. Mainly used for the
> display(HTML(html)) > display(HTML(html))
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------- | | ----------- | ---------------------------------------------- |
| **RETURNS** | bool | `True` if in Jupyter, `False` if not. | | **RETURNS** | `True` if in Jupyter, `False` if not. ~~bool~~ |
### util.compile_prefix_regex {#util.compile_prefix_regex tag="function"} ### util.compile_prefix_regex {#util.compile_prefix_regex tag="function"}
@ -702,10 +665,10 @@ Compile a sequence of prefix rules into a regex object.
> nlp.tokenizer.prefix_search = prefix_regex.search > nlp.tokenizer.prefix_search = prefix_regex.search
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | tuple | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). | | `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). | | **RETURNS** | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"} ### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
@ -719,10 +682,10 @@ Compile a sequence of suffix rules into a regex object.
> nlp.tokenizer.suffix_search = suffix_regex.search > nlp.tokenizer.suffix_search = suffix_regex.search
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | tuple | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). | | `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). | | **RETURNS** | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
### util.compile_infix_regex {#util.compile_infix_regex tag="function"} ### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
@ -736,10 +699,10 @@ Compile a sequence of infix rules into a regex object.
> nlp.tokenizer.infix_finditer = infix_regex.finditer > nlp.tokenizer.infix_finditer = infix_regex.finditer
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | tuple | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). | | `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). | | **RETURNS** | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ |
### util.minibatch {#util.minibatch tag="function" new="2"} ### util.minibatch {#util.minibatch tag="function" new="2"}
@ -754,11 +717,11 @@ vary on each step.
> nlp.update(batch) > nlp.update(batch)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | -------------- | ---------------------- | | ---------- | ---------------------------------------- |
| `items` | iterable | The items to batch up. | | `items` | The items to batch up. ~~Iterable[Any]~~ |
| `size` | int / iterable | The batch size(s). | | `size` | int / iterable | The batch size(s). ~~Union[int, Sequence[int]]~~ |
| **YIELDS** | list | The batches. | | **YIELDS** | The batches. |
### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"} ### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"}
@ -776,17 +739,30 @@ of one entity) or when merging spans with
> filtered = filter_spans(spans) > filtered = filter_spans(spans)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------- | -------------------- | | ----------- | --------------------------------------- |
| `spans` | iterable | The spans to filter. | | `spans` | The spans to filter. ~~Iterable[Span]~~ |
| **RETURNS** | list | The filtered spans. | | **RETURNS** | The filtered spans. ~~List[Span]~~ |
### util.get_words_and_spaces {#get_words_and_spaces tag="function" new="3"} ### util.get_words_and_spaces {#get_words_and_spaces tag="function" new="3"}
<!-- TODO: document --> Given a list of words and a text, reconstruct the original tokens and return a
list of words and spaces that can be used to create a [`Doc`](/api/doc#init).
This can help recover destructive tokenization that didn't preserve any
whitespace information.
| Name | Type | Description | > #### Example
| ----------- | ----- | ----------- | >
| `words` | list | | > ```python
| `text` | str | | > orig_words = ["Hey", ",", "what", "'s", "up", "?"]
| **RETURNS** | tuple | | > orig_text = "Hey, what's up?"
> words, spaces = get_words_and_spaces(orig_words, orig_text)
> # ['Hey', ',', 'what', "'s", 'up', '?']
> # [False, True, False, True, False, False]
> ```
| Name | Description |
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
| `words` | The list of words. ~~Iterable[str]~~ |
| `text` | The original text. ~~str~~ |
| **RETURNS** | A list of words and a list of boolean values indicating whether the word at this position is followed by a space. ~~Tuple[List[str], List[bool]]~~ |

View File

@ -60,11 +60,11 @@ architectures and their arguments and hyperparameters.
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG) > nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
> ``` > ```
| Setting | Type | Description | Default | | Setting | Description |
| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- | | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `max_batch_items` | int | Maximum size of a padded batch. | `4096` | | `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. | `null_annotation_setter` | | `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. | [TransformerModel](/api/architectures#TransformerModel) | | `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
```python ```python
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
@ -101,14 +101,14 @@ attribute. You can also provide a callback to set additional annotations. In
your application, you would normally use a shortcut for this and instantiate the your application, you would normally use a shortcut for this and instantiate the
component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
| Name | Type | Description | | Name | Description |
| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | The shared vocabulary. ~~Vocab~~ |
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. | | `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ |
| `annotation_setter` | `Callable` | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. | | `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. | | `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. | | `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ |
## Transformer.\_\_call\_\_ {#call tag="method"} ## Transformer.\_\_call\_\_ {#call tag="method"}
@ -128,10 +128,10 @@ to the [`predict`](/api/transformer#predict) and
> processed = transformer(doc) > processed = transformer(doc)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ------------------------ | | ----------- | -------------------------------- |
| `doc` | `Doc` | The document to process. | | `doc` | The document to process. ~~Doc~~ |
| **RETURNS** | `Doc` | The processed document. | | **RETURNS** | The processed document. ~~Doc~~ |
## Transformer.pipe {#pipe tag="method"} ## Transformer.pipe {#pipe tag="method"}
@ -150,12 +150,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
> pass > pass
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ----------------------------------------------------- | | -------------- | ------------------------------------------------------------- |
| `stream` | `Iterable[Doc]` | A stream of documents. | | `stream` | A stream of documents. ~~Iterable[Doc]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `batch_size` | int | The number of documents to buffer. Defaults to `128`. | | `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
| **YIELDS** | `Doc` | The processed documents in order. | | **YIELDS** | The processed documents in order. ~~Doc~~ |
## Transformer.begin_training {#begin_training tag="method"} ## Transformer.begin_training {#begin_training tag="method"}
@ -175,13 +175,13 @@ setting up the label scheme based on the data.
> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline) > optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. | | `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. | | `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/transformer#create_optimizer) if not set. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## Transformer.predict {#predict tag="method"} ## Transformer.predict {#predict tag="method"}
@ -195,10 +195,10 @@ modifying them.
> scores = trf.predict([doc1, doc2]) > scores = trf.predict([doc1, doc2])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------- | ----------------------------------------- | | ----------- | ------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to predict. | | `docs` | The documents to predict. ~~Iterable[Doc]~~ |
| **RETURNS** | - | The model's prediction for each document. | | **RETURNS** | The model's prediction for each document. |
## Transformer.set_annotations {#set_annotations tag="method"} ## Transformer.set_annotations {#set_annotations tag="method"}
@ -215,10 +215,10 @@ callback is then called, if provided.
> trf.set_annotations(docs, scores) > trf.set_annotations(docs, scores)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | --------------- | ----------------------------------------------------- | | -------- | ----------------------------------------------------- |
| `docs` | `Iterable[Doc]` | The documents to modify. | | `docs` | The documents to modify. ~~Iterable[Doc]~~ |
| `scores` | - | The scores to set, produced by `Transformer.predict`. | | `scores` | The scores to set, produced by `Transformer.predict`. |
## Transformer.update {#update tag="method"} ## Transformer.update {#update tag="method"}
@ -244,15 +244,15 @@ and call the optimizer, while the others simply increment the gradients.
> losses = trf.update(examples, sgd=optimizer) > losses = trf.update(examples, sgd=optimizer)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. | | `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `drop` | float | The dropout rate. | | `drop` | The dropout rate. ~~float~~ |
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations). | | `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. | | `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. | | **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
## Transformer.create_optimizer {#create_optimizer tag="method"} ## Transformer.create_optimizer {#create_optimizer tag="method"}
@ -265,9 +265,9 @@ Create an optimizer for the pipeline component.
> optimizer = trf.create_optimizer() > optimizer = trf.create_optimizer()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------------------------------------------------- | -------------- | | ----------- | ---------------------------- |
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | | **RETURNS** | The optimizer. ~~Optimizer~~ |
## Transformer.use_params {#use_params tag="method, contextmanager"} ## Transformer.use_params {#use_params tag="method, contextmanager"}
@ -282,9 +282,9 @@ context, the original parameters are restored.
> trf.to_disk("/best_model") > trf.to_disk("/best_model")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---- | ----------------------------------------- | | -------- | -------------------------------------------------- |
| `params` | dict | The parameter values to use in the model. | | `params` | The parameter values to use in the model. ~~dict~~ |
## Transformer.to_disk {#to_disk tag="method"} ## Transformer.to_disk {#to_disk tag="method"}
@ -297,11 +297,11 @@ Serialize the pipe to disk.
> trf.to_disk("/path/to/transformer") > trf.to_disk("/path/to/transformer")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## Transformer.from_disk {#from_disk tag="method"} ## Transformer.from_disk {#from_disk tag="method"}
@ -314,12 +314,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
> trf.from_disk("/path/to/transformer") > trf.from_disk("/path/to/transformer")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. | | **RETURNS** | The modified `Transformer` object. ~~Transformer~~ |
## Transformer.to_bytes {#to_bytes tag="method"} ## Transformer.to_bytes {#to_bytes tag="method"}
@ -332,11 +332,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
Serialize the pipe to a bytestring. Serialize the pipe to a bytestring.
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. | | **RETURNS** | The serialized form of the `Transformer` object. ~~bytes~~ |
## Transformer.from_bytes {#from_bytes tag="method"} ## Transformer.from_bytes {#from_bytes tag="method"}
@ -350,12 +350,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
> trf.from_bytes(trf_bytes) > trf.from_bytes(trf_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. | | **RETURNS** | The `Transformer` object. ~~Transformer~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}
@ -386,20 +386,20 @@ by this class. Instances of this class
are`typically assigned to the [Doc._.trf_data`](/api/transformer#custom-attributes) are`typically assigned to the [Doc._.trf_data`](/api/transformer#custom-attributes)
extension attribute. extension attribute.
| Name | Type | Description | | Name | Description |
| --------- | -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tokens` | `Dict` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts, and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. | | `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts, and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |
| `tensors` | `List[FloatsXd]` | The activations for the Doc from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. | | `tensors` | The activations for the Doc from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. | | `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
| `width` | int | The width of the last hidden layer. | | `width` | The width of the last hidden layer. ~~int~~ |
### TransformerData.empty {#transformerdata-emoty tag="classmethod"} ### TransformerData.empty {#transformerdata-emoty tag="classmethod"}
Create an empty `TransformerData` container. Create an empty `TransformerData` container.
| Name | Type | Description | | Name | Description |
| ----------- | ----------------- | -------------- | | ----------- | ---------------------------------- |
| **RETURNS** | `TransformerData` | The container. | | **RETURNS** | The container. ~~TransformerData~~ |
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"} ## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
@ -407,13 +407,13 @@ Holds a batch of input and output objects for a transformer model. The data can
then be split to a list of [`TransformerData`](/api/transformer#transformerdata) then be split to a list of [`TransformerData`](/api/transformer#transformerdata)
objects to associate the outputs to each [`Doc`](/api/doc) in the batch. objects to associate the outputs to each [`Doc`](/api/doc) in the batch.
| Name | Type | Description | | Name | Description |
| ---------- | -------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `spans` | `List[List[Span]]` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each Span can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each Span may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. | | `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each Span can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each Span may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | The output of the tokenizer. | | `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ |
| `tensors` | `List[torch.Tensor]` | The output of the transformer model. | | `tensors` | The output of the transformer model. ~~List[torch.Tensor]~~ |
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. | | `align` | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
| `doc_data` | `List[TransformerData]` | The outputs, split per `Doc` object. | | `doc_data` | The outputs, split per `Doc` object. ~~List[TransformerData]~~ |
### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"} ### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"}
@ -422,19 +422,19 @@ current object's spans, tokens and alignment. This is used during the backward
pass, in order to construct the gradients to pass back into the transformer pass, in order to construct the gradients to pass back into the transformer
model. model.
| Name | Type | Description | | Name | Description |
| ----------- | ---------------------- | ------------------------------- | | ----------- | -------------------------------------------------------- |
| `arrays` | `List[List[Floats3d]]` | The split batch of activations. | | `arrays` | The split batch of activations. ~~List[List[Floats3d]]~~ |
| **RETURNS** | `FullTransformerBatch` | The transformer batch. | | **RETURNS** | The transformer batch. ~~FullTransformerBatch~~ |
### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"} ### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"}
Split a `TransformerData` object that represents a batch into a list with one Split a `TransformerData` object that represents a batch into a list with one
`TransformerData` per `Doc`. `TransformerData` per `Doc`.
| Name | Type | Description | | Name | Description |
| ----------- | ----------------------- | ---------------- | | ----------- | ------------------------------------------ |
| **RETURNS** | `List[TransformerData]` | The split batch. | | **RETURNS** | The split batch. ~~List[TransformerData]~~ |
## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"} ## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
@ -460,10 +460,10 @@ decorator.
> return get_sent_spans > return get_sent_spans
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------------ | ---------------------------------------- | | ----------- | ------------------------------------------------------------- |
| `docs` | `Iterable[Doc]` | A batch of `Doc` objects. | | `docs` | A batch of `Doc` objects. ~~Iterable[Doc]~~ |
| **RETURNS** | `List[List[Span]]` | The spans to process by the transformer. | | **RETURNS** | The spans to process by the transformer. ~~List[List[Span]]~~ |
### doc_spans.v1 {#doc_spans tag="registered function"} ### doc_spans.v1 {#doc_spans tag="registered function"}
@ -510,10 +510,10 @@ than `window` will allow for an overlap, so that some tokens are counted twice.
This can be desirable, because it allows all tokens to have both a left and This can be desirable, because it allows all tokens to have both a left and
right context. right context.
| Name | Type | Description | | Name | Description |
| --------- | ---- | ---------------- | | -------- | ------------------------ |
|  `window` | int | The window size. | | `window` | The window size. ~~int~~ |
| `stride` | int | The stride size. | | `stride` | The stride size. ~~int~~ |
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"} ## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
@ -534,10 +534,10 @@ You can register custom annotation setters using the
> return setter > return setter
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ---------------------- | ------------------------------------ | | ---------- | ------------------------------------------------------------- |
| `docs` | `List[Doc]` | A batch of `Doc` objects. | | `docs` | A batch of `Doc` objects. ~~List[Doc]~~ |
| `trf_data` | `FullTransformerBatch` | The transformers data for the batch. | | `trf_data` | The transformers data for the batch. ~~FullTransformerBatch~~ |
The following built-in functions are available: The following built-in functions are available:
@ -550,6 +550,6 @@ The following built-in functions are available:
The component sets the following The component sets the following
[custom extension attributes](/usage/processing-pipeline#custom-components-attributes): [custom extension attributes](/usage/processing-pipeline#custom-components-attributes):
| Name | Type | Description | | Name | Description |
| -------------- | ----------------------------------------------------- | ---------------------------------------------------- | | -------------- | ------------------------------------------------------------------------ |
| `Doc.trf_data` | [`TransformerData`](/api/transformer#transformerdata) | Transformer tokens and outputs for the `Doc` object. | | `Doc.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |

View File

@ -30,13 +30,13 @@ you can add vectors to later.
> vectors = Vectors(data=data, keys=keys) > vectors = Vectors(data=data, keys=keys)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `shape` | tuple | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. | | `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
| `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. | | `data` | The vector data. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| `keys` | iterable | A sequence of keys aligned with the data. | | `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
| `name` | str | A name to identify the vectors table. | | `name` | A name to identify the vectors table. ~~str~~ |
## Vectors.\_\_getitem\_\_ {#getitem tag="method"} ## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
@ -51,10 +51,10 @@ raised.
> assert cat_vector == nlp.vocab["cat"].vector > assert cat_vector == nlp.vocab["cat"].vector
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------- | ---------------------------------- | ------------------------------ | | ----------- | ---------------------------------------------------------------- |
| `key` | int | The key to get the vector for. | | `key` | The key to get the vector for. ~~int~~ |
| returns | `ndarray[ndim=1, dtype='float32']` | The vector for the key. | | **RETURNS** | The vector for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
## Vectors.\_\_setitem\_\_ {#setitem tag="method"} ## Vectors.\_\_setitem\_\_ {#setitem tag="method"}
@ -68,10 +68,10 @@ Set a vector for the given key.
> nlp.vocab.vectors[cat_id] = vector > nlp.vocab.vectors[cat_id] = vector
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---------------------------------- | ------------------------------ | | -------- | ----------------------------------------------------------- |
| `key` | int | The key to set the vector for. | | `key` | The key to set the vector for. ~~int~~ |
| `vector` | `ndarray[ndim=1, dtype='float32']` | The vector to set. | | `vector` | The vector to set. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
## Vectors.\_\_iter\_\_ {#iter tag="method"} ## Vectors.\_\_iter\_\_ {#iter tag="method"}
@ -84,9 +84,9 @@ Iterate over the keys in the table.
> print(key, nlp.vocab.strings[key]) > print(key, nlp.vocab.strings[key])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ---- | ------------------- | | ---------- | --------------------------- |
| **YIELDS** | int | A key in the table. | | **YIELDS** | A key in the table. ~~int~~ |
## Vectors.\_\_len\_\_ {#len tag="method"} ## Vectors.\_\_len\_\_ {#len tag="method"}
@ -99,9 +99,9 @@ Return the number of vectors in the table.
> assert len(vectors) == 3 > assert len(vectors) == 3
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ----------------------------------- | | ----------- | ------------------------------------------- |
| **RETURNS** | int | The number of vectors in the table. | | **RETURNS** | The number of vectors in the table. ~~int~~ |
## Vectors.\_\_contains\_\_ {#contains tag="method"} ## Vectors.\_\_contains\_\_ {#contains tag="method"}
@ -115,10 +115,10 @@ Check whether a key has been mapped to a vector entry in the table.
> assert cat_id in vectors > assert cat_id in vectors
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ----------------------------------- | | ----------- | -------------------------------------------- |
| `key` | int | The key to check. | | `key` | The key to check. ~~int~~ |
| **RETURNS** | bool | Whether the key has a vector entry. | | **RETURNS** | Whether the key has a vector entry. ~~bool~~ |
## Vectors.add {#add tag="method"} ## Vectors.add {#add tag="method"}
@ -138,13 +138,13 @@ mapping separately. If you need to manage the strings, you should use the
> nlp.vocab.vectors.add("dog", row=0) > nlp.vocab.vectors.add("dog", row=0)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ---------------------------------- | ----------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------- |
| `key` | str / int | The key to add. | | `key` | The key to add. ~~Union[str, int]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `vector` | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key. | | `vector` | An optional vector to add for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
| `row` | int | An optional row number of a vector to map the key to. | | `row` | An optional row number of a vector to map the key to. ~~int~~ |
| **RETURNS** | int | The row the vector was added to. | | **RETURNS** | The row the vector was added to. ~~int~~ |
## Vectors.resize {#resize tag="method"} ## Vectors.resize {#resize tag="method"}
@ -160,11 +160,11 @@ These removed items are returned as a list of `(key, row)` tuples.
> removed = nlp.vocab.vectors.resize((10000, 300)) > removed = nlp.vocab.vectors.resize((10000, 300))
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | -------------------------------------------------------------------- | | ----------- | ---------------------------------------------------------------------------------------- |
| `shape` | tuple | A `(rows, dims)` tuple describing the number of rows and dimensions. | | `shape` | A `(rows, dims)` tuple describing the number of rows and dimensions. ~~Tuple[int, int]~~ |
| `inplace` | bool | Reallocate the memory. | | `inplace` | Reallocate the memory. ~~bool~~ |
| **RETURNS** | list | The removed items as a list of `(key, row)` tuples. | | **RETURNS** | The removed items as a list of `(key, row)` tuples. ~~List[Tuple[int, int]]~~ |
## Vectors.keys {#keys tag="method"} ## Vectors.keys {#keys tag="method"}
@ -177,9 +177,9 @@ A sequence of the keys in the table.
> print(key, nlp.vocab.strings[key]) > print(key, nlp.vocab.strings[key])
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | -------- | ----------- | | ----------- | --------------------------- |
| **RETURNS** | iterable | The keys. | | **RETURNS** | The keys. ~~Iterable[int]~~ |
## Vectors.values {#values tag="method"} ## Vectors.values {#values tag="method"}
@ -194,9 +194,9 @@ the length of the vectors table.
> print(vector) > print(vector)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ---------------------------------- | ---------------------- | | ---------- | --------------------------------------------------------------- |
| **YIELDS** | `ndarray[ndim=1, dtype='float32']` | A vector in the table. | | **YIELDS** | A vector in the table. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
## Vectors.items {#items tag="method"} ## Vectors.items {#items tag="method"}
@ -209,9 +209,9 @@ Iterate over `(key, vector)` pairs, in order.
> print(key, nlp.vocab.strings[key], vector) > print(key, nlp.vocab.strings[key], vector)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | ----- | -------------------------------- | | ---------- | ------------------------------------------------------------------------------------- |
| **YIELDS** | tuple | `(key, vector)` pairs, in order. | | **YIELDS** | `(key, vector)` pairs, in order. ~~Tuple[int, numpy.ndarray[ndim=1, dtype=float32]]~~ |
## Vectors.find {#find tag="method"} ## Vectors.find {#find tag="method"}
@ -226,14 +226,14 @@ Look up one or more keys by row, or vice versa.
> keys = nlp.vocab.vectors.find(rows=[18, 256, 985]) > keys = nlp.vocab.vectors.find(rows=[18, 256, 985])
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ------------------------------------- | ------------------------------------------------------------------------ | | -------------- | -------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `key` | str / int | Find the row that the given key points to. Returns int, `-1` if missing. | | `key` | Find the row that the given key points to. Returns int, `-1` if missing. ~~Union[str, int]~~ |
| `keys` | iterable | Find rows that the keys point to. Returns `ndarray`. | | `keys` | Find rows that the keys point to. Returns `numpy.ndarray`. ~~Iterable[Union[str, int]]~~ |
| `row` | int | Find the first key that points to the row. Returns int. | | `row` | Find the first key that points to the row. Returns integer. ~~int~~ |
| `rows` | iterable | Find the keys that point to the rows. Returns ndarray. | | `rows` | Find the keys that point to the rows. Returns `numpy.ndarray`. ~~Iterable[int]~~ |
| **RETURNS** | The requested key, keys, row or rows. | | **RETURNS** | The requested key, keys, row or rows. ~~Union[int, numpy.ndarray[ndim=1, dtype=float32]]~~ |
## Vectors.shape {#shape tag="property"} ## Vectors.shape {#shape tag="property"}
@ -250,9 +250,9 @@ vector table.
> assert dims == 300 > assert dims == 300
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | ---------------------- | | ----------- | ------------------------------------------ |
| **RETURNS** | tuple | A `(rows, dims)` pair. | | **RETURNS** | A `(rows, dims)` pair. ~~Tuple[int, int]~~ |
## Vectors.size {#size tag="property"} ## Vectors.size {#size tag="property"}
@ -265,9 +265,9 @@ The vector size, i.e. `rows * dims`.
> assert vectors.size == 150000 > assert vectors.size == 150000
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ---------------- | | ----------- | ------------------------ |
| **RETURNS** | int | The vector size. | | **RETURNS** | The vector size. ~~int~~ |
## Vectors.is_full {#is_full tag="property"} ## Vectors.is_full {#is_full tag="property"}
@ -283,9 +283,9 @@ If a table is full, it can be resized using
> assert vectors.is_full > assert vectors.is_full
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ---------------------------------- | | ----------- | ------------------------------------------- |
| **RETURNS** | bool | Whether the vectors table is full. | | **RETURNS** | Whether the vectors table is full. ~~bool~~ |
## Vectors.n_keys {#n_keys tag="property"} ## Vectors.n_keys {#n_keys tag="property"}
@ -301,9 +301,9 @@ vectors, they will be counted individually.
> assert vectors.n_keys == 0 > assert vectors.n_keys == 0
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ------------------------------------ | | ----------- | -------------------------------------------- |
| **RETURNS** | int | The number of all keys in the table. | | **RETURNS** | The number of all keys in the table. ~~int~~ |
## Vectors.most_similar {#most_similar tag="method"} ## Vectors.most_similar {#most_similar tag="method"}
@ -320,14 +320,14 @@ performed in chunks, to avoid consuming too much memory. You can set the
> most_similar = nlp.vocab.vectors.most_similar(queries, n=10) > most_similar = nlp.vocab.vectors.most_similar(queries, n=10)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------- | ------------------------------------------------------------------ | | -------------- | --------------------------------------------------------------------------- |
| `queries` | `ndarray` | An array with one or more vectors. | | `queries` | An array with one or more vectors. ~~numpy.ndarray~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `batch_size` | int | The batch size to use. Default to `1024`. | | `batch_size` | The batch size to use. Default to `1024`. ~~int~~ |
| `n` | int | The number of entries to return for each query. Defaults to `1`. | | `n` | The number of entries to return for each query. Defaults to `1`. ~~int~~ |
| `sort` | bool | Whether to sort the entries returned by score. Defaults to `True`. | | `sort` | Whether to sort the entries returned by score. Defaults to `True`. ~~bool~~ |
| **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. | | **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. ~~Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]~~ |
## Vectors.to_disk {#to_disk tag="method"} ## Vectors.to_disk {#to_disk tag="method"}
@ -340,9 +340,9 @@ Save the current state to a directory.
> >
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- | | ------ | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
## Vectors.from_disk {#from_disk tag="method"} ## Vectors.from_disk {#from_disk tag="method"}
@ -355,10 +355,10 @@ Loads state from a directory. Modifies the object in place and returns it.
> vectors.from_disk("/path/to/vectors") > vectors.from_disk("/path/to/vectors")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ------------ | -------------------------------------------------------------------------- | | ----------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| **RETURNS** | `Vectors` | The modified `Vectors` object. | | **RETURNS** | The modified `Vectors` object. ~~Vectors~~ |
## Vectors.to_bytes {#to_bytes tag="method"} ## Vectors.to_bytes {#to_bytes tag="method"}
@ -370,9 +370,9 @@ Serialize the current state to a binary string.
> vectors_bytes = vectors.to_bytes() > vectors_bytes = vectors.to_bytes()
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ----- | -------------------------------------------- | | ----------- | ------------------------------------------------------ |
| **RETURNS** | bytes | The serialized form of the `Vectors` object. | | **RETURNS** | The serialized form of the `Vectors` object. ~~bytes~~ |
## Vectors.from_bytes {#from_bytes tag="method"} ## Vectors.from_bytes {#from_bytes tag="method"}
@ -387,15 +387,15 @@ Load state from a binary string.
> new_vectors.from_bytes(vectors_bytes) > new_vectors.from_bytes(vectors_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------- | ---------------------- | | ----------- | --------------------------------- |
| `data` | bytes | The data to load from. | | `data` | The data to load from. ~~bytes~~ |
| **RETURNS** | `Vectors` | The `Vectors` object. | | **RETURNS** | The `Vectors` object. ~~Vectors~~ |
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Description |
| --------- | ---------------------------------- | ------------------------------------------------------------------------------- | | --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `data` | `ndarray[ndim=1, dtype='float32']` | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. | | `data` | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
| `key2row` | dict | Dictionary mapping word hashes to rows in the `Vectors.data` table. | | `key2row` | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~ |
| `keys` | `ndarray[ndim=1, dtype='float32']` | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. | | `keys` | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |

View File

@ -21,14 +21,15 @@ Create the vocabulary.
> vocab = Vocab(strings=["hello", "world"]) > vocab = Vocab(strings=["hello", "world"])
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------------------------------------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lex_attr_getters` | dict | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. | | `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ |
| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. | | `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ |
| `lookups` | `Lookups` | A [`Lookups`](/api/lookups) that stores the `lemma_\*`, `lexeme_norm` and other large lookup tables. Defaults to `None`. | | `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ |
| `lookups_extra` <Tag variant="new">2.3</Tag> | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. | | `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
| `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. | | `vectors_name` <Tag variant="new">2.2</Tag> | A name to identify the vectors table. ~~str~~ |
| `vectors_name` <Tag variant="new">2.2</Tag> | str | A name to identify the vectors table. | | `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
| `get_noun_chunks` | A function that yields base noun phrases, used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ |
## Vocab.\_\_len\_\_ {#len tag="method"} ## Vocab.\_\_len\_\_ {#len tag="method"}
@ -41,9 +42,9 @@ Get the current number of lexemes in the vocabulary.
> assert len(nlp.vocab) > 0 > assert len(nlp.vocab) > 0
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | ---------------------------------------- | | ----------- | ------------------------------------------------ |
| **RETURNS** | int | The number of lexemes in the vocabulary. | | **RETURNS** | The number of lexemes in the vocabulary. ~~int~~ |
## Vocab.\_\_getitem\_\_ {#getitem tag="method"} ## Vocab.\_\_getitem\_\_ {#getitem tag="method"}
@ -57,10 +58,10 @@ given, a new lexeme is created and stored.
> assert nlp.vocab[apple] == nlp.vocab["apple"] > assert nlp.vocab[apple] == nlp.vocab["apple"]
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------- | ---------------------------------------- | | -------------- | ------------------------------------------------------------ |
| `id_or_string` | int / str | The hash value of a word, or its string. | | `id_or_string` | The hash value of a word, or its string. ~~Union[int, str]~~ |
| **RETURNS** | `Lexeme` | The lexeme indicated by the given ID. | | **RETURNS** | The lexeme indicated by the given ID. ~~Lexeme~~ |
## Vocab.\_\_iter\_\_ {#iter tag="method"} ## Vocab.\_\_iter\_\_ {#iter tag="method"}
@ -72,9 +73,9 @@ Iterate over the lexemes in the vocabulary.
> stop_words = (lex for lex in nlp.vocab if lex.is_stop) > stop_words = (lex for lex in nlp.vocab if lex.is_stop)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ---------- | -------- | --------------------------- | | ---------- | -------------------------------------- |
| **YIELDS** | `Lexeme` | An entry in the vocabulary. | | **YIELDS** | An entry in the vocabulary. ~~Lexeme~~ |
## Vocab.\_\_contains\_\_ {#contains tag="method"} ## Vocab.\_\_contains\_\_ {#contains tag="method"}
@ -91,10 +92,10 @@ given string, you need to look it up in
> assert oov not in nlp.vocab > assert oov not in nlp.vocab
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | ---- | -------------------------------------------------- | | ----------- | ----------------------------------------------------------- |
| `string` | str | The ID string. | | `string` | The ID string. ~~str~~ |
| **RETURNS** | bool | Whether the string has an entry in the vocabulary. | | **RETURNS** | Whether the string has an entry in the vocabulary. ~~bool~~ |
## Vocab.add_flag {#add_flag tag="method"} ## Vocab.add_flag {#add_flag tag="method"}
@ -115,11 +116,11 @@ using `token.check_flag(flag_id)`.
> assert doc[2].check_flag(MY_PRODUCT) == True > assert doc[2].check_flag(MY_PRODUCT) == True
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value. | | `flag_getter` | A function that takes the lexeme text and returns the boolean flag value. ~~Callable[[str], bool]~~ |
| `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. | | `flag_id` | An integer between `1` and `63` (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. ~~int~~ |
| **RETURNS** | int | The integer ID by which the flag value can be checked. | | **RETURNS** | The integer ID by which the flag value can be checked. ~~int~~ |
## Vocab.reset_vectors {#reset_vectors tag="method" new="2"} ## Vocab.reset_vectors {#reset_vectors tag="method" new="2"}
@ -133,11 +134,11 @@ have to call this to change the size of the vectors. Only one of the `width` and
> nlp.vocab.reset_vectors(width=300) > nlp.vocab.reset_vectors(width=300)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | ---- | -------------------------------------- | | -------------- | ---------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `width` | int | The new width (keyword argument only). | | `width` | The new width. ~~int~~ |
| `shape` | int | The new shape (keyword argument only). | | `shape` | The new shape. ~~int~~ |
## Vocab.prune_vectors {#prune_vectors tag="method" new="2"} ## Vocab.prune_vectors {#prune_vectors tag="method" new="2"}
@ -158,11 +159,11 @@ cosines are calculated in minibatches, to reduce memory usage.
> assert len(nlp.vocab.vectors) <= 1000 > assert len(nlp.vocab.vectors) <= 1000
> ``` > ```
| Name | Type | Description | | Name | Description |
| ------------ | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nr_row` | int | The number of rows to keep in the vector table. | | `nr_row` | The number of rows to keep in the vector table. ~~int~~ |
| `batch_size` | int | Batch of vectors for calculating the similarities. Larger batch sizes might be faster, while temporarily requiring more memory. | | `batch_size` | Batch of vectors for calculating the similarities. Larger batch sizes might be faster, while temporarily requiring more memory. ~~int~~ |
| **RETURNS** | dict | A dictionary keyed by removed words mapped to `(string, score)` tuples, where `string` is the entry the removed word was mapped to, and `score` the similarity score between the two words. | | **RETURNS** | A dictionary keyed by removed words mapped to `(string, score)` tuples, where `string` is the entry the removed word was mapped to, and `score` the similarity score between the two words. ~~Dict[str, Tuple[str, float]]~~ |
## Vocab.get_vector {#get_vector tag="method" new="2"} ## Vocab.get_vector {#get_vector tag="method" new="2"}
@ -178,12 +179,12 @@ subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
> nlp.vocab.get_vector("apple", minn=1, maxn=5) > nlp.vocab.get_vector("apple", minn=1, maxn=5)
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------------------------------- | ---------------------------------------- | ---------------------------------------------------------------------------------------------- | | ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- |
| `orth` | int / str | The hash value of a word, or its unicode string. | | `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ |
| `minn` <Tag variant="new">2.1</Tag> | int | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. | | `minn` <Tag variant="new">2.1</Tag> | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ |
| `maxn` <Tag variant="new">2.1</Tag> | int | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. | | `maxn` <Tag variant="new">2.1</Tag> | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ |
| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A word vector. Size and shape are determined by the `Vocab.vectors` instance. | | **RETURNS** | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
## Vocab.set_vector {#set_vector tag="method" new="2"} ## Vocab.set_vector {#set_vector tag="method" new="2"}
@ -196,10 +197,10 @@ or hash value.
> nlp.vocab.set_vector("apple", array([...])) > nlp.vocab.set_vector("apple", array([...]))
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------- | ---------------------------------------- | ------------------------------------------------ | | -------- | -------------------------------------------------------------------- |
| `orth` | int / str | The hash value of a word, or its unicode string. | | `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ |
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | The vector to set. | | `vector` | The vector to set. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
## Vocab.has_vector {#has_vector tag="method" new="2"} ## Vocab.has_vector {#has_vector tag="method" new="2"}
@ -213,10 +214,10 @@ Words can be looked up by string or hash value.
> vector = nlp.vocab.get_vector("apple") > vector = nlp.vocab.get_vector("apple")
> ``` > ```
| Name | Type | Description | | Name | Description |
| ----------- | --------- | ------------------------------------------------ | | ----------- | -------------------------------------------------------------------- |
| `orth` | int / str | The hash value of a word, or its unicode string. | | `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ |
| **RETURNS** | bool | Whether the word has a vector. | | **RETURNS** | Whether the word has a vector. ~~bool~~ |
## Vocab.to_disk {#to_disk tag="method" new="2"} ## Vocab.to_disk {#to_disk tag="method" new="2"}
@ -228,11 +229,11 @@ Save the current state to a directory.
> nlp.vocab.to_disk("/path/to/vocab") > nlp.vocab.to_disk("/path/to/vocab")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
## Vocab.from_disk {#from_disk tag="method" new="2"} ## Vocab.from_disk {#from_disk tag="method" new="2"}
@ -245,12 +246,12 @@ Loads state from a directory. Modifies the object in place and returns it.
> vocab = Vocab().from_disk("/path/to/vocab") > vocab = Vocab().from_disk("/path/to/vocab")
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | -------------------------------------------------------------------------- | | -------------- | ----------------------------------------------------------------------------------------------- |
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Vocab` | The modified `Vocab` object. | | **RETURNS** | The modified `Vocab` object. ~~Vocab~~ |
## Vocab.to_bytes {#to_bytes tag="method"} ## Vocab.to_bytes {#to_bytes tag="method"}
@ -262,11 +263,11 @@ Serialize the current state to a binary string.
> vocab_bytes = nlp.vocab.to_bytes() > vocab_bytes = nlp.vocab.to_bytes()
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | bytes | The serialized form of the `Vocab` object. | | **RETURNS** | The serialized form of the `Vocab` object. ~~Vocab~~ |
## Vocab.from_bytes {#from_bytes tag="method"} ## Vocab.from_bytes {#from_bytes tag="method"}
@ -281,12 +282,12 @@ Load state from a binary string.
> vocab.from_bytes(vocab_bytes) > vocab.from_bytes(vocab_bytes)
> ``` > ```
| Name | Type | Description | | Name | Description |
| -------------- | --------------- | ------------------------------------------------------------------------- | | -------------- | ------------------------------------------------------------------------------------------- |
| `bytes_data` | bytes | The data to load from. | | `bytes_data` | The data to load from. ~~bytes~~ |
| _keyword-only_ | | | | _keyword-only_ | |
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
| **RETURNS** | `Vocab` | The `Vocab` object. | | **RETURNS** | The `Vocab` object. ~~Vocab~~ |
## Attributes {#attributes} ## Attributes {#attributes}
@ -299,13 +300,13 @@ Load state from a binary string.
> assert type(PERSON) == int > assert type(PERSON) == int
> ``` > ```
| Name | Type | Description | | Name | Description |
| --------------------------------------------- | ------------- | ------------------------------------------------------------ | | --------------------------------------------- | ------------------------------------------------------------------------------- |
| `strings` | `StringStore` | A table managing the string-to-int mapping. | | `strings` | A table managing the string-to-int mapping. ~~StringStore~~ |
| `vectors` <Tag variant="new">2</Tag> | `Vectors` | A table associating word IDs to word vectors. | | `vectors` <Tag variant="new">2</Tag> | A table associating word IDs to word vectors. ~~Vectors~~ |
| `vectors_length` | int | Number of dimensions for each word vector. | | `vectors_length` | Number of dimensions for each word vector. ~~int~~ |
| `lookups` | `Lookups` | The available lookup tables in this vocab. | | `lookups` | The available lookup tables in this vocab. ~~Lookups~~ |
| `writing_system` <Tag variant="new">2.1</Tag> | dict | A dict with information about the language's writing system. | | `writing_system` <Tag variant="new">2.1</Tag> | A dict with information about the language's writing system. ~~Dict[str, Any]~~ |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -73,14 +73,14 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
### Other classes {#architecture-other} ### Other classes {#architecture-other}
| Name | Description | | Name | Description |
| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------- | | ------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------- |
| [`Vocab`](/api/vocab) | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. | | [`Vocab`](/api/vocab) | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. |
| [`StringStore`](/api/stringstore) | Map strings to and from hash values. | | [`StringStore`](/api/stringstore) | Map strings to and from hash values. |
| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. | | [`Vectors`](/api/vectors) | Container class for vector data keyed by string. |
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. | | [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. | | [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. | | [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. | | [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. |
| [`Scorer`](/api/scorer) | Compute evaluation scores. | | [`Scorer`](/api/scorer) | Compute evaluation scores. |
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. | | [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |

View File

@ -980,7 +980,7 @@ nlp.tokenizer = my_tokenizer
| Argument | Type | Description | | Argument | Type | Description |
| ----------- | ----------------- | ------------------------- | | ----------- | ----------------- | ------------------------- |
| `text` | str | The raw text to tokenize. | | `text` | `str` | The raw text to tokenize. |
| **RETURNS** | [`Doc`](/api/doc) | The tokenized document. | | **RETURNS** | [`Doc`](/api/doc) | The tokenized document. |
#### Example 1: Basic whitespace tokenizer {#custom-tokenizer-example} #### Example 1: Basic whitespace tokenizer {#custom-tokenizer-example}

View File

@ -139,25 +139,25 @@ $ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip
The `meta` argument of the `Chinese` language class supports the following The `meta` argument of the `Chinese` language class supports the following
following tokenizer config settings: following tokenizer config settings:
| Name | Type | Description | | Name | Description |
| ------------------ | ---- | ------------------------------------------------------------------------------------------------------- | | ------------------ | --------------------------------------------------------------------------------------------------------------- |
| `segmenter` | str | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. | | `segmenter` | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. ~~str~~ |
| `pkuseg_model` | str | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. | | `pkuseg_model` | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~ |
| `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. | | `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. ~~str~~ |
```python ```python
### Examples ### Examples
# Load "default" model # Load "default" model
cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"} cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
nlp = Chinese(meta={"tokenizer": {"config": cfg}}) nlp = Chinese(config={"tokenizer": {"config": cfg}})
# Load local model # Load local model
cfg = {"segmenter": "pkuseg", "pkuseg_model": "/path/to/pkuseg_model"} cfg = {"segmenter": "pkuseg", "pkuseg_model": "/path/to/pkuseg_model"}
nlp = Chinese(meta={"tokenizer": {"config": cfg}}) nlp = Chinese(config={"tokenizer": {"config": cfg}})
# Override the user directory # Override the user directory
cfg = {"segmenter": "pkuseg", "pkuseg_model": "default", "pkuseg_user_dict": "/path"} cfg = {"segmenter": "pkuseg", "pkuseg_model": "default", "pkuseg_user_dict": "/path"}
nlp = Chinese(meta={"tokenizer": {"config": cfg}}) nlp = Chinese(config={"tokenizer": {"config": cfg}})
``` ```
You can also modify the user dictionary on-the-fly: You can also modify the user dictionary on-the-fly:

View File

@ -477,10 +477,10 @@ only being able to modify it afterwards.
> return doc > return doc
> ``` > ```
| Argument | Type | Description | | Argument | Type | Description |
| ----------- | ----- | ------------------------------------------------------ | | ----------- | ----------------- | ------------------------------------------------------ |
| `doc` | `Doc` | The `Doc` object processed by the previous component. | | `doc` | [`Doc`](/api/doc) | The `Doc` object processed by the previous component. |
| **RETURNS** | `Doc` | The `Doc` object processed by this pipeline component. | | **RETURNS** | [`Doc`](/api/doc) | The `Doc` object processed by this pipeline component. |
The [`@Language.component`](/api/language#component) decorator lets you turn a The [`@Language.component`](/api/language#component) decorator lets you turn a
simple function into a pipeline component. It takes at least one argument, the simple function into a pipeline component. It takes at least one argument, the
@ -502,12 +502,12 @@ last** in the pipeline, or define a **custom name**. If no name is set and no
> nlp.add_pipe("my_component", before="parser") > nlp.add_pipe("my_component", before="parser")
> ``` > ```
| Argument | Type | Description | | Argument | Description |
| -------- | --------- | ------------------------------------------------------------------------ | | -------- | --------------------------------------------------------------------------------- |
| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). | | `last` | If set to `True`, component is added **last** in the pipeline (default). ~~bool~~ |
| `first` | bool | If set to `True`, component is added **first** in the pipeline. | | `first` | If set to `True`, component is added **first** in the pipeline. ~~bool~~ |
| `before` | str / int | String name or index to add the new component **before**. | | `before` | String name or index to add the new component **before**. ~~Union[str, int]~~ |
| `after` | str / int | String name or index to add the new component **after**. | | `after` | String name or index to add the new component **after**. ~~Union[str, int]~~ |
<Infobox title="Changed in v3.0" variant="warning"> <Infobox title="Changed in v3.0" variant="warning">
@ -626,10 +626,10 @@ added to the pipeline:
> return MyComponent() > return MyComponent()
> ``` > ```
| Argument | Type | Description | | Argument | Description |
| -------- | --------------------------- | ------------------------------------------------------------------------------------------------------------------------- | | -------- | --------------------------------------------------------------------------------------------------------------------------------- |
| `nlp` | [`Language`](/api/language) | The current `nlp` object. Can be used to access the | | `nlp` | The current `nlp` object. Can be used to access the shared vocab. ~~Language~~ |
| `name` | str | The **instance name** of the component in the pipeline. This lets you identify different instances of the same component. | | `name` | The **instance name** of the component in the pipeline. This lets you identify different instances of the same component. ~~str~~ |
All other settings can be passed in by the user via the `config` argument on All other settings can be passed in by the user via the `config` argument on
[`nlp.add_pipe`](/api/language). The [`nlp.add_pipe`](/api/language). The
@ -1332,12 +1332,11 @@ function that takes a `Doc`, modifies it and returns it.
- If you're looking to publish a model that depends on a custom pipeline - If you're looking to publish a model that depends on a custom pipeline
component, you can either **require it** in the model package's dependencies, component, you can either **require it** in the model package's dependencies,
or if the component is specific and lightweight choose to **ship it with or if the component is specific and lightweight choose to **ship it with
your model package** and add it to the `Language` instance returned by the your model package**. Just make sure the
model's `load()` method. For examples of this, check out the implementations [`@Language.component`](/api/language#component) or
of spaCy's [`@Language.factory`](/api/language#factory) decorator that registers the
[`load_model_from_init_py`](/api/top-level#util.load_model_from_init_py) custom component runs in your model's `__init__.py` or is exposed via an
[`load_model_from_path`](/api/top-level#util.load_model_from_path) utility [entry point](/usage/saving-loading#entry-points).
functions.
- Once you're ready to share your extension with others, make sure to **add docs - Once you're ready to share your extension with others, make sure to **add docs
and installation instructions** (you can always link to this page for more and installation instructions** (you can always link to this page for more

View File

@ -157,19 +157,20 @@ The available token pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for [`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are: rule-based matching are:
| Attribute | Type |  Description | | Attribute |  Description |
| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ | | -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
| `ORTH` | str | The exact verbatim text of a token. | | `ORTH` | The exact verbatim text of a token. ~~str~~ |
| `TEXT` <Tag variant="new">2.1</Tag> | str | The exact verbatim text of a token. | | `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
| `LOWER` | str | The lowercase form of the token text. | | `LOWER` | The lowercase form of the token text. ~~str~~ |
|  `LENGTH` | int | The length of the token text. | |  `LENGTH` | The length of the token text. ~~int~~ |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | |  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ |
| `ENT_TYPE` | str | The token's entity label. | | `ENT_TYPE` | The token's entity label. ~~str~~ |
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | | `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ |
<Accordion title="Does it matter if the attribute names are uppercase or lowercase?"> <Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
@ -231,11 +232,11 @@ following rich comparison attributes are available:
> pattern2 = [{"LENGTH": {">=": 10}}] > pattern2 = [{"LENGTH": {">=": 10}}]
> ``` > ```
| Attribute | Value Type | Description | | Attribute | Description |
| -------------------------- | ---------- | --------------------------------------------------------------------------------- | | -------------------------- | ------------------------------------------------------------------------------------------------------- |
| `IN` | any | Attribute value is member of a list. | | `IN` | Attribute value is member of a list. ~~Any~~ |
| `NOT_IN` | any | Attribute value is _not_ member of a list. | | `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. | | `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
#### Regular expressions {#regex new="2.1"} #### Regular expressions {#regex new="2.1"}
@ -485,12 +486,12 @@ This allows you to write callbacks that consider the entire set of matched
phrases, so that you can resolve overlaps and other conflicts in whatever way phrases, so that you can resolve overlaps and other conflicts in whatever way
you prefer. you prefer.
| Argument | Type | Description | | Argument | Description |
| --------- | --------- | -------------------------------------------------------------------------------------------------------------------- | | --------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
| `matcher` | `Matcher` | The matcher instance. | | `matcher` | The matcher instance. ~~Matcher~~ |
| `doc` | `Doc` | The document the matcher was used on. | | `doc` | The document the matcher was used on. ~~Doc~~ |
| `i` | int | Index of the current match (`matches[i`]). | | `i` | Index of the current match (`matches[i`]). ~~int~~ |
| `matches` | list |  A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. | | `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ |
### Using custom pipeline components {#matcher-pipeline} ### Using custom pipeline components {#matcher-pipeline}

View File

@ -10,45 +10,48 @@ next: /usage/training
## Installation {#install hidden="true"} ## Installation {#install hidden="true"}
Transformers are a family of neural network architectures that compute dense, Transformers are a family of neural network architectures that compute **dense,
context-sensitive representations for the tokens in your documents. Downstream context-sensitive representations** for the tokens in your documents. Downstream
models in your pipeline can then use these representations as input features to models in your pipeline can then use these representations as input features to
improve their predictions. You can connect multiple components to a single **improve their predictions**. You can connect multiple components to a single
transformer model, with any or all of those components giving feedback to the transformer model, with any or all of those components giving feedback to the
transformer to fine-tune it to your tasks. spaCy's transformer support transformer to fine-tune it to your tasks. spaCy's transformer support
interoperates with PyTorch and the [Huggingface transformers](https://huggingface.co/transformers/) interoperates with [PyTorch](https://pytorch.org) and the
library, giving you access to thousands of pretrained models for your pipelines. [HuggingFace `transformers`](https://huggingface.co/transformers/) library,
There are many [great guides](http://jalammar.github.io/illustrated-transformer/) giving you access to thousands of pretrained models for your pipelines. There
to transformer models, but for practical purposes, you can simply think of them are many [great guides](http://jalammar.github.io/illustrated-transformer/) to
as a drop-in replacement that let you achieve higher accuracy in exchange for transformer models, but for practical purposes, you can simply think of them as
higher training and runtime costs. a drop-in replacement that let you achieve **higher accuracy** in exchange for
**higher training and runtime costs**.
## System requirements ### System requirements
We recommend an NVIDIA GPU with at least 10GB of memory in order to work with We recommend an NVIDIA GPU with at least 10GB of memory in order to work with
transformer models. The exact requirements will depend on the transformer you transformer models. The exact requirements will depend on the transformer you
model you choose and whether you're training the pipeline or simply running it. model you choose and whether you're training the pipeline or simply running it.
Training a transformer-based model without a GPU will be too slow for most Training a transformer-based model without a GPU will be too slow for most
practical purposes. You'll also need to make sure your GPU drivers are up-to-date practical purposes. You'll also need to make sure your GPU drivers are
and v9+ of the CUDA runtime is installed. up-to-date and v9+ of the CUDA runtime is installed.
Once you have CUDA installed, you'll need to install two pip packages, `cupy` Once you have CUDA installed, you'll need to install two pip packages,
and `spacy-transformers`. [CuPy](https://docs.cupy.dev/en/stable/install.html) [`cupy`](https://docs.cupy.dev/en/stable/install.html) and
[`spacy-transformers`](https://github.com/explosion/spacy-transformers). `cupy`
is just like `numpy`, but for GPU. The best way to install it is to choose a is just like `numpy`, but for GPU. The best way to install it is to choose a
wheel that matches the version of CUDA you're using. You may also need to set the wheel that matches the version of CUDA you're using. You may also need to set
`CUDA_PATH` environment variable if your CUDA runtime is installed in the `CUDA_PATH` environment variable if your CUDA runtime is installed in a
a non-standard location. Putting it all together, if you had installed CUDA 10.2 non-standard location. Putting it all together, if you had installed CUDA 10.2
in `/opt/nvidia/cuda`, you would run: in `/opt/nvidia/cuda`, you would run:
``` ```bash
### Installation with CUDA
export CUDA_PATH="/opt/nvidia/cuda" export CUDA_PATH="/opt/nvidia/cuda"
pip install cupy-cuda102 pip install cupy-cuda102
pip install spacy-transformers pip install spacy-transformers
``` ```
Provisioning a new machine will require about 5GB of data to be downloaded in total: Provisioning a new machine will require about 5GB of data to be downloaded in
3GB for the CUDA runtime, 800MB for PyTorch, 400MB for CuPy, 500MB for the transformer total: 3GB for the CUDA runtime, 800MB for PyTorch, 400MB for CuPy, 500MB for
weights, and about 200MB for spaCy and its various requirements. the transformer weights, and about 200MB for spaCy and its various requirements.
## Runtime usage {#runtime} ## Runtime usage {#runtime}
@ -237,23 +240,22 @@ The [`Transformer`](/api/transformer) component expects a Thinc
[`Model`](https://thinc.ai/docs/api-model) object to be passed in as its `model` [`Model`](https://thinc.ai/docs/api-model) object to be passed in as its `model`
argument. You're not limited to the implementation provided by argument. You're not limited to the implementation provided by
`spacy-transformers` the only requirement is that your registered function `spacy-transformers` the only requirement is that your registered function
must return an object of type `Model[List[Doc], FullTransformerBatch]`: that is, must return an object of type ~~Model[List[Doc], FullTransformerBatch]~~: that
a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a is, a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the
transformer data. transformer data.
> #### Model type annotations > #### Model type annotations
> >
> In the documentation and code base, you may come across type annotations and > In the documentation and code base, you may come across type annotations and
> descriptions of [Thinc](https://thinc.ai) model types, like > descriptions of [Thinc](https://thinc.ai) model types, like ~~Model[List[Doc],
> `Model[List[Doc], List[Floats2d]]`. This so-called generic type describes the > List[Floats2d]]~~. This so-called generic type describes the layer and its
> layer and its input and output type in this case, it takes a list of `Doc` > input and output type in this case, it takes a list of `Doc` objects as the
> objects as the input and list of 2-dimensional arrays of floats as the output. > input and list of 2-dimensional arrays of floats as the output. You can read
> You can read more about defining Thinc > more about defining Thinc models [here](https://thinc.ai/docs/usage-models).
> models [here](https://thinc.ai/docs/usage-models). Also see the > Also see the [type checking](https://thinc.ai/docs/usage-type-checking) for
> [type checking](https://thinc.ai/docs/usage-type-checking) for how to enable > how to enable linting in your editor to see live feedback if your inputs and
> linting in your editor to see live feedback if your inputs and outputs don't > outputs don't match.
> match.
The same idea applies to task models that power the **downstream components**. The same idea applies to task models that power the **downstream components**.
Most of spaCy's built-in model creation functions support a `tok2vec` argument, Most of spaCy's built-in model creation functions support a `tok2vec` argument,
@ -288,7 +290,7 @@ The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a
determines how the vector for each spaCy token will be computed from the zero or determines how the vector for each spaCy token will be computed from the zero or
more source rows the token is aligned against. Here we use the more source rows the token is aligned against. Here we use the
[`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which
averages the wordpiece rows. We could instead use `reduce_last`, averages the wordpiece rows. We could instead use
[`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom [`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom
function you write yourself. function you write yourself.

View File

@ -231,6 +231,7 @@ on them.
| `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) | | `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) |
| keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` | | keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` |
| `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` | | `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` |
| `verbose` argument on [`Language.evaluate`] | logging |
| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer) | | `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer) |
## Migrating from v2.x {#migrating} ## Migrating from v2.x {#migrating}

View File

@ -58,12 +58,12 @@ arcs.
</Infobox> </Infobox>
| Argument | Type | Description | Default | | Argument | Description |
| --------- | ---- | ----------------------------------------------------------- | ----------- | | --------- | ----------------------------------------------------------------------------------------- |
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | | `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
| `color` | str | Text color (HEX, RGB or color names). | `"#000000"` | | `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
| `bg` | str | Background color (HEX, RGB or color names). | `"#ffffff"` | | `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
| `font` | str | Font name or font family for all text. | `"Arial"` | | `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
For a list of all available options, see the For a list of all available options, see the
[`displacy` API documentation](/api/top-level#displacy_options). [`displacy` API documentation](/api/top-level#displacy_options).
@ -121,10 +121,10 @@ import DisplacyEntHtml from 'images/displacy-ent2.html'
The entity visualizer lets you customize the following `options`: The entity visualizer lets you customize the following `options`:
| Argument | Type | Description | Default | | Argument | Description |
| -------- | ---- | ------------------------------------------------------------------------------------- | ------- | | -------- | -------------------------------------------------------------------------------------------------------------------------- |
| `ents` | list |  Entity types to highlight (`None` for all types). | `None` | | `ents` | Entity types to highlight (`None` for all types). Defaults to `None`. ~~Optional[List[str]]~~ | `None` |
| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | | `colors` | Color overrides. Entity types in uppercase should be mapped to color names or values. Defaults to `{}`. ~~Dict[str, str]~~ |
If you specify a list of `ents`, only those entity types will be rendered for If you specify a list of `ents`, only those entity types will be rendered for
example, you can choose to display `PERSON` entities. Internally, the visualizer example, you can choose to display `PERSON` entities. Internally, the visualizer

View File

@ -113,7 +113,6 @@
{ "text": "Vectors", "url": "/api/vectors" }, { "text": "Vectors", "url": "/api/vectors" },
{ "text": "Lookups", "url": "/api/lookups" }, { "text": "Lookups", "url": "/api/lookups" },
{ "text": "Morphology", "url": "/api/morphology" }, { "text": "Morphology", "url": "/api/morphology" },
{ "text": "MorphAnalysis", "url": "/api/morphanalysis" },
{ "text": "KnowledgeBase", "url": "/api/kb" }, { "text": "KnowledgeBase", "url": "/api/kb" },
{ "text": "Scorer", "url": "/api/scorer" }, { "text": "Scorer", "url": "/api/scorer" },
{ "text": "Corpus", "url": "/api/corpus" } { "text": "Corpus", "url": "/api/corpus" }

View File

@ -0,0 +1,43 @@
{
"Doc": "/api/doc",
"Token": "/api/token",
"Span": "/api/span",
"Lexeme": "/api/lexeme",
"Example": "/api/example",
"Alignment": "/api/example#alignment-object",
"Vocab": "/api/vocab",
"StringStore": "/api/stringstore",
"Lookups": "/api/lookups",
"Table": "/api/lookups#table",
"Vectors": "/api/vectors",
"Language": "/api/language",
"Defaults": "/api/language#defaults",
"Scorer": "/api/scorer",
"DocBin": "/api/docbin",
"FactoryMeta": "/api/language#factorymeta",
"Tokenizer": "/api/tokenizer",
"MorphAnalysis": "/api/morphology#morphanalysis",
"KnowledgeBase": "/api/kb",
"Candidate": "/api/kb#candidate",
"Matcher": "/api/matcher",
"PhraseMatcher": "/api/phrasematcher",
"TransformerData": "/api/transformer#transformerdata",
"FullTransformerBatch": "/api/transformer#fulltransformerbatch",
"LexemeC": "/api/cython-structs#lexemec",
"TokenC": "/api/cython-structs#tokenc",
"Config": "https://thinc.ai/docs/api-config#config",
"Optimizer": "https://thinc.ai/docs/api-optimizers",
"Model": "https://thinc.ai/docs/api-model",
"Ragged": "https://thinc.ai/docs/api-types#ragged",
"Floats2d": "https://thinc.ai/docs/api-types#types",
"Floats3d": "https://thinc.ai/docs/api-types#types",
"FloatsXd": "https://thinc.ai/docs/api-types#types",
"cymem.Pool": "https://github.com/explosion/cymem",
"preshed.BloomFilter": "https://github.com/explosion/preshed",
"transformers.BatchEncoding": "https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding",
"torch.Tensor": "https://pytorch.org/docs/stable/tensors.html",
"numpy.ndarray": "https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html",
"Match": "https://docs.python.org/3/library/re.html#match-objects",
"Pattern": "https://docs.python.org/3/library/re.html#regular-expression-objects",
"Path": "https://docs.python.org/3/library/pathlib.html"
}

View File

@ -1,4 +1,4 @@
import React from 'react' import React, { Fragment } from 'react'
import PropTypes from 'prop-types' import PropTypes from 'prop-types'
import classNames from 'classnames' import classNames from 'classnames'
import highlightCode from 'gatsby-remark-prismjs/highlight-code.js' import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'
@ -6,12 +6,13 @@ import rangeParser from 'parse-numeric-range'
import { StaticQuery, graphql } from 'gatsby' import { StaticQuery, graphql } from 'gatsby'
import { window } from 'browser-monads' import { window } from 'browser-monads'
import CUSTOM_TYPES from '../../meta/type-annotations.json'
import { isString, htmlToReact } from './util' import { isString, htmlToReact } from './util'
import Link from './link' import Link from './link'
import GitHubCode from './github' import GitHubCode from './github'
import classes from '../styles/code.module.sass' import classes from '../styles/code.module.sass'
const WRAP_THRESHOLD = 16 const WRAP_THRESHOLD = 30
export default props => ( export default props => (
<Pre> <Pre>
@ -40,6 +41,52 @@ InlineCode.propTypes = {
children: PropTypes.node, children: PropTypes.node,
} }
function linkType(el, showLink = true) {
if (!isString(el) || !el.length) return el
const elStr = el.trim()
if (!elStr) return el
const typeUrl = CUSTOM_TYPES[elStr]
const url = typeUrl == true ? DEFAULT_TYPE_URL : typeUrl
const ws = el[0] == ' '
return url && showLink ? (
<Fragment>
{ws && ' '}
<Link to={url} hideIcon>
{elStr}
</Link>
</Fragment>
) : (
el
)
}
export const TypeAnnotation = ({ lang = 'python', link = true, children }) => {
// Hacky, but we're temporarily replacing a dot to prevent it from being split during highlighting
const TMP_DOT = '•'
const code = Array.isArray(children) ? children.join('') : children || ''
const rawStr = code.replace('.', TMP_DOT)
const rawHtml = lang === 'none' || !code ? code : highlightCode(lang, rawStr)
const html = rawHtml.replace(TMP_DOT, '.').replace(/\n/g, ' ')
const result = htmlToReact(html)
const elements = Array.isArray(result) ? result : [result]
const annotClassNames = classNames(
'type-annotation',
`language-${lang}`,
classes.inlineCode,
classes.typeAnnotation,
{
[classes.wrap]: code.length >= WRAP_THRESHOLD,
}
)
return (
<code className={annotClassNames} aria-label="Type annotation">
{elements.map((el, i) => (
<Fragment key={i}>{linkType(el, !!link)}</Fragment>
))}
</code>
)
}
export class Code extends React.Component { export class Code extends React.Component {
state = { Juniper: null } state = { Juniper: null }

View File

@ -56,6 +56,38 @@
--color-inline-code-text: var(--color-back) --color-inline-code-text: var(--color-back)
--color-inline-code-bg: var(--color-dark-secondary) --color-inline-code-bg: var(--color-dark-secondary)
.type-annotation,
white-space: pre-wrap
font-family: var(--font-code)
&.wrap
word-wrap: break-word
a
border: 0
// Special style for types in API tables
td > &:last-child
display: block
border-top: 1px dotted var(--color-subtle)
border-radius: 0
background: none
width: calc(100% + 2rem)
margin-left: -1rem
padding-left: 1rem
padding-top: 5px
margin-top: 5px
margin-bottom: -5px
&:before
content: "Type: "
opacity: 0.75
font-family: var(--font-primary)
color: var(--color-dark-secondary)
font-weight: bold
text-transform: uppercase
margin-right: 5px
.wrap .wrap
white-space: pre-wrap white-space: pre-wrap
word-wrap: anywhere word-wrap: anywhere

View File

@ -358,6 +358,15 @@ body [id]:target
&.italic &.italic
font-style: italic font-style: italic
[class*="language-"].type-annotation .token
&.builtin, &.boolean, &.number
color: var(--color-inline-code-text)
&.operator
color: var(--syntax-comment)
// Settings for ini syntax (config files) // Settings for ini syntax (config files)
[class*="language-ini"] [class*="language-ini"]
color: var(--syntax-comment) color: var(--syntax-comment)

View File

@ -29,7 +29,8 @@
border: 0 border: 0
.td .td
padding: 1rem padding: 0.9rem 1rem
font-size: 95%
&:not(:last-child) &:not(:last-child)
border-right: 1px solid var(--color-subtle) border-right: 1px solid var(--color-subtle)

View File

@ -20,7 +20,7 @@ import SEO from '../components/seo'
import Link from '../components/link' import Link from '../components/link'
import Section, { Hr } from '../components/section' import Section, { Hr } from '../components/section'
import { Table, Tr, Th, Td } from '../components/table' import { Table, Tr, Th, Td } from '../components/table'
import { Pre, Code, InlineCode } from '../components/code' import { Pre, Code, InlineCode, TypeAnnotation } from '../components/code'
import { Ol, Ul, Li } from '../components/list' import { Ol, Ul, Li } from '../components/list'
import { H2, H3, H4, H5, P, Abbr, Help } from '../components/typography' import { H2, H3, H4, H5, P, Abbr, Help } from '../components/typography'
import Accordion from '../components/accordion' import Accordion from '../components/accordion'
@ -41,6 +41,7 @@ const mdxComponents = {
pre: Pre, pre: Pre,
code: Code, code: Code,
inlineCode: InlineCode, inlineCode: InlineCode,
del: TypeAnnotation,
table: Table, table: Table,
img: Image, img: Image,
tr: Tr, tr: Tr,