mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Update docs, types and API consistency
This commit is contained in:
parent
61dfdd9fbd
commit
3ae5e02f4f
|
@ -70,7 +70,7 @@ def evaluate(
|
|||
corpus = Corpus(data_path, gold_preproc=gold_preproc)
|
||||
nlp = util.load_model(model)
|
||||
dev_dataset = list(corpus(nlp))
|
||||
scores = nlp.evaluate(dev_dataset, verbose=False)
|
||||
scores = nlp.evaluate(dev_dataset)
|
||||
metrics = {
|
||||
"TOK": "token_acc",
|
||||
"TAG": "tag_acc",
|
||||
|
|
|
@ -18,7 +18,7 @@ RENDER_WRAPPER = None
|
|||
|
||||
|
||||
def render(
|
||||
docs: Union[Iterable[Doc], Doc],
|
||||
docs: Union[Iterable[Union[Doc, Span]], Doc, Span],
|
||||
style: str = "dep",
|
||||
page: bool = False,
|
||||
minify: bool = False,
|
||||
|
|
|
@ -439,8 +439,6 @@ class Language:
|
|||
assigns: Iterable[str] = tuple(),
|
||||
requires: Iterable[str] = tuple(),
|
||||
retokenizes: bool = False,
|
||||
scores: Iterable[str] = tuple(),
|
||||
default_score_weights: Dict[str, float] = SimpleFrozenDict(),
|
||||
func: Optional[Callable[[Doc], Doc]] = None,
|
||||
) -> Callable:
|
||||
"""Register a new pipeline component. Can be used for stateless function
|
||||
|
@ -456,12 +454,6 @@ class Language:
|
|||
e.g. "token.ent_id". Used for pipeline analyis.
|
||||
retokenizes (bool): Whether the component changes the tokenization.
|
||||
Used for pipeline analysis.
|
||||
scores (Iterable[str]): All scores set by the component if it's trainable,
|
||||
e.g. ["ents_f", "ents_r", "ents_p"].
|
||||
default_score_weights (Dict[str, float]): The scores to report during
|
||||
training, and their default weight towards the final score used to
|
||||
select the best model. Weights should sum to 1.0 per component and
|
||||
will be combined and normalized for the whole pipeline.
|
||||
func (Optional[Callable]): Factory function if not used as a decorator.
|
||||
|
||||
DOCS: https://spacy.io/api/language#component
|
||||
|
@ -482,8 +474,6 @@ class Language:
|
|||
assigns=assigns,
|
||||
requires=requires,
|
||||
retokenizes=retokenizes,
|
||||
scores=scores,
|
||||
default_score_weights=default_score_weights,
|
||||
func=factory_func,
|
||||
)
|
||||
return component_func
|
||||
|
@ -1112,7 +1102,6 @@ class Language:
|
|||
self,
|
||||
examples: Iterable[Example],
|
||||
*,
|
||||
verbose: bool = False,
|
||||
batch_size: int = 256,
|
||||
scorer: Optional[Scorer] = None,
|
||||
component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
|
||||
|
@ -1121,7 +1110,6 @@ class Language:
|
|||
"""Evaluate a model's pipeline components.
|
||||
|
||||
examples (Iterable[Example]): `Example` objects.
|
||||
verbose (bool): Print debugging information.
|
||||
batch_size (int): Batch size to use.
|
||||
scorer (Optional[Scorer]): Scorer to use. If not passed in, a new one
|
||||
will be created.
|
||||
|
@ -1140,7 +1128,6 @@ class Language:
|
|||
scorer_cfg = {}
|
||||
if scorer is None:
|
||||
kwargs = dict(scorer_cfg)
|
||||
kwargs.setdefault("verbose", verbose)
|
||||
kwargs.setdefault("nlp", self)
|
||||
scorer = Scorer(**kwargs)
|
||||
texts = [eg.reference.text for eg in examples]
|
||||
|
@ -1163,8 +1150,7 @@ class Language:
|
|||
docs = list(docs)
|
||||
end_time = timer()
|
||||
for i, (doc, eg) in enumerate(zip(docs, examples)):
|
||||
if verbose:
|
||||
print(doc)
|
||||
util.logger.debug(doc)
|
||||
eg.predicted = doc
|
||||
results = scorer.score(examples)
|
||||
n_words = sum(len(eg.predicted) for eg in examples)
|
||||
|
|
|
@ -2,7 +2,7 @@ from typing import Optional, Iterable, Dict, Any, Callable, Tuple, TYPE_CHECKING
|
|||
import numpy as np
|
||||
|
||||
from .gold import Example
|
||||
from .tokens import Token, Doc
|
||||
from .tokens import Token, Doc, Span
|
||||
from .errors import Errors
|
||||
from .util import get_lang_class
|
||||
from .morphology import Morphology
|
||||
|
@ -250,15 +250,16 @@ class Scorer:
|
|||
examples: Iterable[Example],
|
||||
attr: str,
|
||||
*,
|
||||
getter: Callable[[Doc, str], Any] = getattr,
|
||||
getter: Callable[[Doc, str], Iterable[Span]] = getattr,
|
||||
**cfg,
|
||||
) -> Dict[str, Any]:
|
||||
"""Returns PRF scores for labeled spans.
|
||||
|
||||
examples (Iterable[Example]): Examples to score
|
||||
attr (str): The attribute to score.
|
||||
getter (Callable[[Doc, str], Any]): Defaults to getattr. If provided,
|
||||
getter(doc, attr) should return the spans for the individual doc.
|
||||
getter (Callable[[Doc, str], Iterable[Span]]): Defaults to getattr. If
|
||||
provided, getter(doc, attr) should return the spans for the
|
||||
individual doc.
|
||||
RETURNS (Dict[str, Any]): A dictionary containing the PRF scores under
|
||||
the keys attr_p/r/f and the per-type PRF scores under attr_per_type.
|
||||
|
||||
|
@ -444,7 +445,7 @@ class Scorer:
|
|||
*,
|
||||
getter: Callable[[Token, str], Any] = getattr,
|
||||
head_attr: str = "head",
|
||||
head_getter: Callable[[Token, str], Any] = getattr,
|
||||
head_getter: Callable[[Token, str], Token] = getattr,
|
||||
ignore_labels: Tuple[str] = tuple(),
|
||||
**cfg,
|
||||
) -> Dict[str, Any]:
|
||||
|
@ -458,7 +459,7 @@ class Scorer:
|
|||
individual token.
|
||||
head_attr (str): The attribute containing the head token. Defaults to
|
||||
'head'.
|
||||
head_getter (Callable[[Token, str], Any]): Defaults to getattr. If provided,
|
||||
head_getter (Callable[[Token, str], Token]): Defaults to getattr. If provided,
|
||||
head_getter(token, attr) should return the value of the head for an
|
||||
individual token.
|
||||
ignore_labels (Tuple): Labels to ignore while scoring (e.g., punct).
|
||||
|
|
|
@ -356,13 +356,13 @@ def test_language_factories_combine_score_weights(weights, expected):
|
|||
|
||||
def test_language_factories_scores():
|
||||
name = "test_language_factories_scores"
|
||||
func = lambda doc: doc
|
||||
func = lambda nlp, name: lambda doc: doc
|
||||
weights1 = {"a1": 0.5, "a2": 0.5}
|
||||
weights2 = {"b1": 0.2, "b2": 0.7, "b3": 0.1}
|
||||
Language.component(
|
||||
Language.factory(
|
||||
f"{name}1", scores=list(weights1), default_score_weights=weights1, func=func,
|
||||
)
|
||||
Language.component(
|
||||
Language.factory(
|
||||
f"{name}2", scores=list(weights2), default_score_weights=weights2, func=func,
|
||||
)
|
||||
meta1 = Language.get_factory_meta(f"{name}1")
|
||||
|
|
|
@ -102,8 +102,7 @@ cdef class Doc:
|
|||
|
||||
Construction 2
|
||||
>>> from spacy.tokens import Doc
|
||||
>>> doc = Doc(nlp.vocab, words=[u'hello', u'world', u'!'],
|
||||
>>> spaces=[True, False, False])
|
||||
>>> doc = Doc(nlp.vocab, words=["hello", "world", "!"], spaces=[True, False, False])
|
||||
|
||||
DOCS: https://spacy.io/api/doc
|
||||
"""
|
||||
|
|
|
@ -886,6 +886,15 @@ def escape_html(text: str) -> str:
|
|||
def get_words_and_spaces(
|
||||
words: Iterable[str], text: str
|
||||
) -> Tuple[List[str], List[bool]]:
|
||||
"""Given a list of words and a text, reconstruct the original tokens and
|
||||
return a list of words and spaces that can be used to create a Doc. This
|
||||
can help recover destructive tokenization that didn't preserve any
|
||||
whitespace information.
|
||||
|
||||
words (Iterable[str]): The words.
|
||||
text (str): The original text.
|
||||
RETURNS (Tuple[List[str], List[bool]]): The words and spaces.
|
||||
"""
|
||||
if "".join("".join(words).split()) != "".join(text.split()):
|
||||
raise ValueError(Errors.E194.format(text=text, words=words))
|
||||
text_words = []
|
||||
|
|
|
@ -75,7 +75,8 @@ import { H1, H2, H3, H4, H5, Label, InlineList, Comment } from
|
|||
Headlines are set in
|
||||
[HK Grotesk](http://cargocollective.com/hanken/HK-Grotesk-Open-Source-Font) by
|
||||
Hanken Design. All other body text and code uses the best-matching default
|
||||
system font to provide a "native" reading experience.
|
||||
system font to provide a "native" reading experience. All code uses the
|
||||
[JetBrains Mono](https://www.jetbrains.com/lp/mono/) typeface by JetBrains.
|
||||
|
||||
<Infobox title="Important note" variant="warning">
|
||||
|
||||
|
@ -106,7 +107,7 @@ Tags are also available as standalone `<Tag />` components.
|
|||
| Argument | Example | Result |
|
||||
| -------- | -------------------------- | ----------------------------------------- |
|
||||
| `tag` | `{tag="method"}` | <Tag>method</Tag> |
|
||||
| `new` | `{new="2"}` | <Tag variant="new">2</Tag> |
|
||||
| `new` | `{new="3"}` | <Tag variant="new">3</Tag> |
|
||||
| `model` | `{model="tagger, parser"}` | <Tag variant="model">tagger, parser</Tag> |
|
||||
| `hidden` | `{hidden="true"}` | |
|
||||
|
||||
|
@ -130,6 +131,8 @@ Special link styles are used depending on the link URL.
|
|||
|
||||
- [I am a regular external link](https://explosion.ai)
|
||||
- [I am a link to the documentation](/api/doc)
|
||||
- [I am a link to an architecture](/api/architectures#HashEmbedCNN)
|
||||
- [I am a link to a model](/models/en#en_core_web_sm)
|
||||
- [I am a link to GitHub](https://github.com/explosion/spaCy)
|
||||
|
||||
### Abbreviations {#abbr}
|
||||
|
@ -188,18 +191,20 @@ the buttons are implemented as styled links instead of native button elements.
|
|||
<InlineList><Button to="#" variant="primary">Primary small</Button>
|
||||
<Button to="#" variant="secondary">Secondary small</Button></InlineList>
|
||||
|
||||
<br />
|
||||
|
||||
<InlineList><Button to="#" variant="primary" large>Primary large</Button>
|
||||
<Button to="#" variant="secondary" large>Secondary large</Button></InlineList>
|
||||
|
||||
## Components
|
||||
|
||||
### Table
|
||||
### Table {#table}
|
||||
|
||||
> #### Markdown
|
||||
>
|
||||
> ```markdown_
|
||||
> | Header 1 | Header 2 |
|
||||
> | --- | --- |
|
||||
> | -------- | -------- |
|
||||
> | Column 1 | Column 2 |
|
||||
> ```
|
||||
>
|
||||
|
@ -213,7 +218,7 @@ the buttons are implemented as styled links instead of native button elements.
|
|||
> ```
|
||||
|
||||
Tables are used to present data and API documentation. Certain keywords can be
|
||||
used to mark a footer row with a distinct style, for example to visualise the
|
||||
used to mark a footer row with a distinct style, for example to visualize the
|
||||
return values of a documented function.
|
||||
|
||||
| Header 1 | Header 2 | Header 3 | Header 4 |
|
||||
|
@ -224,7 +229,73 @@ return values of a documented function.
|
|||
| Column 1 | Column 2 | Column 3 | Column 4 |
|
||||
| **RETURNS** | Column 2 | Column 3 | Column 4 |
|
||||
|
||||
### List
|
||||
Tables also support optional "divider" rows that are typically used to denote
|
||||
keyword-only arguments in API documentation. To turn a row into a dividing
|
||||
headline, it should only include content in its first cell, and its value should
|
||||
be italicized:
|
||||
|
||||
> #### Markdown
|
||||
>
|
||||
> ```markdown_
|
||||
> | Header 1 | Header 2 | Header 3 |
|
||||
> | -------- | -------- | -------- |
|
||||
> | Column 1 | Column 2 | Column 3 |
|
||||
> | _Hello_ | | |
|
||||
> | Column 1 | Column 2 | Column 3 |
|
||||
> ```
|
||||
|
||||
| Header 1 | Header 2 | Header 3 |
|
||||
| -------- | -------- | -------- |
|
||||
| Column 1 | Column 2 | Column 3 |
|
||||
| _Hello_ | | |
|
||||
| Column 1 | Column 2 | Column 3 |
|
||||
|
||||
### Type Annotations {#type-annotations}
|
||||
|
||||
> #### Markdown
|
||||
>
|
||||
> ```markdown_
|
||||
> ~~Model[List[Doc], Floats2d]~~
|
||||
> ```
|
||||
>
|
||||
> #### JSX
|
||||
>
|
||||
> ```markup
|
||||
> <TypeAnnotation>Model[List[Doc], Floats2d]</Typeannotation>
|
||||
> ```
|
||||
|
||||
Type annotations are special inline code blocks are used to describe Python
|
||||
types in the [type hints](https://docs.python.org/3/library/typing.html) format.
|
||||
The special component will split the type, apply syntax highlighting and link
|
||||
all types that specify links in `meta/type-annotations.json`. Types can link to
|
||||
internal or external documentation pages. To make it easy to represent the type
|
||||
annotations in Markdown, the rendering "hijacks" the `~~` tags that would
|
||||
typically be converted to a `<del>` element – but in this case, text surrounded
|
||||
by `~~` becomes a type annotation.
|
||||
|
||||
- ~~Dict[str, List[Union[Doc, Span]]]~~
|
||||
- ~~Model[List[Doc], List[numpy.ndarray]]~~
|
||||
|
||||
Type annotations support a special visual style in tables and will render as a
|
||||
separate row, under the cell text. This allows the API docs to display complex
|
||||
types without taking up too much space in the cell. The type annotation should
|
||||
always be the **last element** in the row.
|
||||
|
||||
> #### Markdown
|
||||
>
|
||||
> ```markdown_
|
||||
> | Header 1 | Header 2 |
|
||||
> | -------- | ----------------------- |
|
||||
> | Column 1 | Column 2 ~~List[Doc]~~ |
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
|
||||
### List {#list}
|
||||
|
||||
> #### Markdown
|
||||
>
|
||||
|
@ -255,7 +326,7 @@ automatically.
|
|||
3. Lorem ipsum dolor
|
||||
4. consectetur adipiscing elit
|
||||
|
||||
### Aside
|
||||
### Aside {#aside}
|
||||
|
||||
> #### Markdown
|
||||
>
|
||||
|
@ -280,7 +351,7 @@ To make them easier to use in Markdown, paragraphs formatted as blockquotes will
|
|||
turn into asides by default. Level 4 headlines (with a leading `####`) will
|
||||
become aside titles.
|
||||
|
||||
### Code Block
|
||||
### Code Block {#code-block}
|
||||
|
||||
> #### Markdown
|
||||
>
|
||||
|
@ -387,7 +458,7 @@ original file is shown at the top of the widget.
|
|||
https://github.com/explosion/spaCy/tree/master/spacy/language.py
|
||||
```
|
||||
|
||||
### Infobox
|
||||
### Infobox {#infobox}
|
||||
|
||||
import Infobox from 'components/infobox'
|
||||
|
||||
|
@ -425,7 +496,7 @@ blocks.
|
|||
|
||||
</Infobox>
|
||||
|
||||
### Accordion
|
||||
### Accordion {#accordion}
|
||||
|
||||
import Accordion from 'components/accordion'
|
||||
|
||||
|
|
|
@ -33,18 +33,18 @@ TODO: intro and how architectures work, link to
|
|||
> subword_features = true
|
||||
> ```
|
||||
|
||||
Build spaCy's 'standard' tok2vec layer, which uses hash embedding with subword
|
||||
Build spaCy's "standard" tok2vec layer, which uses hash embedding with subword
|
||||
features and a CNN with layer-normalized maxout.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------------- | ---- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | int | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. |
|
||||
| `depth` | int | The number of convolutional layers to use. Recommended values are between `2` and `8`. |
|
||||
| `embed_size` | int | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. |
|
||||
| `window_size` | int | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. |
|
||||
| `maxout_pieces` | int | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. |
|
||||
| `subword_features` | bool | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. |
|
||||
| `pretrained_vectors` | bool | Whether to also use static vectors. |
|
||||
| Name | Description |
|
||||
| -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The width of the input and output. These are required to be the same, so that residual connections can be used. Recommended values are `96`, `128` or `300`. ~~int~~ |
|
||||
| `depth` | The number of convolutional layers to use. Recommended values are between `2` and `8`. ~~int~~ |
|
||||
| `embed_size` | The number of rows in the hash embedding tables. This can be surprisingly small, due to the use of the hash embeddings. Recommended values are between `2000` and `10000`. ~~int~~ |
|
||||
| `window_size` | The number of tokens on either side to concatenate during the convolutions. The receptive field of the CNN will be `depth * (window_size * 2 + 1)`, so a 4-layer network with a window size of `2` will be sensitive to 17 words at a time. Recommended value is `1`. ~~int~~ |
|
||||
| `maxout_pieces` | The number of pieces to use in the maxout non-linearity. If `1`, the [`Mish`](https://thinc.ai/docs/api-layers#mish) non-linearity is used instead. Recommended values are `1`-`3`. ~~int~~ |
|
||||
| `subword_features` | Whether to also embed subword features, specifically the prefix, suffix and word shape. This is recommended for alphabetic languages like English, but not if single-character tokens are used for a language such as Chinese. ~~bool~~ |
|
||||
| `pretrained_vectors` | Whether to also use static vectors. ~~bool~~ |
|
||||
|
||||
### spacy.Tok2Vec.v1 {#Tok2Vec}
|
||||
|
||||
|
@ -67,10 +67,10 @@ Construct a tok2vec model out of embedding and encoding subnetworks. See the
|
|||
["Embed, Encode, Attend, Predict"](https://explosion.ai/blog/deep-learning-formula-nlp)
|
||||
blog post for background.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `embed` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed) |
|
||||
| `encode` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Floats2d]`. **Output:** `List[Floats2d]`. Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). |
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `embed` | Embed tokens into context-independent word vector representations. For example, [CharacterEmbed](/api/architectures#CharacterEmbed) or [MultiHashEmbed](/api/architectures#MultiHashEmbed). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `encode` | Encode context into the embeddings, using an architecture such as a CNN, BiLSTM or transformer. For example, [MaxoutWindowEncoder](/api/architectures#MaxoutWindowEncoder). ~~Model[List[Floats2d], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.Tok2VecListener.v1 {#Tok2VecListener}
|
||||
|
||||
|
@ -108,10 +108,10 @@ Instead of defining its own `Tok2Vec` instance, a model architecture like
|
|||
[Tagger](/api/architectures#tagger) can define a listener as its `tok2vec`
|
||||
argument that connects to the shared `tok2vec` component in the pipeline.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `width` | int | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. |
|
||||
| `upstream` | str | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. |
|
||||
| Name | Description |
|
||||
| ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The width of the vectors produced by the "upstream" [`Tok2Vec`](/api/tok2vec) component. ~~int~~ |
|
||||
| `upstream` | A string to identify the "upstream" `Tok2Vec` component to communicate with. The upstream name should either be the wildcard string `"*"`, or the name of the `Tok2Vec` component. You'll almost never have multiple upstream `Tok2Vec` components, so the wildcard string will almost always be fine. ~~str~~ |
|
||||
|
||||
### spacy.MultiHashEmbed.v1 {#MultiHashEmbed}
|
||||
|
||||
|
@ -134,12 +134,12 @@ definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from
|
|||
pretrained static vectors can also be incorporated into the concatenated
|
||||
representation.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------------- | ---- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | int | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. |
|
||||
| `rows` | int | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. |
|
||||
| `also_embed_subwords` | bool | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. |
|
||||
| `also_use_static_vectors` | bool | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. |
|
||||
| Name | Description |
|
||||
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The output width. Also used as the width of the embedding tables. Recommended values are between `64` and `300`. ~~int~~ |
|
||||
| `rows` | The number of rows for the embedding tables. Can be low, due to the hashing trick. Embeddings for prefix, suffix and word shape use half as many rows. Recommended values are between `2000` and `10000`. ~~int~~ |
|
||||
| `also_embed_subwords` | Whether to use the `PREFIX`, `SUFFIX` and `SHAPE` features in the embeddings. If not using these, you may need more rows in your hash embeddings, as there will be increased chance of collisions. ~~bool~~ |
|
||||
| `also_use_static_vectors` | Whether to also use static word vectors. Requires a vectors table to be loaded in the [Doc](/api/doc) objects' vocab. ~~bool~~ |
|
||||
|
||||
### spacy.CharacterEmbed.v1 {#CharacterEmbed}
|
||||
|
||||
|
@ -170,12 +170,12 @@ concatenated. A hash-embedded vector of the `NORM` of the word is also
|
|||
concatenated on, and the result is then passed through a feed-forward network to
|
||||
construct a single vector to represent the information.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | int | The width of the output vector and the `NORM` hash embedding. |
|
||||
| `rows` | int | The number of rows in the `NORM` hash embedding table. |
|
||||
| `nM` | int | The dimensionality of the character embeddings. Recommended values are between `16` and `64`. |
|
||||
| `nC` | int | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. |
|
||||
| Name | Description |
|
||||
| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The width of the output vector and the `NORM` hash embedding. ~~int~~ |
|
||||
| `rows` | The number of rows in the `NORM` hash embedding table. ~~int~~ |
|
||||
| `nM` | The dimensionality of the character embeddings. Recommended values are between `16` and `64`. ~~int~~ |
|
||||
| `nC` | The number of UTF-8 bytes to embed per word. Recommended values are between `3` and `8`, although it may depend on the length of words in the language. ~~int~~ |
|
||||
|
||||
### spacy.MaxoutWindowEncoder.v1 {#MaxoutWindowEncoder}
|
||||
|
||||
|
@ -193,12 +193,12 @@ construct a single vector to represent the information.
|
|||
Encode context using convolutions with maxout activation, layer normalization
|
||||
and residual connections.
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. |
|
||||
| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. |
|
||||
| `maxout_pieces` | int | The number of maxout pieces to use. Recommended values are `2` or `3`. |
|
||||
| `depth` | int | The number of convolutional layers. Recommended value is `4`. |
|
||||
| Name | Description |
|
||||
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ |
|
||||
| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ |
|
||||
| `maxout_pieces` | The number of maxout pieces to use. Recommended values are `2` or `3`. ~~int~~ |
|
||||
| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
|
||||
|
||||
### spacy.MishWindowEncoder.v1 {#MishWindowEncoder}
|
||||
|
||||
|
@ -216,11 +216,11 @@ Encode context using convolutions with
|
|||
[`Mish`](https://thinc.ai/docs/api-layers#mish) activation, layer normalization
|
||||
and residual connections.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. |
|
||||
| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. |
|
||||
| `depth` | int | The number of convolutional layers. Recommended value is `4`. |
|
||||
| Name | Description |
|
||||
| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ |
|
||||
| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ |
|
||||
| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
|
||||
|
||||
### spacy.TorchBiLSTMEncoder.v1 {#TorchBiLSTMEncoder}
|
||||
|
||||
|
@ -237,11 +237,11 @@ and residual connections.
|
|||
Encode context using bidirectional LSTM layers. Requires
|
||||
[PyTorch](https://pytorch.org).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `width` | int | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. |
|
||||
| `window_size` | int | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. |
|
||||
| `depth` | int | The number of convolutional layers. Recommended value is `4`. |
|
||||
| Name | Description |
|
||||
| ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `width` | The input and output width. These are required to be the same, to allow residual connections. This value will be determined by the width of the inputs. Recommended values are between `64` and `300`. ~~int~~ |
|
||||
| `window_size` | The number of words to concatenate around each token to construct the convolution. Recommended value is `1`. ~~int~~ |
|
||||
| `depth` | The number of convolutional layers. Recommended value is `4`. ~~int~~ |
|
||||
|
||||
## Transformer architectures {#transformers source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/architectures.py"}
|
||||
|
||||
|
@ -268,11 +268,11 @@ architectures into your training config.
|
|||
|
||||
<!-- TODO: description -->
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | str | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). |
|
||||
| `get_spans` | `Callable` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. |
|
||||
| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). |
|
||||
| Name | Description |
|
||||
| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Any model name that can be loaded by [`transformers.AutoModel`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoModel). ~~str~~ |
|
||||
| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ |
|
||||
| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
|
||||
|
||||
### spacy-transformers.Tok2VecListener.v1 {#transformers-Tok2VecListener}
|
||||
|
||||
|
@ -297,10 +297,10 @@ operate over wordpieces, which usually don't align one-to-one against spaCy
|
|||
tokens. The layer therefore requires a reduction operation in order to calculate
|
||||
a single token vector given zero or more wordpiece vectors.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `pooling` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types) | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. |
|
||||
| `grad_factor` | float | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. |
|
||||
| Name | Description |
|
||||
| ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ |
|
||||
| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ |
|
||||
|
||||
### spacy-transformers.Tok2VecTransformer.v1 {#Tok2VecTransformer}
|
||||
|
||||
|
@ -320,12 +320,12 @@ Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
|
|||
object, but it's a **simpler solution** if you only need the transformer within
|
||||
one component.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------ | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_spans` | callable | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. |
|
||||
| `tokenizer_config` | `Dict[str, Any]` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). |
|
||||
| `pooling` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** [`Ragged`](https://thinc.ai/docs/api-types#ragged). **Output:** [`Floats2d`](https://thinc.ai/docs/api-types#types) | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. |
|
||||
| `grad_factor` | float | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. |
|
||||
| Name | Description |
|
||||
| ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_spans` | Function that takes a batch of [`Doc`](/api/doc) object and returns lists of [`Span`](/api) objects to process by the transformer. [See here](/api/transformer#span_getters) for built-in options and examples. ~~Callable[[List[Doc]], List[Span]]~~ |
|
||||
| `tokenizer_config` | Tokenizer settings passed to [`transformers.AutoTokenizer`](https://huggingface.co/transformers/model_doc/auto.html#transformers.AutoTokenizer). ~~Dict[str, Any]~~ |
|
||||
| `pooling` | A reduction layer used to calculate the token vectors based on zero or more wordpiece vectors. If in doubt, mean pooling (see [`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean)) is usually a good choice. ~~Model[Ragged, Floats2d]~~ |
|
||||
| `grad_factor` | Reweight gradients from the component before passing them upstream. You can set this to `0` to "freeze" the transformer weights with respect to the component, or use it to make some components more significant than others. Leaving it at `1.0` is usually fine. ~~float~~ |
|
||||
|
||||
## Parser & NER architectures {#parser}
|
||||
|
||||
|
@ -368,14 +368,14 @@ consists of either two or three subnetworks:
|
|||
state representation. If not present, the output from the lower model is used
|
||||
as action scores directly.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. |
|
||||
| `nr_feature_tokens` | int | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. |
|
||||
| `hidden_width` | int | The width of the hidden layer. |
|
||||
| `maxout_pieces` | int | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. |
|
||||
| `use_upper` | bool | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. |
|
||||
| `nO` | int | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. |
|
||||
| Name | Description |
|
||||
| ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `nr_feature_tokens` | The number of tokens in the context to use to construct the state vector. Valid choices are `1`, `2`, `3`, `6`, `8` and `13`. The `2`, `8` and `13` feature sets are designed for the parser, while the `3` and `6` feature sets are designed for the entity recognizer. The recommended feature sets are `3` for NER, and `8` for the dependency parser. ~~int~~ |
|
||||
| `hidden_width` | The width of the hidden layer. ~~int~~ |
|
||||
| `maxout_pieces` | How many pieces to use in the state prediction layer. Recommended values are `1`, `2` or `3`. If `1`, the maxout non-linearity is replaced with a [`Relu`](https://thinc.ai/docs/api-layers#relu) non-linearity if `use_upper` is `True`, and no non-linearity if `False`. ~~int~~ |
|
||||
| `use_upper` | Whether to use an additional hidden layer after the state vector in order to predict the action scores. It is recommended to set this to `False` for large pretrained models such as transformers, and `True` for smaller networks. The upper layer is computed on CPU, which becomes a bottleneck on larger GPU-based models, where it's also less necessary. ~~bool~~ |
|
||||
| `nO` | The number of actions the model will predict between. Usually inferred from data at the beginning of training, or loaded from disk. ~~int~~ |
|
||||
|
||||
### spacy.BILUOTagger.v1 {#BILUOTagger source="spacy/ml/models/simple_ner.py"}
|
||||
|
||||
|
@ -402,9 +402,9 @@ generally results in better linear separation between classes, especially for
|
|||
non-CRF models, because there are more distinct classes for the different
|
||||
situations ([Ratinov et al., 2009](https://www.aclweb.org/anthology/W09-1119/)).
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. |
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------ |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
### spacy.IOBTagger.v1 {#IOBTagger source="spacy/ml/models/simple_ner.py"}
|
||||
|
||||
|
@ -427,9 +427,9 @@ spans into tags assigned to each token. The first token of a span is given the
|
|||
tag B-LABEL, and subsequent tokens are given the tag I-LABEL. All other tokens
|
||||
are assigned the tag O.
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. |
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------ |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
## Tagging architectures {#tagger source="spacy/ml/models/tagger.py"}
|
||||
|
||||
|
@ -450,10 +450,10 @@ Build a tagger model, using a provided token-to-vector component. The tagger
|
|||
model simply adds a linear layer with softmax activation to predict scores given
|
||||
the token vectors.
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. Subnetwork to map tokens into vector representations. |
|
||||
| `nO` | int | The number of tags to output. Inferred from the data if `None`. |
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------ |
|
||||
| `tok2vec` | Subnetwork to map tokens into vector representations. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `nO` | The number of tags to output. Inferred from the data if `None`. ~~Optional[int]~~ |
|
||||
|
||||
## Text classification architectures {#textcat source="spacy/ml/models/textcat.py"}
|
||||
|
||||
|
@ -489,18 +489,17 @@ network has an internal CNN Tok2Vec layer and uses attention.
|
|||
> nO = null
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------------------- | ----- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
|
||||
| `pretrained_vectors` | bool | Whether or not pretrained vectors will be used in addition to the feature vectors. |
|
||||
| `width` | int | Output dimension of the feature encoding step. |
|
||||
| `embed_size` | int | Input dimension of the feature encoding step. |
|
||||
| `conv_depth` | int | Depth of the Tok2Vec layer. |
|
||||
| `window_size` | int | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. |
|
||||
| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
|
||||
| `dropout` | float | The dropout rate. |
|
||||
| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when |
|
||||
| `begin_training` is called. |
|
||||
| Name | Description |
|
||||
| -------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `pretrained_vectors` | Whether or not pretrained vectors will be used in addition to the feature vectors. ~~bool~~ |
|
||||
| `width` | Output dimension of the feature encoding step. ~~int~~ |
|
||||
| `embed_size` | Input dimension of the feature encoding step. ~~int~~ |
|
||||
| `conv_depth` | Depth of the tok2vec layer. ~~int~~ |
|
||||
| `window_size` | The number of contextual vectors to [concatenate](https://thinc.ai/docs/api-layers#expand_window) from the left and from the right. ~~int~~ |
|
||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
||||
| `dropout` | The dropout rate. ~~float~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
||||
|
||||
### spacy.TextCatCNN.v1 {#TextCatCNN}
|
||||
|
||||
|
@ -527,11 +526,11 @@ A neural network model where token vectors are calculated using a CNN. The
|
|||
vectors are mean pooled and used as features in a feed-forward network. This
|
||||
architecture is usually less accurate than the ensemble, but runs faster.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. |
|
||||
| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. |
|
||||
| Name | Description |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
||||
|
||||
### spacy.TextCatBOW.v1 {#TextCatBOW}
|
||||
|
||||
|
@ -549,18 +548,18 @@ architecture is usually less accurate than the ensemble, but runs faster.
|
|||
An ngram "bag-of-words" model. This architecture should run much faster than the
|
||||
others, but may not be as accurate, especially if texts are short.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `exclusive_classes` | bool | Whether or not categories are mutually exclusive. |
|
||||
| `ngram_size` | int | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. |
|
||||
| `no_output_layer` | float | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes=True`, else `Logistic`. |
|
||||
| `nO` | int | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. |
|
||||
| Name | Description |
|
||||
| ------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
||||
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the the [`TextCategorizer`](/api/textcategorizer) component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
||||
|
||||
## Entity linking architectures {#entitylinker source="spacy/ml/models/entity_linker.py"}
|
||||
|
||||
An [`EntityLinker`](/api/entitylinker) component disambiguates textual mentions
|
||||
(tagged as named entities) to unique identifiers, grounding the named entities
|
||||
into the "real world". This requires 3 main components:
|
||||
into the "real world". This requires 3 main component
|
||||
|
||||
- A [`KnowledgeBase`](/api/kb) (KB) holding the unique identifiers, potential
|
||||
synonyms and prior probabilities.
|
||||
|
@ -571,8 +570,8 @@ into the "real world". This requires 3 main components:
|
|||
|
||||
### spacy.EntityLinker.v1 {#EntityLinker}
|
||||
|
||||
The `EntityLinker` model architecture is a `Thinc` `Model` with a Linear output
|
||||
layer.
|
||||
The `EntityLinker` model architecture is a Thinc `Model` with a
|
||||
[`Linear`](https://thinc.ai/api-layers#linear) output layer.
|
||||
|
||||
> #### Example Config
|
||||
>
|
||||
|
@ -599,27 +598,24 @@ layer.
|
|||
> @assets = "spacy.CandidateGenerator.v1"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------------------------------------ | ---------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | [`Model`](https://thinc.ai/docs/api-model) | The [`tok2vec`](#tok2vec) layer of the model. |
|
||||
| `nO` | int | Output dimension, determined by the length of the vectors encoding each entity in the KB |
|
||||
|
||||
If the `nO` dimension is not set, the Entity Linking component will set it when
|
||||
`begin_training` is called.
|
||||
| Name | Description |
|
||||
| --------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tok2vec` | The [`tok2vec`](#tok2vec) layer of the model. ~~Model~~ |
|
||||
| `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `begin_training` is called. ~~Optional[int]~~ |
|
||||
|
||||
### spacy.EmptyKB.v1 {#EmptyKB}
|
||||
|
||||
A function that creates a default, empty `KnowledgeBase` from a
|
||||
[`Vocab`](/api/vocab) instance.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------------- | ---- | ------------------------------------------------------------------------- |
|
||||
| `entity_vector_length` | int | The length of the vectors encoding each entity in the KB - 64 by default. |
|
||||
| Name | Description |
|
||||
| ---------------------- | ----------------------------------------------------------------------------------- |
|
||||
| `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ |
|
||||
|
||||
### spacy.CandidateGenerator.v1 {#CandidateGenerator}
|
||||
|
||||
A function that takes as input a [`KnowledgeBase`](/api/kb) and a
|
||||
[`Span`](/api/span) object denoting a named entity, and returns a list of
|
||||
plausible [`Candidate` objects](/api/kb/#candidate_init). The default
|
||||
plausible [`Candidate`](/api/kb/#candidate) objects. The default
|
||||
`CandidateGenerator` simply uses the text of a mention to find its potential
|
||||
aliases in the `KnowledgeBase`. Note that this function is case-dependent.
|
||||
|
|
|
@ -31,10 +31,10 @@ how the component should be configured. You can override its settings via the
|
|||
> nlp.add_pipe("attribute_ruler", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| --------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ------- |
|
||||
| `pattern_dicts` | `Iterable[dict]` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](#add) (`patterns`/`attrs`/`index`) to add as patterns. | `None` |
|
||||
| `validate` | bool | Whether patterns should be validated (passed to the `Matcher`). | `False` |
|
||||
| Setting | Description |
|
||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `pattern_dicts` | A list of pattern dicts with the keys as the arguments to [`AttributeRuler.add`](/api/attributeruler#add) (`patterns`/`attrs`/`index`) to add as patterns. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ |
|
||||
| `validate` | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. ~~bool~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/attributeruler.py
|
||||
|
@ -47,10 +47,10 @@ be a list of dictionaries with `"patterns"`, `"attrs"`, and optional `"index"`
|
|||
keys, e.g.:
|
||||
|
||||
```python
|
||||
pattern_dicts = \[
|
||||
{"patterns": \[\[{"TAG": "VB"}\]\], "attrs": {"POS": "VERB"}},
|
||||
{"patterns": \[\[{"LOWER": "an"}\]\], "attrs": {"LEMMA": "a"}},
|
||||
\]
|
||||
pattern_dicts = [
|
||||
{"patterns": [[{"TAG": "VB"}]], "attrs": {"POS": "VERB"}},
|
||||
{"patterns": [[{"LOWER": "an"}]], "attrs": {"LEMMA": "a"}},
|
||||
]
|
||||
```
|
||||
|
||||
> #### Example
|
||||
|
@ -60,23 +60,23 @@ pattern_dicts = \[
|
|||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ----------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. |
|
||||
| `name` | str | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. |
|
||||
| _keyword-only_ | | |
|
||||
| `pattern_dicts` | `Iterable[Dict]]` | Optional patterns to load in on initialization. Defaults to `None`. |
|
||||
| `validate` | bool | Whether patterns should be validated (passed to the `Matcher`). Defaults to `False`. |
|
||||
| Name | Description |
|
||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary to pass to the matcher. ~~Vocab~~ |
|
||||
| `name` | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `pattern_dicts` | Optional patterns to load in on initialization. Defaults to `None`. ~~Optional[Iterable[Dict[str, Union[List[dict], dict, int]]]]~~ |
|
||||
| `validate` | Whether patterns should be validated (passed to the [`Matcher`](/api/matcher#init)). Defaults to `False`. ~~bool~~ |
|
||||
|
||||
## AttributeRuler.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Apply the attribute ruler to a Doc, setting token attributes for tokens matched
|
||||
by the provided patterns.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` with added entities, if available. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## AttributeRuler.add {#add tag="method"}
|
||||
|
||||
|
@ -95,11 +95,11 @@ may be negative to index from the end of the span.
|
|||
> attribute_ruler.add(patterns=patterns, attrs=attrs)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---------------------- | ----------------------------------------------------------------------------------------------------------------------- |
|
||||
| patterns | `Iterable[List[Dict]]` | A list of Matcher patterns. |
|
||||
| attrs | dict | The attributes to assign to the target token in the matched span. |
|
||||
| index | int | The index of the token in the matched span to modify. May be negative to index from the end of the span. Defaults to 0. |
|
||||
| Name | Description |
|
||||
| ---------- | --------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `patterns` | The `Matcher` patterns to add. ~~Iterable[List[Dict[Union[int, str], Any]]]~~ |
|
||||
| `attrs` | The attributes to assign to the target token in the matched span. ~~Dict[str, Any]~~ |
|
||||
| `index` | The index of the token in the matched span to modify. May be negative to index from the end of the span. Defaults to `0`. ~~int~~ |
|
||||
|
||||
## AttributeRuler.add_patterns {#add_patterns tag="method"}
|
||||
|
||||
|
@ -107,52 +107,52 @@ may be negative to index from the end of the span.
|
|||
>
|
||||
> ```python
|
||||
> attribute_ruler = nlp.add_pipe("attribute_ruler")
|
||||
> pattern_dicts = \[
|
||||
> pattern_dicts = [
|
||||
> {
|
||||
> "patterns": \[\[{"TAG": "VB"}\]\],
|
||||
> "patterns": [[{"TAG": "VB"}]],
|
||||
> "attrs": {"POS": "VERB"}
|
||||
> },
|
||||
> {
|
||||
> "patterns": \[\[{"LOWER": "two"}, {"LOWER": "apples"}\]\],
|
||||
> "patterns": [[{"LOWER": "two"}, {"LOWER": "apples"}]],
|
||||
> "attrs": {"LEMMA": "apple"},
|
||||
> "index": -1
|
||||
> },
|
||||
> \]
|
||||
> ]
|
||||
> attribute_ruler.add_patterns(pattern_dicts)
|
||||
> ```
|
||||
|
||||
Add patterns from a list of pattern dicts with the keys as the arguments to
|
||||
[`AttributeRuler.add`](#add).
|
||||
[`AttributeRuler.add`](/api/attributeruler#add).
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ----------------- | -------------------- |
|
||||
| `pattern_dicts` | `Iterable[Dict]]` | The patterns to add. |
|
||||
| Name | Description |
|
||||
| --------------- | -------------------------------------------------------------------------- |
|
||||
| `pattern_dicts` | The patterns to add. ~~Iterable[Dict[str, Union[List[dict], dict, int]]]~~ |
|
||||
|
||||
## AttributeRuler.patterns {#patterns tag="property"}
|
||||
|
||||
Get all patterns that have been added to the attribute ruler in the
|
||||
`patterns_dict` format accepted by
|
||||
[`AttributeRuler.add_patterns`](#add_patterns).
|
||||
[`AttributeRuler.add_patterns`](/api/attributeruler#add_patterns).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | ------------------------------------------ |
|
||||
| **RETURNS** | `List[dict]` | The patterns added to the attribute ruler. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------- |
|
||||
| **RETURNS** | The patterns added to the attribute ruler. ~~List[Dict[str, Union[List[dict], dict, int]]]~~ |
|
||||
|
||||
## AttributeRuler.load_from_tag_map {#load_from_tag_map tag="method"}
|
||||
|
||||
Load attribute ruler patterns from a tag map.
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---- | ------------------------------------------------------------------------------------------ |
|
||||
| `tag_map` | dict | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. |
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `tag_map` | The tag map that maps fine-grained tags to coarse-grained tags and morphological features. ~~Dict[str, Dict[Union[int, str], Union[int, str]]]~~ |
|
||||
|
||||
## AttributeRuler.load_from_morph_rules {#load_from_morph_rules tag="method"}
|
||||
|
||||
Load attribute ruler patterns from morph rules.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `morph_rules` | dict | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. |
|
||||
| Name | Description |
|
||||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `morph_rules` | The morph rules that map token text and fine-grained tags to coarse-grained tags, lemmas and morphological features. ~~Dict[str, Dict[str, Dict[Union[int, str], Union[int, str]]]]~~ |
|
||||
|
||||
## AttributeRuler.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -165,11 +165,11 @@ Serialize the pipe to disk.
|
|||
> attribute_ruler.to_disk("/path/to/attribute_ruler")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## AttributeRuler.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -182,12 +182,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> attribute_ruler.from_disk("/path/to/attribute_ruler")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `AttributeRuler` | The modified `AttributeRuler` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `AttributeRuler` object. ~~AttributeRuler~~ |
|
||||
|
||||
## AttributeRuler.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -200,11 +200,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `AttributeRuler` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `AttributeRuler` object. ~~bytes~~ |
|
||||
|
||||
## AttributeRuler.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -218,12 +218,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> attribute_ruler.from_bytes(attribute_ruler_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ---------------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `AttributeRuler` | The `AttributeRuler` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `AttributeRuler` object. ~~AttributeRuler~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -598,9 +598,9 @@ $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P
|
|||
|
||||
| Argument | Type | Description |
|
||||
| ----------------------- | ---------- | ----------------------------------------------------------------------------------------------------- |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. | |
|
||||
| `component` | positional | Name of the pipeline component of which the model should be analyzed. | |
|
||||
| `--layers`, `-l` | option | Comma-separated names of layer IDs to print. | |
|
||||
| `config_path` | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
|
||||
| `component` | positional | Name of the pipeline component of which the model should be analyzed. |
|
||||
| `--layers`, `-l` | option | Comma-separated names of layer IDs to print. |
|
||||
| `--dimensions`, `-DIM` | option | Show dimensions of each layer. |
|
||||
| `--parameters`, `-PAR` | option | Show parameters of each layer. |
|
||||
| `--gradients`, `-GRAD` | option | Show gradients of each layer. |
|
||||
|
|
|
@ -34,12 +34,12 @@ streaming.
|
|||
> limit = 0
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ------ | ----------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | `Path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). |
|
||||
| `gold_preproc` | bool | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. |
|
||||
| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. |
|
||||
| `limit` | int | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. |
|
||||
| Name | Description |
|
||||
| --------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | The directory or filename to read from. Expects data in spaCy's binary [`.spacy` format](/api/data-formats#binary-training). ~~Path~~ |
|
||||
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. See [`Corpus`](/api/corpus#init) for details. ~~bool~~ |
|
||||
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/gold/corpus.py
|
||||
|
@ -67,13 +67,13 @@ train/test skew.
|
|||
> corpus = Corpus("./data", limit=10)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | The directory or filename to read from. |
|
||||
| _keyword-only_ | | |
|
||||
| `gold_preproc` | bool | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. |
|
||||
| `max_length` | int | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. |
|
||||
| `limit` | int | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. |
|
||||
| Name | Description |
|
||||
| --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | The directory or filename to read from. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ |
|
||||
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||
|
||||
## Corpus.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -90,7 +90,7 @@ Yield examples from the data.
|
|||
> train_data = corpus(nlp)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ---------- | ------------------------- |
|
||||
| `nlp` | `Language` | The current `nlp` object. |
|
||||
| **YIELDS** | `Example` | The examples. |
|
||||
| Name | Description |
|
||||
| ---------- | -------------------------------------- |
|
||||
| `nlp` | The current `nlp` object. ~~Language~~ |
|
||||
| **YIELDS** | The examples. ~~Example~~ |
|
||||
|
|
|
@ -23,13 +23,13 @@ accessed from Python. For the Python documentation, see [`Doc`](/api/doc).
|
|||
|
||||
### Attributes {#doc_attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------ | ----------------------------------------------------------------------------------------- |
|
||||
| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Doc` object is garbage collected. |
|
||||
| `vocab` | `Vocab` | A reference to the shared `Vocab` object. |
|
||||
| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. |
|
||||
| `length` | `int` | The number of tokens in the document. |
|
||||
| `max_length` | `int` | The underlying size of the `Doc.c` array. |
|
||||
| Name | Description |
|
||||
| ------------ | -------------------------------------------------------------------------------------------------------- |
|
||||
| `mem` | A memory pool. Allocated memory will be freed once the `Doc` object is garbage collected. ~~cymem.Pool~~ |
|
||||
| `vocab` | A reference to the shared `Vocab` object. ~~Vocab~~ |
|
||||
| `c` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. ~~TokenC\*~~ |
|
||||
| `length` | The number of tokens in the document. ~~int~~ |
|
||||
| `max_length` | The underlying size of the `Doc.c` array. ~~int~~ |
|
||||
|
||||
### Doc.push_back {#doc_push_back tag="method"}
|
||||
|
||||
|
@ -50,10 +50,10 @@ Append a token to the `Doc`. The token can be provided as a
|
|||
> assert doc.text == "hello "
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | --------------- | ----------------------------------------- |
|
||||
| `lex_or_tok` | `LexemeOrToken` | The word to append to the `Doc`. |
|
||||
| `has_space` | `bint` | Whether the word has trailing whitespace. |
|
||||
| Name | Description |
|
||||
| ------------ | -------------------------------------------------- |
|
||||
| `lex_or_tok` | The word to append to the `Doc`. ~~LexemeOrToken~~ |
|
||||
| `has_space` | Whether the word has trailing whitespace. ~~bint~~ |
|
||||
|
||||
## Token {#token tag="cdef class" source="spacy/tokens/token.pxd"}
|
||||
|
||||
|
@ -70,12 +70,12 @@ accessed from Python. For the Python documentation, see [`Token`](/api/token).
|
|||
|
||||
### Attributes {#token_attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | --------- | ------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | A reference to the shared `Vocab` object. |
|
||||
| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. |
|
||||
| `i` | `int` | The offset of the token within the document. |
|
||||
| `doc` | `Doc` | The parent document. |
|
||||
| Name | Description |
|
||||
| ------- | -------------------------------------------------------------------------- |
|
||||
| `vocab` | A reference to the shared `Vocab` object. ~~Vocab~~ |
|
||||
| `c` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. ~~TokenC\*~~ |
|
||||
| `i` | The offset of the token within the document. ~~int~~ |
|
||||
| `doc` | The parent document. ~~Doc~~ |
|
||||
|
||||
### Token.cinit {#token_cinit tag="method"}
|
||||
|
||||
|
@ -87,12 +87,12 @@ Create a `Token` object from a `TokenC*` pointer.
|
|||
> token = Token.cinit(&doc.c[3], doc, 3)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | --------- | ------------------------------------------------------------ |
|
||||
| `vocab` | `Vocab` | A reference to the shared `Vocab`. |
|
||||
| `c` | `TokenC*` | A pointer to a [`TokenC`](/api/cython-structs#tokenc)struct. |
|
||||
| `offset` | `int` | The offset of the token within the document. |
|
||||
| `doc` | `Doc` | The parent document. |
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------------------------------- |
|
||||
| `vocab` | A reference to the shared `Vocab`. ~~Vocab~~ |
|
||||
| `c` | A pointer to a [`TokenC`](/api/cython-structs#tokenc) struct. ~~TokenC\*~~ |
|
||||
| `offset` | The offset of the token within the document. ~~int~~ |
|
||||
| `doc` | The parent document. ~~int~~ |
|
||||
|
||||
## Span {#span tag="cdef class" source="spacy/tokens/span.pxd"}
|
||||
|
||||
|
@ -107,14 +107,14 @@ accessed from Python. For the Python documentation, see [`Span`](/api/span).
|
|||
|
||||
### Attributes {#span_attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | -------------------------------------- | ------------------------------------------------------- |
|
||||
| `doc` | `Doc` | The parent document. |
|
||||
| `start` | `int` | The index of the first token of the span. |
|
||||
| `end` | `int` | The index of the first token after the span. |
|
||||
| `start_char` | `int` | The index of the first character of the span. |
|
||||
| `end_char` | `int` | The index of the last character of the span. |
|
||||
| `label` | <Abbr title="uint64_t">`attr_t`</Abbr> | A label to attach to the span, e.g. for named entities. |
|
||||
| Name | Description |
|
||||
| ------------ | ----------------------------------------------------------------------------- |
|
||||
| `doc` | The parent document. ~~Doc~~ |
|
||||
| `start` | The index of the first token of the span. ~~int~~ |
|
||||
| `end` | The index of the first token after the span. ~~int~~ |
|
||||
| `start_char` | The index of the first character of the span. ~~int~~ |
|
||||
| `end_char` | The index of the last character of the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~attr_t (uint64_t)~~ |
|
||||
|
||||
## Lexeme {#lexeme tag="cdef class" source="spacy/lexeme.pxd"}
|
||||
|
||||
|
@ -129,11 +129,11 @@ accessed from Python. For the Python documentation, see [`Lexeme`](/api/lexeme).
|
|||
|
||||
### Attributes {#lexeme_attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | -------------------------------------- | --------------------------------------------------------------- |
|
||||
| `c` | `LexemeC*` | A pointer to a [`LexemeC`](/api/cython-structs#lexemec) struct. |
|
||||
| `vocab` | `Vocab` | A reference to the shared `Vocab` object. |
|
||||
| `orth` | <Abbr title="uint64_t">`attr_t`</Abbr> | ID of the verbatim text content. |
|
||||
| Name | Description |
|
||||
| ------- | ----------------------------------------------------------------------------- |
|
||||
| `c` | A pointer to a [`LexemeC`](/api/cython-structs#lexemec) struct. ~~LexemeC\*~~ |
|
||||
| `vocab` | A reference to the shared `Vocab` object. ~~Vocab~~ |
|
||||
| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
|
||||
|
||||
## Vocab {#vocab tag="cdef class" source="spacy/vocab.pxd"}
|
||||
|
||||
|
@ -149,11 +149,11 @@ accessed from Python. For the Python documentation, see [`Vocab`](/api/vocab).
|
|||
|
||||
### Attributes {#vocab_attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. |
|
||||
| `strings` | `StringStore` | A `StringStore` that maps string to hash values and vice versa. |
|
||||
| `length` | `int` | The number of entries in the vocabulary. |
|
||||
| Name | Description |
|
||||
| --------- | ---------------------------------------------------------------------------------------------------------- |
|
||||
| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
|
||||
| `strings` | A `StringStore` that maps string to hash values and vice versa. ~~StringStore~~ |
|
||||
| `length` | The number of entries in the vocabulary. ~~int~~ |
|
||||
|
||||
### Vocab.get {#vocab_get tag="method"}
|
||||
|
||||
|
@ -166,11 +166,11 @@ vocabulary.
|
|||
> lexeme = vocab.get(vocab.mem, "hello")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. |
|
||||
| `string` | str | The string of the word to look up. |
|
||||
| **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------------- |
|
||||
| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
|
||||
| `string` | The string of the word to look up. ~~str~~ |
|
||||
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
|
||||
|
||||
### Vocab.get_by_orth {#vocab_get_by_orth tag="method"}
|
||||
|
||||
|
@ -183,11 +183,11 @@ vocabulary.
|
|||
> lexeme = vocab.get_by_orth(doc[0].lex.norm)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------------------------------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. |
|
||||
| `orth` | <Abbr title="uint64_t">`attr_t`</Abbr> | ID of the verbatim text content. |
|
||||
| **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------------- |
|
||||
| `mem` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. ~~cymem.Pool~~ |
|
||||
| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
|
||||
| **RETURNS** | The lexeme in the vocabulary. ~~const LexemeC\*~~ |
|
||||
|
||||
## StringStore {#stringstore tag="cdef class" source="spacy/strings.pxd"}
|
||||
|
||||
|
@ -203,7 +203,7 @@ accessed from Python. For the Python documentation, see
|
|||
|
||||
### Attributes {#stringstore_attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
|
||||
| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the`StringStore` object is garbage collected. |
|
||||
| `keys` | <Abbr title="vector[uint64_t]">`vector[hash_t]`</Abbr> | A list of hash values in the `StringStore`. |
|
||||
| Name | Description |
|
||||
| ------ | ---------------------------------------------------------------------------------------------------------------- |
|
||||
| `mem` | A memory pool. Allocated memory will be freed once the `StringStore` object is garbage collected. ~~cymem.Pool~~ |
|
||||
| `keys` | A list of hash values in the `StringStore`. ~~vector[hash_t] \(vector[uint64_t])~~ |
|
||||
|
|
|
@ -18,26 +18,26 @@ Cython data container for the `Token` object.
|
|||
> token_ptr = &doc.c[3]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lex` | `const LexemeC*` | A pointer to the lexeme for the token. |
|
||||
| `morph` | `uint64_t` | An ID allowing lookup of morphological attributes. |
|
||||
| `pos` | `univ_pos_t` | Coarse-grained part-of-speech tag. |
|
||||
| `spacy` | `bint` | A binary value indicating whether the token has trailing whitespace. |
|
||||
| `tag` | <Abbr title="uint64_t">`attr_t`</Abbr> | Fine-grained part-of-speech tag. |
|
||||
| `idx` | `int` | The character offset of the token within the parent document. |
|
||||
| `lemma` | <Abbr title="uint64_t">`attr_t`</Abbr> | Base form of the token, with no inflectional suffixes. |
|
||||
| `sense` | <Abbr title="uint64_t">`attr_t`</Abbr> | Space for storing a word sense ID, currently unused. |
|
||||
| `head` | `int` | Offset of the syntactic parent relative to the token. |
|
||||
| `dep` | <Abbr title="uint64_t">`attr_t`</Abbr> | Syntactic dependency relation. |
|
||||
| `l_kids` | `uint32_t` | Number of left children. |
|
||||
| `r_kids` | `uint32_t` | Number of right children. |
|
||||
| `l_edge` | `uint32_t` | Offset of the leftmost token of this token's syntactic descendants. |
|
||||
| `r_edge` | `uint32_t` | Offset of the rightmost token of this token's syntactic descendants. |
|
||||
| `sent_start` | `int` | Ternary value indicating whether the token is the first word of a sentence. `0` indicates a missing value, `-1` indicates `False` and `1` indicates `True`. The default value, 0, is interpreted as no sentence break. Sentence boundary detectors will usually set 0 for all tokens except tokens that follow a sentence boundary. |
|
||||
| `ent_iob` | `int` | IOB code of named entity tag. `0` indicates a missing value, `1` indicates `I`, `2` indicates `0` and `3` indicates `B`. |
|
||||
| `ent_type` | <Abbr title="uint64_t">`attr_t`</Abbr> | Named entity type. |
|
||||
| `ent_id` | <Abbr title="uint64_t">`attr_t`</Abbr> | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. |
|
||||
| Name | Description |
|
||||
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lex` | A pointer to the lexeme for the token. ~~const LexemeC\*~~ |
|
||||
| `morph` | An ID allowing lookup of morphological attributes. ~~uint64_t~~ |
|
||||
| `pos` | Coarse-grained part-of-speech tag. ~~univ_pos_t~~ |
|
||||
| `spacy` | A binary value indicating whether the token has trailing whitespace. ~~bint~~ |
|
||||
| `tag` | Fine-grained part-of-speech tag. ~~attr_t (uint64_t)~~ |
|
||||
| `idx` | The character offset of the token within the parent document. ~~int~~ |
|
||||
| `lemma` | Base form of the token, with no inflectional suffixes. ~~attr_t (uint64_t)~~ |
|
||||
| `sense` | Space for storing a word sense ID, currently unused. ~~attr_t (uint64_t)~~ |
|
||||
| `head` | Offset of the syntactic parent relative to the token. ~~int~~ |
|
||||
| `dep` | Syntactic dependency relation. ~~attr_t (uint64_t)~~ |
|
||||
| `l_kids` | Number of left children. ~~uint32_t~~ |
|
||||
| `r_kids` | Number of right children. ~~uint32_t~~ |
|
||||
| `l_edge` | Offset of the leftmost token of this token's syntactic descendants. ~~uint32_t~~ |
|
||||
| `r_edge` | Offset of the rightmost token of this token's syntactic descendants. ~~uint32_t~~ |
|
||||
| `sent_start` | Ternary value indicating whether the token is the first word of a sentence. `0` indicates a missing value, `-1` indicates `False` and `1` indicates `True`. The default value, 0, is interpreted as no sentence break. Sentence boundary detectors will usually set 0 for all tokens except tokens that follow a sentence boundary. ~~int~~ |
|
||||
| `ent_iob` | IOB code of named entity tag. `0` indicates a missing value, `1` indicates `I`, `2` indicates `0` and `3` indicates `B`. ~~int~~ |
|
||||
| `ent_type` | Named entity type. ~~attr_t (uint64_t)~~ |
|
||||
| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~attr_t (uint64_t)~~ |
|
||||
|
||||
### Token.get_struct_attr {#token_get_struct_attr tag="staticmethod, nogil" source="spacy/tokens/token.pxd"}
|
||||
|
||||
|
@ -52,11 +52,11 @@ Get the value of an attribute from the `TokenC` struct by attribute ID.
|
|||
> is_alpha = Token.get_struct_attr(&doc.c[3], IS_ALPHA)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------------------------------------- | -------------------------------------------------------------------------------------- |
|
||||
| `token` | `const TokenC*` | A pointer to a `TokenC` struct. |
|
||||
| `feat_name` | `attr_id_t` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. |
|
||||
| **RETURNS** | <Abbr title="uint64_t">`attr_t`</Abbr> | The value of the attribute. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------- |
|
||||
| `token` | A pointer to a `TokenC` struct. ~~const TokenC\*~~ |
|
||||
| `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
|
||||
| **RETURNS** | The value of the attribute. ~~attr_t (uint64_t)~~ |
|
||||
|
||||
### Token.set_struct_attr {#token_set_struct_attr tag="staticmethod, nogil" source="spacy/tokens/token.pxd"}
|
||||
|
||||
|
@ -72,11 +72,11 @@ Set the value of an attribute of the `TokenC` struct by attribute ID.
|
|||
> Token.set_struct_attr(token, TAG, 0)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------------------------------------- | -------------------------------------------------------------------------------------- |
|
||||
| `token` | `const TokenC*` | A pointer to a `TokenC` struct. |
|
||||
| `feat_name` | `attr_id_t` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. |
|
||||
| `value` | <Abbr title="uint64_t">`attr_t`</Abbr> | The value to set. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------- |
|
||||
| `token` | A pointer to a `TokenC` struct. ~~const TokenC\*~~ |
|
||||
| `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
|
||||
| `value` | The value to set. ~~attr_t (uint64_t)~~ |
|
||||
|
||||
### token_by_start {#token_by_start tag="function" source="spacy/tokens/doc.pxd"}
|
||||
|
||||
|
@ -93,12 +93,12 @@ Find a token in a `TokenC*` array by the offset of its first character.
|
|||
> assert token_by_start(doc.c, doc.length, 4) == -1
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | --------------- | --------------------------------------------------------- |
|
||||
| `tokens` | `const TokenC*` | A `TokenC*` array. |
|
||||
| `length` | `int` | The number of tokens in the array. |
|
||||
| `start_char` | `int` | The start index to search for. |
|
||||
| **RETURNS** | `int` | The index of the token in the array or `-1` if not found. |
|
||||
| Name | Description |
|
||||
| ------------ | ----------------------------------------------------------------- |
|
||||
| `tokens` | A `TokenC*` array. ~~const TokenC\*~~ |
|
||||
| `length` | The number of tokens in the array. ~~int~~ |
|
||||
| `start_char` | The start index to search for. ~~int~~ |
|
||||
| **RETURNS** | The index of the token in the array or `-1` if not found. ~~int~~ |
|
||||
|
||||
### token_by_end {#token_by_end tag="function" source="spacy/tokens/doc.pxd"}
|
||||
|
||||
|
@ -115,12 +115,12 @@ Find a token in a `TokenC*` array by the offset of its final character.
|
|||
> assert token_by_end(doc.c, doc.length, 1) == -1
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------- | --------------------------------------------------------- |
|
||||
| `tokens` | `const TokenC*` | A `TokenC*` array. |
|
||||
| `length` | `int` | The number of tokens in the array. |
|
||||
| `end_char` | `int` | The end index to search for. |
|
||||
| **RETURNS** | `int` | The index of the token in the array or `-1` if not found. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------- |
|
||||
| `tokens` | A `TokenC*` array. ~~const TokenC\*~~ |
|
||||
| `length` | The number of tokens in the array. ~~int~~ |
|
||||
| `end_char` | The end index to search for. ~~int~~ |
|
||||
| **RETURNS** | The index of the token in the array or `-1` if not found. ~~int~~ |
|
||||
|
||||
### set_children_from_heads {#set_children_from_heads tag="function" source="spacy/tokens/doc.pxd"}
|
||||
|
||||
|
@ -143,10 +143,10 @@ attribute, in order to make the parse tree navigation consistent.
|
|||
> assert doc.c[3].l_kids == 1
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | --------------- | ---------------------------------- |
|
||||
| `tokens` | `const TokenC*` | A `TokenC*` array. |
|
||||
| `length` | `int` | The number of tokens in the array. |
|
||||
| Name | Description |
|
||||
| -------- | ------------------------------------------ |
|
||||
| `tokens` | A `TokenC*` array. ~~const TokenC\*~~ |
|
||||
| `length` | The number of tokens in the array. ~~int~~ |
|
||||
|
||||
## LexemeC {#lexemec tag="C struct" source="spacy/structs.pxd"}
|
||||
|
||||
|
@ -160,17 +160,17 @@ struct.
|
|||
> lex = doc.c[3].lex
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------------------------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `flags` | <Abbr title="uint64_t">`flags_t`</Abbr> | Bit-field for binary lexical flag values. |
|
||||
| `id` | <Abbr title="uint64_t">`attr_t`</Abbr> | Usually used to map lexemes to rows in a matrix, e.g. for word vectors. Does not need to be unique, so currently misnamed. |
|
||||
| `length` | <Abbr title="uint64_t">`attr_t`</Abbr> | Number of unicode characters in the lexeme. |
|
||||
| `orth` | <Abbr title="uint64_t">`attr_t`</Abbr> | ID of the verbatim text content. |
|
||||
| `lower` | <Abbr title="uint64_t">`attr_t`</Abbr> | ID of the lowercase form of the lexeme. |
|
||||
| `norm` | <Abbr title="uint64_t">`attr_t`</Abbr> | ID of the lexeme's norm, i.e. a normalized form of the text. |
|
||||
| `shape` | <Abbr title="uint64_t">`attr_t`</Abbr> | Transform of the lexeme's string, to show orthographic features. |
|
||||
| `prefix` | <Abbr title="uint64_t">`attr_t`</Abbr> | Length-N substring from the start of the lexeme. Defaults to `N=1`. |
|
||||
| `suffix` | <Abbr title="uint64_t">`attr_t`</Abbr> | Length-N substring from the end of the lexeme. Defaults to `N=3`. |
|
||||
| Name | Description |
|
||||
| -------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `flags` | Bit-field for binary lexical flag values. ~~flags_t (uint64_t)~~ |
|
||||
| `id` | Usually used to map lexemes to rows in a matrix, e.g. for word vectors. Does not need to be unique, so currently misnamed. ~~attr_t (uint64_t)~~ |
|
||||
| `length` | Number of unicode characters in the lexeme. ~~attr_t (uint64_t)~~ |
|
||||
| `orth` | ID of the verbatim text content. ~~attr_t (uint64_t)~~ |
|
||||
| `lower` | ID of the lowercase form of the lexeme. ~~attr_t (uint64_t)~~ |
|
||||
| `norm` | ID of the lexeme's norm, i.e. a normalized form of the text. ~~attr_t (uint64_t)~~ |
|
||||
| `shape` | Transform of the lexeme's string, to show orthographic features. ~~attr_t (uint64_t)~~ |
|
||||
| `prefix` | Length-N substring from the start of the lexeme. Defaults to `N=1`. ~~attr_t (uint64_t)~~ |
|
||||
| `suffix` | Length-N substring from the end of the lexeme. Defaults to `N=3`. ~~attr_t (uint64_t)~~ |
|
||||
|
||||
### Lexeme.get_struct_attr {#lexeme_get_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"}
|
||||
|
||||
|
@ -186,11 +186,11 @@ Get the value of an attribute from the `LexemeC` struct by attribute ID.
|
|||
> is_alpha = Lexeme.get_struct_attr(lexeme, IS_ALPHA)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------------------------------------- | -------------------------------------------------------------------------------------- |
|
||||
| `lex` | `const LexemeC*` | A pointer to a `LexemeC` struct. |
|
||||
| `feat_name` | `attr_id_t` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. |
|
||||
| **RETURNS** | <Abbr title="uint64_t">`attr_t`</Abbr> | The value of the attribute. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------- |
|
||||
| `lex` | A pointer to a `LexemeC` struct. ~~const LexemeC\*~~ |
|
||||
| `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
|
||||
| **RETURNS** | The value of the attribute. ~~attr_t (uint64_t)~~ |
|
||||
|
||||
### Lexeme.set_struct_attr {#lexeme_set_struct_attr tag="staticmethod, nogil" source="spacy/lexeme.pxd"}
|
||||
|
||||
|
@ -206,11 +206,11 @@ Set the value of an attribute of the `LexemeC` struct by attribute ID.
|
|||
> Lexeme.set_struct_attr(lexeme, NORM, lexeme.lower)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------------------------------------- | -------------------------------------------------------------------------------------- |
|
||||
| `lex` | `const LexemeC*` | A pointer to a `LexemeC` struct. |
|
||||
| `feat_name` | `attr_id_t` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. |
|
||||
| `value` | <Abbr title="uint64_t">`attr_t`</Abbr> | The value to set. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------- |
|
||||
| `lex` | A pointer to a `LexemeC` struct. ~~const LexemeC\*~~ |
|
||||
| `feat_name` | The ID of the attribute to look up. The attributes are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
|
||||
| `value` | The value to set. ~~attr_t (uint64_t)~~ |
|
||||
|
||||
### Lexeme.c_check_flag {#lexeme_c_check_flag tag="staticmethod, nogil" source="spacy/lexeme.pxd"}
|
||||
|
||||
|
@ -226,11 +226,11 @@ Check the value of a binary flag attribute.
|
|||
> is_stop = Lexeme.c_check_flag(lexeme, IS_STOP)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | ------------------------------------------------------------------------------- |
|
||||
| `lexeme` | `const LexemeC*` | A pointer to a `LexemeC` struct. |
|
||||
| `flag_id` | `attr_id_t` | The ID of the flag to look up. The flag IDs are enumerated in `spacy.typedefs`. |
|
||||
| **RETURNS** | `bint` | The boolean value of the flag. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------------- |
|
||||
| `lexeme` | A pointer to a `LexemeC` struct. ~~const LexemeC\*~~ |
|
||||
| `flag_id` | The ID of the flag to look up. The flag IDs are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
|
||||
| **RETURNS** | The boolean value of the flag. ~~bint~~ |
|
||||
|
||||
### Lexeme.c_set_flag {#lexeme_c_set_flag tag="staticmethod, nogil" source="spacy/lexeme.pxd"}
|
||||
|
||||
|
@ -246,8 +246,8 @@ Set the value of a binary flag attribute.
|
|||
> Lexeme.c_set_flag(lexeme, IS_STOP, 0)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | ------------------------------------------------------------------------------- |
|
||||
| `lexeme` | `const LexemeC*` | A pointer to a `LexemeC` struct. |
|
||||
| `flag_id` | `attr_id_t` | The ID of the flag to look up. The flag IDs are enumerated in `spacy.typedefs`. |
|
||||
| `value` | `bint` | The value to set. |
|
||||
| Name | Description |
|
||||
| --------- | --------------------------------------------------------------------------------------------- |
|
||||
| `lexeme` | A pointer to a `LexemeC` struct. ~~const LexemeC\*~~ |
|
||||
| `flag_id` | The ID of the flag to look up. The flag IDs are enumerated in `spacy.typedefs`. ~~attr_id_t~~ |
|
||||
| `value` | The value to set. ~~bint~~ |
|
||||
|
|
|
@ -73,15 +73,15 @@ your config and check that it's valid, you can run the
|
|||
Defines the `nlp` object, its tokenizer and
|
||||
[processing pipeline](/usage/processing-pipelines) component names.
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| ------------------------- | ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------- |
|
||||
| `lang` | str | The language code to use. | `null` |
|
||||
| `pipeline` | `List[str]` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). | `[]` |
|
||||
| `load_vocab_data` | bool | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. | `true` |
|
||||
| `before_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. | `null` |
|
||||
| `after_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. | `null` |
|
||||
| `after_pipeline_creation` | callable | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. | `null` |
|
||||
| `tokenizer` | callable | The tokenizer to use. | [`Tokenizer`](/api/tokenizer) |
|
||||
| Name | Description | Default |
|
||||
| ------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------- |
|
||||
| `lang` | The language code to use. ~~str~~ | `null` |
|
||||
| `pipeline` | Names of pipeline components in order. Should correspond to sections in the `[components]` block, e.g. `[components.ner]`. See docs on [defining components](/usage/training#config-components). ~~List[str]~~ | `[]` |
|
||||
| `load_vocab_data` | Whether to load additional lexeme and vocab data from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) if available. ~~bool~~ | `true` |
|
||||
| `before_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `Language` subclass before it's initialized. ~~Optional[Callable[[Type[Language]], Type[Language]]]~~ | `null` |
|
||||
| `after_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object right after it's initialized. ~~Optional[Callable[[Language], Language]]~~ | `null` |
|
||||
| `after_pipeline_creation` | Optional [callback](/usage/training#custom-code-nlp-callbacks) to modify `nlp` object after the pipeline components have been added. ~~Optional[Callable[[Language], Language]]~~ | `null` |
|
||||
| `tokenizer` | The tokenizer to use. ~~Callable[[str], Doc]~~ | [`Tokenizer`](/api/tokenizer) |
|
||||
|
||||
### components {#config-components tag="section"}
|
||||
|
||||
|
@ -128,24 +128,24 @@ process that are used when you run [`spacy train`](/api/cli#train).
|
|||
|
||||
<!-- TODO: complete -->
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| --------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------- |
|
||||
| `seed` | int | The random seed. | `${system:seed}` |
|
||||
| `dropout` | float | The dropout rate. | `0.1` |
|
||||
| `accumulate_gradient` | int | Whether to divide the batch up into substeps. | `1` |
|
||||
| `init_tok2vec` | str | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). | `${paths:init_tok2vec}` |
|
||||
| `raw_text` | str | | `${paths:raw}` |
|
||||
| `vectors` | str | | `null` |
|
||||
| `patience` | int | How many steps to continue without improvement in evaluation score. | `1600` |
|
||||
| `max_epochs` | int | Maximum number of epochs to train for. | `0` |
|
||||
| `max_steps` | int | Maximum number of update steps to train for. | `20000` |
|
||||
| `eval_frequency` | int | How often to evaluate during training (steps). | `200` |
|
||||
| `score_weights` | `Dict[str, float]` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. | `{}` |
|
||||
| `frozen_components` | `List[str]` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. | `[]` |
|
||||
| `train_corpus` | callable | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. | [`Corpus`](/api/corpus) |
|
||||
| `dev_corpus` | callable | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. | [`Corpus`](/api/corpus) |
|
||||
| `batcher` | callable | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. | [`batch_by_words`](/api/top-level#batch_by_words) |
|
||||
| `optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) |
|
||||
| Name | Description | Default |
|
||||
| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------- |
|
||||
| `seed` | The random seed. ~~int~~ | `${system:seed}` |
|
||||
| `dropout` | The dropout rate. ~~float~~ | `0.1` |
|
||||
| `accumulate_gradient` | Whether to divide the batch up into substeps. ~~int~~ | `1` |
|
||||
| `init_tok2vec` | Optional path to pretrained tok2vec weights created with [`spacy pretrain`](/api/cli#pretrain). ~~Optional[str]~~ | `${paths:init_tok2vec}` |
|
||||
| `raw_text` | ~~Optional[str]~~ | `${paths:raw}` |
|
||||
| `vectors` | ~~Optional[str]~~ | `null` |
|
||||
| `patience` | How many steps to continue without improvement in evaluation score. ~~int~~ | `1600` |
|
||||
| `max_epochs` | Maximum number of epochs to train for. ~~int~~ | `0` |
|
||||
| `max_steps` | Maximum number of update steps to train for. ~~int~~ | `20000` |
|
||||
| `eval_frequency` | How often to evaluate during training (steps). ~~int~~ | `200` |
|
||||
| `score_weights` | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. ~~Dict[str, float]~~ | `{}` |
|
||||
| `frozen_components` | Pipeline component names that are "frozen" and shouldn't be updated during training. See [here](/usage/training#config-components) for details. ~~List[str]~~ | `[]` |
|
||||
| `train_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. ~~Callable[[Language], Iterator[Example]]~~ | [`Corpus`](/api/corpus) |
|
||||
| `dev_corpus` | Callable that takes the current `nlp` object and yields [`Example`](/api/example) objects. ~~Callable[[Language], Iterator[Example]]~~ | [`Corpus`](/api/corpus) |
|
||||
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ | [`batch_by_words`](/api/top-level#batch_by_words) |
|
||||
| `optimizer` | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. ~~Optimizer~~ | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) |
|
||||
|
||||
### pretraining {#config-pretraining tag="section,optional"}
|
||||
|
||||
|
@ -153,19 +153,19 @@ This section is optional and defines settings and controls for
|
|||
[language model pretraining](/usage/training#pretraining). It's used when you
|
||||
run [`spacy pretrain`](/api/cli#pretrain).
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| ---------------------------- | --------------------------------------------------- | ----------------------------------------------------------------------------- | --------------------------------------------------- |
|
||||
| `max_epochs` | int | Maximum number of epochs. | `1000` |
|
||||
| `min_length` | int | Minimum length of examples. | `5` |
|
||||
| `max_length` | int | Maximum length of examples. | `500` |
|
||||
| `dropout` | float | The dropout rate. | `0.2` |
|
||||
| `n_save_every` | int | Saving frequency. | `null` |
|
||||
| `batch_size` | int / `Sequence[int]` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). | `3000` |
|
||||
| `seed` | int | The random seed. | `${system.seed}` |
|
||||
| `use_pytorch_for_gpu_memory` | bool | Allocate memory via PyTorch. | `${system:use_pytorch_for_gpu_memory}` |
|
||||
| `tok2vec_model` | str | tok2vec model section in the config. | `"components.tok2vec.model"` |
|
||||
| `objective` | dict | The pretraining objective. | `{"type": "characters", "n_characters": 4}` |
|
||||
| `optimizer` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) |
|
||||
| Name | Description | Default |
|
||||
| ---------------------------- | ----------------------------------------------------------------------------------------------------------- | --------------------------------------------------- |
|
||||
| `max_epochs` | Maximum number of epochs. ~~int~~ | `1000` |
|
||||
| `min_length` | Minimum length of examples. ~~int~~ | `5` |
|
||||
| `max_length` | Maximum length of examples. ~~int~~ | `500` |
|
||||
| `dropout` | The dropout rate. ~~float~~ | `0.2` |
|
||||
| `n_save_every` | Saving frequency. ~~int~~ | `null` |
|
||||
| `batch_size` | The batch size or batch size [schedule](https://thinc.ai/docs/api-schedules). ~~Union[int, Sequence[int]]~~ | `3000` |
|
||||
| `seed` | The random seed. ~~int~~ | `${system.seed}` |
|
||||
| `use_pytorch_for_gpu_memory` | Allocate memory via PyTorch. ~~bool~~ | `${system:use_pytorch_for_gpu_memory}` |
|
||||
| `tok2vec_model` | tok2vec model section in the config. ~~str~~ | `"components.tok2vec.model"` |
|
||||
| `objective` | The pretraining objective. ~~Dict[str, Any]~~ | `{"type": "characters", "n_characters": 4}` |
|
||||
| `optimizer` | The optimizer. ~~Optimizer~~ | [`Adam`](https://thinc.ai/docs/api-optimizers#adam) |
|
||||
|
||||
## Training data {#training}
|
||||
|
||||
|
@ -313,22 +313,22 @@ to keep track of your settings and hyperparameters and your own
|
|||
> }
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `text` | str | Raw text. |
|
||||
| `words` | `List[str]` | List of gold-standard tokens. |
|
||||
| `lemmas` | `List[str]` | List of lemmas. |
|
||||
| `spaces` | `List[bool]` | List of boolean values indicating whether the corresponding tokens is followed by a space or not. |
|
||||
| `tags` | `List[str]` | List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging). |
|
||||
| `pos` | `List[str]` | List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging). |
|
||||
| `morphs` | `List[str]` | List of [morphological features](/usage/linguistic-features#rule-based-morphology). |
|
||||
| `sent_starts` | `List[bool]` | List of boolean values indicating whether each token is the first of a sentence or not. |
|
||||
| `deps` | `List[str]` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. |
|
||||
| `heads` | `List[int]` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. |
|
||||
| `entities` | `List[str]` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. |
|
||||
| `entities` | `List[Tuple[int, int, str]]` | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. |
|
||||
| `cats` | `Dict[str, float]` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. |
|
||||
| `links` | `Dict[(int, int), Dict]` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. |
|
||||
| Name | Description |
|
||||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `text` | Raw text. ~~str~~ |
|
||||
| `words` | List of gold-standard tokens. ~~List[str]~~ |
|
||||
| `lemmas` | List of lemmas. ~~List[str]~~ |
|
||||
| `spaces` | List of boolean values indicating whether the corresponding tokens is followed by a space or not. ~~List[bool]~~ |
|
||||
| `tags` | List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging). ~~List[str]~~ |
|
||||
| `pos` | List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging). ~~List[str]~~ |
|
||||
| `morphs` | List of [morphological features](/usage/linguistic-features#rule-based-morphology). ~~List[str]~~ |
|
||||
| `sent_starts` | List of boolean values indicating whether each token is the first of a sentence or not. ~~List[bool]~~ |
|
||||
| `deps` | List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head. ~~List[str]~~ |
|
||||
| `heads` | List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text. ~~List[int]~~ |
|
||||
| `entities` | **Option 1:** List of [BILUO tags](/usage/linguistic-features#accessing-ner) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens. ~~List[str]~~ |
|
||||
| `entities` | **Option 2:** List of `"(start, end, label)"` tuples defining all entities in the text. ~~List[Tuple[int, int, str]]~~ |
|
||||
| `cats` | Dictionary of `label`/`value` pairs indicating how relevant a certain [text category](/api/textcategorizer) is for the text. ~~Dict[str, float]~~ |
|
||||
| `links` | Dictionary of `offset`/`dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The character offsets are linked to a dictionary of relevant knowledge base IDs. ~~Dict[Tuple[int, int], Dict]~~ |
|
||||
|
||||
<Infobox title="Notes and caveats">
|
||||
|
||||
|
@ -390,10 +390,10 @@ provided.
|
|||
> srsly.write_jsonl("/path/to/text.jsonl", data)
|
||||
> ```
|
||||
|
||||
| Key | Type | Description |
|
||||
| -------- | ---- | ---------------------------------------------------------- |
|
||||
| `text` | str | The raw input text. Is not required if `tokens` available. |
|
||||
| `tokens` | list | Optional tokenization, one string per token. |
|
||||
| Key | Description |
|
||||
| -------- | ------------------------------------------------------------------ |
|
||||
| `text` | The raw input text. Is not required if `tokens` available. ~~str~~ |
|
||||
| `tokens` | Optional tokenization, one string per token. ~~List[str]~~ |
|
||||
|
||||
```json
|
||||
### Example
|
||||
|
|
|
@ -44,18 +44,18 @@ A pattern added to the `DependencyMatcher` consists of a list of dictionaries,
|
|||
with each dictionary describing a node to match. Each pattern should have the
|
||||
following top-level keys:
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---- | --------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `PATTERN` | dict | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). |
|
||||
| `SPEC` | dict | The relationships of the nodes in the subtree that should be matched. |
|
||||
| Name | Description |
|
||||
| --------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `PATTERN` | The token attributes to match in the same format as patterns provided to the regular token-based [`Matcher`](/api/matcher). ~~Dict[str, Any]~~ |
|
||||
| `SPEC` | The relationships of the nodes in the subtree that should be matched. ~~Dict[str, str]~~ |
|
||||
|
||||
The `SPEC` includes the following fields:
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ---- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `NODE_NAME` | str | A unique name for this node to refer to it in other specs. |
|
||||
| `NBOR_RELOP` | str | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. |
|
||||
| `NBOR_NAME` | str | The unique name of the node that this node is connected to. |
|
||||
| Name | Description |
|
||||
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `NODE_NAME` | A unique name for this node to refer to it in other specs. ~~str~~ |
|
||||
| `NBOR_RELOP` | A [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html) operator that describes how the two nodes are related. ~~str~~ |
|
||||
| `NBOR_NAME` | The unique name of the node that this node is connected to. ~~str~~ |
|
||||
|
||||
## DependencyMatcher.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
|
@ -68,9 +68,9 @@ Create a rule-based `DependencyMatcher`.
|
|||
> matcher = DependencyMatcher(nlp.vocab)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
|
||||
| Name | Description |
|
||||
| ------- | ----------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ |
|
||||
|
||||
## DependencyMatcher.\_\call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -79,9 +79,9 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.matcher import Matcher
|
||||
> from spacy.matcher import DependencyMatcher
|
||||
>
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> matcher = DependencyMatcher(nlp.vocab)
|
||||
> pattern = [
|
||||
> {"SPEC": {"NODE_NAME": "founded"}, "PATTERN": {"ORTH": "founded"}},
|
||||
> {"SPEC": {"NODE_NAME": "founder", "NBOR_RELOP": ">", "NBOR_NAME": "founded"}, "PATTERN": {"DEP": "nsubj"}},
|
||||
|
@ -91,10 +91,10 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
|||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. |
|
||||
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
|
||||
|
||||
## DependencyMatcher.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
|
@ -115,9 +115,9 @@ number of individual patterns.
|
|||
> assert len(matcher) == 1
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------- |
|
||||
| **RETURNS** | int | The number of rules. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The number of rules. ~~int~~ |
|
||||
|
||||
## DependencyMatcher.\_\_contains\_\_ {#contains tag="method"}
|
||||
|
||||
|
@ -132,10 +132,10 @@ Check whether the matcher contains rules for a match ID.
|
|||
> assert "Rule" in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------------------------- |
|
||||
| `key` | str | The match ID. |
|
||||
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------- |
|
||||
| `key` | The match ID. ~~str~~ |
|
||||
| **RETURNS** | Whether the matcher contains rules for this match ID. ~~bool~~ |
|
||||
|
||||
## DependencyMatcher.add {#add tag="method"}
|
||||
|
||||
|
@ -151,16 +151,16 @@ will be overwritten.
|
|||
> def on_match(matcher, doc, id, matches):
|
||||
> print('Matched!', matches)
|
||||
>
|
||||
> matcher = Matcher(nlp.vocab)
|
||||
> matcher = DependencyMatcher(nlp.vocab)
|
||||
> matcher.add("TEST_PATTERNS", patterns)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||
| Name | Description |
|
||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | An ID for the thing you're matching. ~~str~~ |
|
||||
| `patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a `"PATTERN"` and `"SPEC"`. ~~List[List[Dict[str, dict]]]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
| `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[Matcher, Doc, int, List[tuple], Any]]~~ |
|
||||
|
||||
## DependencyMatcher.remove {#remove tag="method"}
|
||||
|
||||
|
@ -176,9 +176,9 @@ exist.
|
|||
> assert "Rule" not in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----- | ---- | ------------------------- |
|
||||
| `key` | str | The ID of the match rule. |
|
||||
| Name | Description |
|
||||
| ----- | --------------------------------- |
|
||||
| `key` | The ID of the match rule. ~~str~~ |
|
||||
|
||||
## DependencyMatcher.get {#get tag="method"}
|
||||
|
||||
|
@ -192,7 +192,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
|
|||
> on_match, patterns = matcher.get("Rule")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------------- |
|
||||
| `key` | str | The ID of the match rule. |
|
||||
| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------------- |
|
||||
| `key` | The ID of the match rule. ~~str~~ |
|
||||
| **RETURNS** | The rule, as an `(on_match, patterns)` tuple. ~~Tuple[Optional[Callable], List[List[dict]]]~~ |
|
||||
|
|
|
@ -48,13 +48,13 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("parser", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------- |
|
||||
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. | `None` |
|
||||
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100` |
|
||||
| `learn_tokens` | bool | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. | `False` |
|
||||
| `min_action_freq` | int | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. | `30` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
||||
| Setting | Description |
|
||||
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
|
||||
| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. Defaults to `False`. ~~bool~~ |
|
||||
| `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. Defaults to `30`. ~~int~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/dep_parser.pyx
|
||||
|
@ -81,16 +81,16 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. |
|
||||
| _keyword-only_ | | |
|
||||
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. |
|
||||
| `learn_tokens` | bool | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. |
|
||||
| `min_action_freq` | int | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. |
|
||||
| Name | Description |
|
||||
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ |
|
||||
| `learn_tokens` | Whether to learn to merge subtokens that are split relative to the gold standard. Experimental. ~~bool~~ |
|
||||
| `min_action_freq` | The minimum frequency of labelled actions to retain. Rarer labelled actions have their label backed-off to "dep". While this primarily affects the label accuracy, it can also affect the attachment structure, as the labels are used to represent the pseudo-projectivity transformation. ~~int~~ |
|
||||
|
||||
## DependencyParser.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -111,10 +111,10 @@ and all pipeline components are applied to the `Doc` in order. Both
|
|||
> processed = parser(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| `doc` | `Doc` | The document to process. |
|
||||
| **RETURNS** | `Doc` | The processed document. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## DependencyParser.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -133,12 +133,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/dependencyparser#call) and
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------ |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| _keyword-only_ | | |
|
||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `docs` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## DependencyParser.begin_training {#begin_training tag="method"}
|
||||
|
||||
|
@ -158,13 +158,13 @@ setting up the label scheme based on the data.
|
|||
> optimizer = parser.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## DependencyParser.predict {#predict tag="method"}
|
||||
|
||||
|
@ -178,10 +178,10 @@ modifying them.
|
|||
> scores = parser.predict([doc1, doc2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------- | ---------------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||
| **RETURNS** | `syntax.StateClass` | A helper class for the parse state (internal). |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------- |
|
||||
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
||||
| **RETURNS** | A helper class for the parse state (internal). ~~StateClass~~ |
|
||||
|
||||
## DependencyParser.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
|
@ -195,10 +195,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
|
|||
> parser.set_annotations([doc1, doc2], scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ------------------- | ---------------------------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||
| `scores` | `syntax.StateClass` | The scores to set, produced by `DependencyParser.predict`. |
|
||||
| Name | Description |
|
||||
| -------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
||||
| `scores` | The scores to set, produced by `DependencyParser.predict`. Returns an internal helper class for the parse state. ~~List[StateClass]~~ |
|
||||
|
||||
## DependencyParser.update {#update tag="method"}
|
||||
|
||||
|
@ -214,15 +214,15 @@ model. Delegates to [`predict`](/api/dependencyparser#predict) and
|
|||
> losses = parser.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| Name | Description |
|
||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/dependencyparser#set_annotations). |
|
||||
| `sgd` | `Optimizer` | The [`Optimizer`](https://thinc.ai/docs/api-optimizers) object. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## DependencyParser.get_loss {#get_loss tag="method"}
|
||||
|
||||
|
@ -237,11 +237,11 @@ predicted scores.
|
|||
> loss, d_loss = parser.get_loss(examples, scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------- | --------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||
| `scores` | `syntax.StateClass` | Scores representing the model's predictions. |
|
||||
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------- |
|
||||
| `examples` | The batch of examples. ~~Iterable[Example]~~ |
|
||||
| `scores` | Scores representing the model's predictions. ~~StateClass~~ |
|
||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||
|
||||
## DependencyParser.score {#score tag="method" new="3"}
|
||||
|
||||
|
@ -253,10 +253,10 @@ Score a batch of examples.
|
|||
> scores = parser.score(examples)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The examples to score. |
|
||||
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans) and [`Scorer.score_deps`](/api/scorer#score_deps). |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
||||
| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans) and [`Scorer.score_deps`](/api/scorer#score_deps). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## DependencyParser.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
|
@ -270,9 +270,9 @@ component.
|
|||
> optimizer = parser.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------------------------------------- | -------------- |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## DependencyParser.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
|
@ -287,9 +287,9 @@ context, the original parameters are restored.
|
|||
> parser.to_disk("/best_model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---- | ----------------------------------------- |
|
||||
| `params` | dict | The parameter values to use in the model. |
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------- |
|
||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||
|
||||
## DependencyParser.add_label {#add_label tag="method"}
|
||||
|
||||
|
@ -302,10 +302,10 @@ Add a new label to the pipe.
|
|||
> parser.add_label("MY_LABEL")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------------------------- |
|
||||
| `label` | str | The label to add. |
|
||||
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------- |
|
||||
| `label` | The label to add. ~~str~~ |
|
||||
| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
|
||||
|
||||
## DependencyParser.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -318,11 +318,11 @@ Serialize the pipe to disk.
|
|||
> parser.to_disk("/path/to/parser")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## DependencyParser.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -335,12 +335,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> parser.from_disk("/path/to/parser")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------ | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `DependencyParser` object. ~~DependencyParser~~ |
|
||||
|
||||
## DependencyParser.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -353,11 +353,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `DependencyParser` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `DependencyParser` object. ~~bytes~~ |
|
||||
|
||||
## DependencyParser.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -371,12 +371,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> parser.from_bytes(parser_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------ | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `DependencyParser` | The `DependencyParser` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `DependencyParser` object. ~~DependencyParser~~ |
|
||||
|
||||
## DependencyParser.labels {#labels tag="property"}
|
||||
|
||||
|
@ -389,9 +389,9 @@ The labels currently added to the component.
|
|||
> assert "MY_LABEL" in parser.labels
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ---------------------------------- |
|
||||
| **RETURNS** | tuple | The labels added to the component. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------ |
|
||||
| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -30,11 +30,11 @@ Construct a `Doc` object. The most common way to get a `Doc` object is via the
|
|||
> doc = Doc(nlp.vocab, words=words, spaces=spaces)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | A storage container for lexical types. |
|
||||
| `words` | iterable | A list of strings to add to the container. |
|
||||
| `spaces` | iterable | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. |
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||
| `words` | A list of strings to add to the container. ~~Optional[List[str]]~~ |
|
||||
| `spaces` | A list of boolean values indicating whether each word has a subsequent space. Must have the same length as `words`, if specified. Defaults to a sequence of `True`. ~~Optional[List[bool]]~~ |
|
||||
|
||||
## Doc.\_\_getitem\_\_ {#getitem tag="method"}
|
||||
|
||||
|
@ -52,10 +52,10 @@ Negative indexing is supported, and follows the usual Python semantics, i.e.
|
|||
> assert span.text == "it back"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ----------------------- |
|
||||
| `i` | int | The index of the token. |
|
||||
| **RETURNS** | `Token` | The token at `doc[i]`. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `i` | The index of the token. ~~int~~ |
|
||||
| **RETURNS** | The token at `doc[i]`. ~~Token~~ |
|
||||
|
||||
Get a [`Span`](/api/span) object, starting at position `start` (token index) and
|
||||
ending at position `end` (token index). For instance, `doc[2:5]` produces a span
|
||||
|
@ -64,10 +64,10 @@ are not supported, as `Span` objects must be contiguous (cannot have gaps). You
|
|||
can use negative indices and open-ended ranges, which have their normal Python
|
||||
semantics.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | --------------------------------- |
|
||||
| `start_end` | tuple | The slice of the document to get. |
|
||||
| **RETURNS** | `Span` | The span at `doc[start:end]`. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------- |
|
||||
| `start_end` | The slice of the document to get. ~~Tuple[int, int]~~ |
|
||||
| **RETURNS** | The span at `doc[start:end]`. ~~Span~~ |
|
||||
|
||||
## Doc.\_\_iter\_\_ {#iter tag="method"}
|
||||
|
||||
|
@ -85,9 +85,9 @@ main way annotations are accessed from Python. If faster-than-Python speeds are
|
|||
required, you can instead access the annotations as a numpy array, or access the
|
||||
underlying C data directly from Cython.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------- | ----------------- |
|
||||
| **YIELDS** | `Token` | A `Token` object. |
|
||||
| Name | Description |
|
||||
| ---------- | --------------------------- |
|
||||
| **YIELDS** | A `Token` object. ~~Token~~ |
|
||||
|
||||
## Doc.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
|
@ -100,9 +100,9 @@ Get the number of tokens in the document.
|
|||
> assert len(doc) == 7
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------- |
|
||||
| **RETURNS** | int | The number of tokens in the document. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------- |
|
||||
| **RETURNS** | The number of tokens in the document. ~~int~~ |
|
||||
|
||||
## Doc.set_extension {#set_extension tag="classmethod" new="2"}
|
||||
|
||||
|
@ -120,14 +120,14 @@ details, see the documentation on
|
|||
> assert doc._.has_city
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `doc._.my_attr`. |
|
||||
| `default` | - | Optional default value of the attribute if no getter or method is defined. |
|
||||
| `method` | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`. |
|
||||
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. |
|
||||
| `setter` | callable | Setter function that takes the `Doc` and a value, and modifies the object. Is called when the user writes to the `Doc._` attribute. |
|
||||
| `force` | bool | Force overwriting existing attribute. |
|
||||
| Name | Description |
|
||||
| --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `doc._.my_attr`. ~~str~~ |
|
||||
| `default` | Optional default value of the attribute if no getter or method is defined. ~~Optional[Any]~~ |
|
||||
| `method` | Set a custom method on the object, for example `doc._.compare(other_doc)`. ~~Optional[Callable[[Doc, ...], Any]]~~ |
|
||||
| `getter` | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. ~~Optional[Callable[[Doc], Any]]~~ |
|
||||
| `setter` | Setter function that takes the `Doc` and a value, and modifies the object. Is called when the user writes to the `Doc._` attribute. ~~Optional[Callable[[Doc, Any], None]]~~ |
|
||||
| `force` | Force overwriting existing attribute. ~~bool~~ |
|
||||
|
||||
## Doc.get_extension {#get_extension tag="classmethod" new="2"}
|
||||
|
||||
|
@ -144,10 +144,10 @@ Look up a previously registered extension by name. Returns a 4-tuple
|
|||
> assert extension == (False, None, None, None)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------- |
|
||||
| `name` | str | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Name of the extension. ~~str~~ |
|
||||
| **RETURNS** | A `(default, method, getter, setter)` tuple of the extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ |
|
||||
|
||||
## Doc.has_extension {#has_extension tag="classmethod" new="2"}
|
||||
|
||||
|
@ -161,10 +161,10 @@ Check whether an extension has been registered on the `Doc` class.
|
|||
> assert Doc.has_extension("has_city")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------ |
|
||||
| `name` | str | Name of the extension to check. |
|
||||
| **RETURNS** | bool | Whether the extension has been registered. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------- |
|
||||
| `name` | Name of the extension to check. ~~str~~ |
|
||||
| **RETURNS** | Whether the extension has been registered. ~~bool~~ |
|
||||
|
||||
## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
|
||||
|
||||
|
@ -179,10 +179,10 @@ Remove a previously registered extension.
|
|||
> assert not Doc.has_extension("has_city")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------------------------------------- |
|
||||
| `name` | str | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Name of the extension. ~~str~~ |
|
||||
| **RETURNS** | A `(default, method, getter, setter)` tuple of the removed extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ |
|
||||
|
||||
## Doc.char_span {#char_span tag="method" new="2"}
|
||||
|
||||
|
@ -197,14 +197,14 @@ the character indices don't map to a valid span.
|
|||
> assert span.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------------------------ | ---------------------------------------- | --------------------------------------------------------------------- |
|
||||
| `start` | int | The index of the first character of the span. |
|
||||
| `end` | int | The index of the last character after the span. |
|
||||
| `label` | uint64 / str | A label to attach to the span, e.g. for named entities. |
|
||||
| `kb_id` <Tag variant="new">2.2</Tag> | uint64 / str | An ID from a knowledge base to capture the meaning of a named entity. |
|
||||
| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. |
|
||||
| **RETURNS** | `Span` | The newly constructed object or `None`. |
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ----------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
|
||||
## Doc.similarity {#similarity tag="method" model="vectors"}
|
||||
|
||||
|
@ -221,10 +221,10 @@ using an average of word vectors.
|
|||
> assert apples_oranges == oranges_apples
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------------------------------------------------------- |
|
||||
| `other` | - | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. |
|
||||
| **RETURNS** | float | A scalar similarity score. Higher is more similar. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `other` | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ |
|
||||
| **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~ |
|
||||
|
||||
## Doc.count_by {#count_by tag="method"}
|
||||
|
||||
|
@ -237,15 +237,15 @@ attribute ID.
|
|||
> ```python
|
||||
> from spacy.attrs import ORTH
|
||||
> doc = nlp("apple apple orange banana")
|
||||
> assert doc.count_by(ORTH) == {7024L: 1, 119552L: 1, 2087L: 2}
|
||||
> assert doc.count_by(ORTH) == {7024: 1, 119552: 1, 2087: 2}
|
||||
> doc.to_array([ORTH])
|
||||
> # array([[11880], [11880], [7561], [12800]])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------------- |
|
||||
| `attr_id` | int | The attribute ID |
|
||||
| **RETURNS** | dict | A dictionary mapping attributes to integer counts. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------- |
|
||||
| `attr_id` | The attribute ID. ~~int~~ |
|
||||
| **RETURNS** | A dictionary mapping attributes to integer counts. ~~Dict[int, int]~~ |
|
||||
|
||||
## Doc.get_lca_matrix {#get_lca_matrix tag="method"}
|
||||
|
||||
|
@ -261,9 +261,9 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
|
|||
> # array([[0, 1, 1, 1], [1, 1, 1, 1], [1, 1, 2, 3], [1, 1, 3, 3]], dtype=int32)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------------------------------------- | ----------------------------------------------- |
|
||||
| **RETURNS** | `numpy.ndarray[ndim=2, dtype="int32"]` | The lowest common ancestor matrix of the `Doc`. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------- |
|
||||
| **RETURNS** | The lowest common ancestor matrix of the `Doc`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |
|
||||
|
||||
## Doc.to_array {#to_array tag="method"}
|
||||
|
||||
|
@ -288,10 +288,10 @@ Returns a 2D array with one row per token and one column per attribute (when
|
|||
> np_array = doc.to_array("POS")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- |
|
||||
| `attr_ids` | list or int or string | A list of attributes (int IDs or string names) or a single attribute (int ID or string name) |
|
||||
| **RETURNS** | `numpy.ndarray[ndim=2, dtype="uint64"]` or `numpy.ndarray[ndim=1, dtype="uint64"]` | The exported attributes as a numpy array. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `attr_ids` | A list of attributes (int IDs or string names) or a single attribute (int ID or string name). ~~Union[int, str, List[Union[int, str]]]~~ |
|
||||
| **RETURNS** | The exported attributes as a numpy array. ~~Union[numpy.ndarray[ndim=2, dtype=uint64], numpy.ndarray[ndim=1, dtype=uint64]]~~ |
|
||||
|
||||
## Doc.from_array {#from_array tag="method"}
|
||||
|
||||
|
@ -310,15 +310,17 @@ array of attributes.
|
|||
> assert doc[0].pos_ == doc2[0].pos_
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------------------------------------- | ------------------------------------------------------------------------- |
|
||||
| `attrs` | list | A list of attribute ID ints. |
|
||||
| `array` | `numpy.ndarray[ndim=2, dtype="int32"]` | The attribute values to load. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Doc` | Itself. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------- |
|
||||
| `attrs` | A list of attribute ID ints. ~~List[int]~~ |
|
||||
| `array` | The attribute values to load. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `Doc` itself. ~~Doc~~ |
|
||||
|
||||
## Doc.from_docs {#from_docs tag="staticmethod"}
|
||||
|
||||
<!-- TODO: When was this added? -->
|
||||
|
||||
Concatenate multiple `Doc` objects to form a new one. Raises an error if the
|
||||
`Doc` objects do not all share the same `Vocab`.
|
||||
|
||||
|
@ -337,12 +339,12 @@ Concatenate multiple `Doc` objects to form a new one. Raises an error if the
|
|||
> [str(ent) for doc in docs for ent in doc.ents]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------- | ----- | ----------------------------------------------------------------------------------------------- |
|
||||
| `docs` | list | A list of `Doc` objects. |
|
||||
| `ensure_whitespace` | bool | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. |
|
||||
| `attrs` | list | Optional list of attribute ID ints or attribute name strings. |
|
||||
| **RETURNS** | `Doc` | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. |
|
||||
| Name | Description |
|
||||
| ------------------- | ----------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | A list of `Doc` objects. ~~List[Doc]~~ |
|
||||
| `ensure_whitespace` | Insert a space between two adjacent docs whenever the first doc does not end in whitespace. ~~bool~~ |
|
||||
| `attrs` | Optional list of attribute ID ints or attribute name strings. ~~Optional[List[Union[str, int]]]~~ |
|
||||
| **RETURNS** | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. ~~Optional[Doc]~~ |
|
||||
|
||||
## Doc.to_disk {#to_disk tag="method" new="2"}
|
||||
|
||||
|
@ -354,11 +356,11 @@ Save the current state to a directory.
|
|||
> doc.to_disk("/path/to/doc")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## Doc.from_disk {#from_disk tag="method" new="2"}
|
||||
|
||||
|
@ -372,12 +374,12 @@ Loads state from a directory. Modifies the object in place and returns it.
|
|||
> doc = Doc(Vocab()).from_disk("/path/to/doc")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `Doc` object. ~~Doc~~ |
|
||||
|
||||
## Doc.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -390,11 +392,11 @@ Serialize, i.e. export the document contents to a binary string.
|
|||
> doc_bytes = doc.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | A losslessly serialized copy of the `Doc`, including all annotations. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | A losslessly serialized copy of the `Doc`, including all annotations. ~~bytes~~ |
|
||||
|
||||
## Doc.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -410,12 +412,12 @@ Deserialize, i.e. import the document contents from a binary string.
|
|||
> assert doc.text == doc2.text
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| `data` | bytes | The string to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Doc` | The `Doc` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `data` | The string to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `Doc` object. ~~Doc~~ |
|
||||
|
||||
## Doc.retokenize {#retokenize tag="contextmanager" new="2.1"}
|
||||
|
||||
|
@ -433,9 +435,9 @@ invalidated, although they may accidentally continue to work.
|
|||
> retokenizer.merge(doc[0:2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------- | ---------------- |
|
||||
| **RETURNS** | `Retokenizer` | The retokenizer. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| **RETURNS** | The retokenizer. ~~Retokenizer~~ |
|
||||
|
||||
### Retokenizer.merge {#retokenizer.merge tag="method"}
|
||||
|
||||
|
@ -454,10 +456,10 @@ dictionary mapping attribute names to values as the `"_"` key.
|
|||
> retokenizer.merge(doc[2:4], attrs=attrs)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------ | -------------------------------------- |
|
||||
| `span` | `Span` | The span to merge. |
|
||||
| `attrs` | dict | Attributes to set on the merged token. |
|
||||
| Name | Description |
|
||||
| ------- | --------------------------------------------------------------------- |
|
||||
| `span` | The span to merge. ~~Span~~ |
|
||||
| `attrs` | Attributes to set on the merged token. ~~Dict[Union[str, int], Any]~~ |
|
||||
|
||||
### Retokenizer.split {#retokenizer.split tag="method"}
|
||||
|
||||
|
@ -488,33 +490,12 @@ underlying lexeme (if they're context-independent lexical attributes like
|
|||
> retokenizer.split(doc[3], ["New", "York"], heads=heads, attrs=attrs)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------- | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `token` | `Token` | The token to split. |
|
||||
| `orths` | list | The verbatim text of the split tokens. Needs to match the text of the original token. |
|
||||
| `heads` | list | List of `token` or `(token, subtoken)` tuples specifying the tokens to attach the newly split subtokens to. |
|
||||
| `attrs` | dict | Attributes to set on all split tokens. Attribute names mapped to list of per-token attribute values. |
|
||||
|
||||
## Doc.merge {#merge tag="method"}
|
||||
|
||||
Retokenize the document, such that the span at `doc.text[start_idx : end_idx]`
|
||||
is merged into a single token. If `start_idx` and `end_idx` do not mark start
|
||||
and end token boundaries, the document remains unchanged.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> doc = nlp("Los Angeles start.")
|
||||
> doc.merge(0, len("Los Angeles"), "NNP", "Los Angeles", "GPE")
|
||||
> assert [t.text for t in doc] == ["Los Angeles", "start", "."]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------- | ------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `start_idx` | int | The character index of the start of the slice to merge. |
|
||||
| `end_idx` | int | The character index after the end of the slice to merge. |
|
||||
| `**attributes` | - | Attributes to assign to the merged token. By default, attributes are inherited from the syntactic root token of the span. |
|
||||
| **RETURNS** | `Token` | The newly merged token, or `None` if the start and end indices did not fall at token boundaries |
|
||||
| Name | Description |
|
||||
| ------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `token` | The token to split. ~~Token~~ |
|
||||
| `orths` | The verbatim text of the split tokens. Needs to match the text of the original token. ~~List[str]~~ |
|
||||
| `heads` | List of `token` or `(token, subtoken)` tuples specifying the tokens to attach the newly split subtokens to. ~~List[Union[Token, Tuple[Token, int]]]~~ |
|
||||
| `attrs` | Attributes to set on all split tokens. Attribute names mapped to list of per-token attribute values. ~~Dict[Union[str, int], List[Any]]~~ |
|
||||
|
||||
## Doc.ents {#ents tag="property" model="NER"}
|
||||
|
||||
|
@ -531,9 +512,9 @@ objects, if the entity recognizer has been applied.
|
|||
> assert ents[0].text == "Mr. Best"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------ |
|
||||
| **RETURNS** | tuple | Entities in the document, one `Span` per entity. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------- |
|
||||
| **RETURNS** | Entities in the document, one `Span` per entity. ~~Tuple[Span, ...]~~ |
|
||||
|
||||
## Doc.noun_chunks {#noun_chunks tag="property" model="parser"}
|
||||
|
||||
|
@ -552,9 +533,9 @@ relative clauses.
|
|||
> assert chunks[1].text == "another phrase"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------ | ---------------------------- |
|
||||
| **YIELDS** | `Span` | Noun chunks in the document. |
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------------- |
|
||||
| **YIELDS** | Noun chunks in the document. ~~Span~~ |
|
||||
|
||||
## Doc.sents {#sents tag="property" model="parser"}
|
||||
|
||||
|
@ -572,9 +553,9 @@ will be unavailable.
|
|||
> assert [s.root.text for s in sents] == ["is", "'s"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------ | -------------------------- |
|
||||
| **YIELDS** | `Span` | Sentences in the document. |
|
||||
| Name | Description |
|
||||
| ---------- | ----------------------------------- |
|
||||
| **YIELDS** | Sentences in the document. ~~Span~~ |
|
||||
|
||||
## Doc.has_vector {#has_vector tag="property" model="vectors"}
|
||||
|
||||
|
@ -587,9 +568,9 @@ A boolean value indicating whether a word vector is associated with the object.
|
|||
> assert doc.has_vector
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------------ |
|
||||
| **RETURNS** | bool | Whether the document has a vector data attached. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------- |
|
||||
| **RETURNS** | Whether the document has a vector data attached. ~~bool~~ |
|
||||
|
||||
## Doc.vector {#vector tag="property" model="vectors"}
|
||||
|
||||
|
@ -604,9 +585,9 @@ vectors.
|
|||
> assert doc.vector.shape == (300,)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------------------------------- | ------------------------------------------------------- |
|
||||
| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the document's semantics. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------- |
|
||||
| **RETURNS** | A 1-dimensional array representing the document's vector. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
|
||||
## Doc.vector_norm {#vector_norm tag="property" model="vectors"}
|
||||
|
||||
|
@ -622,32 +603,32 @@ The L2 norm of the document's vector representation.
|
|||
> assert doc1.vector_norm != doc2.vector_norm
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ----------------------------------------- |
|
||||
| **RETURNS** | float | The L2 norm of the vector representation. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------- |
|
||||
| **RETURNS** | The L2 norm of the vector representation. ~~float~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `text` | str | A string representation of the document text. |
|
||||
| `text_with_ws` | str | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
|
||||
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
|
||||
| `vocab` | `Vocab` | The store of lexical types. |
|
||||
| `tensor` <Tag variant="new">2</Tag> | `ndarray` | Container for dense vector representations. |
|
||||
| `cats` <Tag variant="new">2</Tag> | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. |
|
||||
| `user_data` | - | A generic storage area, for user custom data. |
|
||||
| `lang` <Tag variant="new">2.1</Tag> | int | Language of the document's vocabulary. |
|
||||
| `lang_` <Tag variant="new">2.1</Tag> | str | Language of the document's vocabulary. |
|
||||
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. |
|
||||
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. |
|
||||
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. |
|
||||
| `is_nered` <Tag variant="new">2.1</Tag> | bool | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. |
|
||||
| `sentiment` | float | The document's positivity/negativity score, if available. |
|
||||
| `user_hooks` | dict | A dictionary that allows customization of the `Doc`'s properties. |
|
||||
| `user_token_hooks` | dict | A dictionary that allows customization of properties of `Token` children. |
|
||||
| `user_span_hooks` | dict | A dictionary that allows customization of properties of `Span` children. |
|
||||
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
|
||||
| Name | Description |
|
||||
| --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `text` | A string representation of the document text. ~~str~~ |
|
||||
| `text_with_ws` | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. ~~str~~ |
|
||||
| `mem` | The document's local memory heap, for all C data it owns. ~~cymem.Pool~~ |
|
||||
| `vocab` | The store of lexical types. ~~Vocab~~ |
|
||||
| `tensor` <Tag variant="new">2</Tag> | Container for dense vector representations. ~~numpy.ndarray~~ |
|
||||
| `cats` <Tag variant="new">2</Tag> | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. ~~Dict[str, float]~~ |
|
||||
| `user_data` | A generic storage area, for user custom data. ~~Dict[str, Any]~~ |
|
||||
| `lang` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~int~~ |
|
||||
| `lang_` <Tag variant="new">2.1</Tag> | Language of the document's vocabulary. ~~str~~ |
|
||||
| `is_tagged` | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. ~~bool~~ |
|
||||
| `is_parsed` | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. ~~bool~~ |
|
||||
| `is_sentenced` | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. ~~bool~~ |
|
||||
| `is_nered` <Tag variant="new">2.1</Tag> | A flag indicating that named entities have been set. Will return `True` if the `Doc` is empty, or if _any_ of the tokens has an entity tag set, even if the others are unknown. ~~bool~~ |
|
||||
| `sentiment` | The document's positivity/negativity score, if available. ~~float~~ |
|
||||
| `user_hooks` | A dictionary that allows customization of the `Doc`'s properties. ~~Dict[str, Callable]~~ |
|
||||
| `user_token_hooks` | A dictionary that allows customization of properties of `Token` children. ~~Dict[str, Callable]~~ |
|
||||
| `user_span_hooks` | A dictionary that allows customization of properties of `Span` children. ~~Dict[str, Callable]~~ |
|
||||
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -44,11 +44,11 @@ Create a `DocBin` object to hold serialized annotations.
|
|||
> doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------------- | --------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `attrs` | `Iterable[str]` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
|
||||
| `store_user_data` | bool | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. |
|
||||
| `docs` | `Iterable[Doc]` | `Doc` objects to add on initialization. |
|
||||
| Argument | Description |
|
||||
| ----------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `attrs` | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. ~~Iterable[str]~~ |
|
||||
| `store_user_data` | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`. ~~bool~~ |
|
||||
| `docs` | `Doc` objects to add on initialization. ~~Iterable[Doc]~~ |
|
||||
|
||||
## DocBin.\_\len\_\_ {#len tag="method"}
|
||||
|
||||
|
@ -63,9 +63,9 @@ Get the number of `Doc` objects that were added to the `DocBin`.
|
|||
> assert len(doc_bin) == 1
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------- |
|
||||
| **RETURNS** | int | The number of `Doc`s added to the `DocBin`. |
|
||||
| Argument | Description |
|
||||
| ----------- | --------------------------------------------------- |
|
||||
| **RETURNS** | The number of `Doc`s added to the `DocBin`. ~~int~~ |
|
||||
|
||||
## DocBin.add {#add tag="method"}
|
||||
|
||||
|
@ -79,9 +79,9 @@ Add a `Doc`'s annotations to the `DocBin` for serialization.
|
|||
> doc_bin.add(doc)
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------- | ----- | ------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object to add. |
|
||||
| Argument | Description |
|
||||
| -------- | -------------------------------- |
|
||||
| `doc` | The `Doc` object to add. ~~Doc~~ |
|
||||
|
||||
## DocBin.get_docs {#get_docs tag="method"}
|
||||
|
||||
|
@ -93,15 +93,15 @@ Recover `Doc` objects from the annotations, using the given vocab.
|
|||
> docs = list(doc_bin.get_docs(nlp.vocab))
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ---------- | ------- | ------------------ |
|
||||
| `vocab` | `Vocab` | The shared vocab. |
|
||||
| **YIELDS** | `Doc` | The `Doc` objects. |
|
||||
| Argument | Description |
|
||||
| ---------- | --------------------------- |
|
||||
| `vocab` | The shared vocab. ~~Vocab~~ |
|
||||
| **YIELDS** | The `Doc` objects. ~~Doc~~ |
|
||||
|
||||
## DocBin.merge {#merge tag="method"}
|
||||
|
||||
Extend the annotations of this `DocBin` with the annotations from another. Will
|
||||
raise an error if the pre-defined attrs of the two `DocBin`s don't match.
|
||||
raise an error if the pre-defined `attrs` of the two `DocBin`s don't match.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -114,9 +114,9 @@ raise an error if the pre-defined attrs of the two `DocBin`s don't match.
|
|||
> assert len(doc_bin1) == 2
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------- | -------- | ------------------------------------------- |
|
||||
| `other` | `DocBin` | The `DocBin` to merge into the current bin. |
|
||||
| Argument | Description |
|
||||
| -------- | ------------------------------------------------------ |
|
||||
| `other` | The `DocBin` to merge into the current bin. ~~DocBin~~ |
|
||||
|
||||
## DocBin.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -130,9 +130,9 @@ Serialize the `DocBin`'s annotations to a bytestring.
|
|||
> doc_bin_bytes = doc_bin.to_bytes()
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| **RETURNS** | bytes | The serialized `DocBin`. |
|
||||
| Argument | Description |
|
||||
| ----------- | ---------------------------------- |
|
||||
| **RETURNS** | The serialized `DocBin`. ~~bytes~~ |
|
||||
|
||||
## DocBin.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -145,10 +145,10 @@ Deserialize the `DocBin`'s annotations from a bytestring.
|
|||
> new_doc_bin = DocBin().from_bytes(doc_bin_bytes)
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ------------ | -------- | ---------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| **RETURNS** | `DocBin` | The loaded `DocBin`. |
|
||||
| Argument | Description |
|
||||
| ------------ | -------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| **RETURNS** | The loaded `DocBin`. ~~DocBin~~ |
|
||||
|
||||
## DocBin.to_disk {#to_disk tag="method" new="3"}
|
||||
|
||||
|
@ -164,9 +164,9 @@ and the result can be used as the input data for
|
|||
> doc_bin.to_disk("./data.spacy")
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------- | ------------ | ----------------------------------------------------- |
|
||||
| `path` | str / `Path` | The file path, typically with the `.spacy` extension. |
|
||||
| Argument | Description |
|
||||
| -------- | -------------------------------------------------------------------------- |
|
||||
| `path` | The file path, typically with the `.spacy` extension. ~~Union[str, Path]~~ |
|
||||
|
||||
## DocBin.from_disk {#from_disk tag="method" new="3"}
|
||||
|
||||
|
@ -178,7 +178,7 @@ Load a serialized `DocBin` from a file. Typically uses the `.spacy` extension.
|
|||
> doc_bin = DocBin().from_disk("./data.spacy")
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------- | ------------ | ----------------------------------------------------- |
|
||||
| `path` | str / `Path` | The file path, typically with the `.spacy` extension. |
|
||||
| **RETURNS** | `DocBin` | The loaded `DocBin`. |
|
||||
| Argument | Description |
|
||||
| ----------- | -------------------------------------------------------------------------- |
|
||||
| `path` | The file path, typically with the `.spacy` extension. ~~Union[str, Path]~~ |
|
||||
| **RETURNS** | The loaded `DocBin`. ~~DocBin~~ |
|
||||
|
|
|
@ -40,14 +40,13 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("entity_linker", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ---------------- | -------------------------------------------------------- | --------------------------------------------------------------------------- | ------------------------------------------------------ |
|
||||
| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. | `[]` |
|
||||
| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. | `True` |
|
||||
| `incl_context` | bool | Whether or not to include the local context in the model. | `True` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
|
||||
| `kb_loader` | `Callable[[Vocab], KnowledgeBase]` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. | An empty KnowledgeBase with `entity_vector_length` 64. |
|
||||
| `get_candidates` | `Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]` | Function that generates plausible candidates for a given `Span` object. | Built-in dictionary-lookup function. |
|
||||
| Setting | Description |
|
||||
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ |
|
||||
| `kb` | The [`KnowledgeBase`](/api/kb). Defaults to [EmptyKB](/api/architectures#EmptyKB), a function returning an empty `KnowledgeBase` with an `entity_vector_length` of `64`. ~~KnowledgeBase~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
|
||||
|
@ -66,7 +65,7 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
|
|||
> entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
>
|
||||
> # Construction via add_pipe with custom KB and candidate generation
|
||||
> config = {"kb_loader": {"@assets": "my_kb.v1"}, "get_candidates": {"@assets": "my_candidates.v1"},}
|
||||
> config = {"kb": {"@assets": "my_kb.v1"}}
|
||||
> entity_linker = nlp.add_pipe("entity_linker", config=config)
|
||||
>
|
||||
> # Construction from class
|
||||
|
@ -76,22 +75,20 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
|
|||
|
||||
Create a new pipeline instance. In your application, you would normally use a
|
||||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
[`nlp.add_pipe`](/api/language#add_pipe). Note that both the internal
|
||||
`KnowledgeBase` as well as the Candidate generator can be customized by
|
||||
providing custom registered functions.
|
||||
|
||||
Note that both the internal KB as well as the Candidate generator can be
|
||||
customized by providing custom registered functions.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | -------------------------------------------------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| Name | Description |
|
||||
| ---------------- | --------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `kb_loader` | `Callable[[Vocab], KnowledgeBase]` | Function that creates a [`KnowledgeBase`](/api/kb) from a `Vocab` instance. |
|
||||
| `get_candidates` | `Callable[[KnowledgeBase, "Span"], Iterable[Candidate]]` | Function that generates plausible candidates for a given `Span` object. |
|
||||
| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction. |
|
||||
| `incl_prior` | bool | Whether or not to include prior probabilities from the KB in the model. |
|
||||
| `incl_context` | bool | Whether or not to include the local context in the model. |
|
||||
| `kb` | The [`KnowledgeBase`](/api/kb). ~~KnowledgeBase~~ |
|
||||
| `labels_discard` | NER labels that will automatically get a `"NIL"` prediction. ~~Iterable[str]~~ |
|
||||
| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. ~~bool~~ |
|
||||
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
|
||||
|
||||
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -111,10 +108,10 @@ delegate to the [`predict`](/api/entitylinker#predict) and
|
|||
> processed = entity_linker(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| `doc` | `Doc` | The document to process. |
|
||||
| **RETURNS** | `Doc` | The processed document. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## EntityLinker.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -133,12 +130,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------ |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| _keyword-only_ | | |
|
||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## EntityLinker.begin_training {#begin_training tag="method"}
|
||||
|
||||
|
@ -158,13 +155,13 @@ setting up the label scheme based on the data.
|
|||
> optimizer = entity_linker.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/dependencyparser#create_optimizer) if not set. |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## EntityLinker.predict {#predict tag="method"}
|
||||
|
||||
|
@ -179,10 +176,10 @@ if there is no prediction.
|
|||
> kb_ids = entity_linker.predict([doc1, doc2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------- | ------------------------------------------------------------ |
|
||||
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||
| **RETURNS** | `List[str]` | The predicted KB identifiers for the entities in the `docs`. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------- |
|
||||
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
||||
| **RETURNS** | `List[str]` | The predicted KB identifiers for the entities in the `docs`. ~~List[str]~~ |
|
||||
|
||||
## EntityLinker.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
|
@ -197,10 +194,10 @@ entities.
|
|||
> entity_linker.set_annotations([doc1, doc2], kb_ids)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | --------------- | ------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||
| `kb_ids` | `List[str]` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. |
|
||||
| Name | Description |
|
||||
| -------- | --------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
||||
| `kb_ids` | The knowledge base identifiers for the entities in the docs, predicted by `EntityLinker.predict`. ~~List[str]~~ |
|
||||
|
||||
## EntityLinker.update {#update tag="method"}
|
||||
|
||||
|
@ -216,15 +213,15 @@ pipe's entity linking model and context encoder. Delegates to
|
|||
> losses = entity_linker.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| Name | Description |
|
||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/textcategorizer#set_annotations). |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## EntityLinker.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
|
@ -237,9 +234,9 @@ Create an optimizer for the pipeline component.
|
|||
> optimizer = entity_linker.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------------------------------------- | -------------- |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## EntityLinker.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
|
@ -254,9 +251,9 @@ context, the original parameters are restored.
|
|||
> entity_linker.to_disk("/best_model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---- | ----------------------------------------- |
|
||||
| `params` | dict | The parameter values to use in the model. |
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------- |
|
||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||
|
||||
## EntityLinker.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -269,11 +266,11 @@ Serialize the pipe to disk.
|
|||
> entity_linker.to_disk("/path/to/entity_linker")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## EntityLinker.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -286,12 +283,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> entity_linker.from_disk("/path/to/entity_linker")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `EntityLinker` object. ~~EntityLinker~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -41,11 +41,11 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("ner", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------- |
|
||||
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. |
|
||||
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. | `100` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [TransitionBasedParser](/api/architectures#TransitionBasedParser) |
|
||||
| Setting | Description |
|
||||
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]] |
|
||||
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/ner.pyx
|
||||
|
@ -72,14 +72,14 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| `moves` | `List[str]` | A list of transition names. Inferred from the data if not provided. |
|
||||
| _keyword-only_ | | |
|
||||
| `update_with_oracle_cut_size` | int | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. |
|
||||
| Name | Description |
|
||||
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| `moves` | A list of transition names. Inferred from the data if not provided. ~~Optional[List[str]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. `100` is a good default. ~~int~~ |
|
||||
|
||||
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -100,10 +100,10 @@ and all pipeline components are applied to the `Doc` in order. Both
|
|||
> processed = ner(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| `doc` | `Doc` | The document to process. |
|
||||
| **RETURNS** | `Doc` | The processed document. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## EntityRecognizer.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -122,12 +122,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/entityrecognizer#call) and
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------ |
|
||||
| `docs` | `Iterable[Doc]` | A stream of documents. |
|
||||
| _keyword-only_ | | |
|
||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `docs` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## EntityRecognizer.begin_training {#begin_training tag="method"}
|
||||
|
||||
|
@ -147,13 +147,13 @@ setting up the label scheme based on the data.
|
|||
> optimizer = ner.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/entityrecognizer#create_optimizer) if not set. |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## EntityRecognizer.predict {#predict tag="method"}
|
||||
|
||||
|
@ -167,10 +167,10 @@ modifying them.
|
|||
> scores = ner.predict([doc1, doc2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------ | ---------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||
| **RETURNS** | `List[StateClass]` | List of `syntax.StateClass` objects. `syntax.StateClass` is a helper class for the parse state (internal). |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------- |
|
||||
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
||||
| **RETURNS** | A helper class for the parse state (internal). ~~StateClass~~ |
|
||||
|
||||
## EntityRecognizer.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
|
@ -184,10 +184,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
|
|||
> ner.set_annotations([doc1, doc2], scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ------------------ | ---------------------------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||
| `scores` | `List[StateClass]` | The scores to set, produced by `EntityRecognizer.predict`. |
|
||||
| Name | Description |
|
||||
| -------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
||||
| `scores` | The scores to set, produced by `EntityRecognizer.predict`. Returns an internal helper class for the parse state. ~~List[StateClass]~~ |
|
||||
|
||||
## EntityRecognizer.update {#update tag="method"}
|
||||
|
||||
|
@ -203,15 +203,15 @@ model. Delegates to [`predict`](/api/entityrecognizer#predict) and
|
|||
> losses = ner.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| Name | Description |
|
||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/entityrecognizer#set_annotations). |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## EntityRecognizer.get_loss {#get_loss tag="method"}
|
||||
|
||||
|
@ -226,11 +226,11 @@ predicted scores.
|
|||
> loss, d_loss = ner.get_loss(examples, scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------- | --------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||
| `scores` | `List[StateClass]` | Scores representing the model's predictions. |
|
||||
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------- |
|
||||
| `examples` | The batch of examples. ~~Iterable[Example]~~ |
|
||||
| `scores` | Scores representing the model's predictions. ~~StateClass~~ |
|
||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||
|
||||
## EntityRecognizer.score {#score tag="method" new="3"}
|
||||
|
||||
|
@ -242,10 +242,10 @@ Score a batch of examples.
|
|||
> scores = ner.score(examples)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------- | ------------------------------------------------------------------------ |
|
||||
| `examples` | `Iterable[Example]` | The examples to score. |
|
||||
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
||||
| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## EntityRecognizer.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
|
@ -258,9 +258,9 @@ Create an optimizer for the pipeline component.
|
|||
> optimizer = ner.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------------------------------------- | -------------- |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## EntityRecognizer.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
|
@ -275,9 +275,9 @@ context, the original parameters are restored.
|
|||
> ner.to_disk("/best_model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---- | ----------------------------------------- |
|
||||
| `params` | dict | The parameter values to use in the model. |
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------- |
|
||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||
|
||||
## EntityRecognizer.add_label {#add_label tag="method"}
|
||||
|
||||
|
@ -290,10 +290,10 @@ Add a new label to the pipe.
|
|||
> ner.add_label("MY_LABEL")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------------------------- |
|
||||
| `label` | str | The label to add. |
|
||||
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------- |
|
||||
| `label` | The label to add. ~~str~~ |
|
||||
| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
|
||||
|
||||
## EntityRecognizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -306,11 +306,11 @@ Serialize the pipe to disk.
|
|||
> ner.to_disk("/path/to/ner")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## EntityRecognizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -323,12 +323,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> ner.from_disk("/path/to/ner")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------ | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `EntityRecognizer` object. ~~EntityRecognizer~~ |
|
||||
|
||||
## EntityRecognizer.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -341,11 +341,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `EntityRecognizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `EntityRecognizer` object. ~~bytes~~ |
|
||||
|
||||
## EntityRecognizer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -359,12 +359,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> ner.from_bytes(ner_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------ | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `EntityRecognizer` | The `EntityRecognizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `EntityRecognizer` object. ~~EntityRecognizer~~ |
|
||||
|
||||
## EntityRecognizer.labels {#labels tag="property"}
|
||||
|
||||
|
@ -377,9 +377,9 @@ The labels currently added to the component.
|
|||
> assert "MY_LABEL" in ner.labels
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ---------------------------------- |
|
||||
| **RETURNS** | tuple | The labels added to the component. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------ |
|
||||
| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -34,12 +34,12 @@ how the component should be configured. You can override its settings via the
|
|||
> nlp.add_pipe("entity_ruler", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| --------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
|
||||
| `phrase_matcher_attr` | str | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. | `None` |
|
||||
| `validate` | bool | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). | `False` |
|
||||
| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. | `False` |
|
||||
| `ent_id_sep` | str | Separator used internally for entity IDs. | `"||"` |
|
||||
| Setting | Description |
|
||||
| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
||||
| `validate` | Whether patterns should be validated (passed to the `Matcher` and `PhraseMatcher`). Defaults to `False`. ~~bool~~ |
|
||||
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
|
||||
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entityruler.py
|
||||
|
@ -63,16 +63,16 @@ be a token pattern (list) or a phrase pattern (string). For example:
|
|||
> ruler = EntityRuler(nlp, overwrite_ents=True)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------------------------- | ---------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `nlp` | `Language` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. |
|
||||
| `name` <Tag variant="new">3</Tag> | str | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. |
|
||||
| _keyword-only_ | | |
|
||||
| `phrase_matcher_attr` | int / str | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. |
|
||||
| `validate` | bool | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. |
|
||||
| `overwrite_ents` | bool | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. |
|
||||
| `ent_id_sep` | str | Separator used internally for entity IDs. Defaults to `"||"`. |
|
||||
| `patterns` | iterable | Optional patterns to load in on initialization. |
|
||||
| Name | Description |
|
||||
| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `nlp` | The shared nlp object to pass the vocab to the matchers and process phrase patterns. ~~Language~~ |
|
||||
| `name` <Tag variant="new">3</Tag> | Instance name of the current pipeline component. Typically passed in automatically from the factory when the component is added. Used to disable the current entity ruler while creating phrase patterns with the nlp object. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `phrase_matcher_attr` | Optional attribute name match on for the internal [`PhraseMatcher`](/api/phrasematcher), e.g. `LOWER` to match on the lowercase token text. Defaults to `None`. ~~Optional[Union[int, str]]~~ |
|
||||
| `validate` | Whether patterns should be validated, passed to Matcher and PhraseMatcher as `validate`. Defaults to `False`. ~~bool~~ |
|
||||
| `overwrite_ents` | If existing entities are present, e.g. entities added by the model, overwrite them by matches if necessary. Defaults to `False`. ~~bool~~ |
|
||||
| `ent_id_sep` | Separator used internally for entity IDs. Defaults to `"||"`. ~~str~~ |
|
||||
| `patterns` | Optional patterns to load in on initialization. ~~Optional[List[Dict[str, Union[str, List[dict]]]]]~~ |
|
||||
|
||||
## EntityRuler.\_\len\_\_ {#len tag="method"}
|
||||
|
||||
|
@ -87,9 +87,9 @@ The number of all patterns added to the entity ruler.
|
|||
> assert len(ruler) == 1
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------- |
|
||||
| **RETURNS** | int | The number of patterns. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------- |
|
||||
| **RETURNS** | The number of patterns. ~~int~~ |
|
||||
|
||||
## EntityRuler.\_\_contains\_\_ {#contains tag="method"}
|
||||
|
||||
|
@ -104,10 +104,10 @@ Whether a label is present in the patterns.
|
|||
> assert not "PERSON" in ruler
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------- |
|
||||
| `label` | str | The label to check. |
|
||||
| **RETURNS** | bool | Whether the entity ruler contains the label. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------- |
|
||||
| `label` | The label to check. ~~str~~ |
|
||||
| **RETURNS** | Whether the entity ruler contains the label. ~~bool~~ |
|
||||
|
||||
## EntityRuler.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -130,10 +130,10 @@ is chosen.
|
|||
> assert ents == [("Apple", "ORG")]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` with added entities, if available. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------- |
|
||||
| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
|
||||
| **RETURNS** | The modified `Doc` with added entities, if available. ~~Doc~~ |
|
||||
|
||||
## EntityRuler.add_patterns {#add_patterns tag="method"}
|
||||
|
||||
|
@ -152,9 +152,9 @@ of dicts) or a phrase pattern (string). For more details, see the usage guide on
|
|||
> ruler.add_patterns(patterns)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ---- | -------------------- |
|
||||
| `patterns` | list | The patterns to add. |
|
||||
| Name | Description |
|
||||
| ---------- | ---------------------------------------------------------------- |
|
||||
| `patterns` | The patterns to add. ~~List[Dict[str, Union[str, List[dict]]]]~~ |
|
||||
|
||||
## EntityRuler.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -171,9 +171,9 @@ only the patterns are saved as JSONL. If a directory name is provided, a
|
|||
> ruler.to_disk("/path/to/entity_ruler") # saves patterns and config
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ------------ | ----------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Description |
|
||||
| ------ | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
|
||||
## EntityRuler.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -190,10 +190,10 @@ configuration.
|
|||
> ruler.from_disk("/path/to/entity_ruler") # loads patterns and config
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------- | ---------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~ |
|
||||
|
||||
## EntityRuler.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -206,9 +206,9 @@ Serialize the entity ruler patterns to a bytestring.
|
|||
> ruler_bytes = ruler.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| **RETURNS** | bytes | The serialized patterns. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------- |
|
||||
| **RETURNS** | The serialized patterns. ~~bytes~~ |
|
||||
|
||||
## EntityRuler.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -222,40 +222,40 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> ruler.from_bytes(ruler_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------- | ---------------------------------- |
|
||||
| `bytes_data` | bytes | The bytestring to load. |
|
||||
| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. |
|
||||
| Name | Description |
|
||||
| ------------ | -------------------------------------------------- |
|
||||
| `bytes_data` | The bytestring to load. ~~bytes~~ |
|
||||
| **RETURNS** | The modified `EntityRuler` object. ~~EntityRuler~~ |
|
||||
|
||||
## EntityRuler.labels {#labels tag="property"}
|
||||
|
||||
All labels present in the match patterns.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------ |
|
||||
| **RETURNS** | tuple | The string labels. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------- |
|
||||
| **RETURNS** | The string labels. ~~Tuple[str, ...]~~ |
|
||||
|
||||
## EntityRuler.ent_ids {#labels tag="property" new="2.2.2"}
|
||||
|
||||
All entity ids present in the match patterns `id` properties.
|
||||
All entity IDs present in the `id` properties of the match patterns.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------- |
|
||||
| **RETURNS** | tuple | The string ent_ids. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------- |
|
||||
| **RETURNS** | The string IDs. ~~Tuple[str, ...]~~ |
|
||||
|
||||
## EntityRuler.patterns {#patterns tag="property"}
|
||||
|
||||
Get all patterns that were added to the entity ruler.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------------- |
|
||||
| **RETURNS** | list | The original patterns, one dictionary per pattern. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------- |
|
||||
| **RETURNS** | The original patterns, one dictionary per pattern. ~~List[Dict[str, Union[str, dict]]]~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------- | ------------------------------------- | ---------------------------------------------------------------- |
|
||||
| `matcher` | [`Matcher`](/api/matcher) | The underlying matcher used to process token patterns. |
|
||||
| `phrase_matcher` | [`PhraseMatcher`](/api/phrasematcher) | The underlying phrase matcher, used to process phrase patterns. |
|
||||
| `token_patterns` | dict | The token patterns present in the entity ruler, keyed by label. |
|
||||
| `phrase_patterns` | dict | The phrase patterns present in the entity ruler, keyed by label. |
|
||||
| Name | Description |
|
||||
| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ | |
|
||||
| `phrase_matcher` | The underlying phrase matcher, used to process phrase patterns. ~~PhraseMatcher~~ |
|
||||
| `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
|
||||
| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ |
|
||||
|
|
|
@ -8,9 +8,9 @@ new: 3.0
|
|||
|
||||
An `Example` holds the information for one training instance. It stores two
|
||||
`Doc` objects: one for holding the gold-standard reference data, and one for
|
||||
holding the predictions of the pipeline. An [`Alignment`](#alignment-object)
|
||||
object stores the alignment between these two documents, as they can differ in
|
||||
tokenization.
|
||||
holding the predictions of the pipeline. An
|
||||
[`Alignment`](/api/example#alignment-object) object stores the alignment between
|
||||
these two documents, as they can differ in tokenization.
|
||||
|
||||
## Example.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
|
@ -31,12 +31,12 @@ both documents.
|
|||
> example = Example(predicted, reference)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ----------- | ------------------------------------------------------------------------------------------------ |
|
||||
| `predicted` | `Doc` | The document containing (partial) predictions. Can not be `None`. |
|
||||
| `reference` | `Doc` | The document containing gold-standard annotations. Can not be `None`. |
|
||||
| _keyword-only_ | | |
|
||||
| `alignment` | `Alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `predicted` | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ |
|
||||
| `reference` | The document containing gold-standard annotations. Can not be `None`. ~~Doc~~ |
|
||||
| _keyword-only_ | |
|
||||
| `alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. ~~Optional[Alignment]~~ |
|
||||
|
||||
## Example.from_dict {#from_dict tag="classmethod"}
|
||||
|
||||
|
@ -56,11 +56,11 @@ see the [training format documentation](/api/data-formats#dict-input).
|
|||
> example = Example.from_dict(predicted, {"words": token_ref, "tags": tags_ref})
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ---------------- | ----------------------------------------------------------------- |
|
||||
| `predicted` | `Doc` | The document containing (partial) predictions. Can not be `None`. |
|
||||
| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Can not be `None`. |
|
||||
| **RETURNS** | `Example` | The newly constructed object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------- |
|
||||
| `predicted` | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ |
|
||||
| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Can not be `None`. ~~Dict[str, Any]~~ |
|
||||
| **RETURNS** | The newly constructed object. ~~Example~~ |
|
||||
|
||||
## Example.text {#text tag="property"}
|
||||
|
||||
|
@ -72,12 +72,14 @@ The text of the `predicted` document in this `Example`.
|
|||
> raw_text = example.text
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------- |
|
||||
| **RETURNS** | str | The text of the `predicted` document. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------- |
|
||||
| **RETURNS** | The text of the `predicted` document. ~~str~~ |
|
||||
|
||||
## Example.predicted {#predicted tag="property"}
|
||||
|
||||
The `Doc` holding the predictions. Occasionally also referred to as `example.x`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
|
@ -86,14 +88,15 @@ The text of the `predicted` document in this `Example`.
|
|||
> set_annotations(docs, predictions)
|
||||
> ```
|
||||
|
||||
The `Doc` holding the predictions. Occassionally also refered to as `example.x`.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ---------------------------------------------- |
|
||||
| **RETURNS** | `Doc` | The document containing (partial) predictions. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------ |
|
||||
| **RETURNS** | The document containing (partial) predictions. ~~Doc~~ |
|
||||
|
||||
## Example.reference {#reference tag="property"}
|
||||
|
||||
The `Doc` holding the gold-standard annotations. Occasionally also referred to
|
||||
as `example.y`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
|
@ -102,15 +105,15 @@ The `Doc` holding the predictions. Occassionally also refered to as `example.x`.
|
|||
> gold_labels[i][j] = eg.reference.cats.get(label, 0.0)
|
||||
> ```
|
||||
|
||||
The `Doc` holding the gold-standard annotations. Occassionally also refered to
|
||||
as `example.y`.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------------- |
|
||||
| **RETURNS** | `Doc` | The document containing gold-standard annotations. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------- |
|
||||
| **RETURNS** | The document containing gold-standard annotations. ~~Doc~~ |
|
||||
|
||||
## Example.alignment {#alignment tag="property"}
|
||||
|
||||
The [`Alignment`](/api/example#alignment-object) object mapping the tokens of
|
||||
the `predicted` document to those of the `reference` document.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
|
@ -122,15 +125,15 @@ as `example.y`.
|
|||
> assert list(alignment.y2x.data) == [[0], [1], [2], [2]]
|
||||
> ```
|
||||
|
||||
The `Alignment` object mapping the tokens of the `predicted` document to those
|
||||
of the `reference` document.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------- | -------------------------------------------------- |
|
||||
| **RETURNS** | `Alignment` | The document containing gold-standard annotations. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------- |
|
||||
| **RETURNS** | The document containing gold-standard annotations. ~~Alignment~~ |
|
||||
|
||||
## Example.get_aligned {#get_aligned tag="method"}
|
||||
|
||||
Get the aligned view of a certain token attribute, denoted by its int ID or
|
||||
string name.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
|
@ -141,17 +144,18 @@ of the `reference` document.
|
|||
> assert example.get_aligned("TAG", as_string=True) == ["VERB", "DET", "NOUN"]
|
||||
> ```
|
||||
|
||||
Get the aligned view of a certain token attribute, denoted by its int ID or
|
||||
string name.
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| ----------- | -------------------------- | ------------------------------------------------------------------ | ------- |
|
||||
| `field` | int or str | Attribute ID or string name | |
|
||||
| `as_string` | bool | Whether or not to return the list of values as strings. | `False` |
|
||||
| **RETURNS** | `List[int]` or `List[str]` | List of integer values, or string values if `as_string` is `True`. | |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------- |
|
||||
| `field` | Attribute ID or string name. ~~Union[int, str]~~ |
|
||||
| `as_string` | Whether or not to return the list of values as strings. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | List of integer values, or string values if `as_string` is `True`. ~~Union[List[int], List[str]]~~ |
|
||||
|
||||
## Example.get_aligned_parse {#get_aligned_parse tag="method"}
|
||||
|
||||
Get the aligned view of the dependency parse. If `projectivize` is set to
|
||||
`True`, non-projective dependency trees are made projective through the
|
||||
Pseudo-Projective Dependency Parsing algorithm by Nivre and Nilsson (2005).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
|
@ -161,17 +165,16 @@ string name.
|
|||
> assert proj_heads == [3, 2, 3, 0, 3]
|
||||
> ```
|
||||
|
||||
Get the aligned view of the dependency parse. If `projectivize` is set to
|
||||
`True`, non-projective dependency trees are made projective through the
|
||||
Pseudo-Projective Dependency Parsing algorithm by Nivre and Nilsson (2005).
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| -------------- | -------------------------- | ------------------------------------------------------------------ | ------- |
|
||||
| `projectivize` | bool | Whether or not to projectivize the dependency trees | `True` |
|
||||
| **RETURNS** | `List[int]` or `List[str]` | List of integer values, or string values if `as_string` is `True`. | |
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------- |
|
||||
| `projectivize` | Whether or not to projectivize the dependency trees. Defaults to `True`. ~~bool~~ |
|
||||
| **RETURNS** | List of integer values, or string values if `as_string` is `True`. ~~Union[List[int], List[str]]~~ |
|
||||
|
||||
## Example.get_aligned_ner {#get_aligned_ner tag="method"}
|
||||
|
||||
Get the aligned view of the NER
|
||||
[BILUO](/usage/linguistic-features#accessing-ner) tags.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
|
@ -184,15 +187,16 @@ Pseudo-Projective Dependency Parsing algorithm by Nivre and Nilsson (2005).
|
|||
> assert ner_tags == ["B-PERSON", "L-PERSON", "O", "O", "U-LOC"]
|
||||
> ```
|
||||
|
||||
Get the aligned view of the NER
|
||||
[BILUO](/usage/linguistic-features#accessing-ner) tags.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------- | ----------------------------------------------------------------------------------- |
|
||||
| **RETURNS** | `List[str]` | List of BILUO values, denoting whether tokens are part of an NER annotation or not. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------- |
|
||||
| **RETURNS** | List of BILUO values, denoting whether tokens are part of an NER annotation or not. ~~List[str]~~ |
|
||||
|
||||
## Example.get_aligned_spans_y2x {#get_aligned_spans_y2x tag="method"}
|
||||
|
||||
Get the aligned view of any set of [`Span`](/api/span) objects defined over
|
||||
[`Example.reference`](/api/example#reference). The resulting span indices will
|
||||
align to the tokenization in [`Example.predicted`](/api/example#predicted).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
|
@ -207,17 +211,19 @@ Get the aligned view of the NER
|
|||
> assert [(ent.start, ent.end) for ent in ents_y2x] == [(0, 1)]
|
||||
> ```
|
||||
|
||||
Get the aligned view of any set of [`Span`](/api/span) objects defined over
|
||||
`example.reference`. The resulting span indices will align to the tokenization
|
||||
in `example.predicted`.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | --------------------------------------------------------------- |
|
||||
| `y_spans` | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.reference`. |
|
||||
| **RETURNS** | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.predicted`. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------- |
|
||||
| `y_spans` | `Span` objects aligned to the tokenization of `reference`. ~~Iterable[Span]~~ |
|
||||
| **RETURNS** | `Span` objects aligned to the tokenization of `predicted`. ~~List[Span]~~ |
|
||||
|
||||
## Example.get_aligned_spans_x2y {#get_aligned_spans_x2y tag="method"}
|
||||
|
||||
Get the aligned view of any set of [`Span`](/api/span) objects defined over
|
||||
[`Example.predicted`](/api/example#predicted). The resulting span indices will
|
||||
align to the tokenization in [`Example.reference`](/api/example#reference). This
|
||||
method is particularly useful to assess the accuracy of predicted entities
|
||||
against the original gold-standard annotation.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
|
@ -232,15 +238,10 @@ in `example.predicted`.
|
|||
> assert [(ent.start, ent.end) for ent in ents_x2y] == [(0, 2)]
|
||||
> ```
|
||||
|
||||
Get the aligned view of any set of [`Span`](/api/span) objects defined over
|
||||
`example.predicted`. The resulting span indices will align to the tokenization
|
||||
in `example.reference`. This method is particularly useful to assess the
|
||||
accuracy of predicted entities against the original gold-standard annotation.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | --------------------------------------------------------------- |
|
||||
| `x_spans` | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.predicted`. |
|
||||
| **RETURNS** | `Iterable[Span]` | `Span` objects aligned to the tokenization of `self.reference`. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------- |
|
||||
| `x_spans` | `Span` objects aligned to the tokenization of `predicted`. ~~Iterable[Span]~~ |
|
||||
| **RETURNS** | `Span` objects aligned to the tokenization of `reference`. ~~List[Span]~~ |
|
||||
|
||||
## Example.to_dict {#to_dict tag="method"}
|
||||
|
||||
|
@ -253,12 +254,14 @@ reference annotation contained in this `Example`.
|
|||
> eg_dict = example.to_dict()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | ------------------------------------------------------ |
|
||||
| **RETURNS** | `Dict[str, Any]` | Dictionary representation of the reference annotation. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------- |
|
||||
| **RETURNS** | Dictionary representation of the reference annotation. ~~Dict[str, Any]~~ |
|
||||
|
||||
## Example.split_sents {#split_sents tag="method"}
|
||||
|
||||
Split one `Example` into multiple `Example` objects, one for each sentence.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
|
@ -271,11 +274,9 @@ reference annotation contained in this `Example`.
|
|||
> assert split_examples[1].text == "had lots of fun"
|
||||
> ```
|
||||
|
||||
Split one `Example` into multiple `Example` objects, one for each sentence.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------- | ---------------------------------------------------------- |
|
||||
| **RETURNS** | `List[Example]` | List of `Example` objects, one for each original sentence. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------- |
|
||||
| **RETURNS** | List of `Example` objects, one for each original sentence. ~~List[Example]~~ |
|
||||
|
||||
## Alignment {#alignment-object new="3"}
|
||||
|
||||
|
@ -283,10 +284,10 @@ Calculate alignment tables between two tokenizations.
|
|||
|
||||
### Alignment attributes {#alignment-attributes"}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----- | -------------------------------------------------- | ---------------------------------------------------------- |
|
||||
| `x2y` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | The `Ragged` object holding the alignment from `x` to `y`. |
|
||||
| `y2x` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | The `Ragged` object holding the alignment from `y` to `x`. |
|
||||
| Name | Description |
|
||||
| ----- | --------------------------------------------------------------------- |
|
||||
| `x2y` | The `Ragged` object holding the alignment from `x` to `y`. ~~Ragged~~ |
|
||||
| `y2x` | The `Ragged` object holding the alignment from `y` to `x`. ~~Ragged~~ |
|
||||
|
||||
<Infobox title="Important note" variant="warning">
|
||||
|
||||
|
@ -314,8 +315,8 @@ tokenizations add up to the same string. For example, you'll be able to align
|
|||
|
||||
### Alignment.from_strings {#classmethod tag="function"}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------- | ----------------------------------------------- |
|
||||
| `A` | list | String values of candidate tokens to align. |
|
||||
| `B` | list | String values of reference tokens to align. |
|
||||
| **RETURNS** | `Alignment` | An `Alignment` object describing the alignment. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------- |
|
||||
| `A` | String values of candidate tokens to align. ~~List[str]~~ |
|
||||
| `B` | String values of reference tokens to align. ~~List[str]~~ |
|
||||
| **RETURNS** | An `Alignment` object describing the alignment. ~~Alignment~~ |
|
||||
|
|
|
@ -9,7 +9,7 @@ new: 2.2
|
|||
---
|
||||
|
||||
The `KnowledgeBase` object provides a method to generate
|
||||
[`Candidate`](/api/kb/#candidate_init) objects, which are plausible external
|
||||
[`Candidate`](/api/kb/#candidate) objects, which are plausible external
|
||||
identifiers given a certain textual mention. Each such `Candidate` holds
|
||||
information from the relevant KB entities, such as its frequency in text and
|
||||
possible aliases. Each entity in the knowledge base also has a pretrained entity
|
||||
|
@ -27,18 +27,18 @@ Create the knowledge base.
|
|||
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------------- | ------- | ---------------------------------------- |
|
||||
| `vocab` | `Vocab` | A `Vocab` object. |
|
||||
| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
|
||||
| Name | Description |
|
||||
| ---------------------- | ------------------------------------------------ |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `entity_vector_length` | Length of the fixed-size entity vectors. ~~int~~ |
|
||||
|
||||
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
|
||||
|
||||
The length of the fixed-size entity vectors in the knowledge base.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ---------------------------------------- |
|
||||
| **RETURNS** | int | Length of the fixed-size entity vectors. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------ |
|
||||
| **RETURNS** | Length of the fixed-size entity vectors. ~~int~~ |
|
||||
|
||||
## KnowledgeBase.add_entity {#add_entity tag="method"}
|
||||
|
||||
|
@ -53,11 +53,11 @@ vector, which should be of length
|
|||
> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ------ | ----------------------------------------------- |
|
||||
| `entity` | str | The unique entity identifier |
|
||||
| `freq` | float | The frequency of the entity in a typical corpus |
|
||||
| `entity_vector` | vector | The pretrained vector of the entity |
|
||||
| Name | Description |
|
||||
| --------------- | ---------------------------------------------------------- |
|
||||
| `entity` | The unique entity identifier. ~~str~~ |
|
||||
| `freq` | The frequency of the entity in a typical corpus. ~~float~~ |
|
||||
| `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~ |
|
||||
|
||||
## KnowledgeBase.set_entities {#set_entities tag="method"}
|
||||
|
||||
|
@ -70,11 +70,11 @@ frequency and entity vector for each entity.
|
|||
> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | -------- | --------------------------------- |
|
||||
| `entity_list` | iterable | List of unique entity identifiers |
|
||||
| `freq_list` | iterable | List of entity frequencies |
|
||||
| `vector_list` | iterable | List of entity vectors |
|
||||
| Name | Description |
|
||||
| ------------- | ---------------------------------------------------------------- |
|
||||
| `entity_list` | List of unique entity identifiers. ~~Iterable[Union[str, int]]~~ |
|
||||
| `freq_list` | List of entity frequencies. ~~Iterable[int]~~ |
|
||||
| `vector_list` | List of entity vectors. ~~Iterable[numpy.ndarray]~~ |
|
||||
|
||||
## KnowledgeBase.add_alias {#add_alias tag="method"}
|
||||
|
||||
|
@ -90,11 +90,11 @@ should not exceed 1.
|
|||
> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | -------- | -------------------------------------------------- |
|
||||
| `alias` | str | The textual mention or alias |
|
||||
| `entities` | iterable | The potential entities that the alias may refer to |
|
||||
| `probabilities` | iterable | The prior probabilities of each entity |
|
||||
| Name | Description |
|
||||
| --------------- | --------------------------------------------------------------------------------- |
|
||||
| `alias` | The textual mention or alias. ~~str~~ |
|
||||
| `entities` | The potential entities that the alias may refer to. ~~Iterable[Union[str, int]]~~ |
|
||||
| `probabilities` | The prior probabilities of each entity. ~~Iterable[float]~~ |
|
||||
|
||||
## KnowledgeBase.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
|
@ -106,9 +106,9 @@ Get the total number of entities in the knowledge base.
|
|||
> total_entities = len(kb)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------------------- |
|
||||
| **RETURNS** | int | The number of entities in the knowledge base. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------- |
|
||||
| **RETURNS** | The number of entities in the knowledge base. ~~int~~ |
|
||||
|
||||
## KnowledgeBase.get_entity_strings {#get_entity_strings tag="method"}
|
||||
|
||||
|
@ -120,9 +120,9 @@ Get a list of all entity IDs in the knowledge base.
|
|||
> all_entities = kb.get_entity_strings()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------- |
|
||||
| **RETURNS** | list | The list of entities in the knowledge base. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------- |
|
||||
| **RETURNS** | The list of entities in the knowledge base. ~~List[str]~~ |
|
||||
|
||||
## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
|
||||
|
||||
|
@ -134,9 +134,9 @@ Get the total number of aliases in the knowledge base.
|
|||
> total_aliases = kb.get_size_aliases()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------- |
|
||||
| **RETURNS** | int | The number of aliases in the knowledge base. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------- |
|
||||
| **RETURNS** | The number of aliases in the knowledge base. ~~int~~ |
|
||||
|
||||
## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
|
||||
|
||||
|
@ -148,14 +148,14 @@ Get a list of all aliases in the knowledge base.
|
|||
> all_aliases = kb.get_alias_strings()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------ |
|
||||
| **RETURNS** | list | The list of aliases in the knowledge base. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------- |
|
||||
| **RETURNS** | The list of aliases in the knowledge base. ~~List[str]~~ |
|
||||
|
||||
## KnowledgeBase.get_candidates {#get_candidates tag="method"}
|
||||
|
||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||
of type [`Candidate`](/api/kb/#candidate_init).
|
||||
of type [`Candidate`](/api/kb/#candidate).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -163,10 +163,10 @@ of type [`Candidate`](/api/kb/#candidate_init).
|
|||
> candidates = kb.get_candidates("Douglas")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | ---------------------------------------- |
|
||||
| `alias` | str | The textual mention or alias |
|
||||
| **RETURNS** | iterable | The list of relevant `Candidate` objects |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------- |
|
||||
| `alias` | The textual mention or alias. ~~str~~ |
|
||||
| **RETURNS** | iterable | The list of relevant `Candidate` objects. ~~List[Candidate]~~ |
|
||||
|
||||
## KnowledgeBase.get_vector {#get_vector tag="method"}
|
||||
|
||||
|
@ -178,10 +178,10 @@ Given a certain entity ID, retrieve its pretrained entity vector.
|
|||
> vector = kb.get_vector("Q42")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | ----------------- |
|
||||
| `entity` | str | The entity ID |
|
||||
| **RETURNS** | vector | The entity vector |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------ |
|
||||
| `entity` | The entity ID. ~~str~~ |
|
||||
| **RETURNS** | The entity vector. ~~numpy.ndarray~~ |
|
||||
|
||||
## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
|
||||
|
||||
|
@ -194,11 +194,11 @@ probability of the fact that the mention links to the entity ID.
|
|||
> probability = kb.get_prior_prob("Q42", "Douglas")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------------------------- |
|
||||
| `entity` | str | The entity ID |
|
||||
| `alias` | str | The textual mention or alias |
|
||||
| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------- |
|
||||
| `entity` | The entity ID. ~~str~~ |
|
||||
| `alias` | The textual mention or alias. ~~str~~ |
|
||||
| **RETURNS** | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
|
||||
|
||||
## KnowledgeBase.dump {#dump tag="method"}
|
||||
|
||||
|
@ -210,9 +210,9 @@ Save the current state of the knowledge base to a directory.
|
|||
> kb.dump(loc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `loc` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Description |
|
||||
| ----- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `loc` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
|
||||
## KnowledgeBase.load_bulk {#load_bulk tag="method"}
|
||||
|
||||
|
@ -229,12 +229,20 @@ Restore the state of the knowledge base from a given directory. Note that the
|
|||
> kb.load_bulk("/path/to/kb")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------- | -------------------------------------------------------------------------- |
|
||||
| `loc` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `loc` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| **RETURNS** | The modified `KnowledgeBase` object. ~~KnowledgeBase~~ |
|
||||
|
||||
## Candidate.\_\_init\_\_ {#candidate_init tag="method"}
|
||||
## Candidate {#candidate tag="class"}
|
||||
|
||||
A `Candidate` object refers to a textual mention (alias) that may or may not be
|
||||
resolved to a specific entity from a `KnowledgeBase`. This will be used as input
|
||||
for the entity linking algorithm which will disambiguate the various candidates
|
||||
to the correct one. Each candidate `(alias, entity)` pair is assigned to a
|
||||
certain prior probability.
|
||||
|
||||
### Candidate.\_\_init\_\_ {#candidate-init tag="method"}
|
||||
|
||||
Construct a `Candidate` object. Usually this constructor is not called directly,
|
||||
but instead these objects are returned by the
|
||||
|
@ -247,22 +255,22 @@ but instead these objects are returned by the
|
|||
> candidate = Candidate(kb, entity_hash, entity_freq, entity_vector, alias_hash, prior_prob)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | --------------- | -------------------------------------------------------------- |
|
||||
| `kb` | `KnowledgeBase` | The knowledge base that defined this candidate. |
|
||||
| `entity_hash` | int | The hash of the entity's KB ID. |
|
||||
| `entity_freq` | float | The entity frequency as recorded in the KB. |
|
||||
| `alias_hash` | int | The hash of the textual mention or alias. |
|
||||
| `prior_prob` | float | The prior probability of the `alias` referring to the `entity` |
|
||||
| Name | Description |
|
||||
| ------------- | ------------------------------------------------------------------------- |
|
||||
| `kb` | The knowledge base that defined this candidate. ~~KnowledgeBase~~ |
|
||||
| `entity_hash` | The hash of the entity's KB ID. ~~int~~ |
|
||||
| `entity_freq` | The entity frequency as recorded in the KB. ~~float~~ |
|
||||
| `alias_hash` | The hash of the textual mention or alias. ~~int~~ |
|
||||
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~float~~ |
|
||||
|
||||
## Candidate attributes {#candidate_attributes}
|
||||
## Candidate attributes {#candidate-attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ------ | -------------------------------------------------------------- |
|
||||
| `entity` | int | The entity's unique KB identifier |
|
||||
| `entity_` | str | The entity's unique KB identifier |
|
||||
| `alias` | int | The alias or textual mention |
|
||||
| `alias_` | str | The alias or textual mention |
|
||||
| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
|
||||
| `entity_freq` | long | The frequency of the entity in a typical corpus |
|
||||
| `entity_vector` | vector | The pretrained vector of the entity |
|
||||
| Name | Description |
|
||||
| --------------- | ------------------------------------------------------------------------ |
|
||||
| `entity` | The entity's unique KB identifier. ~~int~~ |
|
||||
| `entity_` | The entity's unique KB identifier. ~~str~~ |
|
||||
| `alias` | The alias or textual mention. ~~int~~ |
|
||||
| `alias_` | The alias or textual mention. ~~str~~ |
|
||||
| `prior_prob` | The prior probability of the `alias` referring to the `entity`. ~~long~~ |
|
||||
| `entity_freq` | The frequency of the entity in a typical corpus. ~~long~~ |
|
||||
| `entity_vector` | The pretrained vector of the entity. ~~numpy.ndarray~~ |
|
||||
|
|
|
@ -32,13 +32,13 @@ Initialize a `Language` object.
|
|||
> nlp = Language(Vocab())
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------ | ----------- | ------------------------------------------------------------------------------------------ |
|
||||
| `vocab` | `Vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. |
|
||||
| _keyword-only_ | | |
|
||||
| `max_length` | int | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. |
|
||||
| `meta` | dict | Custom meta data for the `Language` class. Is written to by models to add model meta data. |
|
||||
| `create_tokenizer` | `Callable` | Optional function that receives the `nlp` object and returns a tokenizer. |
|
||||
| Name | Description |
|
||||
| ------------------ | ------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `vocab` | A `Vocab` object. If `True`, a vocab is created using the default language data settings. ~~Vocab~~ |
|
||||
| _keyword-only_ | |
|
||||
| `max_length` | Maximum number of characters allowed in a single text. Defaults to `10 ** 6`. ~~int~~ |
|
||||
| `meta` | Custom meta data for the `Language` class. Is written to by models to add model meta data. ~~dict~~ |
|
||||
| `create_tokenizer` | Optional function that receives the `nlp` object and returns a tokenizer. ~~Callable[[Language], Callable[[str], Doc]]~~ |
|
||||
|
||||
## Language.from_config {#from_config tag="classmethod"}
|
||||
|
||||
|
@ -58,14 +58,14 @@ model under the hood based on its [`config.cfg`](/api/data-formats#config).
|
|||
> nlp = Language.from_config(config)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `config` | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | The loaded config. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `config` | The loaded config. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `disable` | `Iterable[str]` | List of pipeline component names to disable. |
|
||||
| `auto_fill` | bool | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. |
|
||||
| `validate` | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
|
||||
| **RETURNS** | `Language` | The initialized object. |
|
||||
| `disable` | List of pipeline component names to disable. ~~Iterable[str]~~ |
|
||||
| `auto_fill` | Whether to automatically fill in missing values in the config, based on defaults and function argument annotations. Defaults to `True`. ~~bool~~ |
|
||||
| `validate` | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||
| **RETURNS** | The initialized object. ~~Language~~ |
|
||||
|
||||
## Language.component {#component tag="classmethod" new="3"}
|
||||
|
||||
|
@ -94,16 +94,14 @@ decorator. For more details and examples, see the
|
|||
> Language.component("my_component2", func=my_component)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | str | The name of the component factory. |
|
||||
| _keyword-only_ | | |
|
||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis).. |
|
||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `name` | The name of the component factory. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
|
||||
| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[Doc], Doc]]~~ |
|
||||
|
||||
## Language.factory {#factory tag="classmethod"}
|
||||
|
||||
|
@ -141,17 +139,17 @@ examples, see the
|
|||
> )
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | str | The name of the component factory. |
|
||||
| _keyword-only_ | | |
|
||||
| `default_config` | `Dict[str, any]` | The default config, describing the default values of the factory arguments. |
|
||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||
| `func` | `Optional[Callable]` | Optional function if not used a a decorator. |
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `name` | The name of the component factory. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ |
|
||||
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
|
||||
| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ |
|
||||
| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ |
|
||||
|
||||
## Language.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -165,13 +163,13 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
|
|||
> assert (doc[0].text, doc[0].head.tag_) == ("An", "NN")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ----------------- | ------------------------------------------------------------------------------------------------------ |
|
||||
| `text` | str | The text to be processed. |
|
||||
| _keyword-only_ | | |
|
||||
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
|
||||
| **RETURNS** | [`Doc`](/api/doc) | A container for accessing the annotations. |
|
||||
| Name | Description |
|
||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `text` | The text to be processed. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
|
||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||
| **RETURNS** | A container for accessing the annotations. ~~Doc~~ |
|
||||
|
||||
## Language.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -186,17 +184,17 @@ more efficient than processing texts one-by-one.
|
|||
> assert doc.is_parsed
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------------------------------ | ----------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `texts` | `Iterable[str]` | A sequence of strings. |
|
||||
| _keyword-only_ | | |
|
||||
| `as_tuples` | bool | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
|
||||
| `batch_size` | int | The number of texts to buffer. |
|
||||
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||
| `cleanup` | bool | If `True`, unneeded strings are freed to control memory use. Experimental. |
|
||||
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
|
||||
| `n_process` <Tag variant="new">2.2.2</Tag> | int | Number of processors to use, only supported in Python 3. Defaults to `1`. |
|
||||
| **YIELDS** | `Doc` | Documents in the order of the original text. |
|
||||
| Name | Description |
|
||||
| ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `texts` | A sequence of strings. ~~Iterable[str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `as_tuples` | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. ~~bool~~ |
|
||||
| `batch_size` | The number of texts to buffer. ~~int~~ |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
|
||||
| `cleanup` | If `True`, unneeded strings are freed to control memory use. Experimental. ~~bool~~ |
|
||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||
| `n_process` <Tag variant="new">2.2.2</Tag> | Number of processors to use. Defaults to `1`. ~~int~~ |
|
||||
| **YIELDS** | Documents in the order of the original text. ~~Doc~~ |
|
||||
|
||||
## Language.begin_training {#begin_training tag="method"}
|
||||
|
||||
|
@ -225,12 +223,12 @@ tuples of `Doc` and `GoldParse` objects.
|
|||
> optimizer = nlp.begin_training(get_examples)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Optional[Callable[[], Iterable[Example]]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## Language.resume_training {#resume_training tag="method,experimental" new="3"}
|
||||
|
||||
|
@ -248,11 +246,11 @@ a batch of [Example](/api/example) objects.
|
|||
> nlp.rehearse(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/language#create_optimizer) if not set. |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## Language.update {#update tag="method"}
|
||||
|
||||
|
@ -282,15 +280,15 @@ and custom registered functions if needed. See the
|
|||
> nlp.update([example], sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Dictionary to update with the loss, keyed by pipeline component. |
|
||||
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| Name | Description |
|
||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~ |
|
||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## Language.rehearse {#rehearse tag="method,experimental" new="3"}
|
||||
|
||||
|
@ -305,14 +303,14 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
|||
> losses = nlp.rehearse(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Dictionary to update with the loss, keyed by pipeline component. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## Language.evaluate {#evaluate tag="method"}
|
||||
|
||||
|
@ -328,20 +326,19 @@ objects instead of tuples of `Doc` and `GoldParse` objects.
|
|||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> scores = nlp.evaluate(examples, verbose=True)
|
||||
> scores = nlp.evaluate(examples)
|
||||
> print(scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ------------------------------- | ------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| _keyword-only_ | | |
|
||||
| `verbose` | bool | Print debugging information. |
|
||||
| `batch_size` | int | The batch size to use. |
|
||||
| `scorer` | `Scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. |
|
||||
| `component_cfg` | `Dict[str, dict]` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. |
|
||||
| `scorer_cfg` | `Dict[str, Any]` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. |
|
||||
| **RETURNS** | `Dict[str, Union[float, dict]]` | A dictionary of evaluation scores. |
|
||||
| Name | Description |
|
||||
| --------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The batch size to use. ~~int~~ |
|
||||
| `scorer` | Optional [`Scorer`](/api/scorer) to use. If not passed in, a new one will be created. ~~Optional[Scorer]~~ |
|
||||
| `component_cfg` | Optional dictionary of keyword arguments for components, keyed by component names. Defaults to `None`. ~~Optional[Dict[str, Dict[str, Any]]]~~ |
|
||||
| `scorer_cfg` | Optional dictionary of keyword arguments for the `Scorer`. Defaults to `None`. ~~Optional[Dict[str, Any]]~~ |
|
||||
| **RETURNS** | A dictionary of evaluation scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## Language.use_params {#use_params tag="contextmanager, method"}
|
||||
|
||||
|
@ -356,9 +353,9 @@ their original weights after the block.
|
|||
> nlp.to_disk("/tmp/checkpoint")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---- | --------------------------------------------- |
|
||||
| `params` | dict | A dictionary of parameters keyed by model ID. |
|
||||
| Name | Description |
|
||||
| -------- | ------------------------------------------------------ |
|
||||
| `params` | A dictionary of parameters keyed by model ID. ~~dict~~ |
|
||||
|
||||
## Language.create_pipe {#create_pipe tag="method" new="2"}
|
||||
|
||||
|
@ -380,14 +377,14 @@ To create a component and add it to the pipeline, you should always use
|
|||
> parser = nlp.create_pipe("parser")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `factory_name` | str | Name of the registered component factory. |
|
||||
| `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. |
|
||||
| _keyword-only_ | | |
|
||||
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. |
|
||||
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
|
||||
| **RETURNS** | callable | The pipeline component. |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `factory_name` | Name of the registered component factory. ~~str~~ |
|
||||
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
|
||||
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
|
||||
|
||||
## Language.add_pipe {#add_pipe tag="method" new="2"}
|
||||
|
||||
|
@ -423,19 +420,19 @@ component, adds it to the pipeline and returns it.
|
|||
> nlp.add_pipe("ner", source=source_nlp)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------------------------------- | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `factory_name` | str | Name of the registered component factory. |
|
||||
| `name` | str | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. |
|
||||
| _keyword-only_ | | |
|
||||
| `before` | str / int | Component name or index to insert component directly before. |
|
||||
| `after` | str / int | Component name or index to insert component directly after: |
|
||||
| `first` | bool | Insert component first / not first in the pipeline. |
|
||||
| `last` | bool | Insert component last / not last in the pipeline. |
|
||||
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. |
|
||||
| `source` <Tag variant="new">3</Tag> | `Language` | Optional source model to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source model match the target model. |
|
||||
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
|
||||
| **RETURNS** <Tag variant="new">3</Tag> | callable | The pipeline component. |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `factory_name` | Name of the registered component factory. ~~str~~ |
|
||||
| `name` | Optional unique name of pipeline component instance. If not set, the factory name is used. An error is raised if the name already exists in the pipeline. ~~Optional[str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `before` | Component name or index to insert component directly before. ~~Optional[Union[str, int]]~~ |
|
||||
| `after` | Component name or index to insert component directly after. ~~Optional[Union[str, int]]~~ |
|
||||
| `first` | Insert component first / not first in the pipeline. ~~Optional[bool]~~ |
|
||||
| `last` | Insert component last / not last in the pipeline. ~~Optional[bool]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for this component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
|
||||
| `source` <Tag variant="new">3</Tag> | Optional source model to copy component from. If a source is provided, the `factory_name` is interpreted as the name of the component in the source pipeline. Make sure that the vocab, vectors and settings of the source model match the target model. ~~Optional[Language]~~ |
|
||||
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
|
||||
|
||||
## Language.has_factory {#has_factory tag="classmethod" new="3"}
|
||||
|
||||
|
@ -459,10 +456,10 @@ the `Language` base class, available to all subclasses.
|
|||
> assert not Language.has_factory("component")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ---------------------------------------------------------- |
|
||||
| `name` | str | Name of the pipeline factory to check. |
|
||||
| **RETURNS** | bool | Whether a factory of that name is registered on the class. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------- |
|
||||
| `name` | Name of the pipeline factory to check. ~~str~~ |
|
||||
| **RETURNS** | Whether a factory of that name is registered on the class. ~~bool~~ |
|
||||
|
||||
## Language.has_pipe {#has_pipe tag="method" new="2"}
|
||||
|
||||
|
@ -481,10 +478,10 @@ Check whether a component is present in the pipeline. Equivalent to
|
|||
> assert nlp.has_pipe("my_component")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------------------- |
|
||||
| `name` | str | Name of the pipeline component to check. |
|
||||
| **RETURNS** | bool | Whether a component of that name exists in the pipeline. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------- |
|
||||
| `name` | Name of the pipeline component to check. ~~str~~ |
|
||||
| **RETURNS** | Whether a component of that name exists in the pipeline. ~~bool~~ |
|
||||
|
||||
## Language.get_pipe {#get_pipe tag="method" new="2"}
|
||||
|
||||
|
@ -497,28 +494,37 @@ Get a pipeline component for a given component name.
|
|||
> custom_component = nlp.get_pipe("custom_component")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | -------------------------------------- |
|
||||
| `name` | str | Name of the pipeline component to get. |
|
||||
| **RETURNS** | callable | The pipeline component. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------ |
|
||||
| `name` | Name of the pipeline component to get. ~~str~~ |
|
||||
| **RETURNS** | The pipeline component. ~~Callable[[Doc], Doc]~~ |
|
||||
|
||||
## Language.replace_pipe {#replace_pipe tag="method" new="2"}
|
||||
|
||||
Replace a component in the pipeline.
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
As of v3.0, the `Language.replace_pipe` method doesn't take callables anymore
|
||||
and instead expects the **name of a component factory** registered using
|
||||
[`@Language.component`](/api/language#component) or
|
||||
[`@Language.factory`](/api/language#factory).
|
||||
|
||||
</Infobox>
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> nlp.replace_pipe("parser", my_custom_parser)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | str | Name of the component to replace. |
|
||||
| `component` | callable | The pipeline component to insert. |
|
||||
| _keyword-only_ | | |
|
||||
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. |
|
||||
| `validate` <Tag variant="new">3</Tag> | bool | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. |
|
||||
| Name | Description |
|
||||
| ------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `name` | Name of the component to replace. ~~str~~ |
|
||||
| `component` | The factory name of the component to insert. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config parameters to use for the new component. Will be merged with the `default_config` specified by the component factory. ~~Optional[Dict[str, Any]]~~ |
|
||||
| `validate` <Tag variant="new">3</Tag> | Whether to validate the component config and arguments against the types expected by the factory. Defaults to `True`. ~~bool~~ |
|
||||
|
||||
## Language.rename_pipe {#rename_pipe tag="method" new="2"}
|
||||
|
||||
|
@ -533,10 +539,10 @@ added to the pipeline, you can also use the `name` argument on
|
|||
> nlp.rename_pipe("parser", "spacy_parser")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ---- | -------------------------------- |
|
||||
| `old_name` | str | Name of the component to rename. |
|
||||
| `new_name` | str | New name of the component. |
|
||||
| Name | Description |
|
||||
| ---------- | ---------------------------------------- |
|
||||
| `old_name` | Name of the component to rename. ~~str~~ |
|
||||
| `new_name` | New name of the component. ~~str~~ |
|
||||
|
||||
## Language.remove_pipe {#remove_pipe tag="method" new="2"}
|
||||
|
||||
|
@ -550,10 +556,10 @@ component function.
|
|||
> assert name == "parser"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ----------------------------------------------------- |
|
||||
| `name` | str | Name of the component to remove. |
|
||||
| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------ |
|
||||
| `name` | Name of the component to remove. ~~str~~ |
|
||||
| **RETURNS** | A `(name, component)` tuple of the removed component. ~~Tuple[str, Callable[[Doc], Doc]]~~ |
|
||||
|
||||
## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"}
|
||||
|
||||
|
@ -589,12 +595,12 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
|
|||
|
||||
</Infobox>
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------------------ |
|
||||
| _keyword-only_ | | |
|
||||
| `disable` | str / list | Name(s) of pipeline components to disable. |
|
||||
| `enable` | str / list | Names(s) of pipeline components that will not be disabled. |
|
||||
| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------ |
|
||||
| _keyword-only_ | |
|
||||
| `disable` | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~ |
|
||||
| `enable` | Names(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ |
|
||||
| **RETURNS** | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ |
|
||||
|
||||
## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"}
|
||||
|
||||
|
@ -613,10 +619,10 @@ information about the component and its default provided by the
|
|||
> print(factory_meta.default_config)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------------- | ------------------ |
|
||||
| `name` | str | The factory name. |
|
||||
| **RETURNS** | [`FactoryMeta`](#factorymeta) | The factory meta. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------- |
|
||||
| `name` | The factory name. ~~str~~ |
|
||||
| **RETURNS** | The factory meta. ~~FactoryMeta~~ |
|
||||
|
||||
## Language.get_pipe_meta {#get_pipe_meta tag="method" new="3"}
|
||||
|
||||
|
@ -636,10 +642,10 @@ contains the information about the component and its default provided by the
|
|||
> print(factory_meta.default_config)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------------- | ---------------------------- |
|
||||
| `name` | str | The pipeline component name. |
|
||||
| **RETURNS** | [`FactoryMeta`](#factorymeta) | The factory meta. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------ |
|
||||
| `name` | The pipeline component name. ~~str~~ |
|
||||
| **RETURNS** | The factory meta. ~~FactoryMeta~~ |
|
||||
|
||||
## Language.analyze_pipes {#analyze_pipes tag="method" new="3"}
|
||||
|
||||
|
@ -725,12 +731,12 @@ token.ent_iob, token.ent_type
|
|||
|
||||
</Accordion>
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `keys` | `List[str]` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. |
|
||||
| `pretty` | bool | Pretty-print the results as a table. Defaults to `False`. |
|
||||
| **RETURNS** | dict | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `keys` | The values to display in the table. Corresponds to attributes of the [`FactoryMeta`](/api/language#factorymeta). Defaults to `["assigns", "requires", "scores", "retokenizes"]`. ~~List[str]~~ |
|
||||
| `pretty` | Pretty-print the results as a table. Defaults to `False`. ~~bool~~ |
|
||||
| **RETURNS** | Dictionary containing the pipe analysis, keyed by `"summary"` (component meta by pipe), `"problems"` (attribute names by pipe) and `"attrs"` (pipes that assign and require an attribute, keyed by attribute). ~~Optional[Dict[str, Any]]~~ |
|
||||
|
||||
## Language.meta {#meta tag="property"}
|
||||
|
||||
|
@ -744,9 +750,9 @@ data of the model. The `Language.meta` is also what's serialized as the
|
|||
> print(nlp.meta)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------- |
|
||||
| **RETURNS** | dict | The meta data. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------- |
|
||||
| **RETURNS** | The meta data. ~~Dict[str, Any]~~ |
|
||||
|
||||
## Language.config {#config tag="property" new="3"}
|
||||
|
||||
|
@ -765,9 +771,9 @@ subclass of the built-in `dict`. It supports the additional methods `to_disk`
|
|||
> print(nlp.config.to_str())
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------------------------------------- | ----------- |
|
||||
| **RETURNS** | [`Config`](https://thinc.ai/docs/api-config#config) | The config. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------- |
|
||||
| **RETURNS** | The config. ~~Config~~ |
|
||||
|
||||
## Language.to_disk {#to_disk tag="method" new="2"}
|
||||
|
||||
|
@ -780,11 +786,11 @@ the model**.
|
|||
> nlp.to_disk("/path/to/models")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## Language.from_disk {#from_disk tag="method" new="2"}
|
||||
|
||||
|
@ -806,12 +812,12 @@ loaded object.
|
|||
> nlp = English().from_disk("/path/to/en_model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ----------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Language` | The modified `Language` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `Language` object. ~~Language~~ |
|
||||
|
||||
## Language.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -823,11 +829,11 @@ Serialize the current state to a binary string.
|
|||
> nlp_bytes = nlp.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ----------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Language` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. ~~iterable~~ |
|
||||
| **RETURNS** | The serialized form of the `Language` object. ~~bytes~~ |
|
||||
|
||||
## Language.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -845,35 +851,35 @@ available to the loaded object.
|
|||
> nlp2.from_bytes(nlp_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ----------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Language` | The `Language` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `Language` object. ~~Language~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------------------------------------- | ---------------------- | ---------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | A container for the lexical types. |
|
||||
| `tokenizer` | `Tokenizer` | The tokenizer. |
|
||||
| `make_doc` | `Callable` | Callable that takes a string and returns a `Doc`. |
|
||||
| `pipeline` | `List[str, Callable]` | List of `(name, component)` tuples describing the current processing pipeline, in order. |
|
||||
| `pipe_names` <Tag variant="new">2</Tag> | `List[str]` | List of pipeline component names, in order. |
|
||||
| `pipe_labels` <Tag variant="new">2.2</Tag> | `Dict[str, List[str]]` | List of labels set by the pipeline components, if available, keyed by component name. |
|
||||
| `pipe_factories` <Tag variant="new">2.2</Tag> | `Dict[str, str]` | Dictionary of pipeline component names, mapped to their factory names. |
|
||||
| `factories` | `Dict[str, Callable]` | All available factory functions, keyed by name. |
|
||||
| `factory_names` <Tag variant="new">3</Tag> | `List[str]` | List of all available factory names. |
|
||||
| `path` <Tag variant="new">2</Tag> | `Path` | Path to the model data directory, if a model is loaded. Otherwise `None`. |
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A container for the lexical types. ~~Vocab~~ |
|
||||
| `tokenizer` | The tokenizer. ~~Tokenizer~~ |
|
||||
| `make_doc` | Callable that takes a string and returns a `Doc`. ~~Callable[[str], Doc]~~ |
|
||||
| `pipeline` | List of `(name, component)` tuples describing the current processing pipeline, in order. ~~List[str, Callable[[Doc], Doc]]~~ |
|
||||
| `pipe_names` <Tag variant="new">2</Tag> | List of pipeline component names, in order. ~~List[str]~~ |
|
||||
| `pipe_labels` <Tag variant="new">2.2</Tag> | List of labels set by the pipeline components, if available, keyed by component name. ~~Dict[str, List[str]]~~ |
|
||||
| `pipe_factories` <Tag variant="new">2.2</Tag> | Dictionary of pipeline component names, mapped to their factory names. ~~Dict[str, str]~~ |
|
||||
| `factories` | All available factory functions, keyed by name. ~~Dict[str, Callable[[...], Callable[[Doc], Doc]]]~~ |
|
||||
| `factory_names` <Tag variant="new">3</Tag> | List of all available factory names. ~~List[str]~~ |
|
||||
| `path` <Tag variant="new">2</Tag> | Path to the model data directory, if a model is loaded. Otherwise `None`. ~~Optional[Path]~~ |
|
||||
|
||||
## Class attributes {#class-attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
|
||||
| `lang` | str | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
|
||||
| `default_config` | dict | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](https://github.com/explosion/spaCy/tree/develop/spacy/default_config.cfg). |
|
||||
| Name | Description |
|
||||
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `Defaults` | Settings, data and factory methods for creating the `nlp` object and processing pipeline. ~~Defaults~~ |
|
||||
| `lang` | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). ~~str~~ |
|
||||
| `default_config` | Base [config](/usage/training#config) to use for [Language.config](/api/language#config). Defaults to [`default_config.cfg`](https://github.com/explosion/spaCy/tree/develop/spacy/default_config.cfg). ~~Config~~ |
|
||||
|
||||
## Defaults {#defaults}
|
||||
|
||||
|
@ -907,16 +913,16 @@ customize the default language data:
|
|||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `stop_words` | List of stop words, used for `Token.is_stop`.<br />**Example:** [`stop_words.py`][stop_words.py] |
|
||||
| `tokenizer_exceptions` | Tokenizer exception rules, string mapped to list of token attributes.<br />**Example:** [`de/tokenizer_exceptions.py`][de/tokenizer_exceptions.py] |
|
||||
| `prefixes`, `suffixes`, `infixes` | Prefix, suffix and infix rules for the default tokenizer.<br />**Example:** [`puncutation.py`][punctuation.py] |
|
||||
| `token_match` | Optional regex for matching strings that should never be split, overriding the infix rules.<br />**Example:** [`fr/tokenizer_exceptions.py`][fr/tokenizer_exceptions.py] |
|
||||
| `url_match` | Regular expression for matching URLs. Prefixes and suffixes are removed before applying the match.<br />**Example:** [`tokenizer_exceptions.py`][tokenizer_exceptions.py] |
|
||||
| `lex_attr_getters` | Custom functions for setting lexical attributes on tokens, e.g. `like_num`.<br />**Example:** [`lex_attrs.py`][lex_attrs.py] |
|
||||
| `syntax_iterators` | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks).<br />**Example:** [`syntax_iterators.py`][syntax_iterators.py]. |
|
||||
| `writing_system` | Information about the language's writing system, available via `Vocab.writing_system`. Defaults to: `{"direction": "ltr", "has_case": True, "has_letters": True}.`.<br />**Example:** [`zh/__init__.py`][zh/__init__.py] |
|
||||
| `config` | Default [config](/usage/training#config) added to `nlp.config`. This can include references to custom tokenizers or lemmatizers.<br />**Example:** [`zh/__init__.py`][zh/__init__.py] |
|
||||
| --------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `stop_words` | List of stop words, used for `Token.is_stop`.<br />**Example:** [`stop_words.py`][stop_words.py] ~~Set[str]~~ |
|
||||
| `tokenizer_exceptions` | Tokenizer exception rules, string mapped to list of token attributes.<br />**Example:** [`de/tokenizer_exceptions.py`][de/tokenizer_exceptions.py] ~~Dict[str, List[dict]]~~ |
|
||||
| `prefixes`, `suffixes`, `infixes` | Prefix, suffix and infix rules for the default tokenizer.<br />**Example:** [`puncutation.py`][punctuation.py] ~~Optional[List[Union[str, Pattern]]]~~ |
|
||||
| `token_match` | Optional regex for matching strings that should never be split, overriding the infix rules.<br />**Example:** [`fr/tokenizer_exceptions.py`][fr/tokenizer_exceptions.py] ~~Optional[Pattern]~~ |
|
||||
| `url_match` | Regular expression for matching URLs. Prefixes and suffixes are removed before applying the match.<br />**Example:** [`tokenizer_exceptions.py`][tokenizer_exceptions.py] ~~Optional[Pattern]~~ |
|
||||
| `lex_attr_getters` | Custom functions for setting lexical attributes on tokens, e.g. `like_num`.<br />**Example:** [`lex_attrs.py`][lex_attrs.py] ~~Dict[int, Callable[[str], Any]]~~ |
|
||||
| `syntax_iterators` | Functions that compute views of a `Doc` object based on its syntax. At the moment, only used for [noun chunks](/usage/linguistic-features#noun-chunks).<br />**Example:** [`syntax_iterators.py`][syntax_iterators.py]. ~~Dict[str, Callable[[Union[Doc, Span]], Iterator[Span]]]~~ |
|
||||
| `writing_system` | Information about the language's writing system, available via `Vocab.writing_system`. Defaults to: `{"direction": "ltr", "has_case": True, "has_letters": True}.`.<br />**Example:** [`zh/__init__.py`][zh/__init__.py] ~~Dict[str, Any]~~ |
|
||||
| `config` | Default [config](/usage/training#config) added to `nlp.config`. This can include references to custom tokenizers or lemmatizers.<br />**Example:** [`zh/__init__.py`][zh/__init__.py] ~~Config~~ |
|
||||
|
||||
[stop_words.py]:
|
||||
https://github.com/explosion/spaCy/tree/master/spacy/lang/en/stop_words.py
|
||||
|
@ -963,12 +969,12 @@ provided by the [`@Language.component`](/api/language#component) or
|
|||
component is defined and stored on the `Language` class for each component
|
||||
instance and factory instance.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `factory` | str | The name of the registered component factory. |
|
||||
| `default_config` | `Dict[str, Any]` | The default config, describing the default values of the factory arguments. |
|
||||
| `assigns` | `Iterable[str]` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||
| `requires` | `Iterable[str]` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||
| `retokenizes` | bool | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||
| `scores` | `Iterable[str]` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). |
|
||||
| `default_score_weights` | `Dict[str, float]` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. |
|
||||
| Name | Description |
|
||||
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `factory` | The name of the registered component factory. ~~str~~ |
|
||||
| `default_config` | The default config, describing the default values of the factory arguments. ~~Dict[str, Any]~~ |
|
||||
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
|
||||
| `scores` | All scores set by the components if it's trainable, e.g. `["ents_f", "ents_r", "ents_p"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. ~~Dict[str, float]~~ |
|
||||
|
|
|
@ -36,11 +36,9 @@ tags is available in the pipeline and runs _before_ the lemmatizer.
|
|||
The default config is defined by the pipeline component factory and describes
|
||||
how the component should be configured. You can override its settings via the
|
||||
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||
[`config.cfg` for training](/usage/training#config).
|
||||
|
||||
For examples of the lookups data formats used by the lookup and rule-based
|
||||
lemmatizers, see the
|
||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) repo.
|
||||
[`config.cfg` for training](/usage/training#config). For examples of the lookups
|
||||
data formats used by the lookup and rule-based lemmatizers, see
|
||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -49,12 +47,12 @@ lemmatizers, see the
|
|||
> nlp.add_pipe("lemmatizer", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ----------- | ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------- |
|
||||
| `mode` | str | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. | `"lookup"` |
|
||||
| `lookups` | [`Lookups`](/api/lookups) | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from `spacy-lookups-data`. | `None` |
|
||||
| `overwrite` | bool | Whether to overwrite existing lemmas. | `False` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Not yet implemented:** the model to use. | `None` |
|
||||
| Setting | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `mode` | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
|
||||
| `lookups` | The lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. If `None`, default tables are loaded from [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data). Defaults to `None`. ~~Optional[Lookups]~~ |
|
||||
| `overwrite` | Whether to overwrite existing lemmas. Defaults to `False`. ~~bool~~ |
|
||||
| `model` | **Not yet implemented:** the model to use. ~~Model~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/lemmatizer.py
|
||||
|
@ -77,15 +75,15 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | [`Vocab`](/api/vocab) | The vocab. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model (not yet implemented). |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | **Not yet implemented:** The model to use. ~~Model~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | | |
|
||||
| mode | str | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. |
|
||||
| lookups | [`Lookups`](/api/lookups) | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. |
|
||||
| overwrite | bool | Whether to overwrite existing lemmas. |
|
||||
| mode | The lemmatizer mode, e.g. `"lookup"` or `"rule"`. Defaults to `"lookup"`. ~~str~~ |
|
||||
| lookups | A lookups object containing the tables such as `"lemma_rules"`, `"lemma_index"`, `"lemma_exc"` and `"lemma_lookup"`. Defaults to `None`. ~~Optional[Lookups]~~ |
|
||||
| overwrite | Whether to overwrite existing lemmas. ~~bool~ |
|
||||
|
||||
## Lemmatizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -102,10 +100,10 @@ and all pipeline components are applied to the `Doc` in order.
|
|||
> processed = lemmatizer(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| `doc` | `Doc` | The document to process. |
|
||||
| **RETURNS** | `Doc` | The processed document. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## Lemmatizer.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -121,12 +119,12 @@ applied to the `Doc` in order.
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------ |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| _keyword-only_ | | |
|
||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Lemmatizer.lookup_lemmatize {#lookup_lemmatize tag="method"}
|
||||
|
||||
|
@ -134,39 +132,39 @@ Lemmatize a token using a lookup-based approach. If no lemma is found, the
|
|||
original string is returned. Languages can provide a
|
||||
[lookup table](/usage/adding-languages#lemmatizer) via the `Lookups`.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------- | ------------------------------------- |
|
||||
| `token` | [`Token`](/api/token) | The token to lemmatize. |
|
||||
| **RETURNS** | `List[str]` | A list containing one or more lemmas. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------- |
|
||||
| `token` | The token to lemmatize. ~~Token~~ |
|
||||
| **RETURNS** | A list containing one or more lemmas. ~~List[str]~~ |
|
||||
|
||||
## Lemmatizer.rule_lemmatize {#rule_lemmatize tag="method"}
|
||||
|
||||
Lemmatize a token using a rule-based approach. Typically relies on POS tags.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------- | ------------------------------------- |
|
||||
| `token` | [`Token`](/api/token) | The token to lemmatize. |
|
||||
| **RETURNS** | `List[str]` | A list containing one or more lemmas. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------- |
|
||||
| `token` | The token to lemmatize. ~~Token~~ |
|
||||
| **RETURNS** | A list containing one or more lemmas. ~~List[str]~~ |
|
||||
|
||||
## Lemmatizer.is_base_form {#is_base_form tag="method"}
|
||||
|
||||
Check whether we're dealing with an uninflected paradigm, so we can avoid
|
||||
lemmatization entirely.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------- | ------------------------------------------------------------------------------------------------------- |
|
||||
| `token` | [`Token`](/api/token) | The token to analyze. |
|
||||
| **RETURNS** | bool | Whether the token's attributes (e.g., part-of-speech tag, morphological features) describe a base form. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------------------- |
|
||||
| `token` | The token to analyze. ~~Token~~ |
|
||||
| **RETURNS** | Whether the token's attributes (e.g., part-of-speech tag, morphological features) describe a base form. ~~bool~~ |
|
||||
|
||||
## Lemmatizer.get_lookups_config {#get_lookups_config tag="classmethod"}
|
||||
|
||||
Returns the lookups configuration settings for a given mode for use in
|
||||
[`Lemmatizer.load_lookups`](#load_lookups).
|
||||
[`Lemmatizer.load_lookups`](/api/lemmatizer#load_lookups).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------------- |
|
||||
| `mode` | str | The lemmatizer mode. |
|
||||
| **RETURNS** | dict | The lookups configuration settings for this mode. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `mode` | The lemmatizer mode. ~~str~~ |
|
||||
| **RETURNS** | The lookups configuration settings for this mode. Includes the keys `"required_tables"` and `"optional_tables"`, mapped to a list of table string names. ~~Dict[str, List[str]]~~ |
|
||||
|
||||
## Lemmatizer.load_lookups {#load_lookups tag="classmethod"}
|
||||
|
||||
|
@ -174,12 +172,12 @@ Load and validate lookups tables. If the provided lookups is `None`, load the
|
|||
default lookups tables according to the language and mode settings. Confirm that
|
||||
all required tables for the language and mode are present.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------------- | ---------------------------------------------------------------------------- |
|
||||
| `lang` | str | The language. |
|
||||
| `mode` | str | The lemmatizer mode. |
|
||||
| `lookups` | [`Lookups`](/api/lookups) | The provided lookups, may be `None` if the default lookups should be loaded. |
|
||||
| **RETURNS** | [`Lookups`](/api/lookups) | The lookups object. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | The language. ~~str~~ |
|
||||
| `mode` | The lemmatizer mode. ~~str~~ |
|
||||
| `lookups` | The provided lookups, may be `None` if the default lookups should be loaded. ~~Optional[Lookups]~~ |
|
||||
| **RETURNS** | The lookups. ~~Lookups~~ |
|
||||
|
||||
## Lemmatizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -192,11 +190,11 @@ Serialize the pipe to disk.
|
|||
> lemmatizer.to_disk("/path/to/lemmatizer")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## Lemmatizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -209,12 +207,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> lemmatizer.from_disk("/path/to/lemmatizer")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Lemmatizer` | The modified `Lemmatizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `Lemmatizer` object. ~~Lemmatizer~~ |
|
||||
|
||||
## Lemmatizer.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -227,11 +225,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Lemmatizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `Lemmatizer` object. ~~bytes~~ |
|
||||
|
||||
## Lemmatizer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -245,27 +243,20 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> lemmatizer.from_bytes(lemmatizer_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Lemmatizer` | The `Lemmatizer` object. |
|
||||
|
||||
## Lemmatizer.mode {#mode tag="property"}
|
||||
|
||||
The lemmatizer mode.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------- |
|
||||
| **RETURNS** | `str` | The lemmatizer mode. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `Lemmatizer` object. ~~Lemmatizer~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | --------------------------------- | ------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `lookups` | [`Lookups`](/api/lookups) | The lookups object. |
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). ~~Vocab~~ |
|
||||
| `lookups` | The lookups object. ~~Lookups~~ |
|
||||
| `mode` | The lemmatizer mode. ~~str~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -13,10 +13,10 @@ lemmatization depends on the part-of-speech tag).
|
|||
|
||||
Create a `Lexeme` object.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------- | -------------------------- |
|
||||
| `vocab` | `Vocab` | The parent vocabulary. |
|
||||
| `orth` | int | The orth id of the lexeme. |
|
||||
| Name | Description |
|
||||
| ------- | ---------------------------------- |
|
||||
| `vocab` | The parent vocabulary. ~~Vocab~~ |
|
||||
| `orth` | The orth id of the lexeme. ~~int~~ |
|
||||
|
||||
## Lexeme.set_flag {#set_flag tag="method"}
|
||||
|
||||
|
@ -29,10 +29,10 @@ Change the value of a boolean flag.
|
|||
> nlp.vocab["spaCy"].set_flag(COOL_FLAG, True)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---- | ------------------------------------ |
|
||||
| `flag_id` | int | The attribute ID of the flag to set. |
|
||||
| `value` | bool | The new value of the flag. |
|
||||
| Name | Description |
|
||||
| --------- | -------------------------------------------- |
|
||||
| `flag_id` | The attribute ID of the flag to set. ~~int~~ |
|
||||
| `value` | The new value of the flag. ~~bool~~ |
|
||||
|
||||
## Lexeme.check_flag {#check_flag tag="method"}
|
||||
|
||||
|
@ -46,10 +46,10 @@ Check the value of a boolean flag.
|
|||
> assert nlp.vocab["spaCy"].check_flag(MY_LIBRARY) == True
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------- |
|
||||
| `flag_id` | int | The attribute ID of the flag to query. |
|
||||
| **RETURNS** | bool | The value of the flag. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------- |
|
||||
| `flag_id` | The attribute ID of the flag to query. ~~int~~ |
|
||||
| **RETURNS** | The value of the flag. ~~bool~~ |
|
||||
|
||||
## Lexeme.similarity {#similarity tag="method" model="vectors"}
|
||||
|
||||
|
@ -65,10 +65,10 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
|||
> assert apple_orange == orange_apple
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------------------------------------------------------- |
|
||||
| other | - | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. |
|
||||
| **RETURNS** | float | A scalar similarity score. Higher is more similar. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| other | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ |
|
||||
| **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~ |
|
||||
|
||||
## Lexeme.has_vector {#has_vector tag="property" model="vectors"}
|
||||
|
||||
|
@ -81,9 +81,9 @@ A boolean value indicating whether a word vector is associated with the lexeme.
|
|||
> assert apple.has_vector
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ---------------------------------------------- |
|
||||
| **RETURNS** | bool | Whether the lexeme has a vector data attached. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------- |
|
||||
| **RETURNS** | Whether the lexeme has a vector data attached. ~~bool~~ |
|
||||
|
||||
## Lexeme.vector {#vector tag="property" model="vectors"}
|
||||
|
||||
|
@ -97,9 +97,9 @@ A real-valued meaning representation.
|
|||
> assert apple.vector.shape == (300,)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------------------------------- | ----------------------------------------------------- |
|
||||
| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A 1D numpy array representing the lexeme's semantics. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------ |
|
||||
| **RETURNS** | A 1-dimensional array representing the lexeme's vector. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
|
||||
## Lexeme.vector_norm {#vector_norm tag="property" model="vectors"}
|
||||
|
||||
|
@ -115,50 +115,50 @@ The L2 norm of the lexeme's vector representation.
|
|||
> assert apple.vector_norm != pasta.vector_norm
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ----------------------------------------- |
|
||||
| **RETURNS** | float | The L2 norm of the vector representation. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------- |
|
||||
| **RETURNS** | The L2 norm of the vector representation. ~~float~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `vocab` | `Vocab` | The lexeme's vocabulary. |
|
||||
| `text` | str | Verbatim text content. |
|
||||
| `orth` | int | ID of the verbatim text content. |
|
||||
| `orth_` | str | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. |
|
||||
| `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. |
|
||||
| `flags` | int | Container of the lexeme's binary flags. |
|
||||
| `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. |
|
||||
| `norm_` | str | The lexemes's norm, i.e. a normalized form of the lexeme text. |
|
||||
| `lower` | int | Lowercase form of the word. |
|
||||
| `lower_` | str | Lowercase form of the word. |
|
||||
| `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
||||
| `shape_` | str | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
||||
| `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. |
|
||||
| `prefix_` | str | Length-N substring from the start of the word. Defaults to `N=1`. |
|
||||
| `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. |
|
||||
| `suffix_` | str | Length-N substring from the start of the word. Defaults to `N=3`. |
|
||||
| `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. |
|
||||
| `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. |
|
||||
| `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. |
|
||||
| `is_lower` | bool | Is the lexeme in lowercase? Equivalent to `lexeme.text.islower()`. |
|
||||
| `is_upper` | bool | Is the lexeme in uppercase? Equivalent to `lexeme.text.isupper()`. |
|
||||
| `is_title` | bool | Is the lexeme in titlecase? Equivalent to `lexeme.text.istitle()`. |
|
||||
| `is_punct` | bool | Is the lexeme punctuation? |
|
||||
| `is_left_punct` | bool | Is the lexeme a left punctuation mark, e.g. `(`? |
|
||||
| `is_right_punct` | bool | Is the lexeme a right punctuation mark, e.g. `)`? |
|
||||
| `is_space` | bool | Does the lexeme consist of whitespace characters? Equivalent to `lexeme.text.isspace()`. |
|
||||
| `is_bracket` | bool | Is the lexeme a bracket? |
|
||||
| `is_quote` | bool | Is the lexeme a quotation mark? |
|
||||
| `is_currency` <Tag variant="new">2.0.8</Tag> | bool | Is the lexeme a currency symbol? |
|
||||
| `like_url` | bool | Does the lexeme resemble a URL? |
|
||||
| `like_num` | bool | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. |
|
||||
| `like_email` | bool | Does the lexeme resemble an email address? |
|
||||
| `is_oov` | bool | Does the lexeme have a word vector? |
|
||||
| `is_stop` | bool | Is the lexeme part of a "stop list"? |
|
||||
| `lang` | int | Language of the parent vocabulary. |
|
||||
| `lang_` | str | Language of the parent vocabulary. |
|
||||
| `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). |
|
||||
| `cluster` | int | Brown cluster ID. |
|
||||
| `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. |
|
||||
| Name | Description |
|
||||
| -------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The lexeme's vocabulary. ~~Vocab~~ |
|
||||
| `text` | Verbatim text content. ~~str~~ |
|
||||
| `orth` | ID of the verbatim text content. ~~int~~ |
|
||||
| `orth_` | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. ~~str~~ |
|
||||
| `rank` | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
|
||||
| `flags` | Container of the lexeme's binary flags. ~~int~~ |
|
||||
| `norm` | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~int~~ |
|
||||
| `norm_` | The lexemes's norm, i.e. a normalized form of the lexeme text. ~~str~~ |
|
||||
| `lower` | Lowercase form of the word. ~~int~~ |
|
||||
| `lower_` | Lowercase form of the word. ~~str~~ |
|
||||
| `shape` | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
|
||||
| `shape_` | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
|
||||
| `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ |
|
||||
| `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ |
|
||||
| `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ |
|
||||
| `suffix_` | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~ |
|
||||
| `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ |
|
||||
| `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ |
|
||||
| `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ |
|
||||
| `is_lower` | Is the lexeme in lowercase? Equivalent to `lexeme.text.islower()`. ~~bool~~ |
|
||||
| `is_upper` | Is the lexeme in uppercase? Equivalent to `lexeme.text.isupper()`. ~~bool~~ |
|
||||
| `is_title` | Is the lexeme in titlecase? Equivalent to `lexeme.text.istitle()`. ~~bool~~ |
|
||||
| `is_punct` | Is the lexeme punctuation? ~~bool~~ |
|
||||
| `is_left_punct` | Is the lexeme a left punctuation mark, e.g. `(`? ~~bool~~ |
|
||||
| `is_right_punct` | Is the lexeme a right punctuation mark, e.g. `)`? ~~bool~~ |
|
||||
| `is_space` | Does the lexeme consist of whitespace characters? Equivalent to `lexeme.text.isspace()`. ~~bool~~ |
|
||||
| `is_bracket` | Is the lexeme a bracket? ~~bool~~ |
|
||||
| `is_quote` | Is the lexeme a quotation mark? ~~bool~~ |
|
||||
| `is_currency` <Tag variant="new">2.0.8</Tag> | Is the lexeme a currency symbol? ~~bool~~ |
|
||||
| `like_url` | Does the lexeme resemble a URL? ~~bool~~ |
|
||||
| `like_num` | Does the lexeme represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ |
|
||||
| `like_email` | Does the lexeme resemble an email address? ~~bool~~ |
|
||||
| `is_oov` | Does the lexeme have a word vector? ~~bool~~ |
|
||||
| `is_stop` | Is the lexeme part of a "stop list"? ~~bool~~ |
|
||||
| `lang` | Language of the parent vocabulary. ~~int~~ |
|
||||
| `lang_` | Language of the parent vocabulary. ~~str~~ |
|
||||
| `prob` | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). ~~float~~ |
|
||||
| `cluster` | Brown cluster ID. ~~int~~ |
|
||||
| `sentiment` | A scalar value indicating the positivity or negativity of the lexeme. ~~float~~ |
|
||||
|
|
|
@ -24,10 +24,6 @@ Create a `Lookups` object.
|
|||
> lookups = Lookups()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------- | ----------------------------- |
|
||||
| **RETURNS** | `Lookups` | The newly constructed object. |
|
||||
|
||||
## Lookups.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
Get the current number of tables in the lookups.
|
||||
|
@ -39,9 +35,9 @@ Get the current number of tables in the lookups.
|
|||
> assert len(lookups) == 0
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------ |
|
||||
| **RETURNS** | int | The number of tables in the lookups. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------- |
|
||||
| **RETURNS** | The number of tables in the lookups. ~~int~~ |
|
||||
|
||||
## Lookups.\_\contains\_\_ {#contains tag="method"}
|
||||
|
||||
|
@ -56,10 +52,10 @@ Check if the lookups contain a table of a given name. Delegates to
|
|||
> assert "some_table" in lookups
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------------------- |
|
||||
| `name` | str | Name of the table. |
|
||||
| **RETURNS** | bool | Whether a table of that name is in the lookups. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------- |
|
||||
| `name` | Name of the table. ~~str~~ |
|
||||
| **RETURNS** | Whether a table of that name is in the lookups. ~~bool~~ |
|
||||
|
||||
## Lookups.tables {#tables tag="property"}
|
||||
|
||||
|
@ -73,9 +69,9 @@ Get the names of all tables in the lookups.
|
|||
> assert lookups.tables == ["some_table"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------- |
|
||||
| **RETURNS** | list | Names of the tables in the lookups. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------- |
|
||||
| **RETURNS** | Names of the tables in the lookups. ~~List[str]~~ |
|
||||
|
||||
## Lookups.add_table {#add_table tag="method"}
|
||||
|
||||
|
@ -89,11 +85,11 @@ exists.
|
|||
> lookups.add_table("some_table", {"foo": "bar"})
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------------- | ---------------------------------- |
|
||||
| `name` | str | Unique name of the table. |
|
||||
| `data` | dict | Optional data to add to the table. |
|
||||
| **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------- |
|
||||
| `name` | Unique name of the table. ~~str~~ |
|
||||
| `data` | Optional data to add to the table. ~~dict~~ |
|
||||
| **RETURNS** | The newly added table. ~~Table~~ |
|
||||
|
||||
## Lookups.get_table {#get_table tag="method"}
|
||||
|
||||
|
@ -108,10 +104,10 @@ Get a table from the lookups. Raises an error if the table doesn't exist.
|
|||
> assert table["foo"] == "bar"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------------- | ------------------ |
|
||||
| `name` | str | Name of the table. |
|
||||
| **RETURNS** | [`Table`](/api/lookups#table) | The table. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------- |
|
||||
| `name` | Name of the table. ~~str~~ |
|
||||
| **RETURNS** | The table. ~~Table~~ |
|
||||
|
||||
## Lookups.remove_table {#remove_table tag="method"}
|
||||
|
||||
|
@ -126,10 +122,10 @@ Remove a table from the lookups. Raises an error if the table doesn't exist.
|
|||
> assert "some_table" not in lookups
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------------- | ---------------------------- |
|
||||
| `name` | str | Name of the table to remove. |
|
||||
| **RETURNS** | [`Table`](/api/lookups#table) | The removed table. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------ |
|
||||
| `name` | Name of the table to remove. ~~str~~ |
|
||||
| **RETURNS** | The removed table. ~~Table~~ |
|
||||
|
||||
## Lookups.has_table {#has_table tag="method"}
|
||||
|
||||
|
@ -144,10 +140,10 @@ Check if the lookups contain a table of a given name. Equivalent to
|
|||
> assert lookups.has_table("some_table")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------------------- |
|
||||
| `name` | str | Name of the table. |
|
||||
| **RETURNS** | bool | Whether a table of that name is in the lookups. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------- |
|
||||
| `name` | Name of the table. ~~str~~ |
|
||||
| **RETURNS** | Whether a table of that name is in the lookups. ~~bool~~ |
|
||||
|
||||
## Lookups.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -159,9 +155,9 @@ Serialize the lookups to a bytestring.
|
|||
> lookup_bytes = lookups.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ----------------------- |
|
||||
| **RETURNS** | bytes | The serialized lookups. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------- |
|
||||
| **RETURNS** | The serialized lookups. ~~bytes~~ |
|
||||
|
||||
## Lookups.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -175,10 +171,10 @@ Load the lookups from a bytestring.
|
|||
> lookups.from_bytes(lookup_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | --------- | ---------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| **RETURNS** | `Lookups` | The loaded lookups. |
|
||||
| Name | Description |
|
||||
| ------------ | -------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| **RETURNS** | The loaded lookups. ~~Lookups~~ |
|
||||
|
||||
## Lookups.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -191,9 +187,9 @@ which will be created if it doesn't exist.
|
|||
> lookups.to_disk("/path/to/lookups")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Description |
|
||||
| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
|
||||
## Lookups.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -208,10 +204,10 @@ the file doesn't exist.
|
|||
> lookups.from_disk("/path/to/lookups")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `Lookups` | The loaded lookups. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| **RETURNS** | The loaded lookups. ~~Lookups~~ |
|
||||
|
||||
## Table {#table tag="class, ordererddict"}
|
||||
|
||||
|
@ -236,9 +232,9 @@ Initialize a new table.
|
|||
> assert table["foo"] == "bar"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---- | ---------------------------------- |
|
||||
| `name` | str | Optional table name for reference. |
|
||||
| Name | Description |
|
||||
| ------ | ------------------------------------------ |
|
||||
| `name` | Optional table name for reference. ~~str~~ |
|
||||
|
||||
### Table.from_dict {#table.from_dict tag="classmethod"}
|
||||
|
||||
|
@ -252,11 +248,11 @@ Initialize a new table from a dict.
|
|||
> table = Table.from_dict(data, name="some_table")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ---------------------------------- |
|
||||
| `data` | dict | The dictionary. |
|
||||
| `name` | str | Optional table name for reference. |
|
||||
| **RETURNS** | `Table` | The newly constructed object. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------ |
|
||||
| `data` | The dictionary. ~~dict~~ |
|
||||
| `name` | Optional table name for reference. ~~str~~ |
|
||||
| **RETURNS** | The newly constructed object. ~~Table~~ |
|
||||
|
||||
### Table.set {#table.set tag="method"}
|
||||
|
||||
|
@ -272,10 +268,10 @@ Set a new key / value pair. String keys will be hashed. Same as
|
|||
> assert table["foo"] == "bar"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | --------- | ----------- |
|
||||
| `key` | str / int | The key. |
|
||||
| `value` | - | The value. |
|
||||
| Name | Description |
|
||||
| ------- | ---------------------------- |
|
||||
| `key` | The key. ~~Union[str, int]~~ |
|
||||
| `value` | The value. |
|
||||
|
||||
### Table.to_bytes {#table.to_bytes tag="method"}
|
||||
|
||||
|
@ -287,9 +283,9 @@ Serialize the table to a bytestring.
|
|||
> table_bytes = table.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------- |
|
||||
| **RETURNS** | bytes | The serialized table. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------- |
|
||||
| **RETURNS** | The serialized table. ~~bytes~~ |
|
||||
|
||||
### Table.from_bytes {#table.from_bytes tag="method"}
|
||||
|
||||
|
@ -303,15 +299,15 @@ Load a table from a bytestring.
|
|||
> table.from_bytes(table_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------- | ----------------- |
|
||||
| `bytes_data` | bytes | The data to load. |
|
||||
| **RETURNS** | `Table` | The loaded table. |
|
||||
| Name | Description |
|
||||
| ------------ | --------------------------- |
|
||||
| `bytes_data` | The data to load. ~~bytes~~ |
|
||||
| **RETURNS** | The loaded table. ~~Table~~ |
|
||||
|
||||
### Attributes {#table-attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------- | ----------------------------------------------------- |
|
||||
| `name` | str | Table name. |
|
||||
| `default_size` | int | Default size of bloom filters if no data is provided. |
|
||||
| `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `name` | Table name. ~~str~~ |
|
||||
| `default_size` | Default size of bloom filters if no data is provided. ~~int~~ |
|
||||
| `bloom` | The bloom filters. ~~preshed.BloomFilter~~ |
|
||||
|
|
|
@ -30,20 +30,20 @@ pattern keys correspond to a number of
|
|||
[`Token` attributes](/api/token#attributes). The supported attributes for
|
||||
rule-based matching are:
|
||||
|
||||
| Attribute | Type | Description |
|
||||
| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
|
||||
| `ORTH` | str | The exact verbatim text of a token. |
|
||||
| `TEXT` <Tag variant="new">2.1</Tag> | str | The exact verbatim text of a token. |
|
||||
| `LOWER` | str | The lowercase form of the token text. |
|
||||
| `LENGTH` | int | The length of the token text. |
|
||||
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
|
||||
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|
||||
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|
||||
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|
||||
| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
|
||||
| `ENT_TYPE` | str | The token's entity label. |
|
||||
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
|
||||
| `OP` | str | Operator or quantifier to determine how often to match a token pattern. |
|
||||
| Attribute | Description |
|
||||
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
|
||||
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
|
||||
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|
||||
| `LENGTH` | The length of the token text. ~~int~~ |
|
||||
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|
||||
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|
||||
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|
||||
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|
||||
| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ |
|
||||
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
||||
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
||||
| `OP` | Operator or quantifier to determine how often to match a token pattern. ~~str~~ |
|
||||
|
||||
Operators and quantifiers define **how often** a token pattern should be
|
||||
matched:
|
||||
|
@ -75,11 +75,11 @@ it compares to another value.
|
|||
> ]
|
||||
> ```
|
||||
|
||||
| Attribute | Type | Description |
|
||||
| -------------------------- | ---------- | --------------------------------------------------------------------------------- |
|
||||
| `IN` | any | Attribute value is member of a list. |
|
||||
| `NOT_IN` | any | Attribute value is _not_ member of a list. |
|
||||
| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. |
|
||||
| Attribute | Description |
|
||||
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
|
||||
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
||||
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
||||
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
||||
|
||||
## Matcher.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
|
@ -95,10 +95,10 @@ string where an integer is expected) or unexpected property names.
|
|||
> matcher = Matcher(nlp.vocab)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
|
||||
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate all patterns added to this matcher. |
|
||||
| Name | Description |
|
||||
| --------------------------------------- | ----------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ |
|
||||
| `validate` <Tag variant="new">2.1</Tag> | Validate all patterns added to this matcher. ~~bool~~ |
|
||||
|
||||
## Matcher.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -116,10 +116,10 @@ Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
|
|||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `doclike` | `Doc`/`Span` | The `Doc` or `Span` to match over. |
|
||||
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doclike` | The `Doc` or `Span` to match over. ~~Union[Doc, Span]~~ |
|
||||
| **RETURNS** | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
|
||||
|
||||
## Matcher.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -134,13 +134,13 @@ Match a stream of documents, yielding them in turn.
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------------------------------------- | -------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | iterable | A stream of documents or spans. |
|
||||
| `batch_size` | int | The number of documents to accumulate into a working set. |
|
||||
| `return_matches` <Tag variant="new">2.1</Tag> | bool | Yield the match lists along with the docs, making results `(doc, matches)` tuples. |
|
||||
| `as_tuples` | bool | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. |
|
||||
| **YIELDS** | `Doc` | Documents, in order. |
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | A stream of documents or spans. ~~Iterable[Union[Doc, Span]]~~ |
|
||||
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
|
||||
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
|
||||
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
|
||||
| **YIELDS** | Documents, in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
|
||||
|
||||
## Matcher.\_\_len\_\_ {#len tag="method" new="2"}
|
||||
|
||||
|
@ -157,9 +157,9 @@ patterns.
|
|||
> assert len(matcher) == 1
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------- |
|
||||
| **RETURNS** | int | The number of rules. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The number of rules. ~~int~~ |
|
||||
|
||||
## Matcher.\_\_contains\_\_ {#contains tag="method" new="2"}
|
||||
|
||||
|
@ -174,10 +174,10 @@ Check whether the matcher contains rules for a match ID.
|
|||
> assert "Rule" in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------------------------- |
|
||||
| `key` | str | The match ID. |
|
||||
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------- |
|
||||
| `key` | The match ID. ~~str~~ |
|
||||
| **RETURNS** | Whether the matcher contains rules for this match ID. ~~bool~~ |
|
||||
|
||||
## Matcher.add {#add tag="method" new="2"}
|
||||
|
||||
|
@ -217,13 +217,13 @@ patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
|
|||
|
||||
</Infobox>
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------------------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `patterns` | `List[List[dict]]` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||
| _keyword-only_ | | |
|
||||
| `on_match` | callable / `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
| `greedy` <Tag variant="new">3</Tag> | str | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`. |
|
||||
| Name | Description |
|
||||
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | An ID for the thing you're matching. ~~str~~ |
|
||||
| `patterns` | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. ~~List[List[Dict[str, Any]]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[Matcher, Doc, int, List[tuple], Any]]~~ |
|
||||
| `greedy` <Tag variant="new">3</Tag> | Optional filter for greedy matches. Can either be `"FIRST"` or `"LONGEST"`. ~~Optional[str]~~ |
|
||||
|
||||
## Matcher.remove {#remove tag="method" new="2"}
|
||||
|
||||
|
@ -239,9 +239,9 @@ exist.
|
|||
> assert "Rule" not in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----- | ---- | ------------------------- |
|
||||
| `key` | str | The ID of the match rule. |
|
||||
| Name | Description |
|
||||
| ----- | --------------------------------- |
|
||||
| `key` | The ID of the match rule. ~~str~~ |
|
||||
|
||||
## Matcher.get {#get tag="method" new="2"}
|
||||
|
||||
|
@ -255,7 +255,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
|
|||
> on_match, patterns = matcher.get("Rule")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------------- |
|
||||
| `key` | str | The ID of the match rule. |
|
||||
| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------------- |
|
||||
| `key` | The ID of the match rule. ~~str~~ |
|
||||
| **RETURNS** | The rule, as an `(on_match, patterns)` tuple. ~~Tuple[Optional[Callable], List[List[dict]]]~~ |
|
||||
|
|
|
@ -1,142 +0,0 @@
|
|||
---
|
||||
title: MorphAnalysis
|
||||
tag: class
|
||||
source: spacy/tokens/morphanalysis.pyx
|
||||
---
|
||||
|
||||
Stores a single morphological analysis.
|
||||
|
||||
## MorphAnalysis.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Initialize a MorphAnalysis object from a UD FEATS string or a dictionary of
|
||||
morphological features.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.tokens import MorphAnalysis
|
||||
>
|
||||
> feats = "Feat1=Val1|Feat2=Val2"
|
||||
> m = MorphAnalysis(nlp.vocab, feats)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------------------ | --------------------------- |
|
||||
| `vocab` | `Vocab` | The vocab. |
|
||||
| `features` | `Union[Dict, str]` | The morphological features. |
|
||||
|
||||
## MorphAnalysis.\_\_contains\_\_ {#contains tag="method"}
|
||||
|
||||
Whether a feature/value pair is in the analysis.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> feats = "Feat1=Val1,Val2|Feat2=Val2"
|
||||
> morph = MorphAnalysis(nlp.vocab, feats)
|
||||
> assert "Feat1=Val1" in morph
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------- |
|
||||
| **RETURNS** | `str` | A feature/value pair in the analysis. |
|
||||
|
||||
## MorphAnalysis.\_\_iter\_\_ {#iter tag="method"}
|
||||
|
||||
Iterate over the feature/value pairs in the analysis.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> feats = "Feat1=Val1,Val3|Feat2=Val2"
|
||||
> morph = MorphAnalysis(nlp.vocab, feats)
|
||||
> assert list(morph) == ["Feat1=Va1", "Feat1=Val3", "Feat2=Val2"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ----- | ------------------------------------- |
|
||||
| **YIELDS** | `str` | A feature/value pair in the analysis. |
|
||||
|
||||
## MorphAnalysis.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
Returns the number of features in the analysis.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> feats = "Feat1=Val1,Val2|Feat2=Val2"
|
||||
> morph = MorphAnalysis(nlp.vocab, feats)
|
||||
> assert len(morph) == 3
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------- |
|
||||
| **RETURNS** | `int` | The number of features in the analysis. |
|
||||
|
||||
## MorphAnalysis.\_\_str\_\_ {#str tag="method"}
|
||||
|
||||
Returns the morphological analysis in the UD FEATS string format.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> feats = "Feat1=Val1,Val2|Feat2=Val2"
|
||||
> morph = MorphAnalysis(nlp.vocab, feats)
|
||||
> assert str(morph) == feats
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------- |
|
||||
| **RETURNS** | `str` | The analysis in UD FEATS format. |
|
||||
|
||||
## MorphAnalysis.get {#get tag="method"}
|
||||
|
||||
Retrieve values for a feature by field.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> feats = "Feat1=Val1,Val2"
|
||||
> morph = MorphAnalysis(nlp.vocab, feats)
|
||||
> assert morph.get("Feat1") == ["Val1", "Val2"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | ---------------------------------- |
|
||||
| `field` | `str` | The field to retrieve. |
|
||||
| **RETURNS** | `list` | A list of the individual features. |
|
||||
|
||||
## MorphAnalysis.to_dict {#to_dict tag="method"}
|
||||
|
||||
Produce a dict representation of the analysis, in the same format as the tag
|
||||
map.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> feats = "Feat1=Val1,Val2|Feat2=Val2"
|
||||
> morph = MorphAnalysis(nlp.vocab, feats)
|
||||
> assert morph.to_dict() == {"Feat1": "Val1,Val2", "Feat2": "Val2"}
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | ---------------------------------------- |
|
||||
| **RETURNS** | `dict` | The dict representation of the analysis. |
|
||||
|
||||
## MorphAnalysis.from_id {#from_id tag="classmethod"}
|
||||
|
||||
Create a morphological analysis from a given hash ID.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> feats = "Feat1=Val1|Feat2=Val2"
|
||||
> hash = nlp.vocab.strings[feats]
|
||||
> morph = MorphAnalysis.from_id(nlp.vocab, hash)
|
||||
> assert str(morph) == feats
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------- | -------------------------------- |
|
||||
| `vocab` | `Vocab` | The vocab. |
|
||||
| `key` | `int` | The hash of the features string. |
|
|
@ -32,9 +32,9 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("morphologizer", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ------- | ------------------------------------------ | ----------------- | ----------------------------------- |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [Tagger](/api/architectures#Tagger) |
|
||||
| Setting | Description |
|
||||
| ------- | ------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | The model to use. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/morphologizer.pyx
|
||||
|
@ -42,7 +42,9 @@ https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/morphologizer.pyx
|
|||
|
||||
## Morphologizer.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Initialize the morphologizer.
|
||||
Create a new pipeline instance. In your application, you would normally use a
|
||||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -59,18 +61,14 @@ Initialize the morphologizer.
|
|||
> morphologizer = Morphologizer(nlp.vocab, model)
|
||||
> ```
|
||||
|
||||
Create a new pipeline instance. In your application, you would normally use a
|
||||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| _keyword-only_ | | |
|
||||
| `labels_morph` | dict | Mapping of morph + POS tags to morph labels. |
|
||||
| `labels_pos` | dict | Mapping of morph + POS tags to POS tags. |
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `labels_morph` | Mapping of morph + POS tags to morph labels. ~~Dict[str, str]~~ |
|
||||
| `labels_pos` | Mapping of morph + POS tags to POS tags. ~~Dict[str, str]~~ |
|
||||
|
||||
## Morphologizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -90,10 +88,10 @@ delegate to the [`predict`](/api/morphologizer#predict) and
|
|||
> processed = morphologizer(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| `doc` | `Doc` | The document to process. |
|
||||
| **RETURNS** | `Doc` | The processed document. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## Morphologizer.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -112,12 +110,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/morphologizer#call) and
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------ |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| _keyword-only_ | | |
|
||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Morphologizer.begin_training {#begin_training tag="method"}
|
||||
|
||||
|
@ -138,13 +136,13 @@ setting up the label scheme based on the data.
|
|||
> optimizer = morphologizer.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/sentencerecognizer#create_optimizer) if not set. |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## Morphologizer.predict {#predict tag="method"}
|
||||
|
||||
|
@ -158,10 +156,10 @@ modifying them.
|
|||
> scores = morphologizer.predict([doc1, doc2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------- | ----------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||
| **RETURNS** | - | The model's prediction for each document. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------- |
|
||||
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
||||
| **RETURNS** | The model's prediction for each document. |
|
||||
|
||||
## Morphologizer.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
|
@ -175,10 +173,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
|
|||
> morphologizer.set_annotations([doc1, doc2], scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | --------------- | ------------------------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||
| `scores` | - | The scores to set, produced by `Morphologizer.predict`. |
|
||||
| Name | Description |
|
||||
| -------- | ------------------------------------------------------- |
|
||||
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
||||
| `scores` | The scores to set, produced by `Morphologizer.predict`. |
|
||||
|
||||
## Morphologizer.update {#update tag="method"}
|
||||
|
||||
|
@ -195,15 +193,15 @@ Delegates to [`predict`](/api/morphologizer#predict) and
|
|||
> losses = morphologizer.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| Name | Description |
|
||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/sentencerecognizer#set_annotations). |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## Morphologizer.get_loss {#get_loss tag="method"}
|
||||
|
||||
|
@ -218,11 +216,11 @@ predicted scores.
|
|||
> loss, d_loss = morphologizer.get_loss(examples, scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------- | --------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||
| `scores` | - | Scores representing the model's predictions. |
|
||||
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------- |
|
||||
| `examples` | The batch of examples. ~~Iterable[Example]~~ |
|
||||
| `scores` | Scores representing the model's predictions. |
|
||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||
|
||||
## Morphologizer.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
|
@ -235,9 +233,9 @@ Create an optimizer for the pipeline component.
|
|||
> optimizer = morphologizer.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------------------------------------- | -------------- |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## Morphologizer.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
|
@ -252,9 +250,9 @@ context, the original parameters are restored.
|
|||
> morphologizer.to_disk("/best_model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---- | ----------------------------------------- |
|
||||
| `params` | dict | The parameter values to use in the model. |
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------- |
|
||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||
|
||||
## Morphologizer.add_label {#add_label tag="method"}
|
||||
|
||||
|
@ -268,10 +266,10 @@ both `pos` and `morph`, the label should include the UPOS as the feature `POS`.
|
|||
> morphologizer.add_label("Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------------------------- |
|
||||
| `label` | str | The label to add. |
|
||||
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------- |
|
||||
| `label` | The label to add. ~~str~~ |
|
||||
| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
|
||||
|
||||
## Morphologizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -284,11 +282,11 @@ Serialize the pipe to disk.
|
|||
> morphologizer.to_disk("/path/to/morphologizer")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## Morphologizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -301,12 +299,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> morphologizer.from_disk("/path/to/morphologizer")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Morphologizer` | The modified `Morphologizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `Morphologizer` object. ~~Morphologizer~~ |
|
||||
|
||||
## Morphologizer.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -319,11 +317,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Morphologizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `Morphologizer` object. ~~bytes~~ |
|
||||
|
||||
## Morphologizer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -337,19 +335,19 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> morphologizer.from_bytes(morphologizer_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Morphologizer` | The `Morphologizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `Morphologizer` object. ~~Morphologizer~~ |
|
||||
|
||||
## Morphologizer.labels {#labels tag="property"}
|
||||
|
||||
The labels currently added to the component in Universal Dependencies
|
||||
[FEATS format](https://universaldependencies.org/format.html#morphological-annotation).
|
||||
Note that even for a blank component, this will always include the internal
|
||||
empty label `_`. If POS features are used, the labels will include the
|
||||
The labels currently added to the component in the Universal Dependencies
|
||||
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
||||
format. Note that even for a blank component, this will always include the
|
||||
internal empty label `_`. If POS features are used, the labels will include the
|
||||
coarse-grained POS as the feature `POS`.
|
||||
|
||||
> #### Example
|
||||
|
@ -359,9 +357,9 @@ coarse-grained POS as the feature `POS`.
|
|||
> assert "Mood=Ind|POS=VERB|Tense=Past|VerbForm=Fin" in morphologizer.labels
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ---------------------------------- |
|
||||
| **RETURNS** | tuple | The labels added to the component. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------ |
|
||||
| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -7,7 +7,8 @@ source: spacy/morphology.pyx
|
|||
Store the possible morphological analyses for a language, and index them by
|
||||
hash. To save space on each token, tokens only know the hash of their
|
||||
morphological analysis, so queries of morphological attributes are delegated to
|
||||
this class.
|
||||
this class. See [`MorphAnalysis`](/api/morphology#morphansalysis) for the
|
||||
container storing a single morphological analysis.
|
||||
|
||||
## Morphology.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
|
@ -21,15 +22,17 @@ Create a Morphology object.
|
|||
> morphology = Morphology(strings)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------- | ----------------- |
|
||||
| `strings` | `StringStore` | The string store. |
|
||||
| Name | Description |
|
||||
| --------- | --------------------------------- |
|
||||
| `strings` | The string store. ~~StringStore~~ |
|
||||
|
||||
## Morphology.add {#add tag="method"}
|
||||
|
||||
Insert a morphological analysis in the morphology table, if not already present.
|
||||
The morphological analysis may be provided in the UD FEATS format as a string or
|
||||
in the tag map dictionary format. Returns the hash of the new analysis.
|
||||
The morphological analysis may be provided in the Universal Dependencies
|
||||
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
||||
format as a string or in the tag map dictionary format. Returns the hash of the
|
||||
new analysis.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -39,9 +42,9 @@ in the tag map dictionary format. Returns the hash of the new analysis.
|
|||
> assert hash == nlp.vocab.strings[feats]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------------------ | --------------------------- |
|
||||
| `features` | `Union[Dict, str]` | The morphological features. |
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------------------------ |
|
||||
| `features` | The morphological features. ~~Union[Dict, str]~~ |
|
||||
|
||||
## Morphology.get {#get tag="method"}
|
||||
|
||||
|
@ -53,16 +56,20 @@ in the tag map dictionary format. Returns the hash of the new analysis.
|
|||
> assert nlp.vocab.morphology.get(hash) == feats
|
||||
> ```
|
||||
|
||||
Get the FEATS string for the hash of the morphological analysis.
|
||||
Get the
|
||||
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
||||
string for the hash of the morphological analysis.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ---- | --------------------------------------- |
|
||||
| `morph` | int | The hash of the morphological analysis. |
|
||||
| Name | Description |
|
||||
| ------- | ----------------------------------------------- |
|
||||
| `morph` | The hash of the morphological analysis. ~~int~~ |
|
||||
|
||||
## Morphology.feats_to_dict {#feats_to_dict tag="staticmethod"}
|
||||
|
||||
Convert a string FEATS representation to a dictionary of features and values in
|
||||
the same format as the tag map.
|
||||
Convert a string
|
||||
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
||||
representation to a dictionary of features and values in the same format as the
|
||||
tag map.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -72,14 +79,16 @@ the same format as the tag map.
|
|||
> assert d == {"Feat1": "Val1", "Feat2": "Val2"}
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------------------------------ |
|
||||
| `feats` | str | The morphological features in Universal Dependencies FEATS format. |
|
||||
| **RETURNS** | dict | The morphological features as a dictionary. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `feats` | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
|
||||
| **RETURNS** | The morphological features as a dictionary. ~~Dict[str, str]~~ |
|
||||
|
||||
## Morphology.dict_to_feats {#dict_to_feats tag="staticmethod"}
|
||||
|
||||
Convert a dictionary of features and values to a string FEATS representation.
|
||||
Convert a dictionary of features and values to a string
|
||||
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
||||
representation.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -89,15 +98,157 @@ Convert a dictionary of features and values to a string FEATS representation.
|
|||
> assert f == "Feat1=Val1|Feat2=Val2"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ----------------- | --------------------------------------------------------------------- |
|
||||
| `feats_dict` | `Dict[str, Dict]` | The morphological features as a dictionary. |
|
||||
| **RETURNS** | str | The morphological features as in Universal Dependencies FEATS format. |
|
||||
| Name | Description |
|
||||
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ |
|
||||
| **RETURNS** | The morphological features as in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ----- | -------------------------------------------- |
|
||||
| `FEATURE_SEP` | `str` | The FEATS feature separator. Default is `|`. |
|
||||
| `FIELD_SEP` | `str` | The FEATS field separator. Default is `=`. |
|
||||
| `VALUE_SEP` | `str` | The FEATS value separator. Default is `,`. |
|
||||
| Name | Description |
|
||||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `FEATURE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) feature separator. Default is `|`. ~~str~~ |
|
||||
| `FIELD_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) field separator. Default is `=`. ~~str~~ |
|
||||
| `VALUE_SEP` | The [FEATS](https://universaldependencies.org/format.html#morphological-annotation) value separator. Default is `,`. ~~str~~ |
|
||||
|
||||
## MorphAnalysis {#morphanalysis tag="class" source="spacy/tokens/morphanalysis.pyx"}
|
||||
|
||||
Stores a single morphological analysis.
|
||||
|
||||
### MorphAnalysis.\_\_init\_\_ {#morphanalysis-init tag="method"}
|
||||
|
||||
Initialize a MorphAnalysis object from a Universal Dependencies
|
||||
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
||||
string or a dictionary of morphological features.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.tokens import MorphAnalysis
|
||||
>
|
||||
> feats = "Feat1=Val1|Feat2=Val2"
|
||||
> m = MorphAnalysis(nlp.vocab, feats)
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | ---------------------------------------------------------- |
|
||||
| `vocab` | The vocab. ~~Vocab~~ |
|
||||
| `features` | The morphological features. ~~Union[Dict[str, str], str]~~ |
|
||||
|
||||
### MorphAnalysis.\_\_contains\_\_ {#morphanalysis-contains tag="method"}
|
||||
|
||||
Whether a feature/value pair is in the analysis.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> feats = "Feat1=Val1,Val2|Feat2=Val2"
|
||||
> morph = MorphAnalysis(nlp.vocab, feats)
|
||||
> assert "Feat1=Val1" in morph
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------- |
|
||||
| **RETURNS** | A feature/value pair in the analysis. ~~str~~ |
|
||||
|
||||
### MorphAnalysis.\_\_iter\_\_ {#morphanalysis-iter tag="method"}
|
||||
|
||||
Iterate over the feature/value pairs in the analysis.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> feats = "Feat1=Val1,Val3|Feat2=Val2"
|
||||
> morph = MorphAnalysis(nlp.vocab, feats)
|
||||
> assert list(morph) == ["Feat1=Va1", "Feat1=Val3", "Feat2=Val2"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ---------- | --------------------------------------------- |
|
||||
| **YIELDS** | A feature/value pair in the analysis. ~~str~~ |
|
||||
|
||||
### MorphAnalysis.\_\_len\_\_ {#morphanalysis-len tag="method"}
|
||||
|
||||
Returns the number of features in the analysis.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> feats = "Feat1=Val1,Val2|Feat2=Val2"
|
||||
> morph = MorphAnalysis(nlp.vocab, feats)
|
||||
> assert len(morph) == 3
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------- |
|
||||
| **RETURNS** | The number of features in the analysis. ~~int~~ |
|
||||
|
||||
### MorphAnalysis.\_\_str\_\_ {#morphanalysis-str tag="method"}
|
||||
|
||||
Returns the morphological analysis in the Universal Dependencies
|
||||
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
||||
string format.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> feats = "Feat1=Val1,Val2|Feat2=Val2"
|
||||
> morph = MorphAnalysis(nlp.vocab, feats)
|
||||
> assert str(morph) == feats
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| **RETURNS** | The analysis in the Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
|
||||
|
||||
### MorphAnalysis.get {#morphanalysis-get tag="method"}
|
||||
|
||||
Retrieve values for a feature by field.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> feats = "Feat1=Val1,Val2"
|
||||
> morph = MorphAnalysis(nlp.vocab, feats)
|
||||
> assert morph.get("Feat1") == ["Val1", "Val2"]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------ |
|
||||
| `field` | The field to retrieve. ~~str~~ |
|
||||
| **RETURNS** | A list of the individual features. ~~List[str]~~ |
|
||||
|
||||
### MorphAnalysis.to_dict {#morphanalysis-to_dict tag="method"}
|
||||
|
||||
Produce a dict representation of the analysis, in the same format as the tag
|
||||
map.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> feats = "Feat1=Val1,Val2|Feat2=Val2"
|
||||
> morph = MorphAnalysis(nlp.vocab, feats)
|
||||
> assert morph.to_dict() == {"Feat1": "Val1,Val2", "Feat2": "Val2"}
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------- |
|
||||
| **RETURNS** | The dict representation of the analysis. ~~Dict[str, str]~~ |
|
||||
|
||||
### MorphAnalysis.from_id {#morphanalysis-from_id tag="classmethod"}
|
||||
|
||||
Create a morphological analysis from a given hash ID.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> feats = "Feat1=Val1|Feat2=Val2"
|
||||
> hash = nlp.vocab.strings[feats]
|
||||
> morph = MorphAnalysis.from_id(nlp.vocab, hash)
|
||||
> assert str(morph) == feats
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ------- | ---------------------------------------- |
|
||||
| `vocab` | The vocab. ~~Vocab~~ |
|
||||
| `key` | The hash of the features string. ~~int~~ |
|
||||
|
|
|
@ -36,11 +36,11 @@ be shown.
|
|||
> matcher = PhraseMatcher(nlp.vocab)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------------------------------- | --------- | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. |
|
||||
| `attr` <Tag variant="new">2.1</Tag> | int / str | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. |
|
||||
| `validate` <Tag variant="new">2.1</Tag> | bool | Validate patterns added to the matcher. |
|
||||
| Name | Description |
|
||||
| --------------------------------------- | ------------------------------------------------------------------------------------------------------ |
|
||||
| `vocab` | The vocabulary object, which must be shared with the documents the matcher will operate on. ~~Vocab~~ |
|
||||
| `attr` <Tag variant="new">2.1</Tag> | The token attribute to match on. Defaults to `ORTH`, i.e. the verbatim token text. ~~Union[int, str]~~ |
|
||||
| `validate` <Tag variant="new">2.1</Tag> | Validate patterns added to the matcher. ~~bool~~ |
|
||||
|
||||
## PhraseMatcher.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -57,10 +57,10 @@ Find all token sequences matching the supplied patterns on the `Doc`.
|
|||
> matches = matcher(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `doc` | `Doc` | The document to match over. |
|
||||
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------- |
|
||||
| `doc` | The document to match over. ~~Doc~~ |
|
||||
| **RETURNS** | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end]`. The `match_id` is the ID of the added match pattern. ~~List[Tuple[int, int, int]]~~ |
|
||||
|
||||
<Infobox title="Note on retrieving the string representation of the match_id" variant="warning">
|
||||
|
||||
|
@ -87,11 +87,13 @@ Match a stream of documents, yielding them in turn.
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | -------- | --------------------------------------------------------- |
|
||||
| `docs` | iterable | A stream of documents. |
|
||||
| `batch_size` | int | The number of documents to accumulate into a working set. |
|
||||
| **YIELDS** | `Doc` | Documents, in order. |
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| `batch_size` | The number of documents to accumulate into a working set. ~~int~~ |
|
||||
| `return_matches` <Tag variant="new">2.1</Tag> | Yield the match lists along with the docs, making results `(doc, matches)` tuples. ~~bool~~ |
|
||||
| `as_tuples` | Interpret the input stream as `(doc, context)` tuples, and yield `(result, context)` tuples out. If both `return_matches` and `as_tuples` are `True`, the output will be a sequence of `((doc, matches), context)` tuples. ~~bool~~ |
|
||||
| **YIELDS** | Documents and optional matches or context in order. ~~Union[Doc, Tuple[Doc, Any], Tuple[Tuple[Doc, Any], Any]]~~ |
|
||||
|
||||
## PhraseMatcher.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
|
@ -108,9 +110,9 @@ patterns.
|
|||
> assert len(matcher) == 1
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------- |
|
||||
| **RETURNS** | int | The number of rules. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The number of rules. ~~int~~ |
|
||||
|
||||
## PhraseMatcher.\_\_contains\_\_ {#contains tag="method"}
|
||||
|
||||
|
@ -125,10 +127,10 @@ Check whether the matcher contains rules for a match ID.
|
|||
> assert "OBAMA" in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------------------------- |
|
||||
| `key` | str | The match ID. |
|
||||
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------- |
|
||||
| `key` | The match ID. ~~str~~ |
|
||||
| **RETURNS** | Whether the matcher contains rules for this match ID. ~~bool~~ |
|
||||
|
||||
## PhraseMatcher.add {#add tag="method"}
|
||||
|
||||
|
@ -165,12 +167,12 @@ patterns = [nlp("health care reform"), nlp("healthcare reform")]
|
|||
|
||||
</Infobox>
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `docs` | list | `Doc` objects of the phrases to match. |
|
||||
| Name | Description |
|
||||
| -------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | str | An ID for the thing you're matching. ~~str~~ |
|
||||
| `docs` | `Doc` objects of the phrases to match. ~~List[Doc]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
| `on_match` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. ~~Optional[Callable[[Matcher, Doc, int, List[tuple], Any]]~~ |
|
||||
|
||||
## PhraseMatcher.remove {#remove tag="method" new="2.2"}
|
||||
|
||||
|
@ -187,6 +189,6 @@ does not exist.
|
|||
> assert "OBAMA" not in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----- | ---- | ------------------------- |
|
||||
| `key` | str | The ID of the match rule. |
|
||||
| Name | Description |
|
||||
| ----- | --------------------------------- |
|
||||
| `key` | The ID of the match rule. ~~str~~ |
|
||||
|
|
|
@ -45,12 +45,12 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| `**cfg` | | Additional config parameters and settings. Will be available as the dictionary `Pipe.cfg` and is serialized with the component. |
|
||||
| Name | Description |
|
||||
| ------- | ------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], Any]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| `**cfg` | Additional config parameters and settings. Will be available as the dictionary `Pipe.cfg` and is serialized with the component. |
|
||||
|
||||
## Pipe.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -70,10 +70,10 @@ and all pipeline components are applied to the `Doc` in order. Both
|
|||
> processed = pipe(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| `doc` | `Doc` | The document to process. |
|
||||
| **RETURNS** | `Doc` | The processed document. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## Pipe.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -91,12 +91,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/pipe#call) and
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ----------------------------------------------------- |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| _keyword-only_ | | |
|
||||
| `batch_size` | int | The number of documents to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | The processed documents in order. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Pipe.begin_training {#begin_training tag="method"}
|
||||
|
||||
|
@ -116,13 +116,13 @@ setting up the label scheme based on the data.
|
|||
> optimizer = pipe.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/pipe#create_optimizer) if not set. |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## Pipe.predict {#predict tag="method"}
|
||||
|
||||
|
@ -142,10 +142,10 @@ This method needs to be overwritten with your own custom `predict` method.
|
|||
> scores = pipe.predict([doc1, doc2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------- | ----------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||
| **RETURNS** | - | The model's prediction for each document. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------- |
|
||||
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
||||
| **RETURNS** | The model's prediction for each document. |
|
||||
|
||||
## Pipe.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
|
@ -166,10 +166,10 @@ method.
|
|||
> pipe.set_annotations(docs, scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | --------------- | ---------------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||
| `scores` | - | The scores to set, produced by `Pipe.predict`. |
|
||||
| Name | Description |
|
||||
| -------- | ------------------------------------------------ |
|
||||
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
||||
| `scores` | The scores to set, produced by `Tagger.predict`. |
|
||||
|
||||
## Pipe.update {#update tag="method"}
|
||||
|
||||
|
@ -184,15 +184,15 @@ predictions and gold-standard annotations, and update the component's model.
|
|||
> losses = pipe.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| Name | Description |
|
||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/pipe#set_annotations). |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## Pipe.rehearse {#rehearse tag="method,experimental" new="3"}
|
||||
|
||||
|
@ -208,14 +208,14 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
|||
> losses = pipe.rehearse(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## Pipe.get_loss {#get_loss tag="method"}
|
||||
|
||||
|
@ -230,11 +230,11 @@ predicted scores.
|
|||
> loss, d_loss = ner.get_loss(examples, scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------- | --------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||
| `scores` | | Scores representing the model's predictions. |
|
||||
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------- |
|
||||
| `examples` | The batch of examples. ~~Iterable[Example]~~ |
|
||||
| `scores` | Scores representing the model's predictions. |
|
||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||
|
||||
## Pipe.score {#score tag="method" new="3"}
|
||||
|
||||
|
@ -246,10 +246,10 @@ Score a batch of examples.
|
|||
> scores = pipe.score(examples)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------- | --------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The examples to score. |
|
||||
| **RETURNS** | `Dict[str, Any]` | The scores, e.g. produced by the [`Scorer`](/api/scorer). |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
||||
| **RETURNS** | The scores, e.g. produced by the [`Scorer`](/api/scorer). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## Pipe.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
|
@ -263,26 +263,9 @@ Create an optimizer for the pipeline component. Defaults to
|
|||
> optimizer = pipe.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------------------------------------- | -------------- |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
|
||||
## Pipe.add_label {#add_label tag="method"}
|
||||
|
||||
Add a new label to the pipe. It's possible to extend pretrained models with new
|
||||
labels, but care should be taken to avoid the "catastrophic forgetting" problem.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> pipe = nlp.add_pipe("your_custom_pipe")
|
||||
> pipe.add_label("MY_LABEL")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------------------------- |
|
||||
| `label` | str | The label to add. |
|
||||
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## Pipe.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
|
@ -297,9 +280,26 @@ context, the original parameters are restored.
|
|||
> pipe.to_disk("/best_model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---- | ----------------------------------------- |
|
||||
| `params` | dict | The parameter values to use in the model. |
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------- |
|
||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||
|
||||
## Pipe.add_label {#add_label tag="method"}
|
||||
|
||||
Add a new label to the pipe. It's possible to extend pretrained models with new
|
||||
labels, but care should be taken to avoid the "catastrophic forgetting" problem.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> pipe = nlp.add_pipe("your_custom_pipe")
|
||||
> pipe.add_label("MY_LABEL")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------- |
|
||||
| `label` | The label to add. ~~str~~ |
|
||||
| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
|
||||
|
||||
## Pipe.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -312,11 +312,11 @@ Serialize the pipe to disk.
|
|||
> pipe.to_disk("/path/to/pipe")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## Pipe.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -329,12 +329,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> pipe.from_disk("/path/to/pipe")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Pipe` | The modified pipe. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified pipe. ~~Pipe~~ |
|
||||
|
||||
## Pipe.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -347,11 +347,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the pipe. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the pipe. ~~bytes~~ |
|
||||
|
||||
## Pipe.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -365,21 +365,21 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> pipe.from_bytes(pipe_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Pipe` | The pipe. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The pipe. ~~Pipe~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | [`Vocab`](/api/vocab) | The shared vocabulary that's passed in on initialization. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model powering the component. |
|
||||
| `name` | str | The name of the component instance in the pipeline. Can be used in the losses. |
|
||||
| `cfg` | dict | Keyword arguments passed to [`Pipe.__init__`](/api/pipe#init). Will be serialized with the component. |
|
||||
| Name | Description |
|
||||
| ------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `vocab` | The shared vocabulary that's passed in on initialization. ~~Vocab~~ |
|
||||
| `model` | The model powering the component. ~~Model[List[Doc], Any]~~ |
|
||||
| `name` | The name of the component instance in the pipeline. Can be used in the losses. ~~str~~ |
|
||||
| `cfg` | Keyword arguments passed to [`Pipe.__init__`](/api/pipe#init). Will be serialized with the component. ~~Dict[str, Any]~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -33,10 +33,10 @@ all other components.
|
|||
|
||||
</Infobox>
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` with merged noun chunks. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------- |
|
||||
| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
|
||||
| **RETURNS** | The modified `Doc` with merged noun chunks. ~~Doc~~ |
|
||||
|
||||
## merge_entities {#merge_entities tag="function"}
|
||||
|
||||
|
@ -63,10 +63,10 @@ components to the end of the pipeline and after all other components.
|
|||
|
||||
</Infobox>
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` with merged entities. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------- |
|
||||
| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
|
||||
| **RETURNS** | The modified `Doc` with merged entities. ~~Doc~~ |
|
||||
|
||||
## merge_subtokens {#merge_subtokens tag="function" new="2.1"}
|
||||
|
||||
|
@ -102,8 +102,8 @@ end of the pipeline and after all other components.
|
|||
|
||||
</Infobox>
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
|
||||
| `label` | str | The subtoken dependency label. Defaults to `"subtok"`. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------- |
|
||||
| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
|
||||
| `label` | The subtoken dependency label. Defaults to `"subtok"`. ~~str~~ |
|
||||
| **RETURNS** | The modified `Doc` with merged subtokens. ~~Doc~~ |
|
||||
|
|
|
@ -27,9 +27,9 @@ Create a new `Scorer`.
|
|||
> scorer = Scorer(nlp)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `nlp` | Language | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. |
|
||||
| Name | Description |
|
||||
| ----- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `nlp` | The pipeline to use for scoring, where each pipeline component may provide a scoring method. If none is provided, then a default pipeline for the multi-language code `xx` is constructed containing: `senter`, `tagger`, `morphologizer`, `parser`, `ner`, `textcat`. ~~Language~~ |
|
||||
|
||||
## Scorer.score {#score tag="method"}
|
||||
|
||||
|
@ -55,10 +55,10 @@ attribute being scored:
|
|||
> scores = scorer.score(examples)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------- | --------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||
| **RETURNS** | `Dict` | A dictionary of scores. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| **RETURNS** | A dictionary of scores. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## Scorer.score_tokenization {#score_tokenization tag="staticmethod" new="3"}
|
||||
|
||||
|
@ -74,10 +74,10 @@ Scores the tokenization:
|
|||
> scores = Scorer.score_tokenization(examples)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------- | --------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||
| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| **RETURNS** | `Dict` | A dictionary containing the scores `token_acc`, `token_p`, `token_r`, `token_f`. ~~Dict[str, float]]~~ |
|
||||
|
||||
## Scorer.score_token_attr {#score_token_attr tag="staticmethod" new="3"}
|
||||
|
||||
|
@ -90,18 +90,19 @@ Scores a single token attribute.
|
|||
> print(scores["pos_acc"])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||
| `attr` | `str` | The attribute to score. |
|
||||
| _keyword-only_ | | |
|
||||
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
||||
| **RETURNS** | `Dict[str, float]` | A dictionary containing the score `{attr}_acc`. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| `attr` | The attribute to score. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
||||
| **RETURNS** | A dictionary containing the score `{attr}_acc`. ~~Dict[str, float]~~ |
|
||||
|
||||
## Scorer.score_token_attr_per_feat {#score_token_attr_per_feat tag="staticmethod" new="3"}
|
||||
|
||||
Scores a single token attribute per feature for a token attribute in
|
||||
[UFEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
||||
Scores a single token attribute per feature for a token attribute in the
|
||||
Universal Dependencies
|
||||
[FEATS](https://universaldependencies.org/format.html#morphological-annotation)
|
||||
format.
|
||||
|
||||
> #### Example
|
||||
|
@ -111,13 +112,13 @@ format.
|
|||
> print(scores["morph_per_feat"])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||
| `attr` | `str` | The attribute to score. |
|
||||
| _keyword-only_ | | |
|
||||
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
||||
| **RETURNS** | `Dict` | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| `attr` | The attribute to score. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
||||
| **RETURNS** | A dictionary containing the per-feature PRF scores under the key `{attr}_per_feat`. ~~Dict[str, Dict[str, float]]~~ |
|
||||
|
||||
## Scorer.score_spans {#score_spans tag="staticmethod" new="3"}
|
||||
|
||||
|
@ -130,13 +131,13 @@ Returns PRF scores for labeled or unlabeled spans.
|
|||
> print(scores["ents_f"])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||
| `attr` | `str` | The attribute to score. |
|
||||
| _keyword-only_ | | |
|
||||
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. |
|
||||
| **RETURNS** | `Dict` | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| `attr` | The attribute to score. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the `Span` objects for an individual `Doc`. ~~Callable[[Doc, str], Iterable[Span]]~~ |
|
||||
| **RETURNS** | A dictionary containing the PRF scores under the keys `{attr}_p`, `{attr}_r`, `{attr}_f` and the per-type PRF scores under `{attr}_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## Scorer.score_deps {#score_deps tag="staticmethod" new="3"}
|
||||
|
||||
|
@ -159,16 +160,16 @@ Calculate the UAS, LAS, and LAS per type scores for dependency parses.
|
|||
> print(scores["dep_uas"], scores["dep_las"])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||
| `attr` | `str` | The attribute containing the dependency label. |
|
||||
| _keyword-only_ | | |
|
||||
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. |
|
||||
| `head_attr` | `str` | The attribute containing the head token. |
|
||||
| `head_getter` | `callable` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. |
|
||||
| `ignore_labels` | `Tuple` | Labels to ignore while scoring (e.g., `punct`). |
|
||||
| **RETURNS** | `Dict` | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. |
|
||||
| Name | Description |
|
||||
| --------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| `attr` | The attribute to score. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `getter` | Defaults to `getattr`. If provided, `getter(token, attr)` should return the value of the attribute for an individual `Token`. ~~Callable[[Token, str], Any]~~ |
|
||||
| `head_attr` | The attribute containing the head token. ~~str~~ |
|
||||
| `head_getter` | Defaults to `getattr`. If provided, `head_getter(token, attr)` should return the head for an individual `Token`. ~~Callable[[Doc, str], Token]~~ |
|
||||
| `ignore_labels` | Labels to ignore while scoring (e.g. `"punct"`). ~~Iterable[str]~~ |
|
||||
| **RETURNS** | A dictionary containing the scores: `{attr}_uas`, `{attr}_las`, and `{attr}_las_per_type`. ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## Scorer.score_cats {#score_cats tag="staticmethod" new="3"}
|
||||
|
||||
|
@ -195,13 +196,13 @@ depends on the scorer settings:
|
|||
> print(scores["cats_macro_auc"])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ------------------- | ------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The `Example` objects holding both the predictions and the correct gold-standard annotations. |
|
||||
| `attr` | `str` | The attribute to score. |
|
||||
| _keyword-only_ | | |
|
||||
| `getter` | `Callable` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. |
|
||||
| labels | `Iterable[str]` | The set of possible labels. Defaults to `[]`. |
|
||||
| `multi_label` | `bool` | Whether the attribute allows multiple labels. Defaults to `True`. |
|
||||
| `positive_label` | `str` | The positive label for a binary task with exclusive classes. Defaults to `None`. |
|
||||
| **RETURNS** | `Dict` | A dictionary containing the scores, with inapplicable scores as `None`. |
|
||||
| Name | Description |
|
||||
| ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The `Example` objects holding both the predictions and the correct gold-standard annotations. ~~Iterable[Example]~~ |
|
||||
| `attr` | The attribute to score. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `getter` | Defaults to `getattr`. If provided, `getter(doc, attr)` should return the cats for an individual `Doc`. ~~Callable[[Doc, str], Dict[str, float]]~~ |
|
||||
| labels | The set of possible labels. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `multi_label` | Whether the attribute allows multiple labels. Defaults to `True`. ~~bool~~ |
|
||||
| `positive_label` | The positive label for a binary task with exclusive classes. Defaults to `None`. ~~Optional[str]~~ |
|
||||
| **RETURNS** | A dictionary containing the scores, with inapplicable scores as `None`. ~~Dict[str, Optional[float]]~~ |
|
||||
|
|
|
@ -29,9 +29,9 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("senter", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ------- | ------------------------------------------ | ----------------- | ----------------------------------- |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [Tagger](/api/architectures#Tagger) |
|
||||
| Setting | Description |
|
||||
| ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/senter.pyx
|
||||
|
@ -60,11 +60,11 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------- | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `Model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| Name | Description |
|
||||
| ------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
|
||||
## SentenceRecognizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -85,10 +85,10 @@ and all pipeline components are applied to the `Doc` in order. Both
|
|||
> processed = senter(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| `doc` | `Doc` | The document to process. |
|
||||
| **RETURNS** | `Doc` | The processed document. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## SentenceRecognizer.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -107,12 +107,12 @@ and [`pipe`](/api/sentencerecognizer#pipe) delegate to the
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------ |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| _keyword-only_ | | |
|
||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## SentenceRecognizer.begin_training {#begin_training tag="method"}
|
||||
|
||||
|
@ -132,13 +132,13 @@ setting up the label scheme based on the data.
|
|||
> optimizer = senter.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/sentencerecognizer#create_optimizer) if not set. |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## SentenceRecognizer.predict {#predict tag="method"}
|
||||
|
||||
|
@ -152,10 +152,10 @@ modifying them.
|
|||
> scores = senter.predict([doc1, doc2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------- | ----------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||
| **RETURNS** | - | The model's prediction for each document. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------- |
|
||||
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
||||
| **RETURNS** | The model's prediction for each document. |
|
||||
|
||||
## SentenceRecognizer.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
|
@ -169,10 +169,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
|
|||
> senter.set_annotations([doc1, doc2], scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | --------------- | ------------------------------------------------------------ |
|
||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||
| `scores` | - | The scores to set, produced by `SentenceRecognizer.predict`. |
|
||||
| Name | Description |
|
||||
| -------- | ------------------------------------------------------------ |
|
||||
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
||||
| `scores` | The scores to set, produced by `SentenceRecognizer.predict`. |
|
||||
|
||||
## SentenceRecognizer.update {#update tag="method"}
|
||||
|
||||
|
@ -189,15 +189,15 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
|
|||
> losses = senter.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| Name | Description |
|
||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/sentencerecognizer#set_annotations). |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"}
|
||||
|
||||
|
@ -213,14 +213,14 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
|||
> losses = senter.rehearse(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## SentenceRecognizer.get_loss {#get_loss tag="method"}
|
||||
|
||||
|
@ -235,11 +235,11 @@ predicted scores.
|
|||
> loss, d_loss = senter.get_loss(examples, scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------- | --------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||
| `scores` | - | Scores representing the model's predictions. |
|
||||
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------- |
|
||||
| `examples` | The batch of examples. ~~Iterable[Example]~~ |
|
||||
| `scores` | Scores representing the model's predictions. |
|
||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||
|
||||
## SentenceRecognizer.score {#score tag="method" new="3"}
|
||||
|
||||
|
@ -251,10 +251,10 @@ Score a batch of examples.
|
|||
> scores = senter.score(examples)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------- | ------------------------------------------------------------------------ |
|
||||
| `examples` | `Iterable[Example]` | The examples to score. |
|
||||
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
||||
| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. ~~Dict[str, float]~~ |
|
||||
|
||||
## SentenceRecognizer.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
|
@ -267,9 +267,9 @@ Create an optimizer for the pipeline component.
|
|||
> optimizer = senter.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------------------------------------- | -------------- |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## SentenceRecognizer.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
|
@ -284,9 +284,9 @@ context, the original parameters are restored.
|
|||
> senter.to_disk("/best_model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---- | ----------------------------------------- |
|
||||
| `params` | dict | The parameter values to use in the model. |
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------- |
|
||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||
|
||||
## SentenceRecognizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -299,11 +299,11 @@ Serialize the pipe to disk.
|
|||
> senter.to_disk("/path/to/senter")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## SentenceRecognizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -316,12 +316,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> senter.from_disk("/path/to/senter")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | -------------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `SentenceRecognizer` | The modified `SentenceRecognizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `SentenceRecognizer` object. ~~SentenceRecognizer~~ |
|
||||
|
||||
## SentenceRecognizer.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -334,11 +334,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `SentenceRecognizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `SentenceRecognizer` object. ~~bytes~~ |
|
||||
|
||||
## SentenceRecognizer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -352,12 +352,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> senter.from_bytes(senter_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | -------------------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `SentenceRecognizer` | The `SentenceRecognizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `SentenceRecognizer` object. ~~SentenceRecognizer~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -28,9 +28,9 @@ how the component should be configured. You can override its settings via the
|
|||
> nlp.add_pipe("entity_ruler", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ------------- | ----------- | ---------------------------------------------------------------------------------------------------------- | ------- |
|
||||
| `punct_chars` | `List[str]` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. | `None` |
|
||||
| Setting | Description |
|
||||
| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults if not set. Defaults to `None`. ~~Optional[List[str]]~~ | `None` |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/sentencizer.pyx
|
||||
|
@ -51,10 +51,10 @@ Initialize the sentencizer.
|
|||
> sentencizer = Sentencizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ----------- | ----------------------------------------------------------------------------------------------- |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `punct_chars` | `List[str]` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. |
|
||||
| `punct_chars` | Optional custom list of punctuation characters that mark sentence ends. See below for defaults. ~~Optional[List[str]]~~ |
|
||||
|
||||
```python
|
||||
### punct_chars defaults
|
||||
|
@ -87,10 +87,10 @@ the component has been added to the pipeline using
|
|||
> assert len(list(doc.sents)) == 2
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` with added sentence boundaries. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------- |
|
||||
| `doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. ~~Doc~~ |
|
||||
| **RETURNS** | The modified `Doc` with added sentence boundaries. ~~Doc~~ |
|
||||
|
||||
## Sentencizer.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -106,12 +106,12 @@ applied to the `Doc` in order.
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ----------------------------------------------------- |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| _keyword-only_ | | |
|
||||
| `batch_size` | int | The number of documents to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | The processed documents in order. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Sentencizer.score {#score tag="method" new="3"}
|
||||
|
||||
|
@ -123,10 +123,10 @@ Score a batch of examples.
|
|||
> scores = sentencizer.score(examples)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------- | ------------------------------------------------------------------------ |
|
||||
| `examples` | `Iterable[Example]` | The examples to score. |
|
||||
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
||||
| **RETURNS** | The scores, produced by [`Scorer.score_spans`](/api/scorer#score_spans). ~~Dict[str, Union[float, Dict[str, float]]~~ |
|
||||
|
||||
## Sentencizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -142,9 +142,9 @@ a file `sentencizer.json`. This also happens automatically when you save an
|
|||
> sentencizer.to_disk("/path/to/sentencizer.json")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a JSON file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Description |
|
||||
| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a JSON file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
|
||||
## Sentencizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -159,10 +159,10 @@ added to its pipeline.
|
|||
> sentencizer.from_disk("/path/to/sentencizer.json")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| **RETURNS** | The modified `Sentencizer` object. ~~Sentencizer~~ |
|
||||
|
||||
## Sentencizer.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -176,9 +176,9 @@ Serialize the sentencizer settings to a bytestring.
|
|||
> sentencizer_bytes = sentencizer.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------- |
|
||||
| **RETURNS** | bytes | The serialized data. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------ |
|
||||
| **RETURNS** | The serialized data. ~~bytes~~ |
|
||||
|
||||
## Sentencizer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -192,7 +192,7 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> sentencizer.from_bytes(sentencizer_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------- | ---------------------------------- |
|
||||
| `bytes_data` | bytes | The bytestring to load. |
|
||||
| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. |
|
||||
| Name | Description |
|
||||
| ------------ | -------------------------------------------------- |
|
||||
| `bytes_data` | The bytestring to load. ~~bytes~~ |
|
||||
| **RETURNS** | The modified `Sentencizer` object. ~~Sentencizer~~ |
|
||||
|
|
|
@ -18,14 +18,14 @@ Create a Span object from the slice `doc[start : end]`.
|
|||
> assert [t.text for t in span] == ["it", "back", "!"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---------------------------------------- | --------------------------------------------------------------------------------------------------------- |
|
||||
| `doc` | `Doc` | The parent document. |
|
||||
| `start` | int | The index of the first token of the span. |
|
||||
| `end` | int | The index of the first token after the span. |
|
||||
| `label` | int / str | A label to attach to the span, e.g. for named entities. As of v2.1, the label can also be a string. |
|
||||
| `kb_id` | int / str | A knowledge base ID to attach to the span, e.g. for named entities. The ID can be an integer or a string. |
|
||||
| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. |
|
||||
| Name | Description |
|
||||
| -------- | --------------------------------------------------------------------------------------- |
|
||||
| `doc` | The parent document. ~~Doc~~ |
|
||||
| `start` | The index of the first token of the span. ~~int~~ |
|
||||
| `end` | The index of the first token after the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[str, int]~~ |
|
||||
| `kb_id` | A knowledge base ID to attach to the span, e.g. for named entities. ~~Union[str, int]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
|
||||
## Span.\_\_getitem\_\_ {#getitem tag="method"}
|
||||
|
||||
|
@ -39,10 +39,10 @@ Get a `Token` object.
|
|||
> assert span[1].text == "back"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | --------------------------------------- |
|
||||
| `i` | int | The index of the token within the span. |
|
||||
| **RETURNS** | `Token` | The token at `span[i]`. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------- |
|
||||
| `i` | The index of the token within the span. ~~int~~ |
|
||||
| **RETURNS** | The token at `span[i]`. ~~Token~~ |
|
||||
|
||||
Get a `Span` object.
|
||||
|
||||
|
@ -54,10 +54,10 @@ Get a `Span` object.
|
|||
> assert span[1:3].text == "back!"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | -------------------------------- |
|
||||
| `start_end` | tuple | The slice of the span to get. |
|
||||
| **RETURNS** | `Span` | The span at `span[start : end]`. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------- |
|
||||
| `start_end` | The slice of the span to get. ~~Tuple[int, int]~~ |
|
||||
| **RETURNS** | The span at `span[start : end]`. ~~Span~~ |
|
||||
|
||||
## Span.\_\_iter\_\_ {#iter tag="method"}
|
||||
|
||||
|
@ -71,9 +71,9 @@ Iterate over `Token` objects.
|
|||
> assert [t.text for t in span] == ["it", "back", "!"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------- | ----------------- |
|
||||
| **YIELDS** | `Token` | A `Token` object. |
|
||||
| Name | Description |
|
||||
| ---------- | --------------------------- |
|
||||
| **YIELDS** | A `Token` object. ~~Token~~ |
|
||||
|
||||
## Span.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
|
@ -87,9 +87,9 @@ Get the number of tokens in the span.
|
|||
> assert len(span) == 3
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------- |
|
||||
| **RETURNS** | int | The number of tokens in the span. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------- |
|
||||
| **RETURNS** | The number of tokens in the span. ~~int~~ |
|
||||
|
||||
## Span.set_extension {#set_extension tag="classmethod" new="2"}
|
||||
|
||||
|
@ -107,14 +107,14 @@ For details, see the documentation on
|
|||
> assert doc[1:4]._.has_city
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `span._.my_attr`. |
|
||||
| `default` | - | Optional default value of the attribute if no getter or method is defined. |
|
||||
| `method` | callable | Set a custom method on the object, for example `span._.compare(other_span)`. |
|
||||
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. |
|
||||
| `setter` | callable | Setter function that takes the `Span` and a value, and modifies the object. Is called when the user writes to the `Span._` attribute. |
|
||||
| `force` | bool | Force overwriting existing attribute. |
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `span._.my_attr`. ~~str~~ |
|
||||
| `default` | Optional default value of the attribute if no getter or method is defined. ~~Optional[Any]~~ |
|
||||
| `method` | Set a custom method on the object, for example `span._.compare(other_span)`. ~~Optional[Callable[[Span, ...], Any]]~~ |
|
||||
| `getter` | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. ~~Optional[Callable[[Span], Any]]~~ |
|
||||
| `setter` | Setter function that takes the `Span` and a value, and modifies the object. Is called when the user writes to the `Span._` attribute. ~~Optional[Callable[[Span, Any], None]]~~ |
|
||||
| `force` | Force overwriting existing attribute. ~~bool~~ |
|
||||
|
||||
## Span.get_extension {#get_extension tag="classmethod" new="2"}
|
||||
|
||||
|
@ -131,10 +131,10 @@ Look up a previously registered extension by name. Returns a 4-tuple
|
|||
> assert extension == (False, None, None, None)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------- |
|
||||
| `name` | str | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Name of the extension. ~~str~~ |
|
||||
| **RETURNS** | A `(default, method, getter, setter)` tuple of the extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ |
|
||||
|
||||
## Span.has_extension {#has_extension tag="classmethod" new="2"}
|
||||
|
||||
|
@ -148,10 +148,10 @@ Check whether an extension has been registered on the `Span` class.
|
|||
> assert Span.has_extension("is_city")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------ |
|
||||
| `name` | str | Name of the extension to check. |
|
||||
| **RETURNS** | bool | Whether the extension has been registered. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------- |
|
||||
| `name` | Name of the extension to check. ~~str~~ |
|
||||
| **RETURNS** | Whether the extension has been registered. ~~bool~~ |
|
||||
|
||||
## Span.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
|
||||
|
||||
|
@ -166,10 +166,10 @@ Remove a previously registered extension.
|
|||
> assert not Span.has_extension("is_city")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------------------------------------- |
|
||||
| `name` | str | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Name of the extension. ~~str~~ |
|
||||
| **RETURNS** | A `(default, method, getter, setter)` tuple of the removed extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ |
|
||||
|
||||
## Span.char_span {#char_span tag="method" new="2.2.4"}
|
||||
|
||||
|
@ -184,14 +184,14 @@ the character indices don't map to a valid span.
|
|||
> assert span.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------------------------------- | --------------------------------------------------------------------- |
|
||||
| `start` | int | The index of the first character of the span. |
|
||||
| `end` | int | The index of the last character after the span. |
|
||||
| `label` | uint64 / str | A label to attach to the span, e.g. for named entities. |
|
||||
| `kb_id` | uint64 / str | An ID from a knowledge base to capture the meaning of a named entity. |
|
||||
| `vector` | `numpy.ndarray[ndim=1, dtype="float32"]` | A meaning representation of the span. |
|
||||
| **RETURNS** | `Span` | The newly constructed object or `None`. |
|
||||
| Name | Description |
|
||||
| ------------------------------------ | ----------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| **RETURNS** | The newly constructed object or `None`. ~~Optional[Span]~~ |
|
||||
|
||||
## Span.similarity {#similarity tag="method" model="vectors"}
|
||||
|
||||
|
@ -209,10 +209,10 @@ using an average of word vectors.
|
|||
> assert apples_oranges == oranges_apples
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------------------------------------------------------- |
|
||||
| `other` | - | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. |
|
||||
| **RETURNS** | float | A scalar similarity score. Higher is more similar. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `other` | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ |
|
||||
| **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~ |
|
||||
|
||||
## Span.get_lca_matrix {#get_lca_matrix tag="method"}
|
||||
|
||||
|
@ -229,9 +229,9 @@ ancestor is found, e.g. if span excludes a necessary ancestor.
|
|||
> # array([[0, 0, 0], [0, 1, 2], [0, 2, 2]], dtype=int32)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------------------------------------- | ------------------------------------------------ |
|
||||
| **RETURNS** | `numpy.ndarray[ndim=2, dtype="int32"]` | The lowest common ancestor matrix of the `Span`. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------- |
|
||||
| **RETURNS** | The lowest common ancestor matrix of the `Span`. ~~numpy.ndarray[ndim=2, dtype=int32]~~ |
|
||||
|
||||
## Span.to_array {#to_array tag="method" new="2"}
|
||||
|
||||
|
@ -249,10 +249,10 @@ shape `(N, M)`, where `N` is the length of the document. The values will be
|
|||
> np_array = span.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------------- | -------------------------------------------------------------------------------------------------------- |
|
||||
| `attr_ids` | list | A list of attribute ID ints. |
|
||||
| **RETURNS** | `numpy.ndarray[long, ndim=2]` | A feature matrix, with one row per word, and one column per attribute indicated in the input `attr_ids`. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `attr_ids` | A list of attributes (int IDs or string names) or a single attribute (int ID or string name). ~~Union[int, str, List[Union[int, str]]]~~ |
|
||||
| **RETURNS** | The exported attributes as a numpy array. ~~Union[numpy.ndarray[ndim=2, dtype=uint64], numpy.ndarray[ndim=1, dtype=uint64]]~~ |
|
||||
|
||||
## Span.ents {#ents tag="property" new="2.0.13" model="ner"}
|
||||
|
||||
|
@ -270,9 +270,9 @@ if the entity recognizer has been applied.
|
|||
> assert ents[0].text == "Mr. Best"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------- |
|
||||
| **RETURNS** | tuple | Entities in the span, one `Span` per entity. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------- |
|
||||
| **RETURNS** | Entities in the span, one `Span` per entity. ~~Tuple[Span, ...]~~ |
|
||||
|
||||
## Span.as_doc {#as_doc tag="method"}
|
||||
|
||||
|
@ -287,10 +287,10 @@ Create a new `Doc` object corresponding to the `Span`, with a copy of the data.
|
|||
> assert doc2.text == "New York"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ----- | ---------------------------------------------------- |
|
||||
| `copy_user_data` | bool | Whether or not to copy the original doc's user data. |
|
||||
| **RETURNS** | `Doc` | A `Doc` object of the `Span`'s content. |
|
||||
| Name | Description |
|
||||
| ---------------- | ------------------------------------------------------------- |
|
||||
| `copy_user_data` | Whether or not to copy the original doc's user data. ~~bool~~ |
|
||||
| **RETURNS** | A `Doc` object of the `Span`'s content. ~~Doc~~ |
|
||||
|
||||
## Span.root {#root tag="property" model="parser"}
|
||||
|
||||
|
@ -309,9 +309,9 @@ taken.
|
|||
> assert new_york.root.text == "York"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | --------------- |
|
||||
| **RETURNS** | `Token` | The root token. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------- |
|
||||
| **RETURNS** | The root token. ~~Token~~ |
|
||||
|
||||
## Span.conjuncts {#conjuncts tag="property" model="parser"}
|
||||
|
||||
|
@ -325,9 +325,9 @@ A tuple of tokens coordinated to `span.root`.
|
|||
> assert [t.text for t in apples_conjuncts] == ["oranges"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ----------------------- |
|
||||
| **RETURNS** | `tuple` | The coordinated tokens. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------- |
|
||||
| **RETURNS** | The coordinated tokens. ~~Tuple[Token, ...]~~ |
|
||||
|
||||
## Span.lefts {#lefts tag="property" model="parser"}
|
||||
|
||||
|
@ -341,9 +341,9 @@ Tokens that are to the left of the span, whose heads are within the span.
|
|||
> assert lefts == ["New"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------- | ------------------------------------ |
|
||||
| **YIELDS** | `Token` | A left-child of a token of the span. |
|
||||
| Name | Description |
|
||||
| ---------- | ---------------------------------------------- |
|
||||
| **YIELDS** | A left-child of a token of the span. ~~Token~~ |
|
||||
|
||||
## Span.rights {#rights tag="property" model="parser"}
|
||||
|
||||
|
@ -357,9 +357,9 @@ Tokens that are to the right of the span, whose heads are within the span.
|
|||
> assert rights == ["in"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------- | ------------------------------------- |
|
||||
| **YIELDS** | `Token` | A right-child of a token of the span. |
|
||||
| Name | Description |
|
||||
| ---------- | ----------------------------------------------- |
|
||||
| **YIELDS** | A right-child of a token of the span. ~~Token~~ |
|
||||
|
||||
## Span.n_lefts {#n_lefts tag="property" model="parser"}
|
||||
|
||||
|
@ -373,9 +373,9 @@ the span.
|
|||
> assert doc[3:7].n_lefts == 1
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------- |
|
||||
| **RETURNS** | int | The number of left-child tokens. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------- |
|
||||
| **RETURNS** | The number of left-child tokens. ~~int~~ |
|
||||
|
||||
## Span.n_rights {#n_rights tag="property" model="parser"}
|
||||
|
||||
|
@ -389,9 +389,9 @@ the span.
|
|||
> assert doc[2:4].n_rights == 1
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------- |
|
||||
| **RETURNS** | int | The number of right-child tokens. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------- |
|
||||
| **RETURNS** | The number of right-child tokens. ~~int~~ |
|
||||
|
||||
## Span.subtree {#subtree tag="property" model="parser"}
|
||||
|
||||
|
@ -405,9 +405,9 @@ Tokens within the span and tokens which descend from them.
|
|||
> assert subtree == ["Give", "it", "back", "!"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------- | ------------------------------------------------- |
|
||||
| **YIELDS** | `Token` | A token within the span, or a descendant from it. |
|
||||
| Name | Description |
|
||||
| ---------- | ----------------------------------------------------------- |
|
||||
| **YIELDS** | A token within the span, or a descendant from it. ~~Token~~ |
|
||||
|
||||
## Span.has_vector {#has_vector tag="property" model="vectors"}
|
||||
|
||||
|
@ -420,9 +420,9 @@ A boolean value indicating whether a word vector is associated with the object.
|
|||
> assert doc[1:].has_vector
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------- |
|
||||
| **RETURNS** | bool | Whether the span has a vector data attached. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------- |
|
||||
| **RETURNS** | Whether the span has a vector data attached. ~~bool~~ |
|
||||
|
||||
## Span.vector {#vector tag="property" model="vectors"}
|
||||
|
||||
|
@ -437,9 +437,9 @@ vectors.
|
|||
> assert doc[1:].vector.shape == (300,)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------------------------------- | --------------------------------------------------- |
|
||||
| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the span's semantics. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------- |
|
||||
| **RETURNS** | A 1-dimensional array representing the span's vector. ~~`numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
|
||||
## Span.vector_norm {#vector_norm tag="property" model="vectors"}
|
||||
|
||||
|
@ -454,31 +454,31 @@ The L2 norm of the span's vector representation.
|
|||
> assert doc[1:].vector_norm != doc[2:].vector_norm
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ----------------------------------------- |
|
||||
| **RETURNS** | float | The L2 norm of the vector representation. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------- |
|
||||
| **RETURNS** | The L2 norm of the vector representation. ~~float~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------------------------------- | ------------ | -------------------------------------------------------------------------------------------------------------- |
|
||||
| `doc` | `Doc` | The parent document. |
|
||||
| `tensor` <Tag variant="new">2.1.7</Tag> | `ndarray` | The span's slice of the parent `Doc`'s tensor. |
|
||||
| `sent` | `Span` | The sentence span that this span is a part of. |
|
||||
| `start` | int | The token offset for the start of the span. |
|
||||
| `end` | int | The token offset for the end of the span. |
|
||||
| `start_char` | int | The character offset for the start of the span. |
|
||||
| `end_char` | int | The character offset for the end of the span. |
|
||||
| `text` | str | A string representation of the span text. |
|
||||
| `text_with_ws` | str | The text content of the span with a trailing whitespace character if the last token has one. |
|
||||
| `orth` | int | ID of the verbatim text content. |
|
||||
| `orth_` | str | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. |
|
||||
| `label` | int | The hash value of the span's label. |
|
||||
| `label_` | str | The span's label. |
|
||||
| `lemma_` | str | The span's lemma. |
|
||||
| `kb_id` | int | The hash value of the knowledge base ID referred to by the span. |
|
||||
| `kb_id_` | str | The knowledge base ID referred to by the span. |
|
||||
| `ent_id` | int | The hash value of the named entity the token is an instance of. |
|
||||
| `ent_id_` | str | The string ID of the named entity the token is an instance of. |
|
||||
| `sentiment` | float | A scalar value indicating the positivity or negativity of the span. |
|
||||
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
|
||||
| Name | Description |
|
||||
| --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doc` | The parent document. ~~Doc~~ |
|
||||
| `tensor` <Tag variant="new">2.1.7</Tag> | The span's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ |
|
||||
| `sent` | The sentence span that this span is a part of. ~~Span~~ |
|
||||
| `start` | The token offset for the start of the span. ~~int~~ |
|
||||
| `end` | The token offset for the end of the span. ~~int~~ |
|
||||
| `start_char` | The character offset for the start of the span. ~~int~~ |
|
||||
| `end_char` | The character offset for the end of the span. ~~int~~ |
|
||||
| `text` | A string representation of the span text. ~~str~~ |
|
||||
| `text_with_ws` | The text content of the span with a trailing whitespace character if the last token has one. ~~str~~ |
|
||||
| `orth` | ID of the verbatim text content. ~~int~~ |
|
||||
| `orth_` | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. ~~str~~ |
|
||||
| `label` | The hash value of the span's label. ~~int~~ |
|
||||
| `label_` | The span's label. ~~str~~ |
|
||||
| `lemma_` | The span's lemma. Equivalent to `"".join(token.text_with_ws for token in span)`. ~~str~~ |
|
||||
| `kb_id` | The hash value of the knowledge base ID referred to by the span. ~~int~~ |
|
||||
| `kb_id_` | The knowledge base ID referred to by the span. ~~str~~ |
|
||||
| `ent_id` | The hash value of the named entity the token is an instance of. ~~int~~ |
|
||||
| `ent_id_` | The string ID of the named entity the token is an instance of. ~~str~~ |
|
||||
| `sentiment` | A scalar value indicating the positivity or negativity of the span. ~~float~~ |
|
||||
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
||||
|
|
|
@ -19,9 +19,9 @@ Create the `StringStore`.
|
|||
> stringstore = StringStore(["apple", "orange"])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | -------- | ------------------------------------------ |
|
||||
| `strings` | iterable | A sequence of strings to add to the store. |
|
||||
| Name | Description |
|
||||
| --------- | ---------------------------------------------------------------------- |
|
||||
| `strings` | A sequence of strings to add to the store. ~~Optional[Iterable[str]]~~ |
|
||||
|
||||
## StringStore.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
|
@ -34,9 +34,9 @@ Get the number of strings in the store.
|
|||
> assert len(stringstore) == 2
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------- |
|
||||
| **RETURNS** | int | The number of strings in the store. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------- |
|
||||
| **RETURNS** | The number of strings in the store. ~~int~~ |
|
||||
|
||||
## StringStore.\_\_getitem\_\_ {#getitem tag="method"}
|
||||
|
||||
|
@ -51,10 +51,10 @@ Retrieve a string from a given hash, or vice versa.
|
|||
> assert stringstore[apple_hash] == "apple"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | -------------------- | -------------------------- |
|
||||
| `string_or_id` | bytes, str or uint64 | The value to encode. |
|
||||
| **RETURNS** | str or int | The value to be retrieved. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------- |
|
||||
| `string_or_id` | The value to encode. ~~Union[bytes, str, int]~~ |
|
||||
| **RETURNS** | The value to be retrieved. ~~Union[str, int]~~ |
|
||||
|
||||
## StringStore.\_\_contains\_\_ {#contains tag="method"}
|
||||
|
||||
|
@ -68,15 +68,15 @@ Check whether a string is in the store.
|
|||
> assert not "cherry" in stringstore
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------- |
|
||||
| `string` | str | The string to check. |
|
||||
| **RETURNS** | bool | Whether the store contains the string. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------- |
|
||||
| `string` | The string to check. ~~str~~ |
|
||||
| **RETURNS** | Whether the store contains the string. ~~bool~~ |
|
||||
|
||||
## StringStore.\_\_iter\_\_ {#iter tag="method"}
|
||||
|
||||
Iterate over the strings in the store, in order. Note that a newly initialized
|
||||
store will always include an empty string `''` at position `0`.
|
||||
store will always include an empty string `""` at position `0`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -86,9 +86,9 @@ store will always include an empty string `''` at position `0`.
|
|||
> assert all_strings == ["apple", "orange"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ---- | ---------------------- |
|
||||
| **YIELDS** | str | A string in the store. |
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------ |
|
||||
| **YIELDS** | A string in the store. ~~str~~ |
|
||||
|
||||
## StringStore.add {#add tag="method" new="2"}
|
||||
|
||||
|
@ -105,10 +105,10 @@ Add a string to the `StringStore`.
|
|||
> assert stringstore["banana"] == banana_hash
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | ------------------------ |
|
||||
| `string` | str | The string to add. |
|
||||
| **RETURNS** | uint64 | The string's hash value. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `string` | The string to add. ~~str~~ |
|
||||
| **RETURNS** | The string's hash value. ~~int~~ |
|
||||
|
||||
## StringStore.to_disk {#to_disk tag="method" new="2"}
|
||||
|
||||
|
@ -120,9 +120,9 @@ Save the current state to a directory.
|
|||
> stringstore.to_disk("/path/to/strings")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Description |
|
||||
| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
|
||||
## StringStore.from_disk {#from_disk tag="method" new="2"}
|
||||
|
||||
|
@ -135,10 +135,10 @@ Loads state from a directory. Modifies the object in place and returns it.
|
|||
> stringstore = StringStore().from_disk("/path/to/strings")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `StringStore` | The modified `StringStore` object. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| **RETURNS** | The modified `StringStore` object. ~~StringStore~~ |
|
||||
|
||||
## StringStore.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -150,9 +150,9 @@ Serialize the current state to a binary string.
|
|||
> store_bytes = stringstore.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------ |
|
||||
| **RETURNS** | bytes | The serialized form of the `StringStore` object. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------- |
|
||||
| **RETURNS** | The serialized form of the `StringStore` object. ~~bytes~~ |
|
||||
|
||||
## StringStore.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -166,10 +166,10 @@ Load state from a binary string.
|
|||
> new_store = StringStore().from_bytes(store_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------- | ------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| **RETURNS** | `StringStore` | The `StringStore` object. |
|
||||
| Name | Description |
|
||||
| ------------ | ----------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| **RETURNS** | The `StringStore` object. ~~StringStore~~ |
|
||||
|
||||
## Utilities {#util}
|
||||
|
||||
|
@ -184,7 +184,7 @@ Get a 64-bit hash for a given string.
|
|||
> assert hash_string("apple") == 8566208034543834098
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | ------------------- |
|
||||
| `string` | str | The string to hash. |
|
||||
| **RETURNS** | uint64 | The hash. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------- |
|
||||
| `string` | The string to hash. ~~str~~ |
|
||||
| **RETURNS** | The hash. ~~int~~ |
|
||||
|
|
|
@ -28,10 +28,10 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("tagger", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------- |
|
||||
| `set_morphology` | bool | Whether to set morphological features. | `False` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). | [Tagger](/api/architectures#Tagger) |
|
||||
| Setting | Description |
|
||||
| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `set_morphology` | Whether to set morphological features. Defaults to `False`. ~~bool~~ |
|
||||
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). Defaults to [Tagger](/api/architectures#Tagger). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tagger.pyx
|
||||
|
@ -58,13 +58,13 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#add_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| _keyword-only_ | | |
|
||||
| `set_morphology` | bool | Whether to set morphological features. |
|
||||
| Name | Description |
|
||||
| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | A model instance that predicts the tag probabilities. The output vectors should match the number of tags in size, and be normalized as probabilities (all scores between 0 and 1, with the rows summing to `1`). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `set_morphology` | Whether to set morphological features. ~~bool~~ |
|
||||
|
||||
## Tagger.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -84,10 +84,10 @@ and all pipeline components are applied to the `Doc` in order. Both
|
|||
> processed = tagger(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| `doc` | `Doc` | The document to process. |
|
||||
| **RETURNS** | `Doc` | The processed document. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## Tagger.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -105,12 +105,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/tagger#call) and
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------ |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| _keyword-only_ | | |
|
||||
| `batch_size` | int | The number of texts to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | Processed documents in the order of the original text. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Tagger.begin_training {#begin_training tag="method"}
|
||||
|
||||
|
@ -130,13 +130,13 @@ setting up the label scheme based on the data.
|
|||
> optimizer = tagger.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/tagger#create_optimizer) if not set. |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## Tagger.predict {#predict tag="method"}
|
||||
|
||||
|
@ -150,10 +150,10 @@ modifying them.
|
|||
> scores = tagger.predict([doc1, doc2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------- | ----------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||
| **RETURNS** | - | The model's prediction for each document. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------- |
|
||||
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
||||
| **RETURNS** | The model's prediction for each document. |
|
||||
|
||||
## Tagger.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
|
@ -167,10 +167,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
|
|||
> tagger.set_annotations([doc1, doc2], scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | --------------- | ------------------------------------------------ |
|
||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||
| `scores` | - | The scores to set, produced by `Tagger.predict`. |
|
||||
| Name | Description |
|
||||
| -------- | ------------------------------------------------ |
|
||||
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
||||
| `scores` | The scores to set, produced by `Tagger.predict`. |
|
||||
|
||||
## Tagger.update {#update tag="method"}
|
||||
|
||||
|
@ -187,15 +187,15 @@ Delegates to [`predict`](/api/tagger#predict) and
|
|||
> losses = tagger.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| Name | Description |
|
||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/tagger#set_annotations). |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. The value keyed by the model's name is updated. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## Tagger.rehearse {#rehearse tag="method,experimental" new="3"}
|
||||
|
||||
|
@ -211,14 +211,14 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
|||
> losses = tagger.rehearse(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## Tagger.get_loss {#get_loss tag="method"}
|
||||
|
||||
|
@ -233,11 +233,11 @@ predicted scores.
|
|||
> loss, d_loss = tagger.get_loss(examples, scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------- | --------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||
| `scores` | - | Scores representing the model's predictions. |
|
||||
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------- |
|
||||
| `examples` | The batch of examples. ~~Iterable[Example]~~ |
|
||||
| `scores` | Scores representing the model's predictions. |
|
||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||
|
||||
## Tagger.score {#score tag="method" new="3"}
|
||||
|
||||
|
@ -249,10 +249,10 @@ Score a batch of examples.
|
|||
> scores = tagger.score(examples)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | `Iterable[Example]` | The examples to score. |
|
||||
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
||||
| **RETURNS** | The scores, produced by [`Scorer.score_token_attr`](/api/scorer#score_token_attr) for the attributes `"pos"`, `"tag"` and `"lemma"`. ~~Dict[str, float]~~ |
|
||||
|
||||
## Tagger.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
|
@ -265,9 +265,9 @@ Create an optimizer for the pipeline component.
|
|||
> optimizer = tagger.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------------------------------------- | -------------- |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## Tagger.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
|
@ -282,9 +282,9 @@ context, the original parameters are restored.
|
|||
> tagger.to_disk("/best_model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---- | ----------------------------------------- |
|
||||
| `params` | dict | The parameter values to use in the model. |
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------- |
|
||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||
|
||||
## Tagger.add_label {#add_label tag="method"}
|
||||
|
||||
|
@ -297,10 +297,10 @@ Add a new label to the pipe.
|
|||
> tagger.add_label("MY_LABEL")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------------------------- |
|
||||
| `label` | str | The label to add. |
|
||||
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------- |
|
||||
| `label` | The label to add. ~~str~~ |
|
||||
| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
|
||||
|
||||
## Tagger.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -313,11 +313,11 @@ Serialize the pipe to disk.
|
|||
> tagger.to_disk("/path/to/tagger")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## Tagger.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -330,12 +330,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> tagger.from_disk("/path/to/tagger")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tagger` | The modified `Tagger` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `Tagger` object. ~~Tagger~~ |
|
||||
|
||||
## Tagger.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -348,11 +348,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Tagger` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `Tagger` object. ~~bytes~~ |
|
||||
|
||||
## Tagger.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -366,12 +366,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> tagger.from_bytes(tagger_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tagger` | The `Tagger` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `Tagger` object. ~~Tagger~~ |
|
||||
|
||||
## Tagger.labels {#labels tag="property"}
|
||||
|
||||
|
@ -384,9 +384,9 @@ The labels currently added to the component.
|
|||
> assert "MY_LABEL" in tagger.labels
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | ---------------------------------- |
|
||||
| **RETURNS** | `Tuple[str]` | The labels added to the component. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------ |
|
||||
| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -35,10 +35,10 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("textcat", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| -------- | ------------------------------------------ | --------------------------------------------------------------------------------------- | ----------------------------------------------------- |
|
||||
| `labels` | `List[str]` | A list of categories to learn. If empty, the model infers the categories from the data. | `[]` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | A model instance that predicts scores for each category. | [TextCatEnsemble](/api/architectures#TextCatEnsemble) |
|
||||
| Setting | Description |
|
||||
| -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `labels` | A list of categories to learn. If empty, the model infers the categories from the data. Defaults to `[]`. ~~Iterable[str]~~ |
|
||||
| `model` | A model instance that predicts scores for each category. Defaults to [TextCatEnsemble](/api/architectures#TextCatEnsemble). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/textcat.py
|
||||
|
@ -65,13 +65,13 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| _keyword-only_ | | |
|
||||
| `labels` | `Iterable[str]` | The labels to use. |
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| _keyword-only_ | |
|
||||
| `labels` | The labels to use. ~~Iterable[str]~~ |
|
||||
|
||||
## TextCategorizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -91,10 +91,10 @@ delegate to the [`predict`](/api/textcategorizer#predict) and
|
|||
> processed = textcat(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| `doc` | `Doc` | The document to process. |
|
||||
| **RETURNS** | `Doc` | The processed document. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## TextCategorizer.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -113,12 +113,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/textcategorizer#call) and
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ----------------------------------------------------- |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| _keyword-only_ | | |
|
||||
| `batch_size` | int | The number of documents to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | The processed documents in order. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## TextCategorizer.begin_training {#begin_training tag="method"}
|
||||
|
||||
|
@ -138,13 +138,13 @@ setting up the label scheme based on the data.
|
|||
> optimizer = textcat.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------ |
|
||||
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/textcategorizer#create_optimizer) if not set. |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## TextCategorizer.predict {#predict tag="method"}
|
||||
|
||||
|
@ -158,10 +158,10 @@ modifying them.
|
|||
> scores = textcat.predict([doc1, doc2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------- | ----------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||
| **RETURNS** | - | The model's prediction for each document. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------- |
|
||||
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
||||
| **RETURNS** | The model's prediction for each document. |
|
||||
|
||||
## TextCategorizer.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
|
@ -175,10 +175,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
|
|||
> textcat.set_annotations(docs, scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | --------------- | --------------------------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||
| `scores` | - | The scores to set, produced by `TextCategorizer.predict`. |
|
||||
| Name | Description |
|
||||
| -------- | --------------------------------------------------------- |
|
||||
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
||||
| `scores` | The scores to set, produced by `TextCategorizer.predict`. |
|
||||
|
||||
## TextCategorizer.update {#update tag="method"}
|
||||
|
||||
|
@ -195,15 +195,15 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
|
|||
> losses = textcat.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------- | --------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| Name | Description |
|
||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/textcategorizer#set_annotations). |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"}
|
||||
|
||||
|
@ -219,14 +219,14 @@ the "catastrophic forgetting" problem. This feature is experimental.
|
|||
> losses = textcat.rehearse(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | ----------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## TextCategorizer.get_loss {#get_loss tag="method"}
|
||||
|
||||
|
@ -241,11 +241,11 @@ predicted scores.
|
|||
> loss, d_loss = textcat.get_loss(examples, scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------- | --------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The batch of examples. |
|
||||
| `scores` | - | Scores representing the model's predictions. |
|
||||
| **RETURNS** | `Tuple[float, float]` | The loss and the gradient, i.e. `(loss, gradient)`. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------- |
|
||||
| `examples` | The batch of examples. ~~Iterable[Example]~~ |
|
||||
| `scores` | Scores representing the model's predictions. |
|
||||
| **RETURNS** | The loss and the gradient, i.e. `(loss, gradient)`. ~~Tuple[float, float]~~ |
|
||||
|
||||
## TextCategorizer.score {#score tag="method" new="3"}
|
||||
|
||||
|
@ -257,12 +257,12 @@ Score a batch of examples.
|
|||
> scores = textcat.score(examples)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ------------------- | ---------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | The examples to score. |
|
||||
| Name | Description |
|
||||
| ---------------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | The examples to score. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `positive_label` | str | Optional positive label. |
|
||||
| **RETURNS** | `Dict[str, Any]` | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). |
|
||||
| `positive_label` | Optional positive label. ~~Optional[str]~~ |
|
||||
| **RETURNS** | The scores, produced by [`Scorer.score_cats`](/api/scorer#score_cats). ~~Dict[str, Union[float, Dict[str, float]]]~~ |
|
||||
|
||||
## TextCategorizer.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
|
@ -275,25 +275,9 @@ Create an optimizer for the pipeline component.
|
|||
> optimizer = textcat.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------------------------------------- | -------------- |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
|
||||
## TextCategorizer.add_label {#add_label tag="method"}
|
||||
|
||||
Add a new label to the pipe.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat")
|
||||
> textcat.add_label("MY_LABEL")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------------------------- |
|
||||
| `label` | str | The label to add. |
|
||||
| **RETURNS** | int | `0` if the label is already present, otherwise `1`. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## TextCategorizer.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
|
@ -307,9 +291,25 @@ Modify the pipe's model, to use the given parameter values.
|
|||
> textcat.to_disk("/best_model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---- | ----------------------------------------- |
|
||||
| `params` | dict | The parameter values to use in the model. |
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------- |
|
||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||
|
||||
## TextCategorizer.add_label {#add_label tag="method"}
|
||||
|
||||
Add a new label to the pipe.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> textcat = nlp.add_pipe("textcat")
|
||||
> textcat.add_label("MY_LABEL")
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------- |
|
||||
| `label` | The label to add. ~~str~~ |
|
||||
| **RETURNS** | `0` if the label is already present, otherwise `1`. ~~int~~ |
|
||||
|
||||
## TextCategorizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -322,11 +322,11 @@ Serialize the pipe to disk.
|
|||
> textcat.to_disk("/path/to/textcat")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## TextCategorizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -339,12 +339,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> textcat.from_disk("/path/to/textcat")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ----------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `TextCategorizer` object. ~~TextCategorizer~~ |
|
||||
|
||||
## TextCategorizer.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -357,11 +357,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `TextCategorizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `TextCategorizer` object. ~~bytes~~ |
|
||||
|
||||
## TextCategorizer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -375,12 +375,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> textcat.from_bytes(textcat_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ----------------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `TextCategorizer` | The `TextCategorizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `TextCategorizer` object. ~~TextCategorizer~~ |
|
||||
|
||||
## TextCategorizer.labels {#labels tag="property"}
|
||||
|
||||
|
@ -393,9 +393,9 @@ The labels currently added to the component.
|
|||
> assert "MY_LABEL" in textcat.labels
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ---------------------------------- |
|
||||
| **RETURNS** | tuple | The labels added to the component. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------ |
|
||||
| **RETURNS** | The labels added to the component. ~~Tuple[str, ...]~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -40,9 +40,9 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("tok2vec", config=config)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** `List[Floats2d]`. The model to use. | [HashEmbedCNN](/api/architectures#HashEmbedCNN) |
|
||||
| Setting | Description |
|
||||
| ------- | ------------------------------------------------------------------------------------------------------------------ |
|
||||
| `model` | The model to use. Defaults to [HashEmbedCNN](/api/architectures#HashEmbedCNN). ~~Model[List[Doc], List[Floats2d]~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/tok2vec.py
|
||||
|
@ -69,11 +69,11 @@ Create a new pipeline instance. In your application, you would normally use a
|
|||
shortcut for this and instantiate the component using its string name and
|
||||
[`nlp.add_pipe`](/api/language#create_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------------------------------------------ | ------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| Name | Description |
|
||||
| ------- | ------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model[List[Doc], List[Floats2d]~~ |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
|
||||
## Tok2Vec.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -95,10 +95,10 @@ pipeline components are applied to the `Doc` in order. Both
|
|||
> processed = tok2vec(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| `doc` | `Doc` | The document to process. |
|
||||
| **RETURNS** | `Doc` | The processed document. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## Tok2Vec.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -116,12 +116,12 @@ and [`set_annotations`](/api/tok2vec#set_annotations) methods.
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ----------------------------------------------------- |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| _keyword-only_ | | |
|
||||
| `batch_size` | int | The number of documents to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | The processed documents in order. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Tok2Vec.begin_training {#begin_training tag="method"}
|
||||
|
||||
|
@ -141,13 +141,13 @@ setting up the label scheme based on the data.
|
|||
> optimizer = tok2vec.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/tok2vec#create_optimizer) if not set. |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## Tok2Vec.predict {#predict tag="method"}
|
||||
|
||||
|
@ -161,10 +161,10 @@ modifying them.
|
|||
> scores = tok2vec.predict([doc1, doc2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------- | ----------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||
| **RETURNS** | - | The model's prediction for each document. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------- |
|
||||
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
||||
| **RETURNS** | The model's prediction for each document. |
|
||||
|
||||
## Tok2Vec.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
|
@ -178,10 +178,10 @@ Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
|
|||
> tok2vec.set_annotations(docs, scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | --------------- | ------------------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||
| `scores` | - | The scores to set, produced by `Tok2Vec.predict`. |
|
||||
| Name | Description |
|
||||
| -------- | ------------------------------------------------- |
|
||||
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
||||
| `scores` | The scores to set, produced by `Tok2Vec.predict`. |
|
||||
|
||||
## Tok2Vec.update {#update tag="method"}
|
||||
|
||||
|
@ -197,15 +197,15 @@ Delegates to [`predict`](/api/tok2vec#predict).
|
|||
> losses = tok2vec.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------- | --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects to learn from. |
|
||||
| Name | Description |
|
||||
| ----------------- | ---------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects to learn from. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/tok2vec#set_annotations). |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## Tok2Vec.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
|
@ -218,9 +218,9 @@ Create an optimizer for the pipeline component.
|
|||
> optimizer = tok2vec.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------------------------------------- | -------------- |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## Tok2Vec.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
|
@ -235,9 +235,9 @@ context, the original parameters are restored.
|
|||
> tok2vec.to_disk("/best_model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---- | ----------------------------------------- |
|
||||
| `params` | dict | The parameter values to use in the model. |
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------- |
|
||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||
|
||||
## Tok2Vec.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -250,11 +250,11 @@ Serialize the pipe to disk.
|
|||
> tok2vec.to_disk("/path/to/tok2vec")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## Tok2Vec.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -267,12 +267,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> tok2vec.from_disk("/path/to/tok2vec")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `Tok2Vec` object. ~~Tok2Vec~~ |
|
||||
|
||||
## Tok2Vec.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -285,11 +285,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `Tok2Vec` object. ~~bytes~~ |
|
||||
|
||||
## Tok2Vec.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -303,12 +303,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> tok2vec.from_bytes(tok2vec_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `Tok2Vec` object. ~~Tok2Vec~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -17,11 +17,11 @@ Construct a `Token` object.
|
|||
> assert token.text == "Give"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ------- | ------------------------------------------- |
|
||||
| `vocab` | `Vocab` | A storage container for lexical types. |
|
||||
| `doc` | `Doc` | The parent document. |
|
||||
| `offset` | int | The index of the token within the document. |
|
||||
| Name | Description |
|
||||
| -------- | --------------------------------------------------- |
|
||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||
| `doc` | The parent document. ~~Doc~~ |
|
||||
| `offset` | The index of the token within the document. ~~int~~ |
|
||||
|
||||
## Token.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
|
@ -35,9 +35,9 @@ The number of unicode characters in the token, i.e. `token.text`.
|
|||
> assert len(token) == 4
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ---------------------------------------------- |
|
||||
| **RETURNS** | int | The number of unicode characters in the token. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------ |
|
||||
| **RETURNS** | The number of unicode characters in the token. ~~int~~ |
|
||||
|
||||
## Token.set_extension {#set_extension tag="classmethod" new="2"}
|
||||
|
||||
|
@ -55,14 +55,14 @@ For details, see the documentation on
|
|||
> assert doc[3]._.is_fruit
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | -------- | --------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | str | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `token._.my_attr`. |
|
||||
| `default` | - | Optional default value of the attribute if no getter or method is defined. |
|
||||
| `method` | callable | Set a custom method on the object, for example `token._.compare(other_token)`. |
|
||||
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. |
|
||||
| `setter` | callable | Setter function that takes the `Token` and a value, and modifies the object. Is called when the user writes to the `Token._` attribute. |
|
||||
| `force` | bool | Force overwriting existing attribute. |
|
||||
| Name | Description |
|
||||
| --------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Name of the attribute to set by the extension. For example, `"my_attr"` will be available as `token._.my_attr`. ~~str~~ |
|
||||
| `default` | Optional default value of the attribute if no getter or method is defined. ~~Optional[Any]~~ |
|
||||
| `method` | Set a custom method on the object, for example `token._.compare(other_token)`. ~~Optional[Callable[[Token, ...], Any]]~~ |
|
||||
| `getter` | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. ~~Optional[Callable[[Token], Any]]~~ |
|
||||
| `setter` | Setter function that takes the `Token` and a value, and modifies the object. Is called when the user writes to the `Token._` attribute. ~~Optional[Callable[[Token, Any], None]]~~ |
|
||||
| `force` | Force overwriting existing attribute. ~~bool~~ |
|
||||
|
||||
## Token.get_extension {#get_extension tag="classmethod" new="2"}
|
||||
|
||||
|
@ -79,10 +79,10 @@ Look up a previously registered extension by name. Returns a 4-tuple
|
|||
> assert extension == (False, None, None, None)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------- |
|
||||
| `name` | str | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Name of the extension. ~~str~~ |
|
||||
| **RETURNS** | A `(default, method, getter, setter)` tuple of the extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ |
|
||||
|
||||
## Token.has_extension {#has_extension tag="classmethod" new="2"}
|
||||
|
||||
|
@ -96,10 +96,10 @@ Check whether an extension has been registered on the `Token` class.
|
|||
> assert Token.has_extension("is_fruit")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------ |
|
||||
| `name` | str | Name of the extension to check. |
|
||||
| **RETURNS** | bool | Whether the extension has been registered. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------- |
|
||||
| `name` | Name of the extension to check. ~~str~~ |
|
||||
| **RETURNS** | Whether the extension has been registered. ~~bool~~ |
|
||||
|
||||
## Token.remove_extension {#remove_extension tag="classmethod" new=""2.0.11""}
|
||||
|
||||
|
@ -114,10 +114,10 @@ Remove a previously registered extension.
|
|||
> assert not Token.has_extension("is_fruit")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------------------------------------- |
|
||||
| `name` | str | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Name of the extension. ~~str~~ |
|
||||
| **RETURNS** | A `(default, method, getter, setter)` tuple of the removed extension. ~~Tuple[Optional[Any], Optional[Callable], Optional[Callable], Optional[Callable]]~~ |
|
||||
|
||||
## Token.check_flag {#check_flag tag="method"}
|
||||
|
||||
|
@ -132,10 +132,10 @@ Check the value of a boolean flag.
|
|||
> assert token.check_flag(IS_TITLE) == True
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------- |
|
||||
| `flag_id` | int | The attribute ID of the flag to check. |
|
||||
| **RETURNS** | bool | Whether the flag is set. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------- |
|
||||
| `flag_id` | The attribute ID of the flag to check. ~~int~~ |
|
||||
| **RETURNS** | Whether the flag is set. ~~bool~~ |
|
||||
|
||||
## Token.similarity {#similarity tag="method" model="vectors"}
|
||||
|
||||
|
@ -150,10 +150,10 @@ Compute a semantic similarity estimate. Defaults to cosine over vectors.
|
|||
> assert apples_oranges == oranges_apples
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------------------------------------------------------- |
|
||||
| other | - | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. |
|
||||
| **RETURNS** | float | A scalar similarity score. Higher is more similar. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| other | The object to compare with. By default, accepts `Doc`, `Span`, `Token` and `Lexeme` objects. ~~Union[Doc, Span, Token, Lexeme]~~ |
|
||||
| **RETURNS** | A scalar similarity score. Higher is more similar. ~~float~~ |
|
||||
|
||||
## Token.nbor {#nbor tag="method"}
|
||||
|
||||
|
@ -167,10 +167,10 @@ Get a neighboring token.
|
|||
> assert give_nbor.text == "it"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ----------------------------------------------------------- |
|
||||
| `i` | int | The relative position of the token to get. Defaults to `1`. |
|
||||
| **RETURNS** | `Token` | The token at position `self.doc[self.i+i]`. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------- |
|
||||
| `i` | The relative position of the token to get. Defaults to `1`. ~~int~~ |
|
||||
| **RETURNS** | The token at position `self.doc[self.i+i]`. ~~Token~~ |
|
||||
|
||||
## Token.is_ancestor {#is_ancestor tag="method" model="parser"}
|
||||
|
||||
|
@ -186,10 +186,10 @@ dependency tree.
|
|||
> assert give.is_ancestor(it)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ----------------------------------------------------- |
|
||||
| descendant | `Token` | Another token. |
|
||||
| **RETURNS** | bool | Whether this token is the ancestor of the descendant. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------- |
|
||||
| descendant | Another token. ~~Token~~ |
|
||||
| **RETURNS** | Whether this token is the ancestor of the descendant. ~~bool~~ |
|
||||
|
||||
## Token.ancestors {#ancestors tag="property" model="parser"}
|
||||
|
||||
|
@ -205,9 +205,9 @@ The rightmost token of this token's syntactic descendants.
|
|||
> assert [t.text for t in he_ancestors] == ["pleaded"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------- | --------------------------------------------------------------------- |
|
||||
| **YIELDS** | `Token` | A sequence of ancestor tokens such that `ancestor.is_ancestor(self)`. |
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------------------------------------------------------- |
|
||||
| **YIELDS** | A sequence of ancestor tokens such that `ancestor.is_ancestor(self)`. ~~Token~~ |
|
||||
|
||||
## Token.conjuncts {#conjuncts tag="property" model="parser"}
|
||||
|
||||
|
@ -221,9 +221,9 @@ A tuple of coordinated tokens, not including the token itself.
|
|||
> assert [t.text for t in apples_conjuncts] == ["oranges"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ----------------------- |
|
||||
| **RETURNS** | `tuple` | The coordinated tokens. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------- |
|
||||
| **RETURNS** | The coordinated tokens. ~~Tuple[Token, ...]~~ |
|
||||
|
||||
## Token.children {#children tag="property" model="parser"}
|
||||
|
||||
|
@ -237,9 +237,9 @@ A sequence of the token's immediate syntactic children.
|
|||
> assert [t.text for t in give_children] == ["it", "back", "!"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------- | ------------------------------------------- |
|
||||
| **YIELDS** | `Token` | A child token such that `child.head==self`. |
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------------------------------- |
|
||||
| **YIELDS** | A child token such that `child.head == self`. ~~Token~~ |
|
||||
|
||||
## Token.lefts {#lefts tag="property" model="parser"}
|
||||
|
||||
|
@ -253,9 +253,9 @@ The leftward immediate children of the word, in the syntactic dependency parse.
|
|||
> assert lefts == ["New"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------- | -------------------------- |
|
||||
| **YIELDS** | `Token` | A left-child of the token. |
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------------ |
|
||||
| **YIELDS** | A left-child of the token. ~~Token~~ |
|
||||
|
||||
## Token.rights {#rights tag="property" model="parser"}
|
||||
|
||||
|
@ -269,9 +269,9 @@ The rightward immediate children of the word, in the syntactic dependency parse.
|
|||
> assert rights == ["in"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------- | --------------------------- |
|
||||
| **YIELDS** | `Token` | A right-child of the token. |
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------------- |
|
||||
| **YIELDS** | A right-child of the token. ~~Token~~ |
|
||||
|
||||
## Token.n_lefts {#n_lefts tag="property" model="parser"}
|
||||
|
||||
|
@ -285,9 +285,9 @@ dependency parse.
|
|||
> assert doc[3].n_lefts == 1
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------- |
|
||||
| **RETURNS** | int | The number of left-child tokens. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------- |
|
||||
| **RETURNS** | The number of left-child tokens. ~~int~~ |
|
||||
|
||||
## Token.n_rights {#n_rights tag="property" model="parser"}
|
||||
|
||||
|
@ -301,9 +301,9 @@ dependency parse.
|
|||
> assert doc[3].n_rights == 1
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------- |
|
||||
| **RETURNS** | int | The number of right-child tokens. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------- |
|
||||
| **RETURNS** | The number of right-child tokens. ~~int~~ |
|
||||
|
||||
## Token.subtree {#subtree tag="property" model="parser"}
|
||||
|
||||
|
@ -317,9 +317,9 @@ A sequence containing the token and all the token's syntactic descendants.
|
|||
> assert [t.text for t in give_subtree] == ["Give", "it", "back", "!"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------- | -------------------------------------------------------------------------- |
|
||||
| **YIELDS** | `Token` | A descendant token such that `self.is_ancestor(token)` or `token == self`. |
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------------------------------------------------------------ |
|
||||
| **YIELDS** | A descendant token such that `self.is_ancestor(token)` or `token == self`. ~~Token~~ |
|
||||
|
||||
## Token.is_sent_start {#is_sent_start tag="property" new="2"}
|
||||
|
||||
|
@ -334,9 +334,9 @@ unknown. Defaults to `True` for the first token in the `Doc`.
|
|||
> assert not doc[5].is_sent_start
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------ |
|
||||
| **RETURNS** | bool | Whether the token starts a sentence. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------- |
|
||||
| **RETURNS** | Whether the token starts a sentence. ~~bool~~ |
|
||||
|
||||
## Token.has_vector {#has_vector tag="property" model="vectors"}
|
||||
|
||||
|
@ -350,9 +350,9 @@ A boolean value indicating whether a word vector is associated with the token.
|
|||
> assert apples.has_vector
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------------------- |
|
||||
| **RETURNS** | bool | Whether the token has a vector data attached. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------ |
|
||||
| **RETURNS** | Whether the token has a vector data attached. ~~bool~~ |
|
||||
|
||||
## Token.vector {#vector tag="property" model="vectors"}
|
||||
|
||||
|
@ -367,9 +367,9 @@ A real-valued meaning representation.
|
|||
> assert apples.vector.shape == (300,)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------------------------------- | ---------------------------------------------------- |
|
||||
| **RETURNS** | `numpy.ndarray[ndim=1, dtype="float32"]` | A 1D numpy array representing the token's semantics. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------- |
|
||||
| **RETURNS** | A 1-dimensional array representing the token's vector. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
|
||||
## Token.vector_norm {#vector_norm tag="property" model="vectors"}
|
||||
|
||||
|
@ -386,80 +386,80 @@ The L2 norm of the token's vector representation.
|
|||
> assert apples.vector_norm != pasta.vector_norm
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ----------------------------------------- |
|
||||
| **RETURNS** | float | The L2 norm of the vector representation. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------- |
|
||||
| **RETURNS** | The L2 norm of the vector representation. ~~float~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------------------------------------- | ----------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doc` | `Doc` | The parent document. |
|
||||
| `lex` <Tag variant="new">3</Tag> | [`Lexeme`](/api/lexeme) | The underlying lexeme. |
|
||||
| `sent` <Tag variant="new">2.0.12</Tag> | [`Span`](/api/span) | The sentence span that this token is a part of. |
|
||||
| `text` | str | Verbatim text content. |
|
||||
| `text_with_ws` | str | Text content, with trailing space character if present. |
|
||||
| `whitespace_` | str | Trailing space character if present. |
|
||||
| `orth` | int | ID of the verbatim text content. |
|
||||
| `orth_` | str | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. |
|
||||
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
|
||||
| `tensor` <Tag variant="new">2.1.7</Tag> | `ndarray` | The tokens's slice of the parent `Doc`'s tensor. |
|
||||
| `head` | `Token` | The syntactic parent, or "governor", of this token. |
|
||||
| `left_edge` | `Token` | The leftmost token of this token's syntactic descendants. |
|
||||
| `right_edge` | `Token` | The rightmost token of this token's syntactic descendants. |
|
||||
| `i` | int | The index of the token within the parent document. |
|
||||
| `ent_type` | int | Named entity type. |
|
||||
| `ent_type_` | str | Named entity type. |
|
||||
| `ent_iob` | int | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. |
|
||||
| `ent_iob_` | str | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. |
|
||||
| `ent_kb_id` <Tag variant="new">2.2</Tag> | int | Knowledge base ID that refers to the named entity this token is a part of, if any. |
|
||||
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | str | Knowledge base ID that refers to the named entity this token is a part of, if any. |
|
||||
| `ent_id` | int | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. |
|
||||
| `ent_id_` | str | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. |
|
||||
| `lemma` | int | Base form of the token, with no inflectional suffixes. |
|
||||
| `lemma_` | str | Base form of the token, with no inflectional suffixes. |
|
||||
| `norm` | int | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). |
|
||||
| `norm_` | str | The token's norm, i.e. a normalized form of the token text. Usually set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions) or [norm exceptions](/usage/adding-languages#norm-exceptions). |
|
||||
| `lower` | int | Lowercase form of the token. |
|
||||
| `lower_` | str | Lowercase form of the token text. Equivalent to `Token.text.lower()`. |
|
||||
| `shape` | int | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
||||
| `shape_` | str | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
||||
| `prefix` | int | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. |
|
||||
| `prefix_` | str | A length-N substring from the start of the token. Defaults to `N=1`. |
|
||||
| `suffix` | int | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. |
|
||||
| `suffix_` | str | Length-N substring from the end of the token. Defaults to `N=3`. |
|
||||
| `is_alpha` | bool | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. |
|
||||
| `is_ascii` | bool | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. |
|
||||
| `is_digit` | bool | Does the token consist of digits? Equivalent to `token.text.isdigit()`. |
|
||||
| `is_lower` | bool | Is the token in lowercase? Equivalent to `token.text.islower()`. |
|
||||
| `is_upper` | bool | Is the token in uppercase? Equivalent to `token.text.isupper()`. |
|
||||
| `is_title` | bool | Is the token in titlecase? Equivalent to `token.text.istitle()`. |
|
||||
| `is_punct` | bool | Is the token punctuation? |
|
||||
| `is_left_punct` | bool | Is the token a left punctuation mark, e.g. `"("` ? |
|
||||
| `is_right_punct` | bool | Is the token a right punctuation mark, e.g. `")"` ? |
|
||||
| `is_space` | bool | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. |
|
||||
| `is_bracket` | bool | Is the token a bracket? |
|
||||
| `is_quote` | bool | Is the token a quotation mark? |
|
||||
| `is_currency` <Tag variant="new">2.0.8</Tag> | bool | Is the token a currency symbol? |
|
||||
| `like_url` | bool | Does the token resemble a URL? |
|
||||
| `like_num` | bool | Does the token represent a number? e.g. "10.9", "10", "ten", etc. |
|
||||
| `like_email` | bool | Does the token resemble an email address? |
|
||||
| `is_oov` | bool | Does the token have a word vector? |
|
||||
| `is_stop` | bool | Is the token part of a "stop list"? |
|
||||
| `pos` | int | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). |
|
||||
| `pos_` | str | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). |
|
||||
| `tag` | int | Fine-grained part-of-speech. |
|
||||
| `tag_` | str | Fine-grained part-of-speech. |
|
||||
| `morph` | `MorphAnalysis` | Morphological analysis. |
|
||||
| `morph_` | str | Morphological analysis in UD FEATS format. |
|
||||
| `dep` | int | Syntactic dependency relation. |
|
||||
| `dep_` | str | Syntactic dependency relation. |
|
||||
| `lang` | int | Language of the parent document's vocabulary. |
|
||||
| `lang_` | str | Language of the parent document's vocabulary. |
|
||||
| `prob` | float | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). |
|
||||
| `idx` | int | The character offset of the token within the parent document. |
|
||||
| `sentiment` | float | A scalar value indicating the positivity or negativity of the token. |
|
||||
| `lex_id` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. |
|
||||
| `rank` | int | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. |
|
||||
| `cluster` | int | Brown cluster ID. |
|
||||
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
|
||||
| Name | Description |
|
||||
| -------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doc` | The parent document. ~~Doc~~ |
|
||||
| `lex` <Tag variant="new">3</Tag> | The underlying lexeme. ~~Lexeme~~ |
|
||||
| `sent` <Tag variant="new">2.0.12</Tag> | The sentence span that this token is a part of. ~~Span~~ |
|
||||
| `text` | Verbatim text content. ~~str~~ |
|
||||
| `text_with_ws` | Text content, with trailing space character if present. ~~str~~ |
|
||||
| `whitespace_` | Trailing space character if present. ~~str~~ |
|
||||
| `orth` | ID of the verbatim text content. ~~int~~ |
|
||||
| `orth_` | Verbatim text content (identical to `Token.text`). Exists mostly for consistency with the other attributes. ~~str~~ |
|
||||
| `vocab` | The vocab object of the parent `Doc`. ~~vocab~~ |
|
||||
| `tensor` <Tag variant="new">2.1.7</Tag> | The tokens's slice of the parent `Doc`'s tensor. ~~numpy.ndarray~~ |
|
||||
| `head` | The syntactic parent, or "governor", of this token. ~~Token~~ |
|
||||
| `left_edge` | The leftmost token of this token's syntactic descendants. ~~Token~~ |
|
||||
| `right_edge` | The rightmost token of this token's syntactic descendants. ~~Token~~ |
|
||||
| `i` | The index of the token within the parent document. ~~int~~ |
|
||||
| `ent_type` | Named entity type. ~~int~~ |
|
||||
| `ent_type_` | Named entity type. ~~str~~ |
|
||||
| `ent_iob` | IOB code of named entity tag. `3` means the token begins an entity, `2` means it is outside an entity, `1` means it is inside an entity, and `0` means no entity tag is set. ~~int~~ |
|
||||
| `ent_iob_` | IOB code of named entity tag. "B" means the token begins an entity, "I" means it is inside an entity, "O" means it is outside an entity, and "" means no entity tag is set. ~~str~~ |
|
||||
| `ent_kb_id` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~int~~ |
|
||||
| `ent_kb_id_` <Tag variant="new">2.2</Tag> | Knowledge base ID that refers to the named entity this token is a part of, if any. ~~str~~ |
|
||||
| `ent_id` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~int~~ |
|
||||
| `ent_id_` | ID of the entity the token is an instance of, if any. Currently not used, but potentially for coreference resolution. ~~str~~ |
|
||||
| `lemma` | Base form of the token, with no inflectional suffixes. ~~int~~ |
|
||||
| `lemma_` | Base form of the token, with no inflectional suffixes. ~~str~~ |
|
||||
| `norm` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~int~~ |
|
||||
| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~ |
|
||||
| `lower` | Lowercase form of the token. ~~int~~ |
|
||||
| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ |
|
||||
| `shape` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
|
||||
| `shape_` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
|
||||
| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ |
|
||||
| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ |
|
||||
| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ |
|
||||
| `suffix_` | Length-N substring from the end of the token. Defaults to `N=3`. ~~str~~ |
|
||||
| `is_alpha` | Does the token consist of alphabetic characters? Equivalent to `token.text.isalpha()`. ~~bool~~ |
|
||||
| `is_ascii` | Does the token consist of ASCII characters? Equivalent to `all(ord(c) < 128 for c in token.text)`. ~~bool~~ |
|
||||
| `is_digit` | Does the token consist of digits? Equivalent to `token.text.isdigit()`. ~~bool~~ |
|
||||
| `is_lower` | Is the token in lowercase? Equivalent to `token.text.islower()`. ~~bool~~ |
|
||||
| `is_upper` | Is the token in uppercase? Equivalent to `token.text.isupper()`. ~~bool~~ |
|
||||
| `is_title` | Is the token in titlecase? Equivalent to `token.text.istitle()`. ~~bool~~ |
|
||||
| `is_punct` | Is the token punctuation? ~~bool~~ |
|
||||
| `is_left_punct` | Is the token a left punctuation mark, e.g. `"("` ? ~~bool~~ |
|
||||
| `is_right_punct` | Is the token a right punctuation mark, e.g. `")"` ? ~~bool~~ |
|
||||
| `is_space` | Does the token consist of whitespace characters? Equivalent to `token.text.isspace()`. ~~bool~~ |
|
||||
| `is_bracket` | Is the token a bracket? ~~bool~~ |
|
||||
| `is_quote` | Is the token a quotation mark? ~~bool~~ |
|
||||
| `is_currency` <Tag variant="new">2.0.8</Tag> | Is the token a currency symbol? ~~bool~~ |
|
||||
| `like_url` | Does the token resemble a URL? ~~bool~~ |
|
||||
| `like_num` | Does the token represent a number? e.g. "10.9", "10", "ten", etc. ~~bool~~ |
|
||||
| `like_email` | Does the token resemble an email address? ~~bool~~ |
|
||||
| `is_oov` | Does the token have a word vector? ~~bool~~ |
|
||||
| `is_stop` | Is the token part of a "stop list"? ~~bool~~ |
|
||||
| `pos` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~int~~ |
|
||||
| `pos_` | Coarse-grained part-of-speech from the [Universal POS tag set](https://universaldependencies.org/docs/u/pos/). ~~str~~ |
|
||||
| `tag` | Fine-grained part-of-speech. ~~int~~ |
|
||||
| `tag_` | Fine-grained part-of-speech. ~~str~~ |
|
||||
| `morph` | Morphological analysis. ~~MorphAnalysis~~ |
|
||||
| `morph_` | Morphological analysis in the Universal Dependencies [FEATS]https://universaldependencies.org/format.html#morphological-annotation format. ~~str~~ |
|
||||
| `dep` | Syntactic dependency relation. ~~int~~ |
|
||||
| `dep_` | Syntactic dependency relation. ~~str~~ |
|
||||
| `lang` | Language of the parent document's vocabulary. ~~int~~ |
|
||||
| `lang_` | Language of the parent document's vocabulary. ~~str~~ |
|
||||
| `prob` | Smoothed log probability estimate of token's word type (context-independent entry in the vocabulary). ~~float~~ |
|
||||
| `idx` | The character offset of the token within the parent document. ~~int~~ |
|
||||
| `sentiment` | A scalar value indicating the positivity or negativity of the token. ~~float~~ |
|
||||
| `lex_id` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
|
||||
| `rank` | Sequential ID of the token's lexical type, used to index into tables, e.g. for word vectors. ~~int~~ |
|
||||
| `cluster` | Brown cluster ID. ~~int~~ |
|
||||
| `_` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). ~~Underscore~~ |
|
||||
|
|
|
@ -45,15 +45,15 @@ the
|
|||
> tokenizer = nlp.tokenizer
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `vocab` | `Vocab` | A storage container for lexical types. |
|
||||
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
|
||||
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
|
||||
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
|
||||
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
|
||||
| `token_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches. |
|
||||
| `url_match` | callable | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. |
|
||||
| Name | Description |
|
||||
| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | A storage container for lexical types. ~~Vocab~~ |
|
||||
| `rules` | Exceptions and special-cases for the tokenizer. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ |
|
||||
| `prefix_search` | A function matching the signature of `re.compile(string).search` to match prefixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
||||
| `suffix_search` | A function matching the signature of `re.compile(string).search` to match suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
||||
| `infix_finditer` | A function matching the signature of `re.compile(string).finditer` to find infixes. ~~Optional[Callable[[str], Iterator[Match]]]~~ |
|
||||
| `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
||||
| `url_match` | A function matching the signature of `re.compile(string).match` to find token matches after considering prefixes and suffixes. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
||||
|
||||
## Tokenizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -66,10 +66,10 @@ Tokenize a string.
|
|||
> assert len(tokens) == 4
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------- |
|
||||
| `string` | str | The string to tokenize. |
|
||||
| **RETURNS** | `Doc` | A container for linguistic annotations. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------- |
|
||||
| `string` | The string to tokenize. ~~str~~ |
|
||||
| **RETURNS** | A container for linguistic annotations. ~~Doc~~ |
|
||||
|
||||
## Tokenizer.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -83,40 +83,40 @@ Tokenize a stream of texts.
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ----- | ---------------------------------------------------------------------------- |
|
||||
| `texts` | - | A sequence of unicode texts. |
|
||||
| `batch_size` | int | The number of texts to accumulate in an internal buffer. Defaults to `1000`. |
|
||||
| **YIELDS** | `Doc` | A sequence of Doc objects, in order. |
|
||||
| Name | Description |
|
||||
| ------------ | ------------------------------------------------------------------------------------ |
|
||||
| `texts` | A sequence of unicode texts. ~~Iterable[str]~~ |
|
||||
| `batch_size` | The number of texts to accumulate in an internal buffer. Defaults to `1000`. ~~int~~ |
|
||||
| **YIELDS** | The tokenized Doc objects, in order. ~~Doc~~ |
|
||||
|
||||
## Tokenizer.find_infix {#find_infix tag="method"}
|
||||
|
||||
Find internal split points of the string.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `string` | str | The string to split. |
|
||||
| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `string` | The string to split. ~~str~~ |
|
||||
| **RETURNS** | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. ~~List[Match]~~ |
|
||||
|
||||
## Tokenizer.find_prefix {#find_prefix tag="method"}
|
||||
|
||||
Find the length of a prefix that should be segmented from the string, or `None`
|
||||
if no prefix rules match.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------------------ |
|
||||
| `string` | str | The string to segment. |
|
||||
| **RETURNS** | int | The length of the prefix if present, otherwise `None`. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------ |
|
||||
| `string` | The string to segment. ~~str~~ |
|
||||
| **RETURNS** | The length of the prefix if present, otherwise `None`. ~~Optional[int]~~ |
|
||||
|
||||
## Tokenizer.find_suffix {#find_suffix tag="method"}
|
||||
|
||||
Find the length of a suffix that should be segmented from the string, or `None`
|
||||
if no suffix rules match.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | ------------------------------------------------------ |
|
||||
| `string` | str | The string to segment. |
|
||||
| **RETURNS** | int / `None` | The length of the suffix if present, otherwise `None`. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------ |
|
||||
| `string` | The string to segment. ~~str~~ |
|
||||
| **RETURNS** | The length of the suffix if present, otherwise `None`. ~~Optional[int]~~ |
|
||||
|
||||
## Tokenizer.add_special_case {#add_special_case tag="method"}
|
||||
|
||||
|
@ -134,10 +134,10 @@ and examples.
|
|||
> tokenizer.add_special_case("don't", case)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `string` | str | The string to specially tokenize. |
|
||||
| `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. |
|
||||
| Name | Description |
|
||||
| ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `string` | The string to specially tokenize. ~~str~~ |
|
||||
| `token_attrs` | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. ~~Iterable[Dict[int, str]]~~ |
|
||||
|
||||
## Tokenizer.explain {#explain tag="method"}
|
||||
|
||||
|
@ -153,10 +153,10 @@ produced are identical to `Tokenizer.__call__` except for whitespace tokens.
|
|||
> assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------------------------- |
|
||||
| `string` | str | The string to tokenize with the debugging tokenizer |
|
||||
| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------- |
|
||||
| `string` | The string to tokenize with the debugging tokenizer. ~~str~~ |
|
||||
| **RETURNS** | A list of `(pattern_string, token_string)` tuples. ~~List[Tuple[str, str]]~~ |
|
||||
|
||||
## Tokenizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -169,11 +169,11 @@ Serialize the tokenizer to disk.
|
|||
> tokenizer.to_disk("/path/to/tokenizer")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## Tokenizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -186,12 +186,12 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
|
|||
> tokenizer.from_disk("/path/to/tokenizer")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `Tokenizer` object. ~~Tokenizer~~ |
|
||||
|
||||
## Tokenizer.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -204,11 +204,11 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the tokenizer to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Tokenizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `Tokenizer` object. ~~bytes~~ |
|
||||
|
||||
## Tokenizer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -223,23 +223,23 @@ it.
|
|||
> tokenizer.from_bytes(tokenizer_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tokenizer` | The `Tokenizer` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `Tokenizer` object. ~~Tokenizer~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
|
||||
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
|
||||
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
|
||||
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
|
||||
| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an`re.MatchObject`or`None. |
|
||||
| `rules` | dict | A dictionary of tokenizer exceptions and special cases. |
|
||||
| Name | Description |
|
||||
| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The vocab object of the parent `Doc`. ~~Vocab~~ |
|
||||
| `prefix_search` | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
||||
| `suffix_search` | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
||||
| `infix_finditer` | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) sequence of `re.MatchObject` objects. ~~Optional[Callable[[str], Iterator[Match]]]~~ |
|
||||
| `token_match` | A function matching the signature of `re.compile(string).match` to find token matches. Returns an `re.MatchObject` or `None`. ~~Optional[Callable[[str], Optional[Match]]]~~ |
|
||||
| `rules` | A dictionary of tokenizer exceptions and special cases. ~~Optional[Dict[str, List[Dict[int, str]]]]~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -32,13 +32,13 @@ loaded in via [`Language.from_disk`](/api/language#from_disk).
|
|||
> nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------------------- | ---------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | str / `Path` | Model to load, i.e. package name or path. |
|
||||
| _keyword-only_ | | |
|
||||
| `disable` | `List[str]` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||
| `config` <Tag variant="new">3</Tag> | `Dict[str, Any]` / [`Config`](https://thinc.ai/docs/api-config#config) | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. |
|
||||
| **RETURNS** | `Language` | A `Language` object with the loaded model. |
|
||||
| Name | Description |
|
||||
| ----------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `name` | Model to load, i.e. package name or path. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `disable` | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). ~~List[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Optional config overrides, either as nested dict or dict keyed by section value in dot notation, e.g. `"components.name.value"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | A `Language` object with the loaded model. ~~Language~~ |
|
||||
|
||||
Essentially, `spacy.load()` is a convenience wrapper that reads the language ID
|
||||
and pipeline components from a model's `meta.json`, initializes the `Language`
|
||||
|
@ -65,10 +65,10 @@ Create a blank model of a given language class. This function is the twin of
|
|||
> nlp_de = spacy.blank("de") # equivalent to German()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------- | ------------------------------------------------------------------------------------------------ |
|
||||
| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
|
||||
| **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. ~~str~~ |
|
||||
| **RETURNS** | An empty `Language` object of the appropriate subclass. ~~Language~~ |
|
||||
|
||||
#### spacy.info {#spacy.info tag="function"}
|
||||
|
||||
|
@ -85,12 +85,12 @@ meta data as a dictionary instead, you can use the `meta` attribute on your
|
|||
> markdown = spacy.info(markdown=True, silent=True)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ---- | ------------------------------------------------ |
|
||||
| `model` | str | A model, i.e. a package name or path (optional). |
|
||||
| _keyword-only_ | | |
|
||||
| `markdown` | bool | Print information as Markdown. |
|
||||
| `silent` | bool | Don't print anything, just return. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------ |
|
||||
| `model` | A model, i.e. a package name or path (optional). ~~Optional[str]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `markdown` | Print information as Markdown. ~~bool~~ |
|
||||
| `silent` | Don't print anything, just return. ~~bool~~ |
|
||||
|
||||
### spacy.explain {#spacy.explain tag="function"}
|
||||
|
||||
|
@ -111,10 +111,10 @@ list of available terms, see
|
|||
> # world NN noun, singular or mass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------------------- |
|
||||
| `term` | str | Term to explain. |
|
||||
| **RETURNS** | str | The explanation, or `None` if not found in the glossary. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------- |
|
||||
| `term` | Term to explain. ~~str~~ |
|
||||
| **RETURNS** | The explanation, or `None` if not found in the glossary. ~~Optional[str]~~ |
|
||||
|
||||
### spacy.prefer_gpu {#spacy.prefer_gpu tag="function" new="2.0.14"}
|
||||
|
||||
|
@ -131,9 +131,9 @@ models.
|
|||
> nlp = spacy.load("en_core_web_sm")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------ |
|
||||
| **RETURNS** | bool | Whether the GPU was activated. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------- |
|
||||
| **RETURNS** | Whether the GPU was activated. ~~bool~~ |
|
||||
|
||||
### spacy.require_gpu {#spacy.require_gpu tag="function" new="2.0.14"}
|
||||
|
||||
|
@ -150,9 +150,9 @@ and _before_ loading any models.
|
|||
> nlp = spacy.load("en_core_web_sm")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------- |
|
||||
| **RETURNS** | bool | `True` |
|
||||
| Name | Description |
|
||||
| ----------- | --------------- |
|
||||
| **RETURNS** | `True` ~~bool~~ |
|
||||
|
||||
## displaCy {#displacy source="spacy/displacy"}
|
||||
|
||||
|
@ -175,16 +175,16 @@ browser. Will run a simple web server.
|
|||
> displacy.serve([doc1, doc2], style="dep")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| --------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ----------- |
|
||||
| `docs` | list, `Doc`, `Span` | Document(s) to visualize. |
|
||||
| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
|
||||
| `page` | bool | Render markup as full HTML page. | `True` |
|
||||
| `minify` | bool | Minify HTML markup. | `False` |
|
||||
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
|
||||
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
|
||||
| `port` | int | Port to serve visualization. | `5000` |
|
||||
| `host` | str | Host to serve visualization. | `'0.0.0.0'` |
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
|
||||
| `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ |
|
||||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
||||
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
||||
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
||||
| `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
|
||||
| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ |
|
||||
| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ |
|
||||
|
||||
### displacy.render {#displacy.render tag="method" new="2"}
|
||||
|
||||
|
@ -200,16 +200,16 @@ Render a dependency parse tree or named entity visualization.
|
|||
> html = displacy.render(doc, style="dep")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
|
||||
| `docs` | list, `Doc`, `Span` | Document(s) to visualize. |
|
||||
| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
|
||||
| `page` | bool | Render markup as full HTML page. | `False` |
|
||||
| `minify` | bool | Minify HTML markup. | `False` |
|
||||
| `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` |
|
||||
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
|
||||
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
|
||||
| **RETURNS** | str | Rendered HTML markup. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span]], Doc, Span]~~ |
|
||||
| `style` | Visualization style, `"dep"` or `"ent"`. Defaults to `"dep"`. ~~str~~ |
|
||||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
||||
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
||||
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
||||
| `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
|
||||
| `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
|
||||
| **RETURNS** | The rendered HTML markup. ~~str~~ |
|
||||
|
||||
### Visualizer options {#displacy_options}
|
||||
|
||||
|
@ -225,22 +225,22 @@ If a setting is not present in the options, the default value will be used.
|
|||
> displacy.serve(doc, style="dep", options=options)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| ------------------------------------------ | ---- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
|
||||
| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
|
||||
| `add_lemma` <Tag variant="new">2.2.4</Tag> | bool | Print the lemma's in a separate row below the token texts. | `False` |
|
||||
| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
|
||||
| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
|
||||
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
|
||||
| `color` | str | Text color (HEX, RGB or color names). | `'#000000'` |
|
||||
| `bg` | str | Background color (HEX, RGB or color names). | `'#ffffff'` |
|
||||
| `font` | str | Font name or font family for all text. | `'Arial'` |
|
||||
| `offset_x` | int | Spacing on left side of the SVG in px. | `50` |
|
||||
| `arrow_stroke` | int | Width of arrow path in px. | `2` |
|
||||
| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) |
|
||||
| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) |
|
||||
| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` |
|
||||
| `distance` | int | Distance between words in px. | `175` / `150` (compact) |
|
||||
| Name | Description |
|
||||
| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
|
||||
| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemma's in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
||||
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
|
||||
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ |
|
||||
| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ |
|
||||
| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ |
|
||||
| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ |
|
||||
| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ |
|
||||
| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ |
|
||||
|
||||
#### Named Entity Visualizer options {#displacy_options-ent}
|
||||
|
||||
|
@ -252,11 +252,11 @@ If a setting is not present in the options, the default value will be used.
|
|||
> displacy.serve(doc, style="ent", options=options)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| --------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
|
||||
| `ents` | list | Entity types to highlight (`None` for all types). | `None` |
|
||||
| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` |
|
||||
| `template` <Tag variant="new">2.2</Tag> | str | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |
|
||||
| Name | Description |
|
||||
| --------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `ents` | Entity types to highlight or `None` for all types (default). ~~Optional[List[str]]~~ |
|
||||
| `colors` | Color overrides. Entity types in uppercase should be mapped to color names or values. ~~Dict[str, str]~~ |
|
||||
| `template` <Tag variant="new">2.2</Tag> | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. See [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) for examples. ~~Optional[str]~~ |
|
||||
|
||||
By default, displaCy comes with colors for all entity types used by
|
||||
[spaCy models](/models). If you're using custom entity types, you can use the
|
||||
|
@ -359,13 +359,13 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
|
|||
> get_length = null
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `seqs` | `Iterable[Any]` | The sequences to minibatch. |
|
||||
| `size` | `Iterable[int]` / int | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
|
||||
| `tolerance` | float | What percentage of the size to allow batches to exceed. |
|
||||
| `discard_oversize` | bool | Whether to discard sequences that by themselves exceed the tolerated size. |
|
||||
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. |
|
||||
| Name | Description |
|
||||
| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `seqs` | The sequences to minibatch. ~~Iterable[Any]~~ |
|
||||
| `size` | The target number of words per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
|
||||
| `tolerance` | What percentage of the size to allow batches to exceed. ~~float~~ |
|
||||
| `discard_oversize` | Whether to discard sequences that by themselves exceed the tolerated size. ~~bool~~ |
|
||||
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
||||
|
||||
#### batch_by_sequence.v1 {#batch_by_sequence tag="registered function"}
|
||||
|
||||
|
@ -380,10 +380,10 @@ themselves, or be discarded if `discard_oversize` is set to `True`. The argument
|
|||
|
||||
Create a batcher that creates batches of the specified size.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ---------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `size` | `Iterable[int]` / int | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
|
||||
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. |
|
||||
| Name | Description |
|
||||
| ------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `size` | The target number of items per batch. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
|
||||
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
||||
|
||||
#### batch_by_padded.v1 {#batch_by_padded tag="registered function"}
|
||||
|
||||
|
@ -403,12 +403,12 @@ sequences binned by length within a window. The padded size is defined as the
|
|||
maximum length of sequences within the batch multiplied by the number of
|
||||
sequences in the batch.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------ | ---------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `size` | `Iterable[int]` / int | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). |
|
||||
| `buffer` | int | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. |
|
||||
| `discard_oversize` | bool | Whether to discard sequences that are by themselves longer than the largest padded batch size. |
|
||||
| `get_length` | `Callable[[Any], int]` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. |
|
||||
| Name | Description |
|
||||
| ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `size` | The largest padded size to batch sequences into. Can also be a block referencing a schedule, e.g. [`compounding`](https://thinc.ai/docs/api-schedules/#compounding). ~~Union[int, Sequence[int]]~~ |
|
||||
| `buffer` | The number of sequences to accumulate before sorting by length. A larger buffer will result in more even sizing, but if the buffer is very large, the iteration order will be less random, which can result in suboptimal training. ~~int~~ |
|
||||
| `discard_oversize` | Whether to discard sequences that are by themselves longer than the largest padded batch size. ~~bool~~ |
|
||||
| `get_length` | Optional function that receives a sequence item and returns its length. Defaults to the built-in `len()` if not set. ~~Optional[Callable[[Any], int]]~~ |
|
||||
|
||||
## Training data and alignment {#gold source="spacy/gold"}
|
||||
|
||||
|
@ -436,11 +436,11 @@ single-token entity.
|
|||
> assert tags == ["O", "O", "U-LOC", "O"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. |
|
||||
| `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. |
|
||||
| **RETURNS** | list | str strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. ~~Doc~~ |
|
||||
| `entities` | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, Union[str, int]]]~~ |
|
||||
| **RETURNS** | A list of strings, describing the [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
|
||||
|
||||
### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}
|
||||
|
||||
|
@ -458,11 +458,11 @@ Encode per-token tags following the
|
|||
> assert entities == [(7, 13, "LOC")]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doc` | `Doc` | The document that the BILUO tags refer to. |
|
||||
| `entities` | iterable | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. |
|
||||
| **RETURNS** | list | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `doc` | The document that the BILUO tags refer to. ~~Doc~~ |
|
||||
| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
|
||||
| **RETURNS** | A sequence of `(start, end, label)` triples. `start` and `end` will be character-offset integers denoting the slice into the original string. ~~List[Tuple[int, int, str]]~~ |
|
||||
|
||||
### gold.spans_from_biluo_tags {#spans_from_biluo_tags tag="function" new="2.1"}
|
||||
|
||||
|
@ -481,11 +481,11 @@ token-based tags, e.g. to overwrite the `doc.ents`.
|
|||
> doc.ents = spans_from_biluo_tags(doc, tags)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doc` | `Doc` | The document that the BILUO tags refer to. |
|
||||
| `entities` | iterable | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. |
|
||||
| **RETURNS** | list | A sequence of `Span` objects with added entity labels. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `doc` | The document that the BILUO tags refer to. ~~Doc~~ |
|
||||
| `entities` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
|
||||
| **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~ |
|
||||
|
||||
## Utility functions {#util source="spacy/util.py"}
|
||||
|
||||
|
@ -504,7 +504,8 @@ depends on any of spaCy's utilities.
|
|||
Import and load a `Language` class. Allows lazy-loading
|
||||
[language data](/usage/adding-languages) and importing languages using the
|
||||
two-letter language code. To add a language code for a custom language class,
|
||||
you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper.
|
||||
you can register it using the [`@registry.languages`](/api/top-level#registry)
|
||||
decorator.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -514,36 +515,14 @@ you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper.
|
|||
> lang = lang_class()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------- | -------------------------------------- |
|
||||
| `lang` | str | Two-letter language code, e.g. `'en'`. |
|
||||
| **RETURNS** | `Language` | Language class. |
|
||||
|
||||
### util.set_lang_class {#util.set_lang_class tag="function"}
|
||||
|
||||
Set a custom `Language` class name that can be loaded via
|
||||
[`get_lang_class`](/api/top-level#util.get_lang_class). If your model uses a
|
||||
custom language, this is required so that spaCy can load the correct class from
|
||||
the two-letter language code.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> from spacy.lang.xy import CustomLanguage
|
||||
>
|
||||
> util.set_lang_class('xy', CustomLanguage)
|
||||
> lang_class = util.get_lang_class('xy')
|
||||
> nlp = lang_class()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------- | -------------------------------------- |
|
||||
| `name` | str | Two-letter language code, e.g. `'en'`. |
|
||||
| `cls` | `Language` | The language class, e.g. `English`. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------- |
|
||||
| `lang` | Two-letter language code, e.g. `"en"`. ~~str~~ |
|
||||
| **RETURNS** | The respective subclass. ~~Language~~ |
|
||||
|
||||
### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"}
|
||||
|
||||
Check whether a `Language` class is already loaded. `Language` classes are
|
||||
Check whether a `Language` subclass is already loaded. `Language` subclasses are
|
||||
loaded lazily, to avoid expensive setup code associated with the language data.
|
||||
|
||||
> #### Example
|
||||
|
@ -554,10 +533,10 @@ loaded lazily, to avoid expensive setup code associated with the language data.
|
|||
> assert util.lang_class_is_loaded("de") is False
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------- |
|
||||
| `name` | str | Two-letter language code, e.g. `'en'`. |
|
||||
| **RETURNS** | bool | Whether the class has been loaded. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------- |
|
||||
| `name` | Two-letter language code, e.g. `"en"`. ~~str~~ |
|
||||
| **RETURNS** | Whether the class has been loaded. ~~bool~~ |
|
||||
|
||||
### util.load_model {#util.load_model tag="function" new="2"}
|
||||
|
||||
|
@ -566,7 +545,7 @@ will assume the model is a Python package and import and call its `load()`
|
|||
method. If called with a path, spaCy will assume it's a data directory, read the
|
||||
language and pipeline settings from the meta.json and initialize a `Language`
|
||||
class. The model data will then be loaded in via
|
||||
[`Language.from_disk()`](/api/language#from_disk).
|
||||
[`Language.from_disk`](/api/language#from_disk).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -576,31 +555,13 @@ class. The model data will then be loaded in via
|
|||
> nlp = util.load_model("/path/to/data")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---------- | -------------------------------------------------------- |
|
||||
| `name` | str | Package name or model path. |
|
||||
| `**overrides` | - | Specific overrides, like pipeline components to disable. |
|
||||
| **RETURNS** | `Language` | `Language` class with the loaded model. |
|
||||
|
||||
### util.load_model_from_path {#util.load_model_from_path tag="function" new="2"}
|
||||
|
||||
Load a model from a data directory path. Creates the [`Language`](/api/language)
|
||||
class and pipeline based on the directory's meta.json and then calls
|
||||
[`from_disk()`](/api/language#from_disk) with the path. This function also makes
|
||||
it easy to test a new model that you haven't packaged yet.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> nlp = load_model_from_path("/path/to/data")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---------- | ---------------------------------------------------------------------------------------------------- |
|
||||
| `model_path` | str | Path to model data directory. |
|
||||
| `meta` | dict | Model meta data. If `False`, spaCy will try to load the meta from a meta.json in the same directory. |
|
||||
| `**overrides` | - | Specific overrides, like pipeline components to disable. |
|
||||
| **RETURNS** | `Language` | `Language` class with the loaded model. |
|
||||
| Name | Description |
|
||||
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | Package name or model path. ~~str~~ |
|
||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||
| `disable` | Names of pipeline components to disable. ~~Iterable[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
|
||||
|
||||
### util.load_model_from_init_py {#util.load_model_from_init_py tag="function" new="2"}
|
||||
|
||||
|
@ -616,11 +577,13 @@ A helper function to use in the `load()` method of a model package's
|
|||
> return load_model_from_init_py(__file__, **overrides)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---------- | -------------------------------------------------------- |
|
||||
| `init_file` | str | Path to model's `__init__.py`, i.e. `__file__`. |
|
||||
| `**overrides` | - | Specific overrides, like pipeline components to disable. |
|
||||
| **RETURNS** | `Language` | `Language` class with the loaded model. |
|
||||
| Name | Description |
|
||||
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `init_file` | Path to model's `__init__.py`, i.e. `__file__`. ~~Union[str, Path]~~ |
|
||||
| `vocab` <Tag variant="new">3</Tag> | Optional shared vocab to pass in on initialization. If `True` (default), a new `Vocab` object will be created. ~~Union[Vocab, bool]~~. |
|
||||
| `disable` | Names of pipeline components to disable. ~~Iterable[str]~~ |
|
||||
| `config` <Tag variant="new">3</Tag> | Config overrides as nested dict or flat dict keyed by section values in dot notation, e.g. `"nlp.pipeline"`. ~~Union[Dict[str, Any], Config]~~ |
|
||||
| **RETURNS** | `Language` class with the loaded model. ~~Language~~ |
|
||||
|
||||
### util.get_model_meta {#util.get_model_meta tag="function" new="2"}
|
||||
|
||||
|
@ -632,10 +595,10 @@ Get a model's meta.json from a directory path and validate its contents.
|
|||
> meta = util.get_model_meta("/path/to/model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | ------------------------ |
|
||||
| `path` | str / `Path` | Path to model directory. |
|
||||
| **RETURNS** | dict | The model's meta data. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------- |
|
||||
| `path` | Path to model directory. ~~Union[str, Path]~~ |
|
||||
| **RETURNS** | The model's meta data. ~~Dict[str, Any]~~ |
|
||||
|
||||
### util.is_package {#util.is_package tag="function"}
|
||||
|
||||
|
@ -649,10 +612,10 @@ Check if string maps to a package installed via pip. Mainly used to validate
|
|||
> util.is_package("xyz") # False
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | -------------------------------------------- |
|
||||
| `name` | str | Name of package. |
|
||||
| **RETURNS** | `bool` | `True` if installed package, `False` if not. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------- |
|
||||
| `name` | Name of package. ~~str~~ |
|
||||
| **RETURNS** | `True` if installed package, `False` if not. ~~bool~~ |
|
||||
|
||||
### util.get_package_path {#util.get_package_path tag="function" new="2"}
|
||||
|
||||
|
@ -666,10 +629,10 @@ Get path to an installed package. Mainly used to resolve the location of
|
|||
> # /usr/lib/python3.6/site-packages/en_core_web_sm
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------ | -------------------------------- |
|
||||
| `package_name` | str | Name of installed package. |
|
||||
| **RETURNS** | `Path` | Path to model package directory. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------- |
|
||||
| `package_name` | Name of installed package. ~~str~~ |
|
||||
| **RETURNS** | Path to model package directory. ~~Path~~ |
|
||||
|
||||
### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"}
|
||||
|
||||
|
@ -686,9 +649,9 @@ detecting the IPython kernel. Mainly used for the
|
|||
> display(HTML(html))
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------- |
|
||||
| **RETURNS** | bool | `True` if in Jupyter, `False` if not. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------- |
|
||||
| **RETURNS** | `True` if in Jupyter, `False` if not. ~~bool~~ |
|
||||
|
||||
### util.compile_prefix_regex {#util.compile_prefix_regex tag="function"}
|
||||
|
||||
|
@ -702,10 +665,10 @@ Compile a sequence of prefix rules into a regex object.
|
|||
> nlp.tokenizer.prefix_search = prefix_regex.search
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | tuple | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
|
||||
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
|
||||
| **RETURNS** | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
|
||||
### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
|
||||
|
||||
|
@ -719,10 +682,10 @@ Compile a sequence of suffix rules into a regex object.
|
|||
> nlp.tokenizer.suffix_search = suffix_regex.search
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | tuple | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
|
||||
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
|
||||
| **RETURNS** | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
|
||||
### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
|
||||
|
||||
|
@ -736,10 +699,10 @@ Compile a sequence of infix rules into a regex object.
|
|||
> nlp.tokenizer.infix_finditer = infix_regex.finditer
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | tuple | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). |
|
||||
| **RETURNS** | [regex](https://docs.python.org/3/library/re.html#re-objects) | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](https://github.com/explosion/spaCy/tree/master/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
|
||||
| **RETURNS** | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
|
||||
### util.minibatch {#util.minibatch tag="function" new="2"}
|
||||
|
||||
|
@ -754,11 +717,11 @@ vary on each step.
|
|||
> nlp.update(batch)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | -------------- | ---------------------- |
|
||||
| `items` | iterable | The items to batch up. |
|
||||
| `size` | int / iterable | The batch size(s). |
|
||||
| **YIELDS** | list | The batches. |
|
||||
| Name | Description |
|
||||
| ---------- | ---------------------------------------- |
|
||||
| `items` | The items to batch up. ~~Iterable[Any]~~ |
|
||||
| `size` | int / iterable | The batch size(s). ~~Union[int, Sequence[int]]~~ |
|
||||
| **YIELDS** | The batches. |
|
||||
|
||||
### util.filter_spans {#util.filter_spans tag="function" new="2.1.4"}
|
||||
|
||||
|
@ -776,17 +739,30 @@ of one entity) or when merging spans with
|
|||
> filtered = filter_spans(spans)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | -------------------- |
|
||||
| `spans` | iterable | The spans to filter. |
|
||||
| **RETURNS** | list | The filtered spans. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------------- |
|
||||
| `spans` | The spans to filter. ~~Iterable[Span]~~ |
|
||||
| **RETURNS** | The filtered spans. ~~List[Span]~~ |
|
||||
|
||||
### util.get_words_and_spaces {#get_words_and_spaces tag="function" new="3"}
|
||||
|
||||
<!-- TODO: document -->
|
||||
Given a list of words and a text, reconstruct the original tokens and return a
|
||||
list of words and spaces that can be used to create a [`Doc`](/api/doc#init).
|
||||
This can help recover destructive tokenization that didn't preserve any
|
||||
whitespace information.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ----------- |
|
||||
| `words` | list | |
|
||||
| `text` | str | |
|
||||
| **RETURNS** | tuple | |
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> orig_words = ["Hey", ",", "what", "'s", "up", "?"]
|
||||
> orig_text = "Hey, what's up?"
|
||||
> words, spaces = get_words_and_spaces(orig_words, orig_text)
|
||||
> # ['Hey', ',', 'what', "'s", 'up', '?']
|
||||
> # [False, True, False, True, False, False]
|
||||
> ```
|
||||
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `words` | The list of words. ~~Iterable[str]~~ |
|
||||
| `text` | The original text. ~~str~~ |
|
||||
| **RETURNS** | A list of words and a list of boolean values indicating whether the word at this position is followed by a space. ~~Tuple[List[str], List[bool]]~~ |
|
||||
|
|
|
@ -60,11 +60,11 @@ architectures and their arguments and hyperparameters.
|
|||
> nlp.add_pipe("transformer", config=DEFAULT_CONFIG)
|
||||
> ```
|
||||
|
||||
| Setting | Type | Description | Default |
|
||||
| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------- |
|
||||
| `max_batch_items` | int | Maximum size of a padded batch. | `4096` |
|
||||
| `annotation_setter` | Callable | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. | `null_annotation_setter` |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. | [TransformerModel](/api/architectures#TransformerModel) |
|
||||
| Setting | Description |
|
||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `4096`. ~~int~~ |
|
||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. Defaults to `null_annotation_setter` (no additional annotations). ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Defaults to [TransformerModel](/api/architectures#TransformerModel). ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spacy-transformers/blob/master/spacy_transformers/pipeline_component.py
|
||||
|
@ -101,14 +101,14 @@ attribute. You can also provide a callback to set additional annotations. In
|
|||
your application, you would normally use a shortcut for this and instantiate the
|
||||
component using its string name and [`nlp.add_pipe`](/api/language#create_pipe).
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------- | ------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | [`Model`](https://thinc.ai/docs/api-model) | **Input:** `List[Doc]`. **Output:** [`FullTransformerBatch`](/api/transformer#fulltransformerbatch). The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. |
|
||||
| `annotation_setter` | `Callable` | Function that takes a batch of `Doc` objects and a [`FullTransformerBatch`](/api/transformer#fulltransformerbatch) and can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no additional annotations are set. |
|
||||
| _keyword-only_ | | |
|
||||
| `name` | str | String name of the component instance. Used to add entries to the `losses` during training. |
|
||||
| `max_batch_items` | int | Maximum size of a padded batch. Defaults to `128*32`. |
|
||||
| Name | Description |
|
||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||
| `model` | The Thinc [`Model`](https://thinc.ai/docs/api-model) wrapping the transformer. Usually you will want to use the [TransformerModel](/api/architectures#TransformerModel) layer for this. ~~Model[List[Doc], FullTransformerBatch]~~ |
|
||||
| `annotation_setter` | Function that takes a batch of `Doc` objects and transformer outputs can set additional annotations on the `Doc`. The `Doc._.transformer_data` attribute is set prior to calling the callback. By default, no annotations are set. ~~Callable[[List[Doc], FullTransformerBatch], None]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||
| `max_batch_items` | Maximum size of a padded batch. Defaults to `128*32`. ~~int~~ |
|
||||
|
||||
## Transformer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -128,10 +128,10 @@ to the [`predict`](/api/transformer#predict) and
|
|||
> processed = transformer(doc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------ |
|
||||
| `doc` | `Doc` | The document to process. |
|
||||
| **RETURNS** | `Doc` | The processed document. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------- |
|
||||
| `doc` | The document to process. ~~Doc~~ |
|
||||
| **RETURNS** | The processed document. ~~Doc~~ |
|
||||
|
||||
## Transformer.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -150,12 +150,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/transformer#call) and
|
|||
> pass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ----------------------------------------------------- |
|
||||
| `stream` | `Iterable[Doc]` | A stream of documents. |
|
||||
| _keyword-only_ | | |
|
||||
| `batch_size` | int | The number of documents to buffer. Defaults to `128`. |
|
||||
| **YIELDS** | `Doc` | The processed documents in order. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------- |
|
||||
| `stream` | A stream of documents. ~~Iterable[Doc]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The number of documents to buffer. Defaults to `128`. ~~int~~ |
|
||||
| **YIELDS** | The processed documents in order. ~~Doc~~ |
|
||||
|
||||
## Transformer.begin_training {#begin_training tag="method"}
|
||||
|
||||
|
@ -175,13 +175,13 @@ setting up the label scheme based on the data.
|
|||
> optimizer = trf.begin_training(lambda: [], pipeline=nlp.pipeline)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | `Callable[[], Iterable[Example]]` | Optional function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `pipeline` | `List[Tuple[str, Callable]]` | Optional list of pipeline components that this component is part of. |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | An optional optimizer. Will be created via [`create_optimizer`](/api/transformer#create_optimizer) if not set. |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `get_examples` | Function that returns gold-standard annotations in the form of [`Example`](/api/example) objects. ~~Callable[[], Iterable[Example]]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `pipeline` | Optional list of pipeline components that this component is part of. ~~Optional[List[Tuple[str, Callable[[Doc], Doc]]]]~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## Transformer.predict {#predict tag="method"}
|
||||
|
||||
|
@ -195,10 +195,10 @@ modifying them.
|
|||
> scores = trf.predict([doc1, doc2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------- | ----------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to predict. |
|
||||
| **RETURNS** | - | The model's prediction for each document. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------- |
|
||||
| `docs` | The documents to predict. ~~Iterable[Doc]~~ |
|
||||
| **RETURNS** | The model's prediction for each document. |
|
||||
|
||||
## Transformer.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
|
@ -215,10 +215,10 @@ callback is then called, if provided.
|
|||
> trf.set_annotations(docs, scores)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | --------------- | ----------------------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | The documents to modify. |
|
||||
| `scores` | - | The scores to set, produced by `Transformer.predict`. |
|
||||
| Name | Description |
|
||||
| -------- | ----------------------------------------------------- |
|
||||
| `docs` | The documents to modify. ~~Iterable[Doc]~~ |
|
||||
| `scores` | The scores to set, produced by `Transformer.predict`. |
|
||||
|
||||
## Transformer.update {#update tag="method"}
|
||||
|
||||
|
@ -244,15 +244,15 @@ and call the optimizer, while the others simply increment the gradients.
|
|||
> losses = trf.update(examples, sgd=optimizer)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------- | --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | `Iterable[Example]` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. |
|
||||
| _keyword-only_ | | |
|
||||
| `drop` | float | The dropout rate. |
|
||||
| `set_annotations` | bool | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](/api/transformer#set_annotations). |
|
||||
| `sgd` | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| `losses` | `Dict[str, float]` | Optional record of the loss during training. Updated using the component name as the key. |
|
||||
| **RETURNS** | `Dict[str, float]` | The updated `losses` dictionary. |
|
||||
| Name | Description |
|
||||
| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `examples` | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `drop` | The dropout rate. ~~float~~ |
|
||||
| `set_annotations` | Whether or not to update the `Example` objects with the predictions, delegating to [`set_annotations`](#set_annotations). ~~bool~~ |
|
||||
| `sgd` | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~ |
|
||||
| `losses` | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~ |
|
||||
| **RETURNS** | The updated `losses` dictionary. ~~Dict[str, float]~~ |
|
||||
|
||||
## Transformer.create_optimizer {#create_optimizer tag="method"}
|
||||
|
||||
|
@ -265,9 +265,9 @@ Create an optimizer for the pipeline component.
|
|||
> optimizer = trf.create_optimizer()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------------------------------------- | -------------- |
|
||||
| **RETURNS** | [`Optimizer`](https://thinc.ai/docs/api-optimizers) | The optimizer. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------- |
|
||||
| **RETURNS** | The optimizer. ~~Optimizer~~ |
|
||||
|
||||
## Transformer.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
|
@ -282,9 +282,9 @@ context, the original parameters are restored.
|
|||
> trf.to_disk("/best_model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---- | ----------------------------------------- |
|
||||
| `params` | dict | The parameter values to use in the model. |
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------- |
|
||||
| `params` | The parameter values to use in the model. ~~dict~~ |
|
||||
|
||||
## Transformer.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -297,11 +297,11 @@ Serialize the pipe to disk.
|
|||
> trf.to_disk("/path/to/transformer")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## Transformer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -314,12 +314,12 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> trf.from_disk("/path/to/transformer")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tok2Vec` | The modified `Tok2Vec` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `Transformer` object. ~~Transformer~~ |
|
||||
|
||||
## Transformer.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -332,11 +332,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
Serialize the pipe to a bytestring.
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Tok2Vec` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `Transformer` object. ~~bytes~~ |
|
||||
|
||||
## Transformer.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -350,12 +350,12 @@ Load the pipe from a bytestring. Modifies the object in place and returns it.
|
|||
> trf.from_bytes(trf_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tok2Vec` | The `Tok2Vec` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `Transformer` object. ~~Transformer~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
@ -386,20 +386,20 @@ by this class. Instances of this class
|
|||
are`typically assigned to the [Doc._.trf_data`](/api/transformer#custom-attributes)
|
||||
extension attribute.
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | -------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `tokens` | `Dict` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts, and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. |
|
||||
| `tensors` | `List[FloatsXd]` | The activations for the Doc from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. |
|
||||
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. |
|
||||
| `width` | int | The width of the last hidden layer. |
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts, and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |
|
||||
| `tensors` | The activations for the Doc from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
|
||||
| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
|
||||
| `width` | The width of the last hidden layer. ~~int~~ |
|
||||
|
||||
### TransformerData.empty {#transformerdata-emoty tag="classmethod"}
|
||||
|
||||
Create an empty `TransformerData` container.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------- | -------------- |
|
||||
| **RETURNS** | `TransformerData` | The container. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------- |
|
||||
| **RETURNS** | The container. ~~TransformerData~~ |
|
||||
|
||||
## FullTransformerBatch {#fulltransformerbatch tag="dataclass"}
|
||||
|
||||
|
@ -407,13 +407,13 @@ Holds a batch of input and output objects for a transformer model. The data can
|
|||
then be split to a list of [`TransformerData`](/api/transformer#transformerdata)
|
||||
objects to associate the outputs to each [`Doc`](/api/doc) in the batch.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | -------------------------------------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `spans` | `List[List[Span]]` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each Span can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each Span may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. |
|
||||
| `tokens` | [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) | The output of the tokenizer. |
|
||||
| `tensors` | `List[torch.Tensor]` | The output of the transformer model. |
|
||||
| `align` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. |
|
||||
| `doc_data` | `List[TransformerData]` | The outputs, split per `Doc` object. |
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each Span can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each Span may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
|
||||
| `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ |
|
||||
| `tensors` | The output of the transformer model. ~~List[torch.Tensor]~~ |
|
||||
| `align` | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
|
||||
| `doc_data` | The outputs, split per `Doc` object. ~~List[TransformerData]~~ |
|
||||
|
||||
### FullTransformerBatch.unsplit_by_doc {#fulltransformerbatch-unsplit_by_doc tag="method"}
|
||||
|
||||
|
@ -422,19 +422,19 @@ current object's spans, tokens and alignment. This is used during the backward
|
|||
pass, in order to construct the gradients to pass back into the transformer
|
||||
model.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------------- | ------------------------------- |
|
||||
| `arrays` | `List[List[Floats3d]]` | The split batch of activations. |
|
||||
| **RETURNS** | `FullTransformerBatch` | The transformer batch. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------- |
|
||||
| `arrays` | The split batch of activations. ~~List[List[Floats3d]]~~ |
|
||||
| **RETURNS** | The transformer batch. ~~FullTransformerBatch~~ |
|
||||
|
||||
### FullTransformerBatch.split_by_doc {#fulltransformerbatch-split_by_doc tag="method"}
|
||||
|
||||
Split a `TransformerData` object that represents a batch into a list with one
|
||||
`TransformerData` per `Doc`.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------- | ---------------- |
|
||||
| **RETURNS** | `List[TransformerData]` | The split batch. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------ |
|
||||
| **RETURNS** | The split batch. ~~List[TransformerData]~~ |
|
||||
|
||||
## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
|
||||
|
||||
|
@ -460,10 +460,10 @@ decorator.
|
|||
> return get_sent_spans
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------ | ---------------------------------------- |
|
||||
| `docs` | `Iterable[Doc]` | A batch of `Doc` objects. |
|
||||
| **RETURNS** | `List[List[Span]]` | The spans to process by the transformer. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------- |
|
||||
| `docs` | A batch of `Doc` objects. ~~Iterable[Doc]~~ |
|
||||
| **RETURNS** | The spans to process by the transformer. ~~List[List[Span]]~~ |
|
||||
|
||||
### doc_spans.v1 {#doc_spans tag="registered function"}
|
||||
|
||||
|
@ -510,10 +510,10 @@ than `window` will allow for an overlap, so that some tokens are counted twice.
|
|||
This can be desirable, because it allows all tokens to have both a left and
|
||||
right context.
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---- | ---------------- |
|
||||
| `window` | int | The window size. |
|
||||
| `stride` | int | The stride size. |
|
||||
| Name | Description |
|
||||
| -------- | ------------------------ |
|
||||
| `window` | The window size. ~~int~~ |
|
||||
| `stride` | The stride size. ~~int~~ |
|
||||
|
||||
## Annotation setters {#annotation_setters tag="registered functions" source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/annotation_setters.py"}
|
||||
|
||||
|
@ -534,10 +534,10 @@ You can register custom annotation setters using the
|
|||
> return setter
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ---------------------- | ------------------------------------ |
|
||||
| `docs` | `List[Doc]` | A batch of `Doc` objects. |
|
||||
| `trf_data` | `FullTransformerBatch` | The transformers data for the batch. |
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------------------------------------- |
|
||||
| `docs` | A batch of `Doc` objects. ~~List[Doc]~~ |
|
||||
| `trf_data` | The transformers data for the batch. ~~FullTransformerBatch~~ |
|
||||
|
||||
The following built-in functions are available:
|
||||
|
||||
|
@ -550,6 +550,6 @@ The following built-in functions are available:
|
|||
The component sets the following
|
||||
[custom extension attributes](/usage/processing-pipeline#custom-components-attributes):
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ----------------------------------------------------- | ---------------------------------------------------- |
|
||||
| `Doc.trf_data` | [`TransformerData`](/api/transformer#transformerdata) | Transformer tokens and outputs for the `Doc` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------ |
|
||||
| `Doc.trf_data` | Transformer tokens and outputs for the `Doc` object. ~~TransformerData~~ |
|
||||
|
|
|
@ -30,13 +30,13 @@ you can add vectors to later.
|
|||
> vectors = Vectors(data=data, keys=keys)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ---------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| _keyword-only_ | | |
|
||||
| `shape` | tuple | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. |
|
||||
| `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. |
|
||||
| `keys` | iterable | A sequence of keys aligned with the data. |
|
||||
| `name` | str | A name to identify the vectors table. |
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
|
||||
| `data` | The vector data. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
|
||||
| `name` | A name to identify the vectors table. ~~str~~ |
|
||||
|
||||
## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
|
||||
|
||||
|
@ -51,10 +51,10 @@ raised.
|
|||
> assert cat_vector == nlp.vocab["cat"].vector
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ---------------------------------- | ------------------------------ |
|
||||
| `key` | int | The key to get the vector for. |
|
||||
| returns | `ndarray[ndim=1, dtype='float32']` | The vector for the key. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------- |
|
||||
| `key` | The key to get the vector for. ~~int~~ |
|
||||
| **RETURNS** | The vector for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
|
||||
## Vectors.\_\_setitem\_\_ {#setitem tag="method"}
|
||||
|
||||
|
@ -68,10 +68,10 @@ Set a vector for the given key.
|
|||
> nlp.vocab.vectors[cat_id] = vector
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---------------------------------- | ------------------------------ |
|
||||
| `key` | int | The key to set the vector for. |
|
||||
| `vector` | `ndarray[ndim=1, dtype='float32']` | The vector to set. |
|
||||
| Name | Description |
|
||||
| -------- | ----------------------------------------------------------- |
|
||||
| `key` | The key to set the vector for. ~~int~~ |
|
||||
| `vector` | The vector to set. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
|
||||
## Vectors.\_\_iter\_\_ {#iter tag="method"}
|
||||
|
||||
|
@ -84,9 +84,9 @@ Iterate over the keys in the table.
|
|||
> print(key, nlp.vocab.strings[key])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ---- | ------------------- |
|
||||
| **YIELDS** | int | A key in the table. |
|
||||
| Name | Description |
|
||||
| ---------- | --------------------------- |
|
||||
| **YIELDS** | A key in the table. ~~int~~ |
|
||||
|
||||
## Vectors.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
|
@ -99,9 +99,9 @@ Return the number of vectors in the table.
|
|||
> assert len(vectors) == 3
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------- |
|
||||
| **RETURNS** | int | The number of vectors in the table. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------- |
|
||||
| **RETURNS** | The number of vectors in the table. ~~int~~ |
|
||||
|
||||
## Vectors.\_\_contains\_\_ {#contains tag="method"}
|
||||
|
||||
|
@ -115,10 +115,10 @@ Check whether a key has been mapped to a vector entry in the table.
|
|||
> assert cat_id in vectors
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------- |
|
||||
| `key` | int | The key to check. |
|
||||
| **RETURNS** | bool | Whether the key has a vector entry. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------- |
|
||||
| `key` | The key to check. ~~int~~ |
|
||||
| **RETURNS** | Whether the key has a vector entry. ~~bool~~ |
|
||||
|
||||
## Vectors.add {#add tag="method"}
|
||||
|
||||
|
@ -138,13 +138,13 @@ mapping separately. If you need to manage the strings, you should use the
|
|||
> nlp.vocab.vectors.add("dog", row=0)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ---------------------------------- | ----------------------------------------------------- |
|
||||
| `key` | str / int | The key to add. |
|
||||
| _keyword-only_ | | |
|
||||
| `vector` | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key. |
|
||||
| `row` | int | An optional row number of a vector to map the key to. |
|
||||
| **RETURNS** | int | The row the vector was added to. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------- |
|
||||
| `key` | The key to add. ~~Union[str, int]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `vector` | An optional vector to add for the key. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
| `row` | An optional row number of a vector to map the key to. ~~int~~ |
|
||||
| **RETURNS** | The row the vector was added to. ~~int~~ |
|
||||
|
||||
## Vectors.resize {#resize tag="method"}
|
||||
|
||||
|
@ -160,11 +160,11 @@ These removed items are returned as a list of `(key, row)` tuples.
|
|||
> removed = nlp.vocab.vectors.resize((10000, 300))
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------------------------------- |
|
||||
| `shape` | tuple | A `(rows, dims)` tuple describing the number of rows and dimensions. |
|
||||
| `inplace` | bool | Reallocate the memory. |
|
||||
| **RETURNS** | list | The removed items as a list of `(key, row)` tuples. |
|
||||
| Name | Description |
|
||||
| ----------- | ---------------------------------------------------------------------------------------- |
|
||||
| `shape` | A `(rows, dims)` tuple describing the number of rows and dimensions. ~~Tuple[int, int]~~ |
|
||||
| `inplace` | Reallocate the memory. ~~bool~~ |
|
||||
| **RETURNS** | The removed items as a list of `(key, row)` tuples. ~~List[Tuple[int, int]]~~ |
|
||||
|
||||
## Vectors.keys {#keys tag="method"}
|
||||
|
||||
|
@ -177,9 +177,9 @@ A sequence of the keys in the table.
|
|||
> print(key, nlp.vocab.strings[key])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | ----------- |
|
||||
| **RETURNS** | iterable | The keys. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------- |
|
||||
| **RETURNS** | The keys. ~~Iterable[int]~~ |
|
||||
|
||||
## Vectors.values {#values tag="method"}
|
||||
|
||||
|
@ -194,9 +194,9 @@ the length of the vectors table.
|
|||
> print(vector)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ---------------------------------- | ---------------------- |
|
||||
| **YIELDS** | `ndarray[ndim=1, dtype='float32']` | A vector in the table. |
|
||||
| Name | Description |
|
||||
| ---------- | --------------------------------------------------------------- |
|
||||
| **YIELDS** | A vector in the table. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
|
||||
## Vectors.items {#items tag="method"}
|
||||
|
||||
|
@ -209,9 +209,9 @@ Iterate over `(key, vector)` pairs, in order.
|
|||
> print(key, nlp.vocab.strings[key], vector)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ----- | -------------------------------- |
|
||||
| **YIELDS** | tuple | `(key, vector)` pairs, in order. |
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------------------------------------------------------------- |
|
||||
| **YIELDS** | `(key, vector)` pairs, in order. ~~Tuple[int, numpy.ndarray[ndim=1, dtype=float32]]~~ |
|
||||
|
||||
## Vectors.find {#find tag="method"}
|
||||
|
||||
|
@ -226,14 +226,14 @@ Look up one or more keys by row, or vice versa.
|
|||
> keys = nlp.vocab.vectors.find(rows=[18, 256, 985])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------------------------------- | ------------------------------------------------------------------------ |
|
||||
| _keyword-only_ | | |
|
||||
| `key` | str / int | Find the row that the given key points to. Returns int, `-1` if missing. |
|
||||
| `keys` | iterable | Find rows that the keys point to. Returns `ndarray`. |
|
||||
| `row` | int | Find the first key that points to the row. Returns int. |
|
||||
| `rows` | iterable | Find the keys that point to the rows. Returns ndarray. |
|
||||
| **RETURNS** | The requested key, keys, row or rows. |
|
||||
| Name | Description |
|
||||
| -------------- | -------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `key` | Find the row that the given key points to. Returns int, `-1` if missing. ~~Union[str, int]~~ |
|
||||
| `keys` | Find rows that the keys point to. Returns `numpy.ndarray`. ~~Iterable[Union[str, int]]~~ |
|
||||
| `row` | Find the first key that points to the row. Returns integer. ~~int~~ |
|
||||
| `rows` | Find the keys that point to the rows. Returns `numpy.ndarray`. ~~Iterable[int]~~ |
|
||||
| **RETURNS** | The requested key, keys, row or rows. ~~Union[int, numpy.ndarray[ndim=1, dtype=float32]]~~ |
|
||||
|
||||
## Vectors.shape {#shape tag="property"}
|
||||
|
||||
|
@ -250,9 +250,9 @@ vector table.
|
|||
> assert dims == 300
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ---------------------- |
|
||||
| **RETURNS** | tuple | A `(rows, dims)` pair. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------ |
|
||||
| **RETURNS** | A `(rows, dims)` pair. ~~Tuple[int, int]~~ |
|
||||
|
||||
## Vectors.size {#size tag="property"}
|
||||
|
||||
|
@ -265,9 +265,9 @@ The vector size, i.e. `rows * dims`.
|
|||
> assert vectors.size == 150000
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ---------------- |
|
||||
| **RETURNS** | int | The vector size. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------ |
|
||||
| **RETURNS** | The vector size. ~~int~~ |
|
||||
|
||||
## Vectors.is_full {#is_full tag="property"}
|
||||
|
||||
|
@ -283,9 +283,9 @@ If a table is full, it can be resized using
|
|||
> assert vectors.is_full
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ---------------------------------- |
|
||||
| **RETURNS** | bool | Whether the vectors table is full. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------- |
|
||||
| **RETURNS** | Whether the vectors table is full. ~~bool~~ |
|
||||
|
||||
## Vectors.n_keys {#n_keys tag="property"}
|
||||
|
||||
|
@ -301,9 +301,9 @@ vectors, they will be counted individually.
|
|||
> assert vectors.n_keys == 0
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------ |
|
||||
| **RETURNS** | int | The number of all keys in the table. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------- |
|
||||
| **RETURNS** | The number of all keys in the table. ~~int~~ |
|
||||
|
||||
## Vectors.most_similar {#most_similar tag="method"}
|
||||
|
||||
|
@ -320,14 +320,14 @@ performed in chunks, to avoid consuming too much memory. You can set the
|
|||
> most_similar = nlp.vocab.vectors.most_similar(queries, n=10)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------- | ------------------------------------------------------------------ |
|
||||
| `queries` | `ndarray` | An array with one or more vectors. |
|
||||
| _keyword-only_ | | |
|
||||
| `batch_size` | int | The batch size to use. Default to `1024`. |
|
||||
| `n` | int | The number of entries to return for each query. Defaults to `1`. |
|
||||
| `sort` | bool | Whether to sort the entries returned by score. Defaults to `True`. |
|
||||
| **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. |
|
||||
| Name | Description |
|
||||
| -------------- | --------------------------------------------------------------------------- |
|
||||
| `queries` | An array with one or more vectors. ~~numpy.ndarray~~ |
|
||||
| _keyword-only_ | |
|
||||
| `batch_size` | The batch size to use. Default to `1024`. ~~int~~ |
|
||||
| `n` | The number of entries to return for each query. Defaults to `1`. ~~int~~ |
|
||||
| `sort` | Whether to sort the entries returned by score. Defaults to `True`. ~~bool~~ |
|
||||
| **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. ~~Tuple[numpy.ndarray, numpy.ndarray, numpy.ndarray]~~ |
|
||||
|
||||
## Vectors.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -340,9 +340,9 @@ Save the current state to a directory.
|
|||
>
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Description |
|
||||
| ------ | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
|
||||
## Vectors.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -355,10 +355,10 @@ Loads state from a directory. Modifies the object in place and returns it.
|
|||
> vectors.from_disk("/path/to/vectors")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `Vectors` | The modified `Vectors` object. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| **RETURNS** | The modified `Vectors` object. ~~Vectors~~ |
|
||||
|
||||
## Vectors.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -370,9 +370,9 @@ Serialize the current state to a binary string.
|
|||
> vectors_bytes = vectors.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------- |
|
||||
| **RETURNS** | bytes | The serialized form of the `Vectors` object. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------ |
|
||||
| **RETURNS** | The serialized form of the `Vectors` object. ~~bytes~~ |
|
||||
|
||||
## Vectors.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -387,15 +387,15 @@ Load state from a binary string.
|
|||
> new_vectors.from_bytes(vectors_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------- | ---------------------- |
|
||||
| `data` | bytes | The data to load from. |
|
||||
| **RETURNS** | `Vectors` | The `Vectors` object. |
|
||||
| Name | Description |
|
||||
| ----------- | --------------------------------- |
|
||||
| `data` | The data to load from. ~~bytes~~ |
|
||||
| **RETURNS** | The `Vectors` object. ~~Vectors~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------------------------- | ------------------------------------------------------------------------------- |
|
||||
| `data` | `ndarray[ndim=1, dtype='float32']` | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. |
|
||||
| `key2row` | dict | Dictionary mapping word hashes to rows in the `Vectors.data` table. |
|
||||
| `keys` | `ndarray[ndim=1, dtype='float32']` | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. |
|
||||
| Name | Description |
|
||||
| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `data` | Stored vectors data. `numpy` is used for CPU vectors, `cupy` for GPU vectors. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
|
||||
| `key2row` | Dictionary mapping word hashes to rows in the `Vectors.data` table. ~~Dict[int, int]~~ |
|
||||
| `keys` | Array keeping the keys in order, such that `keys[vectors.key2row[key]] == key`. ~~Union[numpy.ndarray[ndim=1, dtype=float32], cupy.ndarray[ndim=1, dtype=float32]]~~ |
|
||||
|
|
|
@ -21,14 +21,15 @@ Create the vocabulary.
|
|||
> vocab = Vocab(strings=["hello", "world"])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------------------------------------- | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lex_attr_getters` | dict | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. |
|
||||
| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. |
|
||||
| `lookups` | `Lookups` | A [`Lookups`](/api/lookups) that stores the `lemma_\*`, `lexeme_norm` and other large lookup tables. Defaults to `None`. |
|
||||
| `lookups_extra` <Tag variant="new">2.3</Tag> | `Lookups` | A [`Lookups`](/api/lookups) that stores the optional `lexeme_cluster`/`lexeme_prob`/`lexeme_sentiment`/`lexeme_settings` lookup tables. Defaults to `None`. |
|
||||
| `oov_prob` | float | The default OOV probability. Defaults to `-20.0`. |
|
||||
| `vectors_name` <Tag variant="new">2.2</Tag> | str | A name to identify the vectors table. |
|
||||
| Name | Description |
|
||||
| ------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lex_attr_getters` | A dictionary mapping attribute IDs to functions to compute them. Defaults to `None`. ~~Optional[Dict[str, Callable[[str], Any]]]~~ |
|
||||
| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ |
|
||||
| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ |
|
||||
| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
|
||||
| `vectors_name` <Tag variant="new">2.2</Tag> | A name to identify the vectors table. ~~str~~ |
|
||||
| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
|
||||
| `get_noun_chunks` | A function that yields base noun phrases, used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ |
|
||||
|
||||
## Vocab.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
|
@ -41,9 +42,9 @@ Get the current number of lexemes in the vocabulary.
|
|||
> assert len(nlp.vocab) > 0
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ---------------------------------------- |
|
||||
| **RETURNS** | int | The number of lexemes in the vocabulary. |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------ |
|
||||
| **RETURNS** | The number of lexemes in the vocabulary. ~~int~~ |
|
||||
|
||||
## Vocab.\_\_getitem\_\_ {#getitem tag="method"}
|
||||
|
||||
|
@ -57,10 +58,10 @@ given, a new lexeme is created and stored.
|
|||
> assert nlp.vocab[apple] == nlp.vocab["apple"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------- | ---------------------------------------- |
|
||||
| `id_or_string` | int / str | The hash value of a word, or its string. |
|
||||
| **RETURNS** | `Lexeme` | The lexeme indicated by the given ID. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------ |
|
||||
| `id_or_string` | The hash value of a word, or its string. ~~Union[int, str]~~ |
|
||||
| **RETURNS** | The lexeme indicated by the given ID. ~~Lexeme~~ |
|
||||
|
||||
## Vocab.\_\_iter\_\_ {#iter tag="method"}
|
||||
|
||||
|
@ -72,9 +73,9 @@ Iterate over the lexemes in the vocabulary.
|
|||
> stop_words = (lex for lex in nlp.vocab if lex.is_stop)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | -------- | --------------------------- |
|
||||
| **YIELDS** | `Lexeme` | An entry in the vocabulary. |
|
||||
| Name | Description |
|
||||
| ---------- | -------------------------------------- |
|
||||
| **YIELDS** | An entry in the vocabulary. ~~Lexeme~~ |
|
||||
|
||||
## Vocab.\_\_contains\_\_ {#contains tag="method"}
|
||||
|
||||
|
@ -91,10 +92,10 @@ given string, you need to look it up in
|
|||
> assert oov not in nlp.vocab
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------------- |
|
||||
| `string` | str | The ID string. |
|
||||
| **RETURNS** | bool | Whether the string has an entry in the vocabulary. |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------- |
|
||||
| `string` | The ID string. ~~str~~ |
|
||||
| **RETURNS** | Whether the string has an entry in the vocabulary. ~~bool~~ |
|
||||
|
||||
## Vocab.add_flag {#add_flag tag="method"}
|
||||
|
||||
|
@ -115,11 +116,11 @@ using `token.check_flag(flag_id)`.
|
|||
> assert doc[2].check_flag(MY_PRODUCT) == True
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value. |
|
||||
| `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. |
|
||||
| **RETURNS** | int | The integer ID by which the flag value can be checked. |
|
||||
| Name | Description |
|
||||
| ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `flag_getter` | A function that takes the lexeme text and returns the boolean flag value. ~~Callable[[str], bool]~~ |
|
||||
| `flag_id` | An integer between `1` and `63` (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. ~~int~~ |
|
||||
| **RETURNS** | The integer ID by which the flag value can be checked. ~~int~~ |
|
||||
|
||||
## Vocab.reset_vectors {#reset_vectors tag="method" new="2"}
|
||||
|
||||
|
@ -133,11 +134,11 @@ have to call this to change the size of the vectors. Only one of the `width` and
|
|||
> nlp.vocab.reset_vectors(width=300)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ---- | -------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `width` | int | The new width (keyword argument only). |
|
||||
| `shape` | int | The new shape (keyword argument only). |
|
||||
| Name | Description |
|
||||
| -------------- | ---------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `width` | The new width. ~~int~~ |
|
||||
| `shape` | The new shape. ~~int~~ |
|
||||
|
||||
## Vocab.prune_vectors {#prune_vectors tag="method" new="2"}
|
||||
|
||||
|
@ -158,11 +159,11 @@ cosines are calculated in minibatches, to reduce memory usage.
|
|||
> assert len(nlp.vocab.vectors) <= 1000
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ---- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `nr_row` | int | The number of rows to keep in the vector table. |
|
||||
| `batch_size` | int | Batch of vectors for calculating the similarities. Larger batch sizes might be faster, while temporarily requiring more memory. |
|
||||
| **RETURNS** | dict | A dictionary keyed by removed words mapped to `(string, score)` tuples, where `string` is the entry the removed word was mapped to, and `score` the similarity score between the two words. |
|
||||
| Name | Description |
|
||||
| ------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `nr_row` | The number of rows to keep in the vector table. ~~int~~ |
|
||||
| `batch_size` | Batch of vectors for calculating the similarities. Larger batch sizes might be faster, while temporarily requiring more memory. ~~int~~ |
|
||||
| **RETURNS** | A dictionary keyed by removed words mapped to `(string, score)` tuples, where `string` is the entry the removed word was mapped to, and `score` the similarity score between the two words. ~~Dict[str, Tuple[str, float]]~~ |
|
||||
|
||||
## Vocab.get_vector {#get_vector tag="method" new="2"}
|
||||
|
||||
|
@ -178,12 +179,12 @@ subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
|
|||
> nlp.vocab.get_vector("apple", minn=1, maxn=5)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------------------- | ---------------------------------------- | ---------------------------------------------------------------------------------------------- |
|
||||
| `orth` | int / str | The hash value of a word, or its unicode string. |
|
||||
| `minn` <Tag variant="new">2.1</Tag> | int | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. |
|
||||
| `maxn` <Tag variant="new">2.1</Tag> | int | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. |
|
||||
| **RETURNS** | `numpy.ndarray[ndim=1, dtype='float32']` | A word vector. Size and shape are determined by the `Vocab.vectors` instance. |
|
||||
| Name | Description |
|
||||
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- |
|
||||
| `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ |
|
||||
| `minn` <Tag variant="new">2.1</Tag> | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ |
|
||||
| `maxn` <Tag variant="new">2.1</Tag> | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ |
|
||||
| **RETURNS** | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
|
||||
## Vocab.set_vector {#set_vector tag="method" new="2"}
|
||||
|
||||
|
@ -196,10 +197,10 @@ or hash value.
|
|||
> nlp.vocab.set_vector("apple", array([...]))
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ---------------------------------------- | ------------------------------------------------ |
|
||||
| `orth` | int / str | The hash value of a word, or its unicode string. |
|
||||
| `vector` | `numpy.ndarray[ndim=1, dtype='float32']` | The vector to set. |
|
||||
| Name | Description |
|
||||
| -------- | -------------------------------------------------------------------- |
|
||||
| `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ |
|
||||
| `vector` | The vector to set. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
|
||||
## Vocab.has_vector {#has_vector tag="method" new="2"}
|
||||
|
||||
|
@ -213,10 +214,10 @@ Words can be looked up by string or hash value.
|
|||
> vector = nlp.vocab.get_vector("apple")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------- | ------------------------------------------------ |
|
||||
| `orth` | int / str | The hash value of a word, or its unicode string. |
|
||||
| **RETURNS** | bool | Whether the word has a vector. |
|
||||
| Name | Description |
|
||||
| ----------- | -------------------------------------------------------------------- |
|
||||
| `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ |
|
||||
| **RETURNS** | Whether the word has a vector. ~~bool~~ |
|
||||
|
||||
## Vocab.to_disk {#to_disk tag="method" new="2"}
|
||||
|
||||
|
@ -228,11 +229,11 @@ Save the current state to a directory.
|
|||
> nlp.vocab.to_disk("/path/to/vocab")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
|
||||
## Vocab.from_disk {#from_disk tag="method" new="2"}
|
||||
|
||||
|
@ -245,12 +246,12 @@ Loads state from a directory. Modifies the object in place and returns it.
|
|||
> vocab = Vocab().from_disk("/path/to/vocab")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Vocab` | The modified `Vocab` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `path` | A path to a directory. Paths may be either strings or `Path`-like objects. ~~Union[str, Path]~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The modified `Vocab` object. ~~Vocab~~ |
|
||||
|
||||
## Vocab.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -262,11 +263,11 @@ Serialize the current state to a binary string.
|
|||
> vocab_bytes = nlp.vocab.to_bytes()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | bytes | The serialized form of the `Vocab` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The serialized form of the `Vocab` object. ~~Vocab~~ |
|
||||
|
||||
## Vocab.from_bytes {#from_bytes tag="method"}
|
||||
|
||||
|
@ -281,12 +282,12 @@ Load state from a binary string.
|
|||
> vocab.from_bytes(vocab_bytes)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------- | ------------------------------------------------------------------------- |
|
||||
| `bytes_data` | bytes | The data to load from. |
|
||||
| _keyword-only_ | | |
|
||||
| `exclude` | `Iterable[str]` | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Vocab` | The `Vocab` object. |
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `bytes_data` | The data to load from. ~~bytes~~ |
|
||||
| _keyword-only_ | |
|
||||
| `exclude` | String names of [serialization fields](#serialization-fields) to exclude. ~~Iterable[str]~~ |
|
||||
| **RETURNS** | The `Vocab` object. ~~Vocab~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
|
@ -299,13 +300,13 @@ Load state from a binary string.
|
|||
> assert type(PERSON) == int
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------------------------------------- | ------------- | ------------------------------------------------------------ |
|
||||
| `strings` | `StringStore` | A table managing the string-to-int mapping. |
|
||||
| `vectors` <Tag variant="new">2</Tag> | `Vectors` | A table associating word IDs to word vectors. |
|
||||
| `vectors_length` | int | Number of dimensions for each word vector. |
|
||||
| `lookups` | `Lookups` | The available lookup tables in this vocab. |
|
||||
| `writing_system` <Tag variant="new">2.1</Tag> | dict | A dict with information about the language's writing system. |
|
||||
| Name | Description |
|
||||
| --------------------------------------------- | ------------------------------------------------------------------------------- |
|
||||
| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ |
|
||||
| `vectors` <Tag variant="new">2</Tag> | A table associating word IDs to word vectors. ~~Vectors~~ |
|
||||
| `vectors_length` | Number of dimensions for each word vector. ~~int~~ |
|
||||
| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ |
|
||||
| `writing_system` <Tag variant="new">2.1</Tag> | A dict with information about the language's writing system. ~~Dict[str, Any]~~ |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -74,13 +74,13 @@ operates on a `Doc` and gives you access to the matched tokens **in context**.
|
|||
### Other classes {#architecture-other}
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------------------- | ---------------------------------------------------------------------------------------------------------------- |
|
||||
| ------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------- |
|
||||
| [`Vocab`](/api/vocab) | The shared vocabulary that stores strings and gives you access to [`Lexeme`](/api/lexeme) objects. |
|
||||
| [`StringStore`](/api/stringstore) | Map strings to and from hash values. |
|
||||
| [`Vectors`](/api/vectors) | Container class for vector data keyed by string. |
|
||||
| [`Lookups`](/api/lookups) | Container for convenient access to large lookup tables and dictionaries. |
|
||||
| [`Morphology`](/api/morphology) | Assign linguistic features like lemmas, noun case, verb tense etc. based on the word and its part-of-speech tag. |
|
||||
| [`MorphAnalysis`](/api/morphanalysis) | A morphological analysis. |
|
||||
| [`MorphAnalysis`](/api/morphology#morphanalysis) | A morphological analysis. |
|
||||
| [`KnowledgeBase`](/api/kb) | Storage for entities and aliases of a knowledge base for entity linking. |
|
||||
| [`Scorer`](/api/scorer) | Compute evaluation scores. |
|
||||
| [`Corpus`](/api/corpus) | Class for managing annotated corpora for training and evaluation data. |
|
||||
|
|
|
@ -980,7 +980,7 @@ nlp.tokenizer = my_tokenizer
|
|||
|
||||
| Argument | Type | Description |
|
||||
| ----------- | ----------------- | ------------------------- |
|
||||
| `text` | str | The raw text to tokenize. |
|
||||
| `text` | `str` | The raw text to tokenize. |
|
||||
| **RETURNS** | [`Doc`](/api/doc) | The tokenized document. |
|
||||
|
||||
#### Example 1: Basic whitespace tokenizer {#custom-tokenizer-example}
|
||||
|
|
|
@ -139,25 +139,25 @@ $ pip install https://github.com/honnibal/pkuseg-python/archive/master.zip
|
|||
The `meta` argument of the `Chinese` language class supports the following
|
||||
following tokenizer config settings:
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------------ | ---- | ------------------------------------------------------------------------------------------------------- |
|
||||
| `segmenter` | str | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. |
|
||||
| `pkuseg_model` | str | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. |
|
||||
| `pkuseg_user_dict` | str | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. |
|
||||
| Name | Description |
|
||||
| ------------------ | --------------------------------------------------------------------------------------------------------------- |
|
||||
| `segmenter` | Word segmenter: `char`, `jieba` or `pkuseg`. Defaults to `char`. ~~str~~ |
|
||||
| `pkuseg_model` | **Required for `pkuseg`:** Name of a model provided by `pkuseg` or the path to a local model directory. ~~str~~ |
|
||||
| `pkuseg_user_dict` | Optional path to a file with one word per line which overrides the default `pkuseg` user dictionary. ~~str~~ |
|
||||
|
||||
```python
|
||||
### Examples
|
||||
# Load "default" model
|
||||
cfg = {"segmenter": "pkuseg", "pkuseg_model": "default"}
|
||||
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
||||
nlp = Chinese(config={"tokenizer": {"config": cfg}})
|
||||
|
||||
# Load local model
|
||||
cfg = {"segmenter": "pkuseg", "pkuseg_model": "/path/to/pkuseg_model"}
|
||||
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
||||
nlp = Chinese(config={"tokenizer": {"config": cfg}})
|
||||
|
||||
# Override the user directory
|
||||
cfg = {"segmenter": "pkuseg", "pkuseg_model": "default", "pkuseg_user_dict": "/path"}
|
||||
nlp = Chinese(meta={"tokenizer": {"config": cfg}})
|
||||
nlp = Chinese(config={"tokenizer": {"config": cfg}})
|
||||
```
|
||||
|
||||
You can also modify the user dictionary on-the-fly:
|
||||
|
|
|
@ -478,9 +478,9 @@ only being able to modify it afterwards.
|
|||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object processed by the previous component. |
|
||||
| **RETURNS** | `Doc` | The `Doc` object processed by this pipeline component. |
|
||||
| ----------- | ----------------- | ------------------------------------------------------ |
|
||||
| `doc` | [`Doc`](/api/doc) | The `Doc` object processed by the previous component. |
|
||||
| **RETURNS** | [`Doc`](/api/doc) | The `Doc` object processed by this pipeline component. |
|
||||
|
||||
The [`@Language.component`](/api/language#component) decorator lets you turn a
|
||||
simple function into a pipeline component. It takes at least one argument, the
|
||||
|
@ -502,12 +502,12 @@ last** in the pipeline, or define a **custom name**. If no name is set and no
|
|||
> nlp.add_pipe("my_component", before="parser")
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------- | --------- | ------------------------------------------------------------------------ |
|
||||
| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). |
|
||||
| `first` | bool | If set to `True`, component is added **first** in the pipeline. |
|
||||
| `before` | str / int | String name or index to add the new component **before**. |
|
||||
| `after` | str / int | String name or index to add the new component **after**. |
|
||||
| Argument | Description |
|
||||
| -------- | --------------------------------------------------------------------------------- |
|
||||
| `last` | If set to `True`, component is added **last** in the pipeline (default). ~~bool~~ |
|
||||
| `first` | If set to `True`, component is added **first** in the pipeline. ~~bool~~ |
|
||||
| `before` | String name or index to add the new component **before**. ~~Union[str, int]~~ |
|
||||
| `after` | String name or index to add the new component **after**. ~~Union[str, int]~~ |
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
|
@ -626,10 +626,10 @@ added to the pipeline:
|
|||
> return MyComponent()
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------- | --------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `nlp` | [`Language`](/api/language) | The current `nlp` object. Can be used to access the |
|
||||
| `name` | str | The **instance name** of the component in the pipeline. This lets you identify different instances of the same component. |
|
||||
| Argument | Description |
|
||||
| -------- | --------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `nlp` | The current `nlp` object. Can be used to access the shared vocab. ~~Language~~ |
|
||||
| `name` | The **instance name** of the component in the pipeline. This lets you identify different instances of the same component. ~~str~~ |
|
||||
|
||||
All other settings can be passed in by the user via the `config` argument on
|
||||
[`nlp.add_pipe`](/api/language). The
|
||||
|
@ -1332,12 +1332,11 @@ function that takes a `Doc`, modifies it and returns it.
|
|||
- If you're looking to publish a model that depends on a custom pipeline
|
||||
component, you can either **require it** in the model package's dependencies,
|
||||
or – if the component is specific and lightweight – choose to **ship it with
|
||||
your model package** and add it to the `Language` instance returned by the
|
||||
model's `load()` method. For examples of this, check out the implementations
|
||||
of spaCy's
|
||||
[`load_model_from_init_py`](/api/top-level#util.load_model_from_init_py)
|
||||
[`load_model_from_path`](/api/top-level#util.load_model_from_path) utility
|
||||
functions.
|
||||
your model package**. Just make sure the
|
||||
[`@Language.component`](/api/language#component) or
|
||||
[`@Language.factory`](/api/language#factory) decorator that registers the
|
||||
custom component runs in your model's `__init__.py` or is exposed via an
|
||||
[entry point](/usage/saving-loading#entry-points).
|
||||
|
||||
- Once you're ready to share your extension with others, make sure to **add docs
|
||||
and installation instructions** (you can always link to this page for more
|
||||
|
|
|
@ -157,19 +157,20 @@ The available token pattern keys correspond to a number of
|
|||
[`Token` attributes](/api/token#attributes). The supported attributes for
|
||||
rule-based matching are:
|
||||
|
||||
| Attribute | Type | Description |
|
||||
| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
|
||||
| `ORTH` | str | The exact verbatim text of a token. |
|
||||
| `TEXT` <Tag variant="new">2.1</Tag> | str | The exact verbatim text of a token. |
|
||||
| `LOWER` | str | The lowercase form of the token text. |
|
||||
| `LENGTH` | int | The length of the token text. |
|
||||
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
|
||||
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|
||||
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|
||||
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|
||||
| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
|
||||
| `ENT_TYPE` | str | The token's entity label. |
|
||||
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
|
||||
| Attribute | Description |
|
||||
| -------------------------------------- | ------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `ORTH` | The exact verbatim text of a token. ~~str~~ |
|
||||
| `TEXT` <Tag variant="new">2.1</Tag> | The exact verbatim text of a token. ~~str~~ |
|
||||
| `LOWER` | The lowercase form of the token text. ~~str~~ |
|
||||
| `LENGTH` | The length of the token text. ~~int~~ |
|
||||
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | Token text consists of alphabetic characters, ASCII characters, digits. ~~bool~~ |
|
||||
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | Token text is in lowercase, uppercase, titlecase. ~~bool~~ |
|
||||
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | Token is punctuation, whitespace, stop word. ~~bool~~ |
|
||||
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | Token text resembles a number, URL, email. ~~bool~~ |
|
||||
| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. ~~str~~ |
|
||||
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
||||
| `_` <Tag variant="new">2.1</Tag> | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). ~~Dict[str, Any]~~ |
|
||||
| `OP` | [Operator or quantifier](#quantifiers) to determine how often to match a token pattern. ~~str~~ |
|
||||
|
||||
<Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
|
||||
|
||||
|
@ -231,11 +232,11 @@ following rich comparison attributes are available:
|
|||
> pattern2 = [{"LENGTH": {">=": 10}}]
|
||||
> ```
|
||||
|
||||
| Attribute | Value Type | Description |
|
||||
| -------------------------- | ---------- | --------------------------------------------------------------------------------- |
|
||||
| `IN` | any | Attribute value is member of a list. |
|
||||
| `NOT_IN` | any | Attribute value is _not_ member of a list. |
|
||||
| `==`, `>=`, `<=`, `>`, `<` | int, float | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. |
|
||||
| Attribute | Description |
|
||||
| -------------------------- | ------------------------------------------------------------------------------------------------------- |
|
||||
| `IN` | Attribute value is member of a list. ~~Any~~ |
|
||||
| `NOT_IN` | Attribute value is _not_ member of a list. ~~Any~~ |
|
||||
| `==`, `>=`, `<=`, `>`, `<` | Attribute value is equal, greater or equal, smaller or equal, greater or smaller. ~~Union[int, float]~~ |
|
||||
|
||||
#### Regular expressions {#regex new="2.1"}
|
||||
|
||||
|
@ -485,12 +486,12 @@ This allows you to write callbacks that consider the entire set of matched
|
|||
phrases, so that you can resolve overlaps and other conflicts in whatever way
|
||||
you prefer.
|
||||
|
||||
| Argument | Type | Description |
|
||||
| --------- | --------- | -------------------------------------------------------------------------------------------------------------------- |
|
||||
| `matcher` | `Matcher` | The matcher instance. |
|
||||
| `doc` | `Doc` | The document the matcher was used on. |
|
||||
| `i` | int | Index of the current match (`matches[i`]). |
|
||||
| `matches` | list | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. |
|
||||
| Argument | Description |
|
||||
| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `matcher` | The matcher instance. ~~Matcher~~ |
|
||||
| `doc` | The document the matcher was used on. ~~Doc~~ |
|
||||
| `i` | Index of the current match (`matches[i`]). ~~int~~ |
|
||||
| `matches` | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. ~~ List[Tuple[int, int int]]~~ |
|
||||
|
||||
### Using custom pipeline components {#matcher-pipeline}
|
||||
|
||||
|
|
|
@ -10,45 +10,48 @@ next: /usage/training
|
|||
|
||||
## Installation {#install hidden="true"}
|
||||
|
||||
Transformers are a family of neural network architectures that compute dense,
|
||||
context-sensitive representations for the tokens in your documents. Downstream
|
||||
Transformers are a family of neural network architectures that compute **dense,
|
||||
context-sensitive representations** for the tokens in your documents. Downstream
|
||||
models in your pipeline can then use these representations as input features to
|
||||
improve their predictions. You can connect multiple components to a single
|
||||
**improve their predictions**. You can connect multiple components to a single
|
||||
transformer model, with any or all of those components giving feedback to the
|
||||
transformer to fine-tune it to your tasks. spaCy's transformer support
|
||||
interoperates with PyTorch and the [Huggingface transformers](https://huggingface.co/transformers/)
|
||||
library, giving you access to thousands of pretrained models for your pipelines.
|
||||
There are many [great guides](http://jalammar.github.io/illustrated-transformer/)
|
||||
to transformer models, but for practical purposes, you can simply think of them
|
||||
as a drop-in replacement that let you achieve higher accuracy in exchange for
|
||||
higher training and runtime costs.
|
||||
interoperates with [PyTorch](https://pytorch.org) and the
|
||||
[HuggingFace `transformers`](https://huggingface.co/transformers/) library,
|
||||
giving you access to thousands of pretrained models for your pipelines. There
|
||||
are many [great guides](http://jalammar.github.io/illustrated-transformer/) to
|
||||
transformer models, but for practical purposes, you can simply think of them as
|
||||
a drop-in replacement that let you achieve **higher accuracy** in exchange for
|
||||
**higher training and runtime costs**.
|
||||
|
||||
## System requirements
|
||||
### System requirements
|
||||
|
||||
We recommend an NVIDIA GPU with at least 10GB of memory in order to work with
|
||||
transformer models. The exact requirements will depend on the transformer you
|
||||
model you choose and whether you're training the pipeline or simply running it.
|
||||
Training a transformer-based model without a GPU will be too slow for most
|
||||
practical purposes. You'll also need to make sure your GPU drivers are up-to-date
|
||||
and v9+ of the CUDA runtime is installed.
|
||||
practical purposes. You'll also need to make sure your GPU drivers are
|
||||
up-to-date and v9+ of the CUDA runtime is installed.
|
||||
|
||||
Once you have CUDA installed, you'll need to install two pip packages, `cupy`
|
||||
and `spacy-transformers`. [CuPy](https://docs.cupy.dev/en/stable/install.html)
|
||||
Once you have CUDA installed, you'll need to install two pip packages,
|
||||
[`cupy`](https://docs.cupy.dev/en/stable/install.html) and
|
||||
[`spacy-transformers`](https://github.com/explosion/spacy-transformers). `cupy`
|
||||
is just like `numpy`, but for GPU. The best way to install it is to choose a
|
||||
wheel that matches the version of CUDA you're using. You may also need to set the
|
||||
`CUDA_PATH` environment variable if your CUDA runtime is installed in
|
||||
a non-standard location. Putting it all together, if you had installed CUDA 10.2
|
||||
wheel that matches the version of CUDA you're using. You may also need to set
|
||||
the `CUDA_PATH` environment variable if your CUDA runtime is installed in a
|
||||
non-standard location. Putting it all together, if you had installed CUDA 10.2
|
||||
in `/opt/nvidia/cuda`, you would run:
|
||||
|
||||
```
|
||||
```bash
|
||||
### Installation with CUDA
|
||||
export CUDA_PATH="/opt/nvidia/cuda"
|
||||
pip install cupy-cuda102
|
||||
pip install spacy-transformers
|
||||
```
|
||||
|
||||
Provisioning a new machine will require about 5GB of data to be downloaded in total:
|
||||
3GB for the CUDA runtime, 800MB for PyTorch, 400MB for CuPy, 500MB for the transformer
|
||||
weights, and about 200MB for spaCy and its various requirements.
|
||||
Provisioning a new machine will require about 5GB of data to be downloaded in
|
||||
total: 3GB for the CUDA runtime, 800MB for PyTorch, 400MB for CuPy, 500MB for
|
||||
the transformer weights, and about 200MB for spaCy and its various requirements.
|
||||
|
||||
## Runtime usage {#runtime}
|
||||
|
||||
|
@ -237,23 +240,22 @@ The [`Transformer`](/api/transformer) component expects a Thinc
|
|||
[`Model`](https://thinc.ai/docs/api-model) object to be passed in as its `model`
|
||||
argument. You're not limited to the implementation provided by
|
||||
`spacy-transformers` – the only requirement is that your registered function
|
||||
must return an object of type `Model[List[Doc], FullTransformerBatch]`: that is,
|
||||
a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a
|
||||
must return an object of type ~~Model[List[Doc], FullTransformerBatch]~~: that
|
||||
is, a Thinc model that takes a list of [`Doc`](/api/doc) objects, and returns a
|
||||
[`FullTransformerBatch`](/api/transformer#fulltransformerbatch) object with the
|
||||
transformer data.
|
||||
|
||||
> #### Model type annotations
|
||||
>
|
||||
> In the documentation and code base, you may come across type annotations and
|
||||
> descriptions of [Thinc](https://thinc.ai) model types, like
|
||||
> `Model[List[Doc], List[Floats2d]]`. This so-called generic type describes the
|
||||
> layer and its input and output type – in this case, it takes a list of `Doc`
|
||||
> objects as the input and list of 2-dimensional arrays of floats as the output.
|
||||
> You can read more about defining Thinc
|
||||
> models [here](https://thinc.ai/docs/usage-models). Also see the
|
||||
> [type checking](https://thinc.ai/docs/usage-type-checking) for how to enable
|
||||
> linting in your editor to see live feedback if your inputs and outputs don't
|
||||
> match.
|
||||
> descriptions of [Thinc](https://thinc.ai) model types, like ~~Model[List[Doc],
|
||||
> List[Floats2d]]~~. This so-called generic type describes the layer and its
|
||||
> input and output type – in this case, it takes a list of `Doc` objects as the
|
||||
> input and list of 2-dimensional arrays of floats as the output. You can read
|
||||
> more about defining Thinc models [here](https://thinc.ai/docs/usage-models).
|
||||
> Also see the [type checking](https://thinc.ai/docs/usage-type-checking) for
|
||||
> how to enable linting in your editor to see live feedback if your inputs and
|
||||
> outputs don't match.
|
||||
|
||||
The same idea applies to task models that power the **downstream components**.
|
||||
Most of spaCy's built-in model creation functions support a `tok2vec` argument,
|
||||
|
@ -288,7 +290,7 @@ The [Tok2VecListener](/api/architectures#Tok2VecListener) layer expects a
|
|||
determines how the vector for each spaCy token will be computed from the zero or
|
||||
more source rows the token is aligned against. Here we use the
|
||||
[`reduce_mean`](https://thinc.ai/docs/api-layers#reduce_mean) layer, which
|
||||
averages the wordpiece rows. We could instead use `reduce_last`,
|
||||
averages the wordpiece rows. We could instead use
|
||||
[`reduce_max`](https://thinc.ai/docs/api-layers#reduce_max), or a custom
|
||||
function you write yourself.
|
||||
|
||||
|
|
|
@ -231,6 +231,7 @@ on them.
|
|||
| `Language.tagger`, `Language.parser`, `Language.entity` | [`Language.get_pipe`](/api/language#get_pipe) |
|
||||
| keyword-arguments like `vocab=False` on `to_disk`, `from_disk`, `to_bytes`, `from_bytes` | `exclude=["vocab"]` |
|
||||
| `n_threads` argument on [`Tokenizer`](/api/tokenizer), [`Matcher`](/api/matcher), [`PhraseMatcher`](/api/phrasematcher) | `n_process` |
|
||||
| `verbose` argument on [`Language.evaluate`] | logging |
|
||||
| `SentenceSegmenter` hook, `SimilarityHook` | [user hooks](/usage/processing-pipelines#custom-components-user-hooks), [`Sentencizer`](/api/sentencizer), [`SentenceRecognizer`](/api/sentenceregognizer) |
|
||||
|
||||
## Migrating from v2.x {#migrating}
|
||||
|
|
|
@ -58,12 +58,12 @@ arcs.
|
|||
|
||||
</Infobox>
|
||||
|
||||
| Argument | Type | Description | Default |
|
||||
| --------- | ---- | ----------------------------------------------------------- | ----------- |
|
||||
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
|
||||
| `color` | str | Text color (HEX, RGB or color names). | `"#000000"` |
|
||||
| `bg` | str | Background color (HEX, RGB or color names). | `"#ffffff"` |
|
||||
| `font` | str | Font name or font family for all text. | `"Arial"` |
|
||||
| Argument | Description |
|
||||
| --------- | ----------------------------------------------------------------------------------------- |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ |
|
||||
| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ |
|
||||
| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ |
|
||||
|
||||
For a list of all available options, see the
|
||||
[`displacy` API documentation](/api/top-level#displacy_options).
|
||||
|
@ -121,10 +121,10 @@ import DisplacyEntHtml from 'images/displacy-ent2.html'
|
|||
|
||||
The entity visualizer lets you customize the following `options`:
|
||||
|
||||
| Argument | Type | Description | Default |
|
||||
| -------- | ---- | ------------------------------------------------------------------------------------- | ------- |
|
||||
| `ents` | list | Entity types to highlight (`None` for all types). | `None` |
|
||||
| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` |
|
||||
| Argument | Description |
|
||||
| -------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `ents` | Entity types to highlight (`None` for all types). Defaults to `None`. ~~Optional[List[str]]~~ | `None` |
|
||||
| `colors` | Color overrides. Entity types in uppercase should be mapped to color names or values. Defaults to `{}`. ~~Dict[str, str]~~ |
|
||||
|
||||
If you specify a list of `ents`, only those entity types will be rendered – for
|
||||
example, you can choose to display `PERSON` entities. Internally, the visualizer
|
||||
|
|
|
@ -113,7 +113,6 @@
|
|||
{ "text": "Vectors", "url": "/api/vectors" },
|
||||
{ "text": "Lookups", "url": "/api/lookups" },
|
||||
{ "text": "Morphology", "url": "/api/morphology" },
|
||||
{ "text": "MorphAnalysis", "url": "/api/morphanalysis" },
|
||||
{ "text": "KnowledgeBase", "url": "/api/kb" },
|
||||
{ "text": "Scorer", "url": "/api/scorer" },
|
||||
{ "text": "Corpus", "url": "/api/corpus" }
|
||||
|
|
43
website/meta/type-annotations.json
Normal file
43
website/meta/type-annotations.json
Normal file
|
@ -0,0 +1,43 @@
|
|||
{
|
||||
"Doc": "/api/doc",
|
||||
"Token": "/api/token",
|
||||
"Span": "/api/span",
|
||||
"Lexeme": "/api/lexeme",
|
||||
"Example": "/api/example",
|
||||
"Alignment": "/api/example#alignment-object",
|
||||
"Vocab": "/api/vocab",
|
||||
"StringStore": "/api/stringstore",
|
||||
"Lookups": "/api/lookups",
|
||||
"Table": "/api/lookups#table",
|
||||
"Vectors": "/api/vectors",
|
||||
"Language": "/api/language",
|
||||
"Defaults": "/api/language#defaults",
|
||||
"Scorer": "/api/scorer",
|
||||
"DocBin": "/api/docbin",
|
||||
"FactoryMeta": "/api/language#factorymeta",
|
||||
"Tokenizer": "/api/tokenizer",
|
||||
"MorphAnalysis": "/api/morphology#morphanalysis",
|
||||
"KnowledgeBase": "/api/kb",
|
||||
"Candidate": "/api/kb#candidate",
|
||||
"Matcher": "/api/matcher",
|
||||
"PhraseMatcher": "/api/phrasematcher",
|
||||
"TransformerData": "/api/transformer#transformerdata",
|
||||
"FullTransformerBatch": "/api/transformer#fulltransformerbatch",
|
||||
"LexemeC": "/api/cython-structs#lexemec",
|
||||
"TokenC": "/api/cython-structs#tokenc",
|
||||
"Config": "https://thinc.ai/docs/api-config#config",
|
||||
"Optimizer": "https://thinc.ai/docs/api-optimizers",
|
||||
"Model": "https://thinc.ai/docs/api-model",
|
||||
"Ragged": "https://thinc.ai/docs/api-types#ragged",
|
||||
"Floats2d": "https://thinc.ai/docs/api-types#types",
|
||||
"Floats3d": "https://thinc.ai/docs/api-types#types",
|
||||
"FloatsXd": "https://thinc.ai/docs/api-types#types",
|
||||
"cymem.Pool": "https://github.com/explosion/cymem",
|
||||
"preshed.BloomFilter": "https://github.com/explosion/preshed",
|
||||
"transformers.BatchEncoding": "https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding",
|
||||
"torch.Tensor": "https://pytorch.org/docs/stable/tensors.html",
|
||||
"numpy.ndarray": "https://numpy.org/doc/stable/reference/generated/numpy.ndarray.html",
|
||||
"Match": "https://docs.python.org/3/library/re.html#match-objects",
|
||||
"Pattern": "https://docs.python.org/3/library/re.html#regular-expression-objects",
|
||||
"Path": "https://docs.python.org/3/library/pathlib.html"
|
||||
}
|
|
@ -1,4 +1,4 @@
|
|||
import React from 'react'
|
||||
import React, { Fragment } from 'react'
|
||||
import PropTypes from 'prop-types'
|
||||
import classNames from 'classnames'
|
||||
import highlightCode from 'gatsby-remark-prismjs/highlight-code.js'
|
||||
|
@ -6,12 +6,13 @@ import rangeParser from 'parse-numeric-range'
|
|||
import { StaticQuery, graphql } from 'gatsby'
|
||||
import { window } from 'browser-monads'
|
||||
|
||||
import CUSTOM_TYPES from '../../meta/type-annotations.json'
|
||||
import { isString, htmlToReact } from './util'
|
||||
import Link from './link'
|
||||
import GitHubCode from './github'
|
||||
import classes from '../styles/code.module.sass'
|
||||
|
||||
const WRAP_THRESHOLD = 16
|
||||
const WRAP_THRESHOLD = 30
|
||||
|
||||
export default props => (
|
||||
<Pre>
|
||||
|
@ -40,6 +41,52 @@ InlineCode.propTypes = {
|
|||
children: PropTypes.node,
|
||||
}
|
||||
|
||||
function linkType(el, showLink = true) {
|
||||
if (!isString(el) || !el.length) return el
|
||||
const elStr = el.trim()
|
||||
if (!elStr) return el
|
||||
const typeUrl = CUSTOM_TYPES[elStr]
|
||||
const url = typeUrl == true ? DEFAULT_TYPE_URL : typeUrl
|
||||
const ws = el[0] == ' '
|
||||
return url && showLink ? (
|
||||
<Fragment>
|
||||
{ws && ' '}
|
||||
<Link to={url} hideIcon>
|
||||
{elStr}
|
||||
</Link>
|
||||
</Fragment>
|
||||
) : (
|
||||
el
|
||||
)
|
||||
}
|
||||
|
||||
export const TypeAnnotation = ({ lang = 'python', link = true, children }) => {
|
||||
// Hacky, but we're temporarily replacing a dot to prevent it from being split during highlighting
|
||||
const TMP_DOT = '•'
|
||||
const code = Array.isArray(children) ? children.join('') : children || ''
|
||||
const rawStr = code.replace('.', TMP_DOT)
|
||||
const rawHtml = lang === 'none' || !code ? code : highlightCode(lang, rawStr)
|
||||
const html = rawHtml.replace(TMP_DOT, '.').replace(/\n/g, ' ')
|
||||
const result = htmlToReact(html)
|
||||
const elements = Array.isArray(result) ? result : [result]
|
||||
const annotClassNames = classNames(
|
||||
'type-annotation',
|
||||
`language-${lang}`,
|
||||
classes.inlineCode,
|
||||
classes.typeAnnotation,
|
||||
{
|
||||
[classes.wrap]: code.length >= WRAP_THRESHOLD,
|
||||
}
|
||||
)
|
||||
return (
|
||||
<code className={annotClassNames} aria-label="Type annotation">
|
||||
{elements.map((el, i) => (
|
||||
<Fragment key={i}>{linkType(el, !!link)}</Fragment>
|
||||
))}
|
||||
</code>
|
||||
)
|
||||
}
|
||||
|
||||
export class Code extends React.Component {
|
||||
state = { Juniper: null }
|
||||
|
||||
|
|
|
@ -56,6 +56,38 @@
|
|||
--color-inline-code-text: var(--color-back)
|
||||
--color-inline-code-bg: var(--color-dark-secondary)
|
||||
|
||||
.type-annotation,
|
||||
white-space: pre-wrap
|
||||
font-family: var(--font-code)
|
||||
|
||||
&.wrap
|
||||
word-wrap: break-word
|
||||
|
||||
a
|
||||
border: 0
|
||||
|
||||
// Special style for types in API tables
|
||||
td > &:last-child
|
||||
display: block
|
||||
border-top: 1px dotted var(--color-subtle)
|
||||
border-radius: 0
|
||||
background: none
|
||||
width: calc(100% + 2rem)
|
||||
margin-left: -1rem
|
||||
padding-left: 1rem
|
||||
padding-top: 5px
|
||||
margin-top: 5px
|
||||
margin-bottom: -5px
|
||||
|
||||
&:before
|
||||
content: "Type: "
|
||||
opacity: 0.75
|
||||
font-family: var(--font-primary)
|
||||
color: var(--color-dark-secondary)
|
||||
font-weight: bold
|
||||
text-transform: uppercase
|
||||
margin-right: 5px
|
||||
|
||||
.wrap
|
||||
white-space: pre-wrap
|
||||
word-wrap: anywhere
|
||||
|
|
|
@ -358,6 +358,15 @@ body [id]:target
|
|||
&.italic
|
||||
font-style: italic
|
||||
|
||||
|
||||
[class*="language-"].type-annotation .token
|
||||
&.builtin, &.boolean, &.number
|
||||
color: var(--color-inline-code-text)
|
||||
|
||||
&.operator
|
||||
color: var(--syntax-comment)
|
||||
|
||||
|
||||
// Settings for ini syntax (config files)
|
||||
[class*="language-ini"]
|
||||
color: var(--syntax-comment)
|
||||
|
|
|
@ -29,7 +29,8 @@
|
|||
border: 0
|
||||
|
||||
.td
|
||||
padding: 1rem
|
||||
padding: 0.9rem 1rem
|
||||
font-size: 95%
|
||||
|
||||
&:not(:last-child)
|
||||
border-right: 1px solid var(--color-subtle)
|
||||
|
|
|
@ -20,7 +20,7 @@ import SEO from '../components/seo'
|
|||
import Link from '../components/link'
|
||||
import Section, { Hr } from '../components/section'
|
||||
import { Table, Tr, Th, Td } from '../components/table'
|
||||
import { Pre, Code, InlineCode } from '../components/code'
|
||||
import { Pre, Code, InlineCode, TypeAnnotation } from '../components/code'
|
||||
import { Ol, Ul, Li } from '../components/list'
|
||||
import { H2, H3, H4, H5, P, Abbr, Help } from '../components/typography'
|
||||
import Accordion from '../components/accordion'
|
||||
|
@ -41,6 +41,7 @@ const mdxComponents = {
|
|||
pre: Pre,
|
||||
code: Code,
|
||||
inlineCode: InlineCode,
|
||||
del: TypeAnnotation,
|
||||
table: Table,
|
||||
img: Image,
|
||||
tr: Tr,
|
||||
|
|
Loading…
Reference in New Issue
Block a user