mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-24 16:24:16 +03:00
Merge branch 'develop' into feature/prepare
This commit is contained in:
commit
d3c63b7965
|
@ -14,7 +14,7 @@ pathy
|
|||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
pydantic>=1.3.0,<2.0.0
|
||||
pydantic>=1.5.0,<2.0.0
|
||||
pytokenizations
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
|
|
|
@ -51,7 +51,7 @@ install_requires =
|
|||
tqdm>=4.38.0,<5.0.0
|
||||
numpy>=1.15.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
pydantic>=1.3.0,<2.0.0
|
||||
pydantic>=1.5.0,<2.0.0
|
||||
pytokenizations
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
|
|
|
@ -114,6 +114,6 @@ def project_document(
|
|||
content = f"{before}{content}{after}"
|
||||
else:
|
||||
msg.warn("Replacing existing file")
|
||||
with output_file.open("w") as f:
|
||||
with output_file.open("w", encoding="utf8") as f:
|
||||
f.write(content)
|
||||
msg.good("Saved project documentation", output_file)
|
||||
|
|
|
@ -270,6 +270,7 @@ factory = "{{ pipe }}"
|
|||
@readers = "spacy.Corpus.v1"
|
||||
path = ${paths.train}
|
||||
max_length = {{ 500 if hardware == "gpu" else 2000 }}
|
||||
augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
|
||||
|
||||
[corpora.dev]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
|
|
|
@ -36,6 +36,11 @@ gold_preproc = false
|
|||
max_length = 0
|
||||
# Limitation on number of training examples
|
||||
limit = 0
|
||||
# Apply some simply data augmentation, where we replace tokens with variations.
|
||||
# This is especially useful for punctuation and case replacement, to help
|
||||
# generalize beyond corpora that don't have smart-quotes, or only have smart
|
||||
# quotes, etc.
|
||||
augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
|
||||
|
||||
[corpora.dev]
|
||||
@readers = "spacy.Corpus.v1"
|
||||
|
|
|
@ -4,7 +4,7 @@ from spacy.training import biluo_tags_to_spans, iob_to_biluo
|
|||
from spacy.training import Corpus, docs_to_json
|
||||
from spacy.training.example import Example
|
||||
from spacy.training.converters import json_to_docs
|
||||
from spacy.training.augment import make_orth_variants_example
|
||||
from spacy.training.augment import create_orth_variants_augmenter
|
||||
from spacy.lang.en import English
|
||||
from spacy.tokens import Doc, DocBin
|
||||
from spacy.util import get_words_and_spaces, minibatch
|
||||
|
@ -496,9 +496,8 @@ def test_make_orth_variants(doc):
|
|||
output_file = tmpdir / "roundtrip.spacy"
|
||||
DocBin(docs=[doc]).to_disk(output_file)
|
||||
# due to randomness, test only that this runs with no errors for now
|
||||
reader = Corpus(output_file)
|
||||
train_example = next(reader(nlp))
|
||||
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
|
||||
reader = Corpus(output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5))
|
||||
train_examples = list(reader(nlp))
|
||||
|
||||
|
||||
@pytest.mark.skip("Outdated")
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
from .corpus import Corpus # noqa: F401
|
||||
from .example import Example, validate_examples # noqa: F401
|
||||
from .align import Alignment # noqa: F401
|
||||
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
|
||||
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
|
||||
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401
|
||||
from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401
|
||||
|
|
|
@ -1,30 +1,50 @@
|
|||
from typing import Callable
|
||||
import random
|
||||
import itertools
|
||||
import copy
|
||||
from functools import partial
|
||||
from ..util import registry
|
||||
|
||||
|
||||
def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming
|
||||
raw_text = example.text
|
||||
orig_dict = example.to_dict()
|
||||
variant_text, variant_token_annot = make_orth_variants(
|
||||
nlp, raw_text, orig_dict["token_annotation"], orth_variant_level
|
||||
)
|
||||
doc = nlp.make_doc(variant_text)
|
||||
orig_dict["token_annotation"] = variant_token_annot
|
||||
return example.from_dict(doc, orig_dict)
|
||||
@registry.augmenters("spacy.dont_augment.v1")
|
||||
def create_null_augmenter():
|
||||
return dont_augment
|
||||
|
||||
|
||||
def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
|
||||
if random.random() >= orth_variant_level:
|
||||
return raw_text, orig_token_dict
|
||||
if not orig_token_dict:
|
||||
return raw_text, orig_token_dict
|
||||
raw = raw_text
|
||||
token_dict = orig_token_dict
|
||||
lower = False
|
||||
if random.random() >= 0.5:
|
||||
lower = True
|
||||
if raw is not None:
|
||||
raw = raw.lower()
|
||||
@registry.augmenters("spacy.orth_variants.v1")
|
||||
def create_orth_variants_augmenter(level: float, lower: float) -> Callable:
|
||||
"""Create a data augmentation callback that uses orth-variant replacement.
|
||||
The callback can be added to a corpus or other data iterator during training.
|
||||
"""
|
||||
return partial(orth_variants_augmenter, level=level, lower=lower)
|
||||
|
||||
|
||||
def dont_augment(nlp, example):
|
||||
yield example
|
||||
|
||||
|
||||
def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.0):
|
||||
if random.random() >= level:
|
||||
yield example
|
||||
else:
|
||||
raw_text = example.text
|
||||
orig_dict = example.to_dict()
|
||||
if not orig_dict["token_annotation"]:
|
||||
yield example
|
||||
else:
|
||||
variant_text, variant_token_annot = make_orth_variants(
|
||||
nlp,
|
||||
raw_text,
|
||||
orig_dict["token_annotation"],
|
||||
lower=raw_text is not None and random.random() < lower
|
||||
)
|
||||
doc = nlp.make_doc(variant_text)
|
||||
orig_dict["token_annotation"] = variant_token_annot
|
||||
yield example.from_dict(doc, orig_dict)
|
||||
|
||||
|
||||
def make_orth_variants(nlp, raw, token_dict, *, lower: bool=False):
|
||||
orig_token_dict = copy.deepcopy(token_dict)
|
||||
orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
|
||||
ndsv = orth_variants.get("single", [])
|
||||
ndpv = orth_variants.get("paired", [])
|
||||
|
@ -103,7 +123,7 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
|
|||
# something went wrong, abort
|
||||
# (add a warning message?)
|
||||
if not match_found:
|
||||
return raw_text, orig_token_dict
|
||||
return raw, orig_token_dict
|
||||
# add following whitespace
|
||||
while raw_idx < len(raw) and raw[raw_idx].isspace():
|
||||
variant_raw += raw[raw_idx]
|
||||
|
|
|
@ -1,9 +1,11 @@
|
|||
import warnings
|
||||
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
|
||||
from typing import Optional
|
||||
from pathlib import Path
|
||||
import srsly
|
||||
|
||||
from .. import util
|
||||
from .augment import dont_augment
|
||||
from .example import Example
|
||||
from ..errors import Warnings
|
||||
from ..tokens import DocBin, Doc
|
||||
|
@ -18,9 +20,19 @@ FILE_TYPE = ".spacy"
|
|||
|
||||
@util.registry.readers("spacy.Corpus.v1")
|
||||
def create_docbin_reader(
|
||||
path: Path, gold_preproc: bool, max_length: int = 0, limit: int = 0
|
||||
path: Path,
|
||||
gold_preproc: bool,
|
||||
max_length: int = 0,
|
||||
limit: int = 0,
|
||||
augmenter: Optional[Callable] = None,
|
||||
) -> Callable[["Language"], Iterable[Example]]:
|
||||
return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit)
|
||||
return Corpus(
|
||||
path,
|
||||
gold_preproc=gold_preproc,
|
||||
max_length=max_length,
|
||||
limit=limit,
|
||||
augmenter=augmenter,
|
||||
)
|
||||
|
||||
|
||||
@util.registry.readers("spacy.JsonlReader.v1")
|
||||
|
@ -70,6 +82,8 @@ class Corpus:
|
|||
0, which indicates no limit.
|
||||
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
|
||||
Defaults to 0, which indicates no limit.
|
||||
augment (Callable[Example, Iterable[Example]]): Optional data augmentation
|
||||
function, to extrapolate additional examples from your annotations.
|
||||
|
||||
DOCS: https://nightly.spacy.io/api/corpus
|
||||
"""
|
||||
|
@ -81,11 +95,13 @@ class Corpus:
|
|||
limit: int = 0,
|
||||
gold_preproc: bool = False,
|
||||
max_length: int = 0,
|
||||
augmenter: Optional[Callable] = None,
|
||||
) -> None:
|
||||
self.path = util.ensure_path(path)
|
||||
self.gold_preproc = gold_preproc
|
||||
self.max_length = max_length
|
||||
self.limit = limit
|
||||
self.augmenter = augmenter if augmenter is not None else dont_augment
|
||||
|
||||
def __call__(self, nlp: "Language") -> Iterator[Example]:
|
||||
"""Yield examples from the data.
|
||||
|
@ -100,7 +116,9 @@ class Corpus:
|
|||
examples = self.make_examples_gold_preproc(nlp, ref_docs)
|
||||
else:
|
||||
examples = self.make_examples(nlp, ref_docs)
|
||||
yield from examples
|
||||
for real_eg in examples:
|
||||
for augmented_eg in self.augmenter(nlp, real_eg):
|
||||
yield augmented_eg
|
||||
|
||||
def _make_example(
|
||||
self, nlp: "Language", reference: Doc, gold_preproc: bool
|
||||
|
|
|
@ -83,6 +83,7 @@ class registry(thinc.registry):
|
|||
callbacks = catalogue.create("spacy", "callbacks")
|
||||
batchers = catalogue.create("spacy", "batchers", entry_points=True)
|
||||
readers = catalogue.create("spacy", "readers", entry_points=True)
|
||||
augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
|
||||
loggers = catalogue.create("spacy", "loggers", entry_points=True)
|
||||
# These are factories registered via third-party packages and the
|
||||
# spacy_factories entry point. This registry only exists so we can easily
|
||||
|
|
|
@ -143,11 +143,10 @@ argument that connects to the shared `tok2vec` component in the pipeline.
|
|||
|
||||
Construct an embedding layer that separately embeds a number of lexical
|
||||
attributes using hash embedding, concatenates the results, and passes it through
|
||||
a feed-forward subnetwork to build a mixed representations. The features used
|
||||
are the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying
|
||||
definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from
|
||||
pretrained static vectors can also be incorporated into the concatenated
|
||||
representation.
|
||||
a feed-forward subnetwork to build mixed representations. The features used are
|
||||
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions
|
||||
depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained
|
||||
static vectors can also be incorporated into the concatenated representation.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
|
@ -170,7 +169,7 @@ representation.
|
|||
> nC = 8
|
||||
> ```
|
||||
|
||||
Construct an embedded representations based on character embeddings, using a
|
||||
Construct an embedded representation based on character embeddings, using a
|
||||
feed-forward network. A fixed number of UTF-8 byte characters are used for each
|
||||
word, taken from the beginning and end of the word equally. Padding is used in
|
||||
the center for words that are too short.
|
||||
|
@ -392,7 +391,7 @@ a single token vector given zero or more wordpiece vectors.
|
|||
> ```
|
||||
|
||||
Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
|
||||
**not** allow multiple components to share the transformer weights, and does
|
||||
**not** allow multiple components to share the transformer weights and does
|
||||
**not** allow the transformer to set annotations into the [`Doc`](/api/doc)
|
||||
object, but it's a **simpler solution** if you only need the transformer within
|
||||
one component.
|
||||
|
@ -437,7 +436,7 @@ might find [this tutorial](https://explosion.ai/blog/parsing-english-in-python)
|
|||
helpful for background information. The neural network state prediction model
|
||||
consists of either two or three subnetworks:
|
||||
|
||||
- **tok2vec**: Map each token into a vector representations. This subnetwork is
|
||||
- **tok2vec**: Map each token into a vector representation. This subnetwork is
|
||||
run once for each batch.
|
||||
- **lower**: Construct a feature-specific vector for each `(token, feature)`
|
||||
pair. This is also run once for each batch. Constructing the state
|
||||
|
@ -575,14 +574,14 @@ architecture is usually less accurate than the ensemble, but runs faster.
|
|||
> nO = null
|
||||
> ```
|
||||
|
||||
An ngram "bag-of-words" model. This architecture should run much faster than the
|
||||
others, but may not be as accurate, especially if texts are short.
|
||||
An n-gram "bag-of-words" model. This architecture should run much faster than
|
||||
the others, but may not be as accurate, especially if texts are short.
|
||||
|
||||
| Name | Description |
|
||||
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
|
||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3`would give unigram, trigram and bigram features. ~~int~~ |
|
||||
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ |
|
||||
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
|
||||
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
|
||||
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
|
||||
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
|
||||
|
||||
|
@ -596,7 +595,7 @@ into the "real world". This requires 3 main components:
|
|||
synonyms and prior probabilities.
|
||||
- A candidate generation step to produce a set of likely identifiers, given a
|
||||
certain textual mention.
|
||||
- A Machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the
|
||||
- A machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the
|
||||
most plausible ID from the set of candidates.
|
||||
|
||||
### spacy.EntityLinker.v1 {#EntityLinker}
|
||||
|
|
|
@ -71,7 +71,7 @@ pattern_dicts = [
|
|||
|
||||
## AttributeRuler.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Apply the attribute ruler to a Doc, setting token attributes for tokens matched
|
||||
Apply the attribute ruler to a `Doc`, setting token attributes for tokens matched
|
||||
by the provided patterns.
|
||||
|
||||
| Name | Description |
|
||||
|
@ -256,6 +256,6 @@ serialization by passing in the string names via the `exclude` argument.
|
|||
| Name | Description |
|
||||
| ---------- | -------------------------------------------------------------- |
|
||||
| `vocab` | The shared [`Vocab`](/api/vocab). |
|
||||
| `patterns` | The Matcher patterns. You usually don't want to exclude this. |
|
||||
| `patterns` | The `Matcher` patterns. You usually don't want to exclude this. |
|
||||
| `attrs` | The attributes to set. You usually don't want to exclude this. |
|
||||
| `indices` | The token indices. You usually don't want to exclude this. |
|
||||
|
|
|
@ -81,7 +81,7 @@ $ python -m spacy info [model] [--markdown] [--silent]
|
|||
Find all trained pipeline packages installed in the current environment and
|
||||
check whether they are compatible with the currently installed version of spaCy.
|
||||
Should be run after upgrading spaCy via `pip install -U spacy` to ensure that
|
||||
all installed packages are can be used with the new version. It will show a list
|
||||
all installed packages can be used with the new version. It will show a list
|
||||
of packages and their installed versions. If any package is out of date, the
|
||||
latest compatible versions and command for updating are shown.
|
||||
|
||||
|
@ -408,7 +408,7 @@ File /path/to/thinc/thinc/schedules.py (line 91)
|
|||
|
||||
### debug data {#debug-data tag="command"}
|
||||
|
||||
Analyze, debug, and validate your training and development data. Get useful
|
||||
Analyze, debug and validate your training and development data. Get useful
|
||||
stats, and find problems like invalid entity annotations, cyclic dependencies,
|
||||
low data labels and more.
|
||||
|
||||
|
|
|
@ -74,6 +74,7 @@ train/test skew.
|
|||
| `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ |
|
||||
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
|
||||
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
|
||||
| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~
|
||||
|
||||
## Corpus.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
|
|
@ -274,7 +274,7 @@ Typically, the extension for these binary files is `.spacy`, and they are used
|
|||
as input format for specifying a [training corpus](/api/corpus) and for spaCy's
|
||||
CLI [`train`](/api/cli#train) command. The built-in
|
||||
[`convert`](/api/cli#convert) command helps you convert spaCy's previous
|
||||
[JSON format](#json-input) to the new binary format format. It also supports
|
||||
[JSON format](#json-input) to the new binary format. It also supports
|
||||
conversion of the `.conllu` format used by the
|
||||
[Universal Dependencies corpora](https://github.com/UniversalDependencies).
|
||||
|
||||
|
@ -338,7 +338,7 @@ $ python -m spacy convert ./data.json ./output.spacy
|
|||
|
||||
<Accordion title="Sample JSON data" spaced>
|
||||
|
||||
Here's an example of dependencies, part-of-speech tags and names entities, taken
|
||||
Here's an example of dependencies, part-of-speech tags and named entities, taken
|
||||
from the English Wall Street Journal portion of the Penn Treebank:
|
||||
|
||||
```json
|
||||
|
|
|
@ -21,8 +21,7 @@ non-projective parses.
|
|||
The parser is trained using an **imitation learning objective**. It follows the
|
||||
actions predicted by the current weights, and at each state, determines which
|
||||
actions are compatible with the optimal parse that could be reached from the
|
||||
current state. The weights such that the scores assigned to the set of optimal
|
||||
actions is increased, while scores assigned to other actions are decreased. Note
|
||||
current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note
|
||||
that more than one action may be optimal for a given state.
|
||||
|
||||
## Config and implementation {#config}
|
||||
|
|
|
@ -503,8 +503,7 @@ invalidated, although they may accidentally continue to work.
|
|||
Mark a span for merging. The `attrs` will be applied to the resulting token (if
|
||||
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
|
||||
underlying lexeme (if they're context-independent lexical attributes like
|
||||
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a
|
||||
dictionary mapping attribute names to values as the `"_"` key.
|
||||
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -94,7 +94,7 @@ providing custom registered functions.
|
|||
|
||||
## EntityLinker.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Apply the pipe to one document. The document is modified in place, and returned.
|
||||
Apply the pipe to one document. The document is modified in place and returned.
|
||||
This usually happens under the hood when the `nlp` object is called on a text
|
||||
and all pipeline components are applied to the `Doc` in order. Both
|
||||
[`__call__`](/api/entitylinker#call) and [`pipe`](/api/entitylinker#pipe)
|
||||
|
|
|
@ -43,7 +43,7 @@ architectures and their arguments and hyperparameters.
|
|||
|
||||
| Setting | Description |
|
||||
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]] |
|
||||
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
|
||||
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
|
||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ |
|
||||
|
||||
|
@ -83,7 +83,7 @@ shortcut for this and instantiate the component using its string name and
|
|||
|
||||
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
Apply the pipe to one document. The document is modified in place, and returned.
|
||||
Apply the pipe to one document. The document is modified in place and returned.
|
||||
This usually happens under the hood when the `nlp` object is called on a text
|
||||
and all pipeline components are applied to the `Doc` in order. Both
|
||||
[`__call__`](/api/entityrecognizer#call) and
|
||||
|
|
|
@ -256,6 +256,6 @@ Get all patterns that were added to the entity ruler.
|
|||
| Name | Description |
|
||||
| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ |
|
||||
| `phrase_matcher` | The underlying phrase matcher, used to process phrase patterns. ~~PhraseMatcher~~ |
|
||||
| `phrase_matcher` | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~ |
|
||||
| `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
|
||||
| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ |
|
||||
|
|
|
@ -33,8 +33,8 @@ both documents.
|
|||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `predicted` | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ |
|
||||
| `reference` | The document containing gold-standard annotations. Can not be `None`. ~~Doc~~ |
|
||||
| `predicted` | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ |
|
||||
| `reference` | The document containing gold-standard annotations. Cannot be `None`. ~~Doc~~ |
|
||||
| _keyword-only_ | |
|
||||
| `alignment` | An object holding the alignment between the tokens of the `predicted` and `reference` documents. ~~Optional[Alignment]~~ |
|
||||
|
||||
|
@ -58,8 +58,8 @@ see the [training format documentation](/api/data-formats#dict-input).
|
|||
|
||||
| Name | Description |
|
||||
| -------------- | ------------------------------------------------------------------------- |
|
||||
| `predicted` | The document containing (partial) predictions. Can not be `None`. ~~Doc~~ |
|
||||
| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Can not be `None`. ~~Dict[str, Any]~~ |
|
||||
| `predicted` | The document containing (partial) predictions. Cannot be `None`. ~~Doc~~ |
|
||||
| `example_dict` | `Dict[str, obj]` | The gold-standard annotations as a dictionary. Cannot be `None`. ~~Dict[str, Any]~~ |
|
||||
| **RETURNS** | The newly constructed object. ~~Example~~ |
|
||||
|
||||
## Example.text {#text tag="property"}
|
||||
|
|
|
@ -46,10 +46,11 @@ information in [`Language.meta`](/api/language#meta) and not to configure the
|
|||
## Language.from_config {#from_config tag="classmethod" new="3"}
|
||||
|
||||
Create a `Language` object from a loaded config. Will set up the tokenizer and
|
||||
language data, add pipeline components based on the pipeline and components
|
||||
define in the config and validate the results. If no config is provided, the
|
||||
default config of the given language is used. This is also how spaCy loads a
|
||||
model under the hood based on its [`config.cfg`](/api/data-formats#config).
|
||||
language data, add pipeline components based on the pipeline and add pipeline
|
||||
components based on the definitions specified in the config. If no config is
|
||||
provided, the default config of the given language is used. This is also how
|
||||
spaCy loads a model under the hood based on its
|
||||
[`config.cfg`](/api/data-formats#config).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -107,7 +108,7 @@ decorator. For more details and examples, see the
|
|||
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
|
||||
| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[Doc], Doc]]~~ |
|
||||
| `func` | Optional function if not used as a decorator. ~~Optional[Callable[[Doc], Doc]]~~ |
|
||||
|
||||
## Language.factory {#factory tag="classmethod"}
|
||||
|
||||
|
@ -154,7 +155,7 @@ examples, see the
|
|||
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
|
||||
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
|
||||
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ |
|
||||
| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ |
|
||||
| `func` | Optional function if not used as a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ |
|
||||
|
||||
## Language.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -609,7 +610,7 @@ does nothing.
|
|||
|
||||
## Language.enable_pipe {#enable_pipe tag="method" new="3"}
|
||||
|
||||
Enable a previously disable component (e.g. via
|
||||
Enable a previously disabled component (e.g. via
|
||||
[`Language.disable_pipes`](/api/language#disable_pipes)) so it's run as part of
|
||||
the pipeline, [`nlp.pipeline`](/api/language#pipeline). If the component is
|
||||
already enabled, this method does nothing.
|
||||
|
@ -636,7 +637,7 @@ pipeline will be restored to the initial state at the end of the block.
|
|||
Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method
|
||||
you can use to undo your changes. You can specify either `disable` (as a list or
|
||||
string), or `enable`. In the latter case, all components not in the `enable`
|
||||
list, will be disabled. Under the hood, this method calls into
|
||||
list will be disabled. Under the hood, this method calls into
|
||||
[`disable_pipe`](/api/language#disable_pipe) and
|
||||
[`enable_pipe`](/api/language#enable_pipe).
|
||||
|
||||
|
@ -669,7 +670,7 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
|
|||
| -------------- | ------------------------------------------------------------------------------------------------------ |
|
||||
| _keyword-only_ | |
|
||||
| `disable` | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~ |
|
||||
| `enable` | Names(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ |
|
||||
| `enable` | Name(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ |
|
||||
| **RETURNS** | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ |
|
||||
|
||||
## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"}
|
||||
|
@ -881,10 +882,10 @@ Loads state from a directory, including all data that was saved with the
|
|||
|
||||
<Infobox variant="warning" title="Important note">
|
||||
|
||||
Keep in mind that this method **only loads serialized state** and doesn't set up
|
||||
the `nlp` object. This means that it requires the correct language class to be
|
||||
initialized and all pipeline components to be added to the pipeline. If you want
|
||||
to load a serialized pipeline from a directory, you should use
|
||||
Keep in mind that this method **only loads the serialized state** and doesn't
|
||||
set up the `nlp` object. This means that it requires the correct language class
|
||||
to be initialized and all pipeline components to be added to the pipeline. If
|
||||
you want to load a serialized pipeline from a directory, you should use
|
||||
[`spacy.load`](/api/top-level#spacy.load), which will set everything up for you.
|
||||
|
||||
</Infobox>
|
||||
|
|
|
@ -38,7 +38,7 @@ The default config is defined by the pipeline component factory and describes
|
|||
how the component should be configured. You can override its settings via the
|
||||
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
|
||||
[`config.cfg` for training](/usage/training#config). For examples of the lookups
|
||||
data formats used by the lookup and rule-based lemmatizers, see
|
||||
data format used by the lookup and rule-based lemmatizers, see
|
||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
|
||||
|
||||
> #### Example
|
||||
|
|
|
@ -61,7 +61,7 @@ matched:
|
|||
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
|
||||
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
|
||||
| `+` | Require the pattern to match 1 or more times. |
|
||||
| `*` | Allow the pattern to match zero or more times. |
|
||||
| `*` | Allow the pattern to match 0 or more times. |
|
||||
|
||||
Token patterns can also map to a **dictionary of properties** instead of a
|
||||
single value to indicate whether the expected value is a member of a list or how
|
||||
|
|
|
@ -12,7 +12,7 @@ container storing a single morphological analysis.
|
|||
|
||||
## Morphology.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Create a Morphology object.
|
||||
Create a `Morphology` object.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -101,7 +101,7 @@ representation.
|
|||
| Name | Description |
|
||||
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ |
|
||||
| **RETURNS** | The morphological features as in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
|
||||
| **RETURNS** | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
|
|
|
@ -200,7 +200,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
|
|||
## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"}
|
||||
|
||||
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
||||
current model to make predictions similar to an initial model, to try to address
|
||||
current model to make predictions similar to an initial model to try to address
|
||||
the "catastrophic forgetting" problem. This feature is experimental.
|
||||
|
||||
> #### Example
|
||||
|
|
|
@ -8,7 +8,7 @@ api_string_name: sentencizer
|
|||
api_trainable: false
|
||||
---
|
||||
|
||||
A simple pipeline component, to allow custom sentence boundary detection logic
|
||||
A simple pipeline component to allow custom sentence boundary detection logic
|
||||
that doesn't require the dependency parse. By default, sentence segmentation is
|
||||
performed by the [`DependencyParser`](/api/dependencyparser), so the
|
||||
`Sentencizer` lets you implement a simpler, rule-based strategy that doesn't
|
||||
|
@ -130,7 +130,7 @@ Score a batch of examples.
|
|||
|
||||
## Sentencizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
Save the sentencizer settings (punctuation characters) a directory. Will create
|
||||
Save the sentencizer settings (punctuation characters) to a directory. Will create
|
||||
a file `sentencizer.json`. This also happens automatically when you save an
|
||||
`nlp` object with a sentencizer added to its pipeline.
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ A slice from a [`Doc`](/api/doc) object.
|
|||
|
||||
## Span.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Create a Span object from the slice `doc[start : end]`.
|
||||
Create a `Span` object from the slice `doc[start : end]`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -187,7 +187,7 @@ the character indices don't map to a valid span.
|
|||
| Name | Description |
|
||||
| ------------------------------------ | ----------------------------------------------------------------------------------------- |
|
||||
| `start` | The index of the first character of the span. ~~int~~ |
|
||||
| `end` | The index of the last character after the span. ~int~~ |
|
||||
| `end` | The index of the last character after the span. ~~int~~ |
|
||||
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
|
||||
| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
|
||||
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
|
|
|
@ -157,7 +157,7 @@ This method was previously called `begin_training`.
|
|||
|
||||
## TextCategorizer.predict {#predict tag="method"}
|
||||
|
||||
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
|
||||
Apply the component's model to a batch of [`Doc`](/api/doc) objects without
|
||||
modifying them.
|
||||
|
||||
> #### Example
|
||||
|
@ -174,7 +174,7 @@ modifying them.
|
|||
|
||||
## TextCategorizer.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
|
||||
Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -217,7 +217,7 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
|
|||
## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"}
|
||||
|
||||
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
|
||||
current model to make predictions similar to an initial model, to try to address
|
||||
current model to make predictions similar to an initial model to try to address
|
||||
the "catastrophic forgetting" problem. This feature is experimental.
|
||||
|
||||
> #### Example
|
||||
|
@ -290,7 +290,7 @@ Create an optimizer for the pipeline component.
|
|||
|
||||
## TextCategorizer.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
Modify the pipe's model, to use the given parameter values.
|
||||
Modify the pipe's model to use the given parameter values.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
|
|
@ -150,7 +150,7 @@ by [`Language.initialize`](/api/language#initialize).
|
|||
|
||||
## Tok2Vec.predict {#predict tag="method"}
|
||||
|
||||
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
|
||||
Apply the component's model to a batch of [`Doc`](/api/doc) objects without
|
||||
modifying them.
|
||||
|
||||
> #### Example
|
||||
|
@ -223,7 +223,7 @@ Create an optimizer for the pipeline component.
|
|||
|
||||
## Tok2Vec.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
Modify the pipe's model, to use the given parameter values. At the end of the
|
||||
Modify the pipe's model to use the given parameter values. At the end of the
|
||||
context, the original parameters are restored.
|
||||
|
||||
> #### Example
|
||||
|
|
|
@ -243,7 +243,7 @@ A sequence of the token's immediate syntactic children.
|
|||
|
||||
## Token.lefts {#lefts tag="property" model="parser"}
|
||||
|
||||
The leftward immediate children of the word, in the syntactic dependency parse.
|
||||
The leftward immediate children of the word in the syntactic dependency parse.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -259,7 +259,7 @@ The leftward immediate children of the word, in the syntactic dependency parse.
|
|||
|
||||
## Token.rights {#rights tag="property" model="parser"}
|
||||
|
||||
The rightward immediate children of the word, in the syntactic dependency parse.
|
||||
The rightward immediate children of the word in the syntactic dependency parse.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -275,7 +275,7 @@ The rightward immediate children of the word, in the syntactic dependency parse.
|
|||
|
||||
## Token.n_lefts {#n_lefts tag="property" model="parser"}
|
||||
|
||||
The number of leftward immediate children of the word, in the syntactic
|
||||
The number of leftward immediate children of the word in the syntactic
|
||||
dependency parse.
|
||||
|
||||
> #### Example
|
||||
|
@ -291,7 +291,7 @@ dependency parse.
|
|||
|
||||
## Token.n_rights {#n_rights tag="property" model="parser"}
|
||||
|
||||
The number of rightward immediate children of the word, in the syntactic
|
||||
The number of rightward immediate children of the word in the syntactic
|
||||
dependency parse.
|
||||
|
||||
> #### Example
|
||||
|
@ -422,8 +422,8 @@ The L2 norm of the token's vector representation.
|
|||
| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~ |
|
||||
| `lower` | Lowercase form of the token. ~~int~~ |
|
||||
| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ |
|
||||
| `shape` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
|
||||
| `shape_` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
|
||||
| `shape` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
|
||||
| `shape_` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
|
||||
| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ |
|
||||
| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ |
|
||||
| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ |
|
||||
|
@ -451,7 +451,7 @@ The L2 norm of the token's vector representation.
|
|||
| `tag` | Fine-grained part-of-speech. ~~int~~ |
|
||||
| `tag_` | Fine-grained part-of-speech. ~~str~~ |
|
||||
| `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~ |
|
||||
| `morph_` <Tag variant="new">3</Tag> | Morphological analysis in the Universal Dependencies [FEATS]https://universaldependencies.org/format.html#morphological-annotation format. ~~str~~ |
|
||||
| `morph_` <Tag variant="new">3</Tag> | Morphological analysis in the Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
|
||||
| `dep` | Syntactic dependency relation. ~~int~~ |
|
||||
| `dep_` | Syntactic dependency relation. ~~str~~ |
|
||||
| `lang` | Language of the parent document's vocabulary. ~~int~~ |
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
---
|
||||
title: Tokenizer
|
||||
teaser: Segment text into words, punctuations marks etc.
|
||||
teaser: Segment text into words, punctuations marks, etc.
|
||||
tag: class
|
||||
source: spacy/tokenizer.pyx
|
||||
---
|
||||
|
@ -15,14 +15,14 @@ source: spacy/tokenizer.pyx
|
|||
Segment text, and create `Doc` objects with the discovered segment boundaries.
|
||||
For a deeper understanding, see the docs on
|
||||
[how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works).
|
||||
The tokenizer is typically created automatically when the a
|
||||
The tokenizer is typically created automatically when a
|
||||
[`Language`](/api/language) subclass is initialized and it reads its settings
|
||||
like punctuation and special case rules from the
|
||||
[`Language.Defaults`](/api/language#defaults) provided by the language subclass.
|
||||
|
||||
## Tokenizer.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
Create a `Tokenizer`, to create `Doc` objects given unicode text. For examples
|
||||
Create a `Tokenizer` to create `Doc` objects given unicode text. For examples
|
||||
of how to construct a custom tokenizer with different tokenization rules, see
|
||||
the
|
||||
[usage documentation](https://spacy.io/usage/linguistic-features#native-tokenizers).
|
||||
|
@ -87,7 +87,7 @@ Tokenize a stream of texts.
|
|||
| ------------ | ------------------------------------------------------------------------------------ |
|
||||
| `texts` | A sequence of unicode texts. ~~Iterable[str]~~ |
|
||||
| `batch_size` | The number of texts to accumulate in an internal buffer. Defaults to `1000`. ~~int~~ |
|
||||
| **YIELDS** | The tokenized Doc objects, in order. ~~Doc~~ |
|
||||
| **YIELDS** | The tokenized `Doc` objects, in order. ~~Doc~~ |
|
||||
|
||||
## Tokenizer.find_infix {#find_infix tag="method"}
|
||||
|
||||
|
|
|
@ -198,7 +198,7 @@ browser. Will run a simple web server.
|
|||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
||||
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
||||
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
||||
| `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
|
||||
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
|
||||
| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ |
|
||||
| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ |
|
||||
|
||||
|
@ -223,7 +223,7 @@ Render a dependency parse tree or named entity visualization.
|
|||
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
|
||||
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
|
||||
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
|
||||
| `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
|
||||
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
|
||||
| `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
|
||||
| **RETURNS** | The rendered HTML markup. ~~str~~ |
|
||||
|
||||
|
@ -244,7 +244,7 @@ If a setting is not present in the options, the default value will be used.
|
|||
| Name | Description |
|
||||
| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
|
||||
| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemma's in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
||||
| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
|
||||
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
|
||||
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
|
||||
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
|
||||
|
@ -623,7 +623,7 @@ sequences in the batch.
|
|||
|
||||
Encode labelled spans into per-token tags, using the
|
||||
[BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit,
|
||||
Out). Returns a list of strings, describing the tags. Each tag string will be of
|
||||
Out). Returns a list of strings, describing the tags. Each tag string will be in
|
||||
the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of
|
||||
`"B"`, `"I"`, `"L"`, `"U"`. The string `"-"` is used where the entity offsets
|
||||
don't align with the tokenization in the `Doc` object. The training algorithm
|
||||
|
@ -747,7 +747,7 @@ decorator.
|
|||
### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"}
|
||||
|
||||
Check whether a `Language` subclass is already loaded. `Language` subclasses are
|
||||
loaded lazily, to avoid expensive setup code associated with the language data.
|
||||
loaded lazily to avoid expensive setup code associated with the language data.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -935,7 +935,7 @@ Compile a sequence of prefix rules into a regex object.
|
|||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
|
||||
| **RETURNS** | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
| **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
|
||||
### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
|
||||
|
||||
|
@ -952,7 +952,7 @@ Compile a sequence of suffix rules into a regex object.
|
|||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
|
||||
| **RETURNS** | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
| **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
|
||||
### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
|
||||
|
||||
|
@ -969,7 +969,7 @@ Compile a sequence of infix rules into a regex object.
|
|||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
|
||||
| **RETURNS** | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
| **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ |
|
||||
|
||||
### util.minibatch {#util.minibatch tag="function" new="2"}
|
||||
|
||||
|
|
|
@ -185,7 +185,7 @@ by [`Language.initialize`](/api/language#initialize).
|
|||
|
||||
## Transformer.predict {#predict tag="method"}
|
||||
|
||||
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
|
||||
Apply the component's model to a batch of [`Doc`](/api/doc) objects without
|
||||
modifying them.
|
||||
|
||||
> #### Example
|
||||
|
@ -202,7 +202,7 @@ modifying them.
|
|||
|
||||
## Transformer.set_annotations {#set_annotations tag="method"}
|
||||
|
||||
Assign the extracted features to the Doc objects. By default, the
|
||||
Assign the extracted features to the `Doc` objects. By default, the
|
||||
[`TransformerData`](/api/transformer#transformerdata) object is written to the
|
||||
[`Doc._.trf_data`](#custom-attributes) attribute. Your `set_extra_annotations`
|
||||
callback is then called, if provided.
|
||||
|
@ -271,7 +271,7 @@ Create an optimizer for the pipeline component.
|
|||
|
||||
## Transformer.use_params {#use_params tag="method, contextmanager"}
|
||||
|
||||
Modify the pipe's model, to use the given parameter values. At the end of the
|
||||
Modify the pipe's model to use the given parameter values. At the end of the
|
||||
context, the original parameters are restored.
|
||||
|
||||
> #### Example
|
||||
|
@ -387,8 +387,8 @@ by this class. Instances of this class are typically assigned to the
|
|||
|
||||
| Name | Description |
|
||||
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts, and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |
|
||||
| `tensors` | The activations for the Doc from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
|
||||
| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |
|
||||
| `tensors` | The activations for the `Doc` from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
|
||||
| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
|
||||
| `width` | The width of the last hidden layer. ~~int~~ |
|
||||
|
||||
|
@ -408,7 +408,7 @@ objects to associate the outputs to each [`Doc`](/api/doc) in the batch.
|
|||
|
||||
| Name | Description |
|
||||
| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each Span can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each Span may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
|
||||
| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each `Span` can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each `Span` may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
|
||||
| `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ |
|
||||
| `tensors` | The output of the transformer model. ~~List[torch.Tensor]~~ |
|
||||
| `align` | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
|
||||
|
@ -438,10 +438,10 @@ Split a `TransformerData` object that represents a batch into a list with one
|
|||
## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
|
||||
|
||||
Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
|
||||
return a lists of [`Span`](/api/span) objects for each doc, to be processed by
|
||||
the transformer. This is used to manage long documents, by cutting them into
|
||||
return a lists of [`Span`](/api/span) objects for each doc to be processed by
|
||||
the transformer. This is used to manage long documents by cutting them into
|
||||
smaller sequences before running the transformer. The spans are allowed to
|
||||
overlap, and you can also omit sections of the Doc if they are not relevant.
|
||||
overlap, and you can also omit sections of the `Doc` if they are not relevant.
|
||||
|
||||
Span getters can be referenced in the `[components.transformer.model.get_spans]`
|
||||
block of the config to customize the sequences processed by the transformer. You
|
||||
|
|
|
@ -290,7 +290,7 @@ If a table is full, it can be resized using
|
|||
## Vectors.n_keys {#n_keys tag="property"}
|
||||
|
||||
Get the number of keys in the table. Note that this is the number of _all_ keys,
|
||||
not just unique vectors. If several keys are mapped are mapped to the same
|
||||
not just unique vectors. If several keys are mapped to the same
|
||||
vectors, they will be counted individually.
|
||||
|
||||
> #### Example
|
||||
|
@ -307,10 +307,10 @@ vectors, they will be counted individually.
|
|||
|
||||
## Vectors.most_similar {#most_similar tag="method"}
|
||||
|
||||
For each of the given vectors, find the `n` most similar entries to it, by
|
||||
For each of the given vectors, find the `n` most similar entries to it by
|
||||
cosine. Queries are by vector. Results are returned as a
|
||||
`(keys, best_rows, scores)` tuple. If `queries` is large, the calculations are
|
||||
performed in chunks, to avoid consuming too much memory. You can set the
|
||||
performed in chunks to avoid consuming too much memory. You can set the
|
||||
`batch_size` to control the size/space trade-off during the calculations.
|
||||
|
||||
> #### Example
|
||||
|
|
|
@ -29,7 +29,7 @@ Create the vocabulary.
|
|||
| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
|
||||
| `vectors_name` <Tag variant="new">2.2</Tag> | A name to identify the vectors table. ~~str~~ |
|
||||
| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
|
||||
| `get_noun_chunks` | A function that yields base noun phrases, used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ |
|
||||
| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ |
|
||||
|
||||
## Vocab.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
|
@ -150,7 +150,7 @@ rows, we would discard the vectors for "feline" and "reclined". These words
|
|||
would then be remapped to the closest remaining vector – so "feline" would have
|
||||
the same vector as "cat", and "reclined" would have the same vector as "sat".
|
||||
The similarities are judged by cosine. The original vectors may be large, so the
|
||||
cosines are calculated in minibatches, to reduce memory usage.
|
||||
cosines are calculated in minibatches to reduce memory usage.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -170,7 +170,7 @@ cosines are calculated in minibatches, to reduce memory usage.
|
|||
Retrieve a vector for a word in the vocabulary. Words can be looked up by string
|
||||
or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn`
|
||||
is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s
|
||||
subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
|
||||
subword features by average over n-grams of `orth` (introduced in spaCy `v2.1`).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -182,13 +182,13 @@ subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
|
|||
| Name | Description |
|
||||
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- |
|
||||
| `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ |
|
||||
| `minn` <Tag variant="new">2.1</Tag> | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ |
|
||||
| `maxn` <Tag variant="new">2.1</Tag> | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ |
|
||||
| `minn` <Tag variant="new">2.1</Tag> | Minimum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ |
|
||||
| `maxn` <Tag variant="new">2.1</Tag> | Maximum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ |
|
||||
| **RETURNS** | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
|
||||
|
||||
## Vocab.set_vector {#set_vector tag="method" new="2"}
|
||||
|
||||
Set a vector for a word in the vocabulary. Words can be referenced by by string
|
||||
Set a vector for a word in the vocabulary. Words can be referenced by string
|
||||
or hash value.
|
||||
|
||||
> #### Example
|
||||
|
|
|
@ -41,8 +41,8 @@ transformers is that word vectors model **lexical types**, rather than _tokens_.
|
|||
If you have a list of terms with no context around them, a transformer model
|
||||
like BERT can't really help you. BERT is designed to understand language **in
|
||||
context**, which isn't what you have. A word vectors table will be a much better
|
||||
fit for your task. However, if you do have words in context — whole sentences or
|
||||
paragraphs of running text — word vectors will only provide a very rough
|
||||
fit for your task. However, if you do have words in context – whole sentences or
|
||||
paragraphs of running text – word vectors will only provide a very rough
|
||||
approximation of what the text is about.
|
||||
|
||||
Word vectors are also very computationally efficient, as they map a word to a
|
||||
|
@ -256,7 +256,7 @@ for doc in nlp.pipe(["some text", "some other text"]):
|
|||
```
|
||||
|
||||
You can also customize how the [`Transformer`](/api/transformer) component sets
|
||||
annotations onto the [`Doc`](/api/doc), by specifying a custom
|
||||
annotations onto the [`Doc`](/api/doc) by specifying a custom
|
||||
`set_extra_annotations` function. This callback will be called with the raw
|
||||
input and output data for the whole batch, along with the batch of `Doc`
|
||||
objects, allowing you to implement whatever you need. The annotation setter is
|
||||
|
@ -675,7 +675,7 @@ given you a 10% error reduction, pretraining with spaCy might give you another
|
|||
|
||||
The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific
|
||||
subnetwork** within one of your components, and add additional layers to build a
|
||||
network for a temporary task, that forces the model to learn something about
|
||||
network for a temporary task that forces the model to learn something about
|
||||
sentence structure and word cooccurrence statistics. Pretraining produces a
|
||||
**binary weights file** that can be loaded back in at the start of training. The
|
||||
weights file specifies an initial set of weights. Training then proceeds as
|
||||
|
|
|
@ -6,6 +6,7 @@ menu:
|
|||
- ['Introduction', 'basics']
|
||||
- ['Quickstart', 'quickstart']
|
||||
- ['Config System', 'config']
|
||||
<!-- - ['Data Utilities', 'data'] -->
|
||||
- ['Custom Functions', 'custom-functions']
|
||||
- ['Parallel Training', 'parallel-training']
|
||||
- ['Internal API', 'api']
|
||||
|
@ -505,6 +506,16 @@ still look good.
|
|||
|
||||
</Accordion>
|
||||
|
||||
<!--
|
||||
## Data Utilities {#data-utilities}
|
||||
|
||||
* spacy convert
|
||||
* The [corpora] block
|
||||
* Custom corpus class
|
||||
* Minibatching
|
||||
* Data augmentation
|
||||
-->
|
||||
|
||||
## Custom Functions {#custom-functions}
|
||||
|
||||
Registered functions in the training config files can refer to built-in
|
||||
|
@ -689,7 +700,7 @@ from pathlib import Path
|
|||
@spacy.registry.loggers("my_custom_logger.v1")
|
||||
def custom_logger(log_path):
|
||||
def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
|
||||
with Path(log_path).open("w") as file_:
|
||||
with Path(log_path).open("w", encoding="utf8") as file_:
|
||||
file_.write("step\\t")
|
||||
file_.write("score\\t")
|
||||
for pipe in nlp.pipe_names:
|
||||
|
|
|
@ -1256,7 +1256,7 @@ def main(template_path, output=None, data_path=None):
|
|||
data_str = f"export const DATA = {data}"
|
||||
result = compiler.get_output()
|
||||
if output is not None:
|
||||
with output.open("w") as f:
|
||||
with output.open("w", encoding="utf8") as f:
|
||||
f.write(f"{header}\n{result}\n{data_str}")
|
||||
print(f"Updated {output.parts[-1]}")
|
||||
else:
|
||||
|
|
Loading…
Reference in New Issue
Block a user