Merge branch 'develop' into feature/prepare

This commit is contained in:
Ines Montani 2020-09-29 20:53:05 +02:00
commit d3c63b7965
39 changed files with 189 additions and 134 deletions

View File

@ -14,7 +14,7 @@ pathy
numpy>=1.15.0
requests>=2.13.0,<3.0.0
tqdm>=4.38.0,<5.0.0
pydantic>=1.3.0,<2.0.0
pydantic>=1.5.0,<2.0.0
pytokenizations
# Official Python utilities
setuptools

View File

@ -51,7 +51,7 @@ install_requires =
tqdm>=4.38.0,<5.0.0
numpy>=1.15.0
requests>=2.13.0,<3.0.0
pydantic>=1.3.0,<2.0.0
pydantic>=1.5.0,<2.0.0
pytokenizations
# Official Python utilities
setuptools

View File

@ -114,6 +114,6 @@ def project_document(
content = f"{before}{content}{after}"
else:
msg.warn("Replacing existing file")
with output_file.open("w") as f:
with output_file.open("w", encoding="utf8") as f:
f.write(content)
msg.good("Saved project documentation", output_file)

View File

@ -270,6 +270,7 @@ factory = "{{ pipe }}"
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = {{ 500 if hardware == "gpu" else 2000 }}
augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
[corpora.dev]
@readers = "spacy.Corpus.v1"

View File

@ -36,6 +36,11 @@ gold_preproc = false
max_length = 0
# Limitation on number of training examples
limit = 0
# Apply some simply data augmentation, where we replace tokens with variations.
# This is especially useful for punctuation and case replacement, to help
# generalize beyond corpora that don't have smart-quotes, or only have smart
# quotes, etc.
augmenter = {"@augmenters": "spacy.orth_variants.v1", "level": 0.1, "lower": 0.5}
[corpora.dev]
@readers = "spacy.Corpus.v1"

View File

@ -4,7 +4,7 @@ from spacy.training import biluo_tags_to_spans, iob_to_biluo
from spacy.training import Corpus, docs_to_json
from spacy.training.example import Example
from spacy.training.converters import json_to_docs
from spacy.training.augment import make_orth_variants_example
from spacy.training.augment import create_orth_variants_augmenter
from spacy.lang.en import English
from spacy.tokens import Doc, DocBin
from spacy.util import get_words_and_spaces, minibatch
@ -496,9 +496,8 @@ def test_make_orth_variants(doc):
output_file = tmpdir / "roundtrip.spacy"
DocBin(docs=[doc]).to_disk(output_file)
# due to randomness, test only that this runs with no errors for now
reader = Corpus(output_file)
train_example = next(reader(nlp))
make_orth_variants_example(nlp, train_example, orth_variant_level=0.2)
reader = Corpus(output_file, augmenter=create_orth_variants_augmenter(level=0.2, lower=0.5))
train_examples = list(reader(nlp))
@pytest.mark.skip("Outdated")

View File

@ -1,6 +1,7 @@
from .corpus import Corpus # noqa: F401
from .example import Example, validate_examples # noqa: F401
from .align import Alignment # noqa: F401
from .augment import dont_augment, orth_variants_augmenter # noqa: F401
from .iob_utils import iob_to_biluo, biluo_to_iob # noqa: F401
from .iob_utils import offsets_to_biluo_tags, biluo_tags_to_offsets # noqa: F401
from .iob_utils import biluo_tags_to_spans, tags_to_entities # noqa: F401

View File

@ -1,30 +1,50 @@
from typing import Callable
import random
import itertools
import copy
from functools import partial
from ..util import registry
def make_orth_variants_example(nlp, example, orth_variant_level=0.0): # TODO: naming
@registry.augmenters("spacy.dont_augment.v1")
def create_null_augmenter():
return dont_augment
@registry.augmenters("spacy.orth_variants.v1")
def create_orth_variants_augmenter(level: float, lower: float) -> Callable:
"""Create a data augmentation callback that uses orth-variant replacement.
The callback can be added to a corpus or other data iterator during training.
"""
return partial(orth_variants_augmenter, level=level, lower=lower)
def dont_augment(nlp, example):
yield example
def orth_variants_augmenter(nlp, example, *, level: float = 0.0, lower: float=0.0):
if random.random() >= level:
yield example
else:
raw_text = example.text
orig_dict = example.to_dict()
if not orig_dict["token_annotation"]:
yield example
else:
variant_text, variant_token_annot = make_orth_variants(
nlp, raw_text, orig_dict["token_annotation"], orth_variant_level
nlp,
raw_text,
orig_dict["token_annotation"],
lower=raw_text is not None and random.random() < lower
)
doc = nlp.make_doc(variant_text)
orig_dict["token_annotation"] = variant_token_annot
return example.from_dict(doc, orig_dict)
yield example.from_dict(doc, orig_dict)
def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
if random.random() >= orth_variant_level:
return raw_text, orig_token_dict
if not orig_token_dict:
return raw_text, orig_token_dict
raw = raw_text
token_dict = orig_token_dict
lower = False
if random.random() >= 0.5:
lower = True
if raw is not None:
raw = raw.lower()
def make_orth_variants(nlp, raw, token_dict, *, lower: bool=False):
orig_token_dict = copy.deepcopy(token_dict)
orth_variants = nlp.vocab.lookups.get_table("orth_variants", {})
ndsv = orth_variants.get("single", [])
ndpv = orth_variants.get("paired", [])
@ -103,7 +123,7 @@ def make_orth_variants(nlp, raw_text, orig_token_dict, orth_variant_level=0.0):
# something went wrong, abort
# (add a warning message?)
if not match_found:
return raw_text, orig_token_dict
return raw, orig_token_dict
# add following whitespace
while raw_idx < len(raw) and raw[raw_idx].isspace():
variant_raw += raw[raw_idx]

View File

@ -1,9 +1,11 @@
import warnings
from typing import Union, List, Iterable, Iterator, TYPE_CHECKING, Callable
from typing import Optional
from pathlib import Path
import srsly
from .. import util
from .augment import dont_augment
from .example import Example
from ..errors import Warnings
from ..tokens import DocBin, Doc
@ -18,9 +20,19 @@ FILE_TYPE = ".spacy"
@util.registry.readers("spacy.Corpus.v1")
def create_docbin_reader(
path: Path, gold_preproc: bool, max_length: int = 0, limit: int = 0
path: Path,
gold_preproc: bool,
max_length: int = 0,
limit: int = 0,
augmenter: Optional[Callable] = None,
) -> Callable[["Language"], Iterable[Example]]:
return Corpus(path, gold_preproc=gold_preproc, max_length=max_length, limit=limit)
return Corpus(
path,
gold_preproc=gold_preproc,
max_length=max_length,
limit=limit,
augmenter=augmenter,
)
@util.registry.readers("spacy.JsonlReader.v1")
@ -70,6 +82,8 @@ class Corpus:
0, which indicates no limit.
limit (int): Limit corpus to a subset of examples, e.g. for debugging.
Defaults to 0, which indicates no limit.
augment (Callable[Example, Iterable[Example]]): Optional data augmentation
function, to extrapolate additional examples from your annotations.
DOCS: https://nightly.spacy.io/api/corpus
"""
@ -81,11 +95,13 @@ class Corpus:
limit: int = 0,
gold_preproc: bool = False,
max_length: int = 0,
augmenter: Optional[Callable] = None,
) -> None:
self.path = util.ensure_path(path)
self.gold_preproc = gold_preproc
self.max_length = max_length
self.limit = limit
self.augmenter = augmenter if augmenter is not None else dont_augment
def __call__(self, nlp: "Language") -> Iterator[Example]:
"""Yield examples from the data.
@ -100,7 +116,9 @@ class Corpus:
examples = self.make_examples_gold_preproc(nlp, ref_docs)
else:
examples = self.make_examples(nlp, ref_docs)
yield from examples
for real_eg in examples:
for augmented_eg in self.augmenter(nlp, real_eg):
yield augmented_eg
def _make_example(
self, nlp: "Language", reference: Doc, gold_preproc: bool

View File

@ -83,6 +83,7 @@ class registry(thinc.registry):
callbacks = catalogue.create("spacy", "callbacks")
batchers = catalogue.create("spacy", "batchers", entry_points=True)
readers = catalogue.create("spacy", "readers", entry_points=True)
augmenters = catalogue.create("spacy", "augmenters", entry_points=True)
loggers = catalogue.create("spacy", "loggers", entry_points=True)
# These are factories registered via third-party packages and the
# spacy_factories entry point. This registry only exists so we can easily

View File

@ -143,11 +143,10 @@ argument that connects to the shared `tok2vec` component in the pipeline.
Construct an embedding layer that separately embeds a number of lexical
attributes using hash embedding, concatenates the results, and passes it through
a feed-forward subnetwork to build a mixed representations. The features used
are the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying
definitions depending on the `Vocab` of the `Doc` object passed in. Vectors from
pretrained static vectors can also be incorporated into the concatenated
representation.
a feed-forward subnetwork to build mixed representations. The features used are
the `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`, which can have varying definitions
depending on the `Vocab` of the `Doc` object passed in. Vectors from pretrained
static vectors can also be incorporated into the concatenated representation.
| Name | Description |
| ------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
@ -170,7 +169,7 @@ representation.
> nC = 8
> ```
Construct an embedded representations based on character embeddings, using a
Construct an embedded representation based on character embeddings, using a
feed-forward network. A fixed number of UTF-8 byte characters are used for each
word, taken from the beginning and end of the word equally. Padding is used in
the center for words that are too short.
@ -392,7 +391,7 @@ a single token vector given zero or more wordpiece vectors.
> ```
Use a transformer as a [`Tok2Vec`](/api/tok2vec) layer directly. This does
**not** allow multiple components to share the transformer weights, and does
**not** allow multiple components to share the transformer weights and does
**not** allow the transformer to set annotations into the [`Doc`](/api/doc)
object, but it's a **simpler solution** if you only need the transformer within
one component.
@ -437,7 +436,7 @@ might find [this tutorial](https://explosion.ai/blog/parsing-english-in-python)
helpful for background information. The neural network state prediction model
consists of either two or three subnetworks:
- **tok2vec**: Map each token into a vector representations. This subnetwork is
- **tok2vec**: Map each token into a vector representation. This subnetwork is
run once for each batch.
- **lower**: Construct a feature-specific vector for each `(token, feature)`
pair. This is also run once for each batch. Constructing the state
@ -575,14 +574,14 @@ architecture is usually less accurate than the ensemble, but runs faster.
> nO = null
> ```
An ngram "bag-of-words" model. This architecture should run much faster than the
others, but may not be as accurate, especially if texts are short.
An n-gram "bag-of-words" model. This architecture should run much faster than
the others, but may not be as accurate, especially if texts are short.
| Name | Description |
| ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `exclusive_classes` | Whether or not categories are mutually exclusive. ~~bool~~ |
| `ngram_size` | Determines the maximum length of the n-grams in the BOW model. For instance, `ngram_size=3` would give unigram, trigram and bigram features. ~~int~~ |
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`. ~~bool~~ |
| `no_output_layer` | Whether or not to add an output layer to the model (`Softmax` activation if `exclusive_classes` is `True`, else `Logistic`). ~~bool~~ |
| `nO` | Output dimension, determined by the number of different labels. If not set, the [`TextCategorizer`](/api/textcategorizer) component will set it when `initialize` is called. ~~Optional[int]~~ |
| **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ |
@ -596,7 +595,7 @@ into the "real world". This requires 3 main components:
synonyms and prior probabilities.
- A candidate generation step to produce a set of likely identifiers, given a
certain textual mention.
- A Machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the
- A machine learning [`Model`](https://thinc.ai/docs/api-model) that picks the
most plausible ID from the set of candidates.
### spacy.EntityLinker.v1 {#EntityLinker}

View File

@ -71,7 +71,7 @@ pattern_dicts = [
## AttributeRuler.\_\_call\_\_ {#call tag="method"}
Apply the attribute ruler to a Doc, setting token attributes for tokens matched
Apply the attribute ruler to a `Doc`, setting token attributes for tokens matched
by the provided patterns.
| Name | Description |
@ -256,6 +256,6 @@ serialization by passing in the string names via the `exclude` argument.
| Name | Description |
| ---------- | -------------------------------------------------------------- |
| `vocab` | The shared [`Vocab`](/api/vocab). |
| `patterns` | The Matcher patterns. You usually don't want to exclude this. |
| `patterns` | The `Matcher` patterns. You usually don't want to exclude this. |
| `attrs` | The attributes to set. You usually don't want to exclude this. |
| `indices` | The token indices. You usually don't want to exclude this. |

View File

@ -81,7 +81,7 @@ $ python -m spacy info [model] [--markdown] [--silent]
Find all trained pipeline packages installed in the current environment and
check whether they are compatible with the currently installed version of spaCy.
Should be run after upgrading spaCy via `pip install -U spacy` to ensure that
all installed packages are can be used with the new version. It will show a list
all installed packages can be used with the new version. It will show a list
of packages and their installed versions. If any package is out of date, the
latest compatible versions and command for updating are shown.
@ -408,7 +408,7 @@ File /path/to/thinc/thinc/schedules.py (line 91)
### debug data {#debug-data tag="command"}
Analyze, debug, and validate your training and development data. Get useful
Analyze, debug and validate your training and development data. Get useful
stats, and find problems like invalid entity annotations, cyclic dependencies,
low data labels and more.

View File

@ -74,6 +74,7 @@ train/test skew.
|  `gold_preproc` | Whether to set up the Example object with gold-standard sentences and tokens for the predictions. Defaults to `False`. ~~bool~~ |
| `max_length` | Maximum document length. Longer documents will be split into sentences, if sentence boundaries are available. Defaults to `0` for no limit. ~~int~~ |
| `limit` | Limit corpus to a subset of examples, e.g. for debugging. Defaults to `0` for no limit. ~~int~~ |
| `augmenter` | Optional data augmentation callback. ~~Callable[[Language, Example], Iterable[Example]]~~
## Corpus.\_\_call\_\_ {#call tag="method"}

View File

@ -274,7 +274,7 @@ Typically, the extension for these binary files is `.spacy`, and they are used
as input format for specifying a [training corpus](/api/corpus) and for spaCy's
CLI [`train`](/api/cli#train) command. The built-in
[`convert`](/api/cli#convert) command helps you convert spaCy's previous
[JSON format](#json-input) to the new binary format format. It also supports
[JSON format](#json-input) to the new binary format. It also supports
conversion of the `.conllu` format used by the
[Universal Dependencies corpora](https://github.com/UniversalDependencies).
@ -338,7 +338,7 @@ $ python -m spacy convert ./data.json ./output.spacy
<Accordion title="Sample JSON data" spaced>
Here's an example of dependencies, part-of-speech tags and names entities, taken
Here's an example of dependencies, part-of-speech tags and named entities, taken
from the English Wall Street Journal portion of the Penn Treebank:
```json

View File

@ -21,8 +21,7 @@ non-projective parses.
The parser is trained using an **imitation learning objective**. It follows the
actions predicted by the current weights, and at each state, determines which
actions are compatible with the optimal parse that could be reached from the
current state. The weights such that the scores assigned to the set of optimal
actions is increased, while scores assigned to other actions are decreased. Note
current state. The weights are updated such that the scores assigned to the set of optimal actions is increased, while scores assigned to other actions are decreased. Note
that more than one action may be optimal for a given state.
## Config and implementation {#config}

View File

@ -503,8 +503,7 @@ invalidated, although they may accidentally continue to work.
Mark a span for merging. The `attrs` will be applied to the resulting token (if
they're context-dependent token attributes like `LEMMA` or `DEP`) or to the
underlying lexeme (if they're context-independent lexical attributes like
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided as a
dictionary mapping attribute names to values as the `"_"` key.
`LOWER` or `IS_STOP`). Writable custom extension attributes can be provided using the `"_"` key and specifying a dictionary that maps attribute names to values.
> #### Example
>

View File

@ -94,7 +94,7 @@ providing custom registered functions.
## EntityLinker.\_\_call\_\_ {#call tag="method"}
Apply the pipe to one document. The document is modified in place, and returned.
Apply the pipe to one document. The document is modified in place and returned.
This usually happens under the hood when the `nlp` object is called on a text
and all pipeline components are applied to the `Doc` in order. Both
[`__call__`](/api/entitylinker#call) and [`pipe`](/api/entitylinker#pipe)

View File

@ -43,7 +43,7 @@ architectures and their arguments and hyperparameters.
| Setting | Description |
| ----------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]] |
| `moves` | A list of transition names. Inferred from the data if not provided. Defaults to `None`. ~~Optional[List[str]]~~ |
| `update_with_oracle_cut_size` | During training, cut long sequences into shorter segments by creating intermediate states based on the gold-standard history. The model is not very sensitive to this parameter, so you usually won't need to change it. Defaults to `100`. ~~int~~ |
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [TransitionBasedParser](/api/architectures#TransitionBasedParser). ~~Model[List[Doc], List[Floats2d]]~~ |
@ -83,7 +83,7 @@ shortcut for this and instantiate the component using its string name and
## EntityRecognizer.\_\_call\_\_ {#call tag="method"}
Apply the pipe to one document. The document is modified in place, and returned.
Apply the pipe to one document. The document is modified in place and returned.
This usually happens under the hood when the `nlp` object is called on a text
and all pipeline components are applied to the `Doc` in order. Both
[`__call__`](/api/entityrecognizer#call) and

View File

@ -256,6 +256,6 @@ Get all patterns that were added to the entity ruler.
| Name | Description |
| ----------------- | --------------------------------------------------------------------------------------------------------------------- |
| `matcher` | The underlying matcher used to process token patterns. ~~Matcher~~ |
| `phrase_matcher` | The underlying phrase matcher, used to process phrase patterns. ~~PhraseMatcher~~ |
| `phrase_matcher` | The underlying phrase matcher used to process phrase patterns. ~~PhraseMatcher~~ |
| `token_patterns` | The token patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Dict[str, Union[str, List[dict]]]]~~ |
| `phrase_patterns` | The phrase patterns present in the entity ruler, keyed by label. ~~Dict[str, List[Doc]]~~ |

View File

@ -46,10 +46,11 @@ information in [`Language.meta`](/api/language#meta) and not to configure the
## Language.from_config {#from_config tag="classmethod" new="3"}
Create a `Language` object from a loaded config. Will set up the tokenizer and
language data, add pipeline components based on the pipeline and components
define in the config and validate the results. If no config is provided, the
default config of the given language is used. This is also how spaCy loads a
model under the hood based on its [`config.cfg`](/api/data-formats#config).
language data, add pipeline components based on the pipeline and add pipeline
components based on the definitions specified in the config. If no config is
provided, the default config of the given language is used. This is also how
spaCy loads a model under the hood based on its
[`config.cfg`](/api/data-formats#config).
> #### Example
>
@ -107,7 +108,7 @@ decorator. For more details and examples, see the
| `assigns` | `Doc` or `Token` attributes assigned by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[Doc], Doc]]~~ |
| `func` | Optional function if not used as a decorator. ~~Optional[Callable[[Doc], Doc]]~~ |
## Language.factory {#factory tag="classmethod"}
@ -154,7 +155,7 @@ examples, see the
| `requires` | `Doc` or `Token` attributes required by this component, e.g. `["token.ent_id"]`. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~Iterable[str]~~ |
| `retokenizes` | Whether the component changes tokenization. Used for [pipe analysis](/usage/processing-pipelines#analysis). ~~bool~~ |
| `default_score_weights` | The scores to report during training, and their default weight towards the final score used to select the best model. Weights should sum to `1.0` per component and will be combined and normalized for the whole pipeline. If a weight is set to `None`, the score will not be logged or weighted. ~~Dict[str, Optional[float]]~~ |
| `func` | Optional function if not used a a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ |
| `func` | Optional function if not used as a decorator. ~~Optional[Callable[[...], Callable[[Doc], Doc]]]~~ |
## Language.\_\_call\_\_ {#call tag="method"}
@ -609,7 +610,7 @@ does nothing.
## Language.enable_pipe {#enable_pipe tag="method" new="3"}
Enable a previously disable component (e.g. via
Enable a previously disabled component (e.g. via
[`Language.disable_pipes`](/api/language#disable_pipes)) so it's run as part of
the pipeline, [`nlp.pipeline`](/api/language#pipeline). If the component is
already enabled, this method does nothing.
@ -636,7 +637,7 @@ pipeline will be restored to the initial state at the end of the block.
Otherwise, a `DisabledPipes` object is returned, that has a `.restore()` method
you can use to undo your changes. You can specify either `disable` (as a list or
string), or `enable`. In the latter case, all components not in the `enable`
list, will be disabled. Under the hood, this method calls into
list will be disabled. Under the hood, this method calls into
[`disable_pipe`](/api/language#disable_pipe) and
[`enable_pipe`](/api/language#enable_pipe).
@ -669,7 +670,7 @@ As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
| -------------- | ------------------------------------------------------------------------------------------------------ |
| _keyword-only_ | |
| `disable` | Name(s) of pipeline components to disable. ~~Optional[Union[str, Iterable[str]]]~~ |
| `enable` | Names(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ |
| `enable` | Name(s) of pipeline components that will not be disabled. ~~Optional[Union[str, Iterable[str]]]~~ |
| **RETURNS** | The disabled pipes that can be restored by calling the object's `.restore()` method. ~~DisabledPipes~~ |
## Language.get_factory_meta {#get_factory_meta tag="classmethod" new="3"}
@ -881,10 +882,10 @@ Loads state from a directory, including all data that was saved with the
<Infobox variant="warning" title="Important note">
Keep in mind that this method **only loads serialized state** and doesn't set up
the `nlp` object. This means that it requires the correct language class to be
initialized and all pipeline components to be added to the pipeline. If you want
to load a serialized pipeline from a directory, you should use
Keep in mind that this method **only loads the serialized state** and doesn't
set up the `nlp` object. This means that it requires the correct language class
to be initialized and all pipeline components to be added to the pipeline. If
you want to load a serialized pipeline from a directory, you should use
[`spacy.load`](/api/top-level#spacy.load), which will set everything up for you.
</Infobox>

View File

@ -38,7 +38,7 @@ The default config is defined by the pipeline component factory and describes
how the component should be configured. You can override its settings via the
`config` argument on [`nlp.add_pipe`](/api/language#add_pipe) or in your
[`config.cfg` for training](/usage/training#config). For examples of the lookups
data formats used by the lookup and rule-based lemmatizers, see
data format used by the lookup and rule-based lemmatizers, see
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data).
> #### Example

View File

@ -61,7 +61,7 @@ matched:
| `!` | Negate the pattern, by requiring it to match exactly 0 times. |
| `?` | Make the pattern optional, by allowing it to match 0 or 1 times. |
| `+` | Require the pattern to match 1 or more times. |
| `*` | Allow the pattern to match zero or more times. |
| `*` | Allow the pattern to match 0 or more times. |
Token patterns can also map to a **dictionary of properties** instead of a
single value to indicate whether the expected value is a member of a list or how

View File

@ -12,7 +12,7 @@ container storing a single morphological analysis.
## Morphology.\_\_init\_\_ {#init tag="method"}
Create a Morphology object.
Create a `Morphology` object.
> #### Example
>
@ -101,7 +101,7 @@ representation.
| Name | Description |
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `feats_dict` | The morphological features as a dictionary. ~~Dict[str, str]~~ |
| **RETURNS** | The morphological features as in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
| **RETURNS** | The morphological features in Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
## Attributes {#attributes}

View File

@ -200,7 +200,7 @@ Delegates to [`predict`](/api/sentencerecognizer#predict) and
## SentenceRecognizer.rehearse {#rehearse tag="method,experimental" new="3"}
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
current model to make predictions similar to an initial model, to try to address
current model to make predictions similar to an initial model to try to address
the "catastrophic forgetting" problem. This feature is experimental.
> #### Example

View File

@ -8,7 +8,7 @@ api_string_name: sentencizer
api_trainable: false
---
A simple pipeline component, to allow custom sentence boundary detection logic
A simple pipeline component to allow custom sentence boundary detection logic
that doesn't require the dependency parse. By default, sentence segmentation is
performed by the [`DependencyParser`](/api/dependencyparser), so the
`Sentencizer` lets you implement a simpler, rule-based strategy that doesn't
@ -130,7 +130,7 @@ Score a batch of examples.
## Sentencizer.to_disk {#to_disk tag="method"}
Save the sentencizer settings (punctuation characters) a directory. Will create
Save the sentencizer settings (punctuation characters) to a directory. Will create
a file `sentencizer.json`. This also happens automatically when you save an
`nlp` object with a sentencizer added to its pipeline.

View File

@ -8,7 +8,7 @@ A slice from a [`Doc`](/api/doc) object.
## Span.\_\_init\_\_ {#init tag="method"}
Create a Span object from the slice `doc[start : end]`.
Create a `Span` object from the slice `doc[start : end]`.
> #### Example
>
@ -187,7 +187,7 @@ the character indices don't map to a valid span.
| Name | Description |
| ------------------------------------ | ----------------------------------------------------------------------------------------- |
| `start` | The index of the first character of the span. ~~int~~ |
| `end` | The index of the last character after the span. ~int~~ |
| `end` | The index of the last character after the span. ~~int~~ |
| `label` | A label to attach to the span, e.g. for named entities. ~~Union[int, str]~~ |
| `kb_id` <Tag variant="new">2.2</Tag> | An ID from a knowledge base to capture the meaning of a named entity. ~~Union[int, str]~~ |
| `vector` | A meaning representation of the span. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |

View File

@ -157,7 +157,7 @@ This method was previously called `begin_training`.
## TextCategorizer.predict {#predict tag="method"}
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
Apply the component's model to a batch of [`Doc`](/api/doc) objects without
modifying them.
> #### Example
@ -174,7 +174,7 @@ modifying them.
## TextCategorizer.set_annotations {#set_annotations tag="method"}
Modify a batch of [`Doc`](/api/doc) objects, using pre-computed scores.
Modify a batch of [`Doc`](/api/doc) objects using pre-computed scores.
> #### Example
>
@ -217,7 +217,7 @@ Delegates to [`predict`](/api/textcategorizer#predict) and
## TextCategorizer.rehearse {#rehearse tag="method,experimental" new="3"}
Perform a "rehearsal" update from a batch of data. Rehearsal updates teach the
current model to make predictions similar to an initial model, to try to address
current model to make predictions similar to an initial model to try to address
the "catastrophic forgetting" problem. This feature is experimental.
> #### Example
@ -290,7 +290,7 @@ Create an optimizer for the pipeline component.
## TextCategorizer.use_params {#use_params tag="method, contextmanager"}
Modify the pipe's model, to use the given parameter values.
Modify the pipe's model to use the given parameter values.
> #### Example
>

View File

@ -150,7 +150,7 @@ by [`Language.initialize`](/api/language#initialize).
## Tok2Vec.predict {#predict tag="method"}
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
Apply the component's model to a batch of [`Doc`](/api/doc) objects without
modifying them.
> #### Example
@ -223,7 +223,7 @@ Create an optimizer for the pipeline component.
## Tok2Vec.use_params {#use_params tag="method, contextmanager"}
Modify the pipe's model, to use the given parameter values. At the end of the
Modify the pipe's model to use the given parameter values. At the end of the
context, the original parameters are restored.
> #### Example

View File

@ -243,7 +243,7 @@ A sequence of the token's immediate syntactic children.
## Token.lefts {#lefts tag="property" model="parser"}
The leftward immediate children of the word, in the syntactic dependency parse.
The leftward immediate children of the word in the syntactic dependency parse.
> #### Example
>
@ -259,7 +259,7 @@ The leftward immediate children of the word, in the syntactic dependency parse.
## Token.rights {#rights tag="property" model="parser"}
The rightward immediate children of the word, in the syntactic dependency parse.
The rightward immediate children of the word in the syntactic dependency parse.
> #### Example
>
@ -275,7 +275,7 @@ The rightward immediate children of the word, in the syntactic dependency parse.
## Token.n_lefts {#n_lefts tag="property" model="parser"}
The number of leftward immediate children of the word, in the syntactic
The number of leftward immediate children of the word in the syntactic
dependency parse.
> #### Example
@ -291,7 +291,7 @@ dependency parse.
## Token.n_rights {#n_rights tag="property" model="parser"}
The number of rightward immediate children of the word, in the syntactic
The number of rightward immediate children of the word in the syntactic
dependency parse.
> #### Example
@ -422,8 +422,8 @@ The L2 norm of the token's vector representation.
| `norm_` | The token's norm, i.e. a normalized form of the token text. Can be set in the language's [tokenizer exceptions](/usage/adding-languages#tokenizer-exceptions). ~~str~~ |
| `lower` | Lowercase form of the token. ~~int~~ |
| `lower_` | Lowercase form of the token text. Equivalent to `Token.text.lower()`. ~~str~~ |
| `shape` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
| `shape_` | Transform of the tokens's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
| `shape` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~int~~ |
| `shape_` | Transform of the tokens's string to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by `d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. ~~str~~ |
| `prefix` | Hash value of a length-N substring from the start of the token. Defaults to `N=1`. ~~int~~ |
| `prefix_` | A length-N substring from the start of the token. Defaults to `N=1`. ~~str~~ |
| `suffix` | Hash value of a length-N substring from the end of the token. Defaults to `N=3`. ~~int~~ |
@ -451,7 +451,7 @@ The L2 norm of the token's vector representation.
| `tag` | Fine-grained part-of-speech. ~~int~~ |
| `tag_` | Fine-grained part-of-speech. ~~str~~ |
| `morph` <Tag variant="new">3</Tag> | Morphological analysis. ~~MorphAnalysis~~ |
| `morph_` <Tag variant="new">3</Tag> | Morphological analysis in the Universal Dependencies [FEATS]https://universaldependencies.org/format.html#morphological-annotation format. ~~str~~ |
| `morph_` <Tag variant="new">3</Tag> | Morphological analysis in the Universal Dependencies [FEATS](https://universaldependencies.org/format.html#morphological-annotation) format. ~~str~~ |
| `dep` | Syntactic dependency relation. ~~int~~ |
| `dep_` | Syntactic dependency relation. ~~str~~ |
| `lang` | Language of the parent document's vocabulary. ~~int~~ |

View File

@ -1,6 +1,6 @@
---
title: Tokenizer
teaser: Segment text into words, punctuations marks etc.
teaser: Segment text into words, punctuations marks, etc.
tag: class
source: spacy/tokenizer.pyx
---
@ -15,14 +15,14 @@ source: spacy/tokenizer.pyx
Segment text, and create `Doc` objects with the discovered segment boundaries.
For a deeper understanding, see the docs on
[how spaCy's tokenizer works](/usage/linguistic-features#how-tokenizer-works).
The tokenizer is typically created automatically when the a
The tokenizer is typically created automatically when a
[`Language`](/api/language) subclass is initialized and it reads its settings
like punctuation and special case rules from the
[`Language.Defaults`](/api/language#defaults) provided by the language subclass.
## Tokenizer.\_\_init\_\_ {#init tag="method"}
Create a `Tokenizer`, to create `Doc` objects given unicode text. For examples
Create a `Tokenizer` to create `Doc` objects given unicode text. For examples
of how to construct a custom tokenizer with different tokenization rules, see
the
[usage documentation](https://spacy.io/usage/linguistic-features#native-tokenizers).
@ -87,7 +87,7 @@ Tokenize a stream of texts.
| ------------ | ------------------------------------------------------------------------------------ |
| `texts` | A sequence of unicode texts. ~~Iterable[str]~~ |
| `batch_size` | The number of texts to accumulate in an internal buffer. Defaults to `1000`. ~~int~~ |
| **YIELDS** | The tokenized Doc objects, in order. ~~Doc~~ |
| **YIELDS** | The tokenized `Doc` objects, in order. ~~Doc~~ |
## Tokenizer.find_infix {#find_infix tag="method"}

View File

@ -198,7 +198,7 @@ browser. Will run a simple web server.
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
| `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
| `port` | Port to serve visualization. Defaults to `5000`. ~~int~~ |
| `host` | Host to serve visualization. Defaults to `"0.0.0.0"`. ~~str~~ |
@ -223,7 +223,7 @@ Render a dependency parse tree or named entity visualization.
| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
| `manual` | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
| `jupyter` | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None` (default). ~~Optional[bool]~~ |
| **RETURNS** | The rendered HTML markup. ~~str~~ |
@ -244,7 +244,7 @@ If a setting is not present in the options, the default value will be used.
| Name | Description |
| ------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- |
| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ |
| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemma's in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
| `add_lemma` <Tag variant="new">2.2.4</Tag> | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ |
| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ |
| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ |
| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ |
@ -623,7 +623,7 @@ sequences in the batch.
Encode labelled spans into per-token tags, using the
[BILUO scheme](/usage/linguistic-features#accessing-ner) (Begin, In, Last, Unit,
Out). Returns a list of strings, describing the tags. Each tag string will be of
Out). Returns a list of strings, describing the tags. Each tag string will be in
the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of
`"B"`, `"I"`, `"L"`, `"U"`. The string `"-"` is used where the entity offsets
don't align with the tokenization in the `Doc` object. The training algorithm
@ -747,7 +747,7 @@ decorator.
### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"}
Check whether a `Language` subclass is already loaded. `Language` subclasses are
loaded lazily, to avoid expensive setup code associated with the language data.
loaded lazily to avoid expensive setup code associated with the language data.
> #### Example
>
@ -935,7 +935,7 @@ Compile a sequence of prefix rules into a regex object.
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | The prefix rules, e.g. [`lang.punctuation.TOKENIZER_PREFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
| **RETURNS** | The regex object. to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
| **RETURNS** | The regex object to be used for [`Tokenizer.prefix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
### util.compile_suffix_regex {#util.compile_suffix_regex tag="function"}
@ -952,7 +952,7 @@ Compile a sequence of suffix rules into a regex object.
| Name | Description |
| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | The suffix rules, e.g. [`lang.punctuation.TOKENIZER_SUFFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
| **RETURNS** | The regex object. to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
| **RETURNS** | The regex object to be used for [`Tokenizer.suffix_search`](/api/tokenizer#attributes). ~~Pattern~~ |
### util.compile_infix_regex {#util.compile_infix_regex tag="function"}
@ -969,7 +969,7 @@ Compile a sequence of infix rules into a regex object.
| Name | Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
| `entries` | The infix rules, e.g. [`lang.punctuation.TOKENIZER_INFIXES`](%%GITHUB_SPACY/spacy/lang/punctuation.py). ~~Iterable[Union[str, Pattern]]~~ |
| **RETURNS** | The regex object. to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ |
| **RETURNS** | The regex object to be used for [`Tokenizer.infix_finditer`](/api/tokenizer#attributes). ~~Pattern~~ |
### util.minibatch {#util.minibatch tag="function" new="2"}

View File

@ -185,7 +185,7 @@ by [`Language.initialize`](/api/language#initialize).
## Transformer.predict {#predict tag="method"}
Apply the component's model to a batch of [`Doc`](/api/doc) objects, without
Apply the component's model to a batch of [`Doc`](/api/doc) objects without
modifying them.
> #### Example
@ -202,7 +202,7 @@ modifying them.
## Transformer.set_annotations {#set_annotations tag="method"}
Assign the extracted features to the Doc objects. By default, the
Assign the extracted features to the `Doc` objects. By default, the
[`TransformerData`](/api/transformer#transformerdata) object is written to the
[`Doc._.trf_data`](#custom-attributes) attribute. Your `set_extra_annotations`
callback is then called, if provided.
@ -271,7 +271,7 @@ Create an optimizer for the pipeline component.
## Transformer.use_params {#use_params tag="method, contextmanager"}
Modify the pipe's model, to use the given parameter values. At the end of the
Modify the pipe's model to use the given parameter values. At the end of the
context, the original parameters are restored.
> #### Example
@ -387,8 +387,8 @@ by this class. Instances of this class are typically assigned to the
| Name | Description |
| --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts, and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |
| `tensors` | The activations for the Doc from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
| `tokens` | A slice of the tokens data produced by the tokenizer. This may have several fields, including the token IDs, the texts and the attention mask. See the [`transformers.BatchEncoding`](https://huggingface.co/transformers/main_classes/tokenizer.html#transformers.BatchEncoding) object for details. ~~dict~~ |
| `tensors` | The activations for the `Doc` from the transformer. Usually the last tensor that is 3-dimensional will be the most important, as that will provide the final hidden state. Generally activations that are 2-dimensional will be attention weights. Details of this variable will differ depending on the underlying transformer model. ~~List[FloatsXd]~~ |
| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
| `width` | The width of the last hidden layer. ~~int~~ |
@ -408,7 +408,7 @@ objects to associate the outputs to each [`Doc`](/api/doc) in the batch.
| Name | Description |
| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each Span can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each Span may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
| `spans` | The batch of input spans. The outer list refers to the Doc objects in the batch, and the inner list are the spans for that `Doc`. Note that spans are allowed to overlap or exclude tokens, but each `Span` can only refer to one `Doc` (by definition). This means that within a `Doc`, the regions of the output tensors that correspond to each `Span` may overlap or have gaps, but for each `Doc`, there is a non-overlapping contiguous slice of the outputs. ~~List[List[Span]]~~ |
| `tokens` | The output of the tokenizer. ~~transformers.BatchEncoding~~ |
| `tensors` | The output of the transformer model. ~~List[torch.Tensor]~~ |
| `align` | Alignment from the spaCy tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
@ -438,10 +438,10 @@ Split a `TransformerData` object that represents a batch into a list with one
## Span getters {#span_getters source="github.com/explosion/spacy-transformers/blob/master/spacy_transformers/span_getters.py"}
Span getters are functions that take a batch of [`Doc`](/api/doc) objects and
return a lists of [`Span`](/api/span) objects for each doc, to be processed by
the transformer. This is used to manage long documents, by cutting them into
return a lists of [`Span`](/api/span) objects for each doc to be processed by
the transformer. This is used to manage long documents by cutting them into
smaller sequences before running the transformer. The spans are allowed to
overlap, and you can also omit sections of the Doc if they are not relevant.
overlap, and you can also omit sections of the `Doc` if they are not relevant.
Span getters can be referenced in the `[components.transformer.model.get_spans]`
block of the config to customize the sequences processed by the transformer. You

View File

@ -290,7 +290,7 @@ If a table is full, it can be resized using
## Vectors.n_keys {#n_keys tag="property"}
Get the number of keys in the table. Note that this is the number of _all_ keys,
not just unique vectors. If several keys are mapped are mapped to the same
not just unique vectors. If several keys are mapped to the same
vectors, they will be counted individually.
> #### Example
@ -307,10 +307,10 @@ vectors, they will be counted individually.
## Vectors.most_similar {#most_similar tag="method"}
For each of the given vectors, find the `n` most similar entries to it, by
For each of the given vectors, find the `n` most similar entries to it by
cosine. Queries are by vector. Results are returned as a
`(keys, best_rows, scores)` tuple. If `queries` is large, the calculations are
performed in chunks, to avoid consuming too much memory. You can set the
performed in chunks to avoid consuming too much memory. You can set the
`batch_size` to control the size/space trade-off during the calculations.
> #### Example

View File

@ -29,7 +29,7 @@ Create the vocabulary.
| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
| `vectors_name` <Tag variant="new">2.2</Tag> | A name to identify the vectors table. ~~str~~ |
| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
| `get_noun_chunks` | A function that yields base noun phrases, used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ |
| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Span]]]]~~ |
## Vocab.\_\_len\_\_ {#len tag="method"}
@ -150,7 +150,7 @@ rows, we would discard the vectors for "feline" and "reclined". These words
would then be remapped to the closest remaining vector so "feline" would have
the same vector as "cat", and "reclined" would have the same vector as "sat".
The similarities are judged by cosine. The original vectors may be large, so the
cosines are calculated in minibatches, to reduce memory usage.
cosines are calculated in minibatches to reduce memory usage.
> #### Example
>
@ -170,7 +170,7 @@ cosines are calculated in minibatches, to reduce memory usage.
Retrieve a vector for a word in the vocabulary. Words can be looked up by string
or hash value. If no vectors data is loaded, a `ValueError` is raised. If `minn`
is defined, then the resulting vector uses [FastText](https://fasttext.cc/)'s
subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
subword features by average over n-grams of `orth` (introduced in spaCy `v2.1`).
> #### Example
>
@ -182,13 +182,13 @@ subword features by average over ngrams of `orth` (introduced in spaCy `v2.1`).
| Name | Description |
| ----------------------------------- | ---------------------------------------------------------------------------------------------------------------------- |
| `orth` | The hash value of a word, or its unicode string. ~~Union[int, str]~~ |
| `minn` <Tag variant="new">2.1</Tag> | Minimum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ |
| `maxn` <Tag variant="new">2.1</Tag> | Maximum n-gram length used for FastText's ngram computation. Defaults to the length of `orth`. ~~int~~ |
| `minn` <Tag variant="new">2.1</Tag> | Minimum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ |
| `maxn` <Tag variant="new">2.1</Tag> | Maximum n-gram length used for FastText's n-gram computation. Defaults to the length of `orth`. ~~int~~ |
| **RETURNS** | A word vector. Size and shape are determined by the `Vocab.vectors` instance. ~~numpy.ndarray[ndim=1, dtype=float32]~~ |
## Vocab.set_vector {#set_vector tag="method" new="2"}
Set a vector for a word in the vocabulary. Words can be referenced by by string
Set a vector for a word in the vocabulary. Words can be referenced by string
or hash value.
> #### Example

View File

@ -41,8 +41,8 @@ transformers is that word vectors model **lexical types**, rather than _tokens_.
If you have a list of terms with no context around them, a transformer model
like BERT can't really help you. BERT is designed to understand language **in
context**, which isn't what you have. A word vectors table will be a much better
fit for your task. However, if you do have words in context whole sentences or
paragraphs of running text word vectors will only provide a very rough
fit for your task. However, if you do have words in context whole sentences or
paragraphs of running text word vectors will only provide a very rough
approximation of what the text is about.
Word vectors are also very computationally efficient, as they map a word to a
@ -256,7 +256,7 @@ for doc in nlp.pipe(["some text", "some other text"]):
```
You can also customize how the [`Transformer`](/api/transformer) component sets
annotations onto the [`Doc`](/api/doc), by specifying a custom
annotations onto the [`Doc`](/api/doc) by specifying a custom
`set_extra_annotations` function. This callback will be called with the raw
input and output data for the whole batch, along with the batch of `Doc`
objects, allowing you to implement whatever you need. The annotation setter is
@ -675,7 +675,7 @@ given you a 10% error reduction, pretraining with spaCy might give you another
The [`spacy pretrain`](/api/cli#pretrain) command will take a **specific
subnetwork** within one of your components, and add additional layers to build a
network for a temporary task, that forces the model to learn something about
network for a temporary task that forces the model to learn something about
sentence structure and word cooccurrence statistics. Pretraining produces a
**binary weights file** that can be loaded back in at the start of training. The
weights file specifies an initial set of weights. Training then proceeds as

View File

@ -6,6 +6,7 @@ menu:
- ['Introduction', 'basics']
- ['Quickstart', 'quickstart']
- ['Config System', 'config']
<!-- - ['Data Utilities', 'data'] -->
- ['Custom Functions', 'custom-functions']
- ['Parallel Training', 'parallel-training']
- ['Internal API', 'api']
@ -505,6 +506,16 @@ still look good.
</Accordion>
<!--
## Data Utilities {#data-utilities}
* spacy convert
* The [corpora] block
* Custom corpus class
* Minibatching
* Data augmentation
-->
## Custom Functions {#custom-functions}
Registered functions in the training config files can refer to built-in
@ -689,7 +700,7 @@ from pathlib import Path
@spacy.registry.loggers("my_custom_logger.v1")
def custom_logger(log_path):
def setup_logger(nlp: "Language") -> Tuple[Callable, Callable]:
with Path(log_path).open("w") as file_:
with Path(log_path).open("w", encoding="utf8") as file_:
file_.write("step\\t")
file_.write("score\\t")
for pipe in nlp.pipe_names:

View File

@ -1256,7 +1256,7 @@ def main(template_path, output=None, data_path=None):
data_str = f"export const DATA = {data}"
result = compiler.get_output()
if output is not None:
with output.open("w") as f:
with output.open("w", encoding="utf8") as f:
f.write(f"{header}\n{result}\n{data_str}")
print(f"Updated {output.parts[-1]}")
else: