mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Fix typos in docs (#13466)
* fix typos * prettier formatting --------- Co-authored-by: svlandeg <svlandeg@github.com>
This commit is contained in:
parent
74836524e3
commit
045cd43c3f
|
@ -39,7 +39,7 @@ def find_threshold_cli(
|
||||||
# fmt: on
|
# fmt: on
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
Runs prediction trials for a trained model with varying thresholds to maximize
|
||||||
the specified metric. The search space for the threshold is traversed linearly
|
the specified metric. The search space for the threshold is traversed linearly
|
||||||
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
||||||
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
||||||
|
@ -81,7 +81,7 @@ def find_threshold(
|
||||||
silent: bool = True,
|
silent: bool = True,
|
||||||
) -> Tuple[float, float, Dict[float, float]]:
|
) -> Tuple[float, float, Dict[float, float]]:
|
||||||
"""
|
"""
|
||||||
Runs prediction trials for models with varying tresholds to maximize the specified metric.
|
Runs prediction trials for models with varying thresholds to maximize the specified metric.
|
||||||
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
|
model (Union[str, Path]): Pipeline to evaluate. Can be a package or a path to a data directory.
|
||||||
data_path (Path): Path to file with DocBin with docs to use for threshold search.
|
data_path (Path): Path to file with DocBin with docs to use for threshold search.
|
||||||
pipe_name (str): Name of pipe to examine thresholds for.
|
pipe_name (str): Name of pipe to examine thresholds for.
|
||||||
|
|
|
@ -329,7 +329,7 @@ def test_language_pipe_error_handler(n_process):
|
||||||
nlp.set_error_handler(raise_error)
|
nlp.set_error_handler(raise_error)
|
||||||
with pytest.raises(ValueError):
|
with pytest.raises(ValueError):
|
||||||
list(nlp.pipe(texts, n_process=n_process))
|
list(nlp.pipe(texts, n_process=n_process))
|
||||||
# set explicitely to ignoring
|
# set explicitly to ignoring
|
||||||
nlp.set_error_handler(ignore_error)
|
nlp.set_error_handler(ignore_error)
|
||||||
docs = list(nlp.pipe(texts, n_process=n_process))
|
docs = list(nlp.pipe(texts, n_process=n_process))
|
||||||
assert len(docs) == 0
|
assert len(docs) == 0
|
||||||
|
|
|
@ -46,10 +46,10 @@ as `Token.attr`, e.g. `token.dep`, while the string value can be retrieved by
|
||||||
appending `_` as in `token.dep_`.
|
appending `_` as in `token.dep_`.
|
||||||
|
|
||||||
| Attribute | Description |
|
| Attribute | Description |
|
||||||
| ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `DEP` | The token's dependency label. ~~str~~ |
|
| `DEP` | The token's dependency label. ~~str~~ |
|
||||||
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
|
| `ENT_ID` | The token's entity ID (`ent_id`). ~~str~~ |
|
||||||
| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer vaues rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
|
| `ENT_IOB` | The IOB part of the token's entity tag. Uses custom integer values rather than the string store: unset is `0`, `I` is `1`, `O` is `2`, and `B` is `3`. ~~str~~ |
|
||||||
| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ |
|
| `ENT_KB_ID` | The token's entity knowledge base ID. ~~str~~ |
|
||||||
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
| `ENT_TYPE` | The token's entity label. ~~str~~ |
|
||||||
| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ |
|
| `IS_ALPHA` | Token text consists of alphabetic characters. ~~bool~~ |
|
||||||
|
|
|
@ -567,7 +567,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
|
||||||
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
|
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
|
||||||
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
|
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
|
||||||
✔ Good amount of examples for all labels
|
✔ Good amount of examples for all labels
|
||||||
✔ Examples without occurences available for all labels
|
✔ Examples without occurrences available for all labels
|
||||||
✔ No entities consisting of or starting/ending with whitespace
|
✔ No entities consisting of or starting/ending with whitespace
|
||||||
|
|
||||||
=========================== Part-of-speech Tagging ===========================
|
=========================== Part-of-speech Tagging ===========================
|
||||||
|
@ -1320,7 +1320,7 @@ $ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key]
|
||||||
|
|
||||||
## find-threshold {id="find-threshold",version="3.5",tag="command"}
|
## find-threshold {id="find-threshold",version="3.5",tag="command"}
|
||||||
|
|
||||||
Runs prediction trials for a trained model with varying tresholds to maximize
|
Runs prediction trials for a trained model with varying thresholds to maximize
|
||||||
the specified metric. The search space for the threshold is traversed linearly
|
the specified metric. The search space for the threshold is traversed linearly
|
||||||
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
from 0 to 1 in `n_trials` steps. Results are displayed in a table on `stdout`
|
||||||
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
(the corresponding API call to `spacy.cli.find_threshold.find_threshold()`
|
||||||
|
|
|
@ -67,7 +67,7 @@ architectures and their arguments and hyperparameters.
|
||||||
| `generate_empty_kb` <Tag variant="new">3.5.1</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
| `generate_empty_kb` <Tag variant="new">3.5.1</Tag> | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ |
|
||||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||||
|
|
||||||
```python
|
```python
|
||||||
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
|
%%GITHUB_SPACY/spacy/pipeline/entity_linker.py
|
||||||
|
@ -101,7 +101,7 @@ custom knowledge base, you should either call
|
||||||
[`initialize`](/api/entitylinker#initialize) call.
|
[`initialize`](/api/entitylinker#initialize) call.
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
| ---------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
| `vocab` | The shared vocabulary. ~~Vocab~~ |
|
||||||
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
|
| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. ~~Model~~ |
|
||||||
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ |
|
||||||
|
@ -114,7 +114,7 @@ custom knowledge base, you should either call
|
||||||
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
|
| `incl_context` | Whether or not to include the local context in the model. ~~bool~~ |
|
||||||
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
| `overwrite` <Tag variant="new">3.2</Tag> | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ |
|
||||||
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
| `scorer` <Tag variant="new">3.2</Tag> | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ |
|
||||||
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
| `threshold` <Tag variant="new">3.4</Tag> | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the threshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ |
|
||||||
|
|
||||||
## EntityLinker.\_\_call\_\_ {id="call",tag="method"}
|
## EntityLinker.\_\_call\_\_ {id="call",tag="method"}
|
||||||
|
|
||||||
|
|
|
@ -173,7 +173,7 @@ happens automatically after the component has been added to the pipeline using
|
||||||
[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
|
[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
|
||||||
with `overwrite_ents=True`, existing entities will be replaced if they overlap
|
with `overwrite_ents=True`, existing entities will be replaced if they overlap
|
||||||
with the matches. When matches overlap in a Doc, the entity ruler prioritizes
|
with the matches. When matches overlap in a Doc, the entity ruler prioritizes
|
||||||
longer patterns over shorter, and if equal the match occuring first in the Doc
|
longer patterns over shorter, and if equal the match occurring first in the Doc
|
||||||
is chosen.
|
is chosen.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
|
@ -288,7 +288,7 @@ it – so no NP-level coordination, no prepositional phrases, and no relative
|
||||||
clauses.
|
clauses.
|
||||||
|
|
||||||
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
|
If the `noun_chunk` [syntax iterator](/usage/linguistic-features#language-data)
|
||||||
has not been implemeted for the given language, a `NotImplementedError` is
|
has not been implemented for the given language, a `NotImplementedError` is
|
||||||
raised.
|
raised.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
|
|
@ -416,7 +416,7 @@ by this class. Instances of this class are typically assigned to the
|
||||||
| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
|
| `align` | Alignment from the `Doc`'s tokenization to the wordpieces. This is a ragged array, where `align.lengths[i]` indicates the number of wordpiece tokens that token `i` aligns against. The actual indices are provided at `align[i].dataXd`. ~~Ragged~~ |
|
||||||
| `width` | The width of the last hidden layer. ~~int~~ |
|
| `width` | The width of the last hidden layer. ~~int~~ |
|
||||||
|
|
||||||
### TransformerData.empty {id="transformerdata-emoty",tag="classmethod"}
|
### TransformerData.empty {id="transformerdata-empty",tag="classmethod"}
|
||||||
|
|
||||||
Create an empty `TransformerData` container.
|
Create an empty `TransformerData` container.
|
||||||
|
|
||||||
|
|
|
@ -832,7 +832,7 @@ retrieve and add to them.
|
||||||
|
|
||||||
After creation, the component needs to be
|
After creation, the component needs to be
|
||||||
[initialized](/usage/training#initialization). This method can define the
|
[initialized](/usage/training#initialization). This method can define the
|
||||||
relevant labels in two ways: explicitely by setting the `labels` argument in the
|
relevant labels in two ways: explicitly by setting the `labels` argument in the
|
||||||
[`initialize` block](/api/data-formats#config-initialize) of the config, or
|
[`initialize` block](/api/data-formats#config-initialize) of the config, or
|
||||||
implicately by deducing them from the `get_examples` callback that generates the
|
implicately by deducing them from the `get_examples` callback that generates the
|
||||||
full **training data set**, or a representative sample.
|
full **training data set**, or a representative sample.
|
||||||
|
|
|
@ -1899,7 +1899,7 @@ the two words.
|
||||||
"Shore": ("coast", 0.732257),
|
"Shore": ("coast", 0.732257),
|
||||||
"Precautionary": ("caution", 0.490973),
|
"Precautionary": ("caution", 0.490973),
|
||||||
"hopelessness": ("sadness", 0.742366),
|
"hopelessness": ("sadness", 0.742366),
|
||||||
"Continous": ("continuous", 0.732549),
|
"Continuous": ("continuous", 0.732549),
|
||||||
"Disemboweled": ("corpse", 0.499432),
|
"Disemboweled": ("corpse", 0.499432),
|
||||||
"biostatistician": ("scientist", 0.339724),
|
"biostatistician": ("scientist", 0.339724),
|
||||||
"somewheres": ("somewheres", 0.402736),
|
"somewheres": ("somewheres", 0.402736),
|
||||||
|
|
|
@ -173,7 +173,7 @@ detected, a corresponding warning is displayed. If you'd like to disable the
|
||||||
dependency check, set `check_requirements: false` in your project's
|
dependency check, set `check_requirements: false` in your project's
|
||||||
`project.yml`.
|
`project.yml`.
|
||||||
|
|
||||||
### 4. Run a workflow {id="run-workfow"}
|
### 4. Run a workflow {id="run-workflow"}
|
||||||
|
|
||||||
> #### project.yml
|
> #### project.yml
|
||||||
>
|
>
|
||||||
|
@ -286,7 +286,7 @@ pipelines.
|
||||||
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||||
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
| `title` | An optional project title used in `--help` message and [auto-generated docs](#custom-docs). |
|
||||||
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
|
| `description` | An optional project description used in [auto-generated docs](#custom-docs). |
|
||||||
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overriden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
| `vars` | A dictionary of variables that can be referenced in paths, URLs and scripts and overridden on the CLI, just like [`config.cfg` variables](/usage/training#config-interpolation). For example, `${vars.name}` will use the value of the variable `name`. Variables need to be defined in the section `vars`, but can be a nested dict, so you're able to reference `${vars.model.name}`. |
|
||||||
| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
|
| `env` | A dictionary of variables, mapped to the names of environment variables that will be read in when running the project. For example, `${env.name}` will use the value of the environment variable defined as `name`. |
|
||||||
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
| `directories` | An optional list of [directories](#project-files) that should be created in the project for assets, training outputs, metrics etc. spaCy will make sure that these directories always exist. |
|
||||||
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
| `assets` | A list of assets that can be fetched with the [`project assets`](/api/cli#project-assets) command. `url` defines a URL or local path, `dest` is the destination file relative to the project directory, and an optional `checksum` ensures that an error is raised if the file's checksum doesn't match. Instead of `url`, you can also provide a `git` block with the keys `repo`, `branch` and `path`, to download from a Git repo. |
|
||||||
|
|
|
@ -306,7 +306,9 @@ installed in the same environment – that's it.
|
||||||
|
|
||||||
### Loading probability tables into existing models
|
### Loading probability tables into existing models
|
||||||
|
|
||||||
You can load a probability table from [spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an existing spaCy model like `en_core_web_sm`.
|
You can load a probability table from
|
||||||
|
[spacy-lookups-data](https://github.com/explosion/spacy-lookups-data) into an
|
||||||
|
existing spaCy model like `en_core_web_sm`.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Requirements: pip install spacy-lookups-data
|
# Requirements: pip install spacy-lookups-data
|
||||||
|
@ -317,7 +319,8 @@ lookups = load_lookups("en", ["lexeme_prob"])
|
||||||
nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))
|
nlp.vocab.lookups.add_table("lexeme_prob", lookups.get_table("lexeme_prob"))
|
||||||
```
|
```
|
||||||
|
|
||||||
When training a model from scratch you can also specify probability tables in the `config.cfg`.
|
When training a model from scratch you can also specify probability tables in
|
||||||
|
the `config.cfg`.
|
||||||
|
|
||||||
```ini {title="config.cfg (excerpt)"}
|
```ini {title="config.cfg (excerpt)"}
|
||||||
[initialize.lookups]
|
[initialize.lookups]
|
||||||
|
@ -346,8 +349,8 @@ them**!
|
||||||
To stick with the theme of
|
To stick with the theme of
|
||||||
[this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
|
[this entry points blog post](https://amir.rachum.com/blog/2017/07/28/python-entry-points/),
|
||||||
consider the following custom spaCy
|
consider the following custom spaCy
|
||||||
[pipeline component](/usage/processing-pipelines#custom-coponents) that prints a
|
[pipeline component](/usage/processing-pipelines#custom-components) that prints
|
||||||
snake when it's called:
|
a snake when it's called:
|
||||||
|
|
||||||
> #### Package directory structure
|
> #### Package directory structure
|
||||||
>
|
>
|
||||||
|
|
|
@ -185,7 +185,7 @@ New: 'ORG' (23860), 'PERSON' (21395), 'GPE' (21193), 'DATE' (18080), 'CARDINAL'
|
||||||
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
|
'LOC' (2113), 'TIME' (1616), 'WORK_OF_ART' (1229), 'QUANTITY' (1150), 'FAC'
|
||||||
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
|
(1134), 'EVENT' (974), 'PRODUCT' (935), 'LAW' (444), 'LANGUAGE' (338)
|
||||||
✔ Good amount of examples for all labels
|
✔ Good amount of examples for all labels
|
||||||
✔ Examples without occurences available for all labels
|
✔ Examples without occurrences available for all labels
|
||||||
✔ No entities consisting of or starting/ending with whitespace
|
✔ No entities consisting of or starting/ending with whitespace
|
||||||
|
|
||||||
=========================== Part-of-speech Tagging ===========================
|
=========================== Part-of-speech Tagging ===========================
|
||||||
|
|
|
@ -138,7 +138,7 @@ backwards compatibility, the tuple format remains available under
|
||||||
`TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details
|
`TransformerData.tensors` and `FullTransformerBatch.tensors`. See more details
|
||||||
in the [transformer API docs](/api/architectures#TransformerModel).
|
in the [transformer API docs](/api/architectures#TransformerModel).
|
||||||
|
|
||||||
`spacy-transfomers` v1.1 also adds support for `transformer_config` settings
|
`spacy-transformers` v1.1 also adds support for `transformer_config` settings
|
||||||
such as `output_attentions`. Additional output is stored under
|
such as `output_attentions`. Additional output is stored under
|
||||||
`TransformerData.model_output`. More details are in the
|
`TransformerData.model_output`. More details are in the
|
||||||
[TransformerModel docs](/api/architectures#TransformerModel). The training speed
|
[TransformerModel docs](/api/architectures#TransformerModel). The training speed
|
||||||
|
|
Loading…
Reference in New Issue
Block a user