Use consistent spelling

This commit is contained in:
Ines Montani 2019-10-02 10:37:39 +02:00
parent 208629615d
commit b6670bf0c2
27 changed files with 69 additions and 69 deletions

View File

@ -3,9 +3,9 @@
# spaCy: Industrial-strength NLP # spaCy: Industrial-strength NLP
spaCy is a library for advanced Natural Language Processing in Python and spaCy is a library for advanced Natural Language Processing in Python and
[pre-trained statistical models](https://spacy.io/models) and word vectors, and
Cython. It's built on the very latest research, and was designed from day one to Cython. It's built on the very latest research, and was designed from day one to
be used in real products. spaCy comes with be used in real products. spaCy comes with
[pretrained statistical models](https://spacy.io/models) and word vectors, and
currently supports tokenization for **50+ languages**. It features currently supports tokenization for **50+ languages**. It features
state-of-the-art speed, convolutional **neural network models** for tagging, state-of-the-art speed, convolutional **neural network models** for tagging,
parsing and **named entity recognition** and easy **deep learning** integration. parsing and **named entity recognition** and easy **deep learning** integration.
@ -73,7 +73,7 @@ it.
- Non-destructive **tokenization** - Non-destructive **tokenization**
- **Named entity** recognition - **Named entity** recognition
- Support for **50+ languages** - Support for **50+ languages**
- Pre-trained [statistical models](https://spacy.io/models) and word vectors - pretrained [statistical models](https://spacy.io/models) and word vectors
- State-of-the-art speed - State-of-the-art speed
- Easy **deep learning** integration - Easy **deep learning** integration
- Part-of-speech tagging - Part-of-speech tagging

View File

@ -376,7 +376,7 @@ def initialize_pipeline(nlp, docs, golds, config, device):
def _load_pretrained_tok2vec(nlp, loc): def _load_pretrained_tok2vec(nlp, loc):
"""Load pre-trained weights for the 'token-to-vector' part of the component """Load pretrained weights for the 'token-to-vector' part of the component
models, which is typically a CNN. See 'spacy pretrain'. Experimental. models, which is typically a CNN. See 'spacy pretrain'. Experimental.
""" """
with Path(loc).open("rb") as file_: with Path(loc).open("rb") as file_:
@ -472,7 +472,7 @@ class TreebankPaths(object):
gpu_device=("Use GPU", "option", "g", int), gpu_device=("Use GPU", "option", "g", int),
use_oracle_segments=("Use oracle segments", "flag", "G", int), use_oracle_segments=("Use oracle segments", "flag", "G", int),
vectors_dir=( vectors_dir=(
"Path to directory with pre-trained vectors, named e.g. en/", "Path to directory with pretrained vectors, named e.g. en/",
"option", "option",
"v", "v",
Path, Path,

View File

@ -38,10 +38,10 @@ def create_kb(
# check the length of the nlp vectors # check the length of the nlp vectors
if "vectors" in nlp.meta and nlp.vocab.vectors.size: if "vectors" in nlp.meta and nlp.vocab.vectors.size:
input_dim = nlp.vocab.vectors_length input_dim = nlp.vocab.vectors_length
logger.info("Loaded pre-trained vectors of size %s" % input_dim) logger.info("Loaded pretrained vectors of size %s" % input_dim)
else: else:
raise ValueError( raise ValueError(
"The `nlp` object should have access to pre-trained word vectors, " "The `nlp` object should have access to pretrained word vectors, "
" cf. https://spacy.io/usage/models#languages." " cf. https://spacy.io/usage/models#languages."
) )

View File

@ -83,7 +83,7 @@ def main(
# check the length of the nlp vectors # check the length of the nlp vectors
if "vectors" not in nlp.meta or not nlp.vocab.vectors.size: if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
raise ValueError( raise ValueError(
"The `nlp` object should have access to pre-trained word vectors, " "The `nlp` object should have access to pretrained word vectors, "
" cf. https://spacy.io/usage/models#languages." " cf. https://spacy.io/usage/models#languages."
) )

View File

@ -65,7 +65,7 @@ def main(
# check that there is a NER component in the pipeline # check that there is a NER component in the pipeline
if "ner" not in nlp.pipe_names: if "ner" not in nlp.pipe_names:
raise ValueError("The `nlp` object should have a pre-trained `ner` component.") raise ValueError("The `nlp` object should have a pretrained `ner` component.")
# STEP 2: create a training dataset from WP # STEP 2: create a training dataset from WP
logger.info("STEP 2: reading training dataset from {}".format(training_path)) logger.info("STEP 2: reading training dataset from {}".format(training_path))

View File

@ -27,7 +27,7 @@ from bin.wiki_entity_linking.train_descriptions import EntityEncoder
# Q7381115 (Russ Cochran): publisher # Q7381115 (Russ Cochran): publisher
ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)} ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
INPUT_DIM = 300 # dimension of pre-trained input vectors INPUT_DIM = 300 # dimension of pretrained input vectors
DESC_WIDTH = 64 # dimension of output entity vectors DESC_WIDTH = 64 # dimension of output entity vectors
@ -39,7 +39,7 @@ DESC_WIDTH = 64 # dimension of output entity vectors
) )
def main(vocab_path=None, model=None, output_dir=None, n_iter=50): def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
"""Load the model, create the KB and pretrain the entity encodings. """Load the model, create the KB and pretrain the entity encodings.
Either an nlp model or a vocab is needed to provide access to pre-trained word embeddings. Either an nlp model or a vocab is needed to provide access to pretrained word embeddings.
If an output_dir is provided, the KB will be stored there in a file 'kb'. If an output_dir is provided, the KB will be stored there in a file 'kb'.
When providing an nlp model, the updated vocab will also be written to a directory in the output_dir.""" When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
if model is None and vocab_path is None: if model is None and vocab_path is None:

View File

@ -1,9 +1,9 @@
"""This script is experimental. """This script is experimental.
Try pre-training the CNN component of the text categorizer using a cheap Try pre-training the CNN component of the text categorizer using a cheap
language modelling-like objective. Specifically, we load pre-trained vectors language modelling-like objective. Specifically, we load pretrained vectors
(from something like word2vec, GloVe, FastText etc), and use the CNN to (from something like word2vec, GloVe, FastText etc), and use the CNN to
predict the tokens' pre-trained vectors. This isn't as easy as it sounds: predict the tokens' pretrained vectors. This isn't as easy as it sounds:
we're not merely doing compression here, because heavy dropout is applied, we're not merely doing compression here, because heavy dropout is applied,
including over the input words. This means the model must often (50% of the time) including over the input words. This means the model must often (50% of the time)
use the context in order to predict the word. use the context in order to predict the word.

View File

@ -2,7 +2,7 @@
# coding: utf8 # coding: utf8
"""Example of training an additional entity type """Example of training an additional entity type
This script shows how to add a new entity type to an existing pre-trained NER This script shows how to add a new entity type to an existing pretrained NER
model. To keep the example short and simple, only four sentences are provided model. To keep the example short and simple, only four sentences are provided
as examples. In practice, you'll need many more — a few hundred would be a as examples. In practice, you'll need many more — a few hundred would be a
good start. You will also likely need to mix in examples of other entity good start. You will also likely need to mix in examples of other entity

View File

@ -96,9 +96,9 @@ def pretrain(
""" """
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
using an approximate language-modelling objective. Specifically, we load using an approximate language-modelling objective. Specifically, we load
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict
vectors which match the pre-trained ones. The weights are saved to a directory vectors which match the pretrained ones. The weights are saved to a directory
after each epoch. You can then pass a path to one of these pre-trained weights after each epoch. You can then pass a path to one of these pretrained weights
files to the 'spacy train' command. files to the 'spacy train' command.
This technique may be especially helpful if you have little labelled data. This technique may be especially helpful if you have little labelled data.
@ -156,7 +156,7 @@ def pretrain(
subword_features=True, # Set to False for Chinese etc subword_features=True, # Set to False for Chinese etc
), ),
) )
# Load in pre-trained weights # Load in pretrained weights
if init_tok2vec is not None: if init_tok2vec is not None:
components = _load_pretrained_tok2vec(nlp, init_tok2vec) components = _load_pretrained_tok2vec(nlp, init_tok2vec)
msg.text("Loaded pretrained tok2vec for: {}".format(components)) msg.text("Loaded pretrained tok2vec for: {}".format(components))

View File

@ -241,7 +241,7 @@ def train(
nlp._optimizer = None nlp._optimizer = None
# Load in pre-trained weights # Load in pretrained weights
if init_tok2vec is not None: if init_tok2vec is not None:
components = _load_pretrained_tok2vec(nlp, init_tok2vec) components = _load_pretrained_tok2vec(nlp, init_tok2vec)
msg.text("Loaded pretrained tok2vec for: {}".format(components)) msg.text("Loaded pretrained tok2vec for: {}".format(components))
@ -529,7 +529,7 @@ def _load_vectors(nlp, vectors):
def _load_pretrained_tok2vec(nlp, loc): def _load_pretrained_tok2vec(nlp, loc):
"""Load pre-trained weights for the 'token-to-vector' part of the component """Load pretrained weights for the 'token-to-vector' part of the component
models, which is typically a CNN. See 'spacy pretrain'. Experimental. models, which is typically a CNN. See 'spacy pretrain'. Experimental.
""" """
with loc.open("rb") as file_: with loc.open("rb") as file_:

View File

@ -356,7 +356,7 @@ class Errors(object):
E113 = ("The newly split token can only have one root (head = 0).") E113 = ("The newly split token can only have one root (head = 0).")
E114 = ("The newly split token needs to have a root (head = 0).") E114 = ("The newly split token needs to have a root (head = 0).")
E115 = ("All subtokens must have associated heads.") E115 = ("All subtokens must have associated heads.")
E116 = ("Cannot currently add labels to pre-trained text classifier. Add " E116 = ("Cannot currently add labels to pretrained text classifier. Add "
"labels before training begins. This functionality was available " "labels before training begins. This functionality was available "
"in previous versions, but had significant bugs that led to poor " "in previous versions, but had significant bugs that led to poor "
"performance.") "performance.")
@ -482,7 +482,7 @@ class Errors(object):
"Current DocBin: {current}\nOther DocBin: {other}") "Current DocBin: {current}\nOther DocBin: {other}")
E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can " E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can "
"happen if the tagger was trained with a different set of " "happen if the tagger was trained with a different set of "
"morphological features. If you're using a pre-trained model, make " "morphological features. If you're using a pretrained model, make "
"sure that your models are up to date:\npython -m spacy validate") "sure that your models are up to date:\npython -m spacy validate")
E168 = ("Unknown field: {field}") E168 = ("Unknown field: {field}")
E169 = ("Can't find module: {module}") E169 = ("Can't find module: {module}")
@ -499,13 +499,13 @@ class Errors(object):
@add_codes @add_codes
class TempErrors(object): class TempErrors(object):
T003 = ("Resizing pre-trained Tagger models is not currently supported.") T003 = ("Resizing pretrained Tagger models is not currently supported.")
T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.") T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the " T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
T008 = ("Bad configuration of Tagger. This is probably a bug within " T008 = ("Bad configuration of Tagger. This is probably a bug within "
"spaCy. We changed the name of an internal attribute for loading " "spaCy. We changed the name of an internal attribute for loading "
"pre-trained vectors, and the class has been passed the old name " "pretrained vectors, and the class has been passed the old name "
"(pretrained_dims) but not the new name (pretrained_vectors).") "(pretrained_dims) but not the new name (pretrained_vectors).")

View File

@ -521,7 +521,7 @@ class Language(object):
"""Make a "rehearsal" update to the models in the pipeline, to prevent """Make a "rehearsal" update to the models in the pipeline, to prevent
forgetting. Rehearsal updates run an initial copy of the model over some forgetting. Rehearsal updates run an initial copy of the model over some
data, and update the model so its current predictions are more like the data, and update the model so its current predictions are more like the
initial ones. This is useful for keeping a pre-trained model on-track, initial ones. This is useful for keeping a pretrained model on-track,
even if you're updating it with a smaller set of examples. even if you're updating it with a smaller set of examples.
docs (iterable): A batch of `Doc` objects. docs (iterable): A batch of `Doc` objects.
@ -627,7 +627,7 @@ class Language(object):
return self._optimizer return self._optimizer
def resume_training(self, sgd=None, **cfg): def resume_training(self, sgd=None, **cfg):
"""Continue training a pre-trained model. """Continue training a pretrained model.
Create and return an optimizer, and initialize "rehearsal" for any pipeline Create and return an optimizer, and initialize "rehearsal" for any pipeline
component that has a .rehearse() method. Rehearsal is used to prevent component that has a .rehearse() method. Rehearsal is used to prevent

View File

@ -125,7 +125,7 @@ class Pipe(object):
def add_label(self, label): def add_label(self, label):
"""Add an output label, to be predicted by the model. """Add an output label, to be predicted by the model.
It's possible to extend pre-trained models with new labels, It's possible to extend pretrained models with new labels,
but care should be taken to avoid the "catastrophic forgetting" but care should be taken to avoid the "catastrophic forgetting"
problem. problem.
""" """

View File

@ -439,10 +439,10 @@ $ token_vector_width=256 learn_rate=0.0001 spacy train [...]
## Pretrain {#pretrain new="2.1" tag="experimental"} ## Pretrain {#pretrain new="2.1" tag="experimental"}
Pre-train the "token to vector" (`tok2vec`) layer of pipeline components, using Pre-train the "token to vector" (`tok2vec`) layer of pipeline components, using
an approximate language-modeling objective. Specifically, we load pre-trained an approximate language-modeling objective. Specifically, we load pretrained
vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which
match the pre-trained ones. The weights are saved to a directory after each match the pretrained ones. The weights are saved to a directory after each
epoch. You can then pass a path to one of these pre-trained weights files to the epoch. You can then pass a path to one of these pretrained weights files to the
`spacy train` command. `spacy train` command.
This technique may be especially helpful if you have little labelled data. This technique may be especially helpful if you have little labelled data.
@ -476,7 +476,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
| `--n-save-every`, `-se` | option | Save model every X batches. | | `--n-save-every`, `-se` | option | Save model every X batches. |
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. | | `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
| `--epoch-start`, `-es` <Tag variant="new">2.1.5</Tag> | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files. | | `--epoch-start`, `-es` <Tag variant="new">2.1.5</Tag> | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files. |
| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. | | **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
### JSONL format for raw text {#pretrain-jsonl} ### JSONL format for raw text {#pretrain-jsonl}

View File

@ -6,11 +6,11 @@ source: spacy/kb.pyx
new: 2.2 new: 2.2
--- ---
The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init) The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init)
objects, which are plausible external identifiers given a certain textual mention. objects, which are plausible external identifiers given a certain textual mention.
Each such `Candidate` holds information from the relevant KB entities, Each such `Candidate` holds information from the relevant KB entities,
such as its frequency in text and possible aliases. such as its frequency in text and possible aliases.
Each entity in the knowledge base also has a pre-trained entity vector of a fixed size. Each entity in the knowledge base also has a pretrained entity vector of a fixed size.
## KnowledgeBase.\_\_init\_\_ {#init tag="method"} ## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
@ -26,9 +26,9 @@ Create the knowledge base.
| Name | Type | Description | | Name | Type | Description |
| ----------------------- | ---------------- | ----------------------------------------- | | ----------------------- | ---------------- | ----------------------------------------- |
| `vocab` | `Vocab` | A `Vocab` object. | | `vocab` | `Vocab` | A `Vocab` object. |
| `entity_vector_length` | int | Length of the fixed-size entity vectors. | | `entity_vector_length` | int | Length of the fixed-size entity vectors. |
| **RETURNS** | `KnowledgeBase` | The newly constructed object. | | **RETURNS** | `KnowledgeBase` | The newly constructed object. |
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"} ## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
@ -41,7 +41,7 @@ The length of the fixed-size entity vectors in the knowledge base.
## KnowledgeBase.add_entity {#add_entity tag="method"} ## KnowledgeBase.add_entity {#add_entity tag="method"}
Add an entity to the knowledge base, specifying its corpus frequency Add an entity to the knowledge base, specifying its corpus frequency
and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length). and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length).
> #### Example > #### Example
@ -55,11 +55,11 @@ and entity vector, which should be of length [`entity_vector_length`](/api/kb#en
| --------------- | ------------- | ------------------------------------------------- | | --------------- | ------------- | ------------------------------------------------- |
| `entity` | unicode | The unique entity identifier | | `entity` | unicode | The unique entity identifier |
| `freq` | float | The frequency of the entity in a typical corpus | | `freq` | float | The frequency of the entity in a typical corpus |
| `entity_vector` | vector | The pre-trained vector of the entity | | `entity_vector` | vector | The pretrained vector of the entity |
## KnowledgeBase.set_entities {#set_entities tag="method"} ## KnowledgeBase.set_entities {#set_entities tag="method"}
Define the full list of entities in the knowledge base, specifying the corpus frequency Define the full list of entities in the knowledge base, specifying the corpus frequency
and entity vector for each entity. and entity vector for each entity.
> #### Example > #### Example
@ -76,9 +76,9 @@ and entity vector for each entity.
## KnowledgeBase.add_alias {#add_alias tag="method"} ## KnowledgeBase.add_alias {#add_alias tag="method"}
Add an alias or mention to the knowledge base, specifying its potential KB identifiers Add an alias or mention to the knowledge base, specifying its potential KB identifiers
and their prior probabilities. The entity identifiers should refer to entities previously and their prior probabilities. The entity identifiers should refer to entities previously
added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities). added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities).
The sum of the prior probabilities should not exceed 1. The sum of the prior probabilities should not exceed 1.
> #### Example > #### Example
@ -151,8 +151,8 @@ Get a list of all aliases in the knowledge base.
## KnowledgeBase.get_candidates {#get_candidates tag="method"} ## KnowledgeBase.get_candidates {#get_candidates tag="method"}
Given a certain textual mention as input, retrieve a list of candidate entities Given a certain textual mention as input, retrieve a list of candidate entities
of type [`Candidate`](/api/kb/#candidate_init). of type [`Candidate`](/api/kb/#candidate_init).
> #### Example > #### Example
> >
@ -167,7 +167,7 @@ of type [`Candidate`](/api/kb/#candidate_init).
## KnowledgeBase.get_vector {#get_vector tag="method"} ## KnowledgeBase.get_vector {#get_vector tag="method"}
Given a certain entity ID, retrieve its pre-trained entity vector. Given a certain entity ID, retrieve its pretrained entity vector.
> #### Example > #### Example
> >
@ -182,7 +182,7 @@ Given a certain entity ID, retrieve its pre-trained entity vector.
## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"} ## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
Given a certain entity ID and a certain textual mention, retrieve Given a certain entity ID and a certain textual mention, retrieve
the prior probability of the fact that the mention links to the entity ID. the prior probability of the fact that the mention links to the entity ID.
> #### Example > #### Example
@ -213,7 +213,7 @@ Save the current state of the knowledge base to a directory.
## KnowledgeBase.load_bulk {#load_bulk tag="method"} ## KnowledgeBase.load_bulk {#load_bulk tag="method"}
Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab) Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab)
should also be the same as the one used to create the KB. should also be the same as the one used to create the KB.
> #### Example > #### Example
@ -265,4 +265,4 @@ of a `KnowledgeBase`.
| `alias_` | unicode | The alias or textual mention | | `alias_` | unicode | The alias or textual mention |
| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` | | `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
| `entity_freq` | long | The frequency of the entity in a typical corpus | | `entity_freq` | long | The frequency of the entity in a typical corpus |
| `entity_vector` | vector | The pre-trained vector of the entity | | `entity_vector` | vector | The pretrained vector of the entity |

View File

@ -440,7 +440,7 @@ package exposes the data files via language-specific
constructing the `Vocab` and [`Lookups`](/api/lookups). This allows easier constructing the `Vocab` and [`Lookups`](/api/lookups). This allows easier
access to the data, serialization with the models and file compression on disk access to the data, serialization with the models and file compression on disk
(so your spaCy installation is smaller). If you want to use the lookup tables (so your spaCy installation is smaller). If you want to use the lookup tables
without a pre-trained model, you have to explicitly install spaCy with lookups without a pretrained model, you have to explicitly install spaCy with lookups
via `pip install spacy[lookups]` or by installing via `pip install spacy[lookups]` or by installing
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) in the [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) in the
same environment as spaCy. same environment as spaCy.

View File

@ -93,7 +93,7 @@ https://github.com/explosion/spaCy/tree/master/examples/pipeline/multi_processin
### Training spaCy's Named Entity Recognizer {#training-ner} ### Training spaCy's Named Entity Recognizer {#training-ner}
This example shows how to update spaCy's entity recognizer with your own This example shows how to update spaCy's entity recognizer with your own
examples, starting off with an existing, pre-trained model, or from scratch examples, starting off with an existing, pretrained model, or from scratch
using a blank `Language` class. using a blank `Language` class.
```python ```python
@ -102,7 +102,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py
### Training an additional entity type {#new-entity-type} ### Training an additional entity type {#new-entity-type}
This script shows how to add a new entity type to an existing pre-trained NER This script shows how to add a new entity type to an existing pretrained NER
model. To keep the example short and simple, only four sentences are provided as model. To keep the example short and simple, only four sentences are provided as
examples. In practice, you'll need many more — a few hundred would be a good examples. In practice, you'll need many more — a few hundred would be a good
start. start.
@ -114,7 +114,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_new_entit
### Training spaCy's Dependency Parser {#parser} ### Training spaCy's Dependency Parser {#parser}
This example shows how to update spaCy's dependency parser, starting off with an This example shows how to update spaCy's dependency parser, starting off with an
existing, pre-trained model, or from scratch using a blank `Language` class. existing, pretrained model, or from scratch using a blank `Language` class.
```python ```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py

View File

@ -137,7 +137,7 @@ pre-processing.
### Model comparison {#spacy-models} ### Model comparison {#spacy-models}
In this section, we provide benchmark accuracies for the pre-trained model In this section, we provide benchmark accuracies for the pretrained model
pipelines we distribute with spaCy. Evaluations are conducted end-to-end from pipelines we distribute with spaCy. Evaluations are conducted end-to-end from
raw text, with no "gold standard" pre-processing, over text from a mix of genres raw text, with no "gold standard" pre-processing, over text from a mix of genres
where possible. where possible.

View File

@ -56,7 +56,7 @@ run `pip install spacy[lookups]` or install
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
separately. The lookups package is needed to create blank models with separately. The lookups package is needed to create blank models with
lemmatization data, and to lemmatize in languages that don't yet come with lemmatization data, and to lemmatize in languages that don't yet come with
pre-trained models and aren't powered by third-party libraries. pretrained models and aren't powered by third-party libraries.
</Infobox> </Infobox>

View File

@ -508,7 +508,7 @@ responsibility for ensuring that the data is left in a consistent state.
<Infobox title="Annotation scheme"> <Infobox title="Annotation scheme">
For details on the entity types available in spaCy's pre-trained models, see the For details on the entity types available in spaCy's pretrained models, see the
[NER annotation scheme](/api/annotation#named-entities). [NER annotation scheme](/api/annotation#named-entities).
</Infobox> </Infobox>
@ -998,7 +998,7 @@ can sometimes tokenize things differently for example, `"I'm"` →
In situations like that, you often want to align the tokenization so that you In situations like that, you often want to align the tokenization so that you
can merge annotations from different sources together, or take vectors predicted can merge annotations from different sources together, or take vectors predicted
by a by a
[pre-trained BERT model](https://github.com/huggingface/pytorch-transformers) [pretrained BERT model](https://github.com/huggingface/pytorch-transformers)
and apply them to spaCy tokens. spaCy's [`gold.align`](/api/goldparse#align) and apply them to spaCy tokens. spaCy's [`gold.align`](/api/goldparse#align)
helper returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the helper returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the
number of misaligned tokens, the one-to-one mappings of token indices in both number of misaligned tokens, the one-to-one mappings of token indices in both

View File

@ -860,7 +860,7 @@ def custom_ner_wrapper(doc):
The `custom_ner_wrapper` can then be added to the pipeline of a blank model The `custom_ner_wrapper` can then be added to the pipeline of a blank model
using [`nlp.add_pipe`](/api/language#add_pipe). You can also replace the using [`nlp.add_pipe`](/api/language#add_pipe). You can also replace the
existing entity recognizer of a pre-trained model with existing entity recognizer of a pretrained model with
[`nlp.replace_pipe`](/api/language#replace_pipe). [`nlp.replace_pipe`](/api/language#replace_pipe).
Here's another example of a custom model, `your_custom_model`, that takes a list Here's another example of a custom model, `your_custom_model`, that takes a list

View File

@ -1078,7 +1078,7 @@ order to implement more abstract logic.
### Example: Expanding named entities {#models-rules-ner} ### Example: Expanding named entities {#models-rules-ner}
When using the a pre-trained When using the a pretrained
[named entity recognition](/usage/linguistic-features/#named-entities) model to [named entity recognition](/usage/linguistic-features/#named-entities) model to
extract information from your texts, you may find that the predicted span only extract information from your texts, you may find that the predicted span only
includes parts of the entity you're looking for. Sometimes, this happens if includes parts of the entity you're looking for. Sometimes, this happens if

View File

@ -321,7 +321,7 @@ the `drop` keyword argument. See the [`Language`](/api/language) and
## Training the named entity recognizer {#ner} ## Training the named entity recognizer {#ner}
All [spaCy models](/models) support online learning, so you can update a All [spaCy models](/models) support online learning, so you can update a
pre-trained model with new examples. You'll usually need to provide many pretrained model with new examples. You'll usually need to provide many
**examples** to meaningfully improve the system — a few hundred is a good start, **examples** to meaningfully improve the system — a few hundred is a good start,
although more is better. although more is better.
@ -347,7 +347,7 @@ your data** to find a solution that works best for you.
### Updating the Named Entity Recognizer {#example-train-ner} ### Updating the Named Entity Recognizer {#example-train-ner}
This example shows how to update spaCy's entity recognizer with your own This example shows how to update spaCy's entity recognizer with your own
examples, starting off with an existing, pre-trained model, or from scratch examples, starting off with an existing, pretrained model, or from scratch
using a blank `Language` class. To do this, you'll need **example texts** and using a blank `Language` class. To do this, you'll need **example texts** and
the **character offsets** and **labels** of each entity contained in the texts. the **character offsets** and **labels** of each entity contained in the texts.
@ -376,7 +376,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py
### Training an additional entity type {#example-new-entity-type} ### Training an additional entity type {#example-new-entity-type}
This script shows how to add a new entity type `ANIMAL` to an existing This script shows how to add a new entity type `ANIMAL` to an existing
pre-trained NER model, or an empty `Language` class. To keep the example short pretrained NER model, or an empty `Language` class. To keep the example short
and simple, only a few sentences are provided as examples. In practice, you'll and simple, only a few sentences are provided as examples. In practice, you'll
need many more — a few hundred would be a good start. You will also likely need need many more — a few hundred would be a good start. You will also likely need
to mix in examples of other entity types, which might be obtained by running the to mix in examples of other entity types, which might be obtained by running the
@ -440,7 +440,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py
training the parser. training the parser.
2. **Add the dependency labels** to the parser using the 2. **Add the dependency labels** to the parser using the
[`add_label`](/api/dependencyparser#add_label) method. If you're starting off [`add_label`](/api/dependencyparser#add_label) method. If you're starting off
with a pre-trained spaCy model, this is usually not necessary but it with a pretrained spaCy model, this is usually not necessary but it
doesn't hurt either, just to be safe. doesn't hurt either, just to be safe.
3. **Shuffle and loop over** the examples. For each example, **update the 3. **Shuffle and loop over** the examples. For each example, **update the
model** by calling [`nlp.update`](/api/language#update), which steps through model** by calling [`nlp.update`](/api/language#update), which steps through
@ -624,7 +624,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py
a pre-defined [`vocab`](/api/vocab) object. a pre-defined [`vocab`](/api/vocab) object.
2. **Pretrain the entity embeddings** by running the descriptions of the 2. **Pretrain the entity embeddings** by running the descriptions of the
entities through a simple encoder-decoder network. The current implementation entities through a simple encoder-decoder network. The current implementation
requires the `nlp` model to have access to pre-trained word embeddings, but a requires the `nlp` model to have access to pretrained word embeddings, but a
custom implementation of this encoding step can also be used. custom implementation of this encoding step can also be used.
3. **Construct the KB** by defining all entities with their pretrained vectors, 3. **Construct the KB** by defining all entities with their pretrained vectors,
and all aliases with their prior probabilities. and all aliases with their prior probabilities.

View File

@ -324,9 +324,9 @@ check if all of your models are up to date, you can run the
- The lemmatization tables have been moved to their own package, - The lemmatization tables have been moved to their own package,
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data), which [`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data), which
is not installed by default. If you're using pre-trained models, **nothing is not installed by default. If you're using pretrained models, **nothing
changes**, because the tables are now included in the model packages. If you changes**, because the tables are now included in the model packages. If you
want to use the lemmatizer for other languages that don't yet have pre-trained want to use the lemmatizer for other languages that don't yet have pretrained
models (e.g. Turkish or Croatian) or start off with a blank model that models (e.g. Turkish or Croatian) or start off with a blank model that
contains lookup data (e.g. `spacy.blank("en")`), you'll need to **explicitly contains lookup data (e.g. `spacy.blank("en")`), you'll need to **explicitly
install spaCy plus data** via `pip install spacy[lookups]`. install spaCy plus data** via `pip install spacy[lookups]`.

View File

@ -1677,7 +1677,7 @@
{ {
"id": "spacy-pytorch-transformers", "id": "spacy-pytorch-transformers",
"title": "spacy-pytorch-transformers", "title": "spacy-pytorch-transformers",
"slogan": "spaCy pipelines for pre-trained BERT, XLNet and GPT-2", "slogan": "spaCy pipelines for pretrained BERT, XLNet and GPT-2",
"description": "This package provides spaCy model pipelines that wrap [Hugging Face's `pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.", "description": "This package provides spaCy model pipelines that wrap [Hugging Face's `pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
"github": "explosion/spacy-pytorch-transformers", "github": "explosion/spacy-pytorch-transformers",
"url": "https://explosion.ai/blog/spacy-pytorch-transformers", "url": "https://explosion.ai/blog/spacy-pytorch-transformers",
@ -1855,7 +1855,7 @@
{ {
"id": "models", "id": "models",
"title": "Models", "title": "Models",
"description": "Third-party pre-trained models for different languages and domains" "description": "Third-party pretrained models for different languages and domains"
} }
] ]
}, },

View File

@ -345,7 +345,7 @@ const Models = ({ pageContext, repo, children }) => {
return ( return (
<> <>
<Title title={title} teaser={`Available pre-trained statistical models for ${title}`} /> <Title title={title} teaser={`Available pretrained statistical models for ${title}`} />
<StaticQuery <StaticQuery
query={query} query={query}
render={({ site }) => render={({ site }) =>

View File

@ -126,7 +126,7 @@ const Landing = ({ data }) => {
{counts.modelLangs} languages {counts.modelLangs} languages
</Li> </Li>
<Li> <Li>
Pre-trained <strong>word vectors</strong> pretrained <strong>word vectors</strong>
</Li> </Li>
<Li>State-of-the-art speed</Li> <Li>State-of-the-art speed</Li>
<Li> <Li>