Use consistent spelling

This commit is contained in:
Ines Montani 2019-10-02 10:37:39 +02:00
parent 208629615d
commit b6670bf0c2
27 changed files with 69 additions and 69 deletions

View File

@ -3,9 +3,9 @@
# spaCy: Industrial-strength NLP
spaCy is a library for advanced Natural Language Processing in Python and
[pre-trained statistical models](https://spacy.io/models) and word vectors, and
Cython. It's built on the very latest research, and was designed from day one to
be used in real products. spaCy comes with
[pretrained statistical models](https://spacy.io/models) and word vectors, and
currently supports tokenization for **50+ languages**. It features
state-of-the-art speed, convolutional **neural network models** for tagging,
parsing and **named entity recognition** and easy **deep learning** integration.
@ -73,7 +73,7 @@ it.
- Non-destructive **tokenization**
- **Named entity** recognition
- Support for **50+ languages**
- Pre-trained [statistical models](https://spacy.io/models) and word vectors
- pretrained [statistical models](https://spacy.io/models) and word vectors
- State-of-the-art speed
- Easy **deep learning** integration
- Part-of-speech tagging

View File

@ -376,7 +376,7 @@ def initialize_pipeline(nlp, docs, golds, config, device):
def _load_pretrained_tok2vec(nlp, loc):
"""Load pre-trained weights for the 'token-to-vector' part of the component
"""Load pretrained weights for the 'token-to-vector' part of the component
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
"""
with Path(loc).open("rb") as file_:
@ -472,7 +472,7 @@ class TreebankPaths(object):
gpu_device=("Use GPU", "option", "g", int),
use_oracle_segments=("Use oracle segments", "flag", "G", int),
vectors_dir=(
"Path to directory with pre-trained vectors, named e.g. en/",
"Path to directory with pretrained vectors, named e.g. en/",
"option",
"v",
Path,

View File

@ -38,10 +38,10 @@ def create_kb(
# check the length of the nlp vectors
if "vectors" in nlp.meta and nlp.vocab.vectors.size:
input_dim = nlp.vocab.vectors_length
logger.info("Loaded pre-trained vectors of size %s" % input_dim)
logger.info("Loaded pretrained vectors of size %s" % input_dim)
else:
raise ValueError(
"The `nlp` object should have access to pre-trained word vectors, "
"The `nlp` object should have access to pretrained word vectors, "
" cf. https://spacy.io/usage/models#languages."
)

View File

@ -83,7 +83,7 @@ def main(
# check the length of the nlp vectors
if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
raise ValueError(
"The `nlp` object should have access to pre-trained word vectors, "
"The `nlp` object should have access to pretrained word vectors, "
" cf. https://spacy.io/usage/models#languages."
)

View File

@ -65,7 +65,7 @@ def main(
# check that there is a NER component in the pipeline
if "ner" not in nlp.pipe_names:
raise ValueError("The `nlp` object should have a pre-trained `ner` component.")
raise ValueError("The `nlp` object should have a pretrained `ner` component.")
# STEP 2: create a training dataset from WP
logger.info("STEP 2: reading training dataset from {}".format(training_path))

View File

@ -27,7 +27,7 @@ from bin.wiki_entity_linking.train_descriptions import EntityEncoder
# Q7381115 (Russ Cochran): publisher
ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
INPUT_DIM = 300 # dimension of pre-trained input vectors
INPUT_DIM = 300 # dimension of pretrained input vectors
DESC_WIDTH = 64 # dimension of output entity vectors
@ -39,7 +39,7 @@ DESC_WIDTH = 64 # dimension of output entity vectors
)
def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
"""Load the model, create the KB and pretrain the entity encodings.
Either an nlp model or a vocab is needed to provide access to pre-trained word embeddings.
Either an nlp model or a vocab is needed to provide access to pretrained word embeddings.
If an output_dir is provided, the KB will be stored there in a file 'kb'.
When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
if model is None and vocab_path is None:

View File

@ -1,9 +1,9 @@
"""This script is experimental.
Try pre-training the CNN component of the text categorizer using a cheap
language modelling-like objective. Specifically, we load pre-trained vectors
language modelling-like objective. Specifically, we load pretrained vectors
(from something like word2vec, GloVe, FastText etc), and use the CNN to
predict the tokens' pre-trained vectors. This isn't as easy as it sounds:
predict the tokens' pretrained vectors. This isn't as easy as it sounds:
we're not merely doing compression here, because heavy dropout is applied,
including over the input words. This means the model must often (50% of the time)
use the context in order to predict the word.

View File

@ -2,7 +2,7 @@
# coding: utf8
"""Example of training an additional entity type
This script shows how to add a new entity type to an existing pre-trained NER
This script shows how to add a new entity type to an existing pretrained NER
model. To keep the example short and simple, only four sentences are provided
as examples. In practice, you'll need many more — a few hundred would be a
good start. You will also likely need to mix in examples of other entity

View File

@ -96,9 +96,9 @@ def pretrain(
"""
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
using an approximate language-modelling objective. Specifically, we load
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
vectors which match the pre-trained ones. The weights are saved to a directory
after each epoch. You can then pass a path to one of these pre-trained weights
pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict
vectors which match the pretrained ones. The weights are saved to a directory
after each epoch. You can then pass a path to one of these pretrained weights
files to the 'spacy train' command.
This technique may be especially helpful if you have little labelled data.
@ -156,7 +156,7 @@ def pretrain(
subword_features=True, # Set to False for Chinese etc
),
)
# Load in pre-trained weights
# Load in pretrained weights
if init_tok2vec is not None:
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
msg.text("Loaded pretrained tok2vec for: {}".format(components))

View File

@ -241,7 +241,7 @@ def train(
nlp._optimizer = None
# Load in pre-trained weights
# Load in pretrained weights
if init_tok2vec is not None:
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
msg.text("Loaded pretrained tok2vec for: {}".format(components))
@ -529,7 +529,7 @@ def _load_vectors(nlp, vectors):
def _load_pretrained_tok2vec(nlp, loc):
"""Load pre-trained weights for the 'token-to-vector' part of the component
"""Load pretrained weights for the 'token-to-vector' part of the component
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
"""
with loc.open("rb") as file_:

View File

@ -356,7 +356,7 @@ class Errors(object):
E113 = ("The newly split token can only have one root (head = 0).")
E114 = ("The newly split token needs to have a root (head = 0).")
E115 = ("All subtokens must have associated heads.")
E116 = ("Cannot currently add labels to pre-trained text classifier. Add "
E116 = ("Cannot currently add labels to pretrained text classifier. Add "
"labels before training begins. This functionality was available "
"in previous versions, but had significant bugs that led to poor "
"performance.")
@ -482,7 +482,7 @@ class Errors(object):
"Current DocBin: {current}\nOther DocBin: {other}")
E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can "
"happen if the tagger was trained with a different set of "
"morphological features. If you're using a pre-trained model, make "
"morphological features. If you're using a pretrained model, make "
"sure that your models are up to date:\npython -m spacy validate")
E168 = ("Unknown field: {field}")
E169 = ("Can't find module: {module}")
@ -499,13 +499,13 @@ class Errors(object):
@add_codes
class TempErrors(object):
T003 = ("Resizing pre-trained Tagger models is not currently supported.")
T003 = ("Resizing pretrained Tagger models is not currently supported.")
T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues")
T008 = ("Bad configuration of Tagger. This is probably a bug within "
"spaCy. We changed the name of an internal attribute for loading "
"pre-trained vectors, and the class has been passed the old name "
"pretrained vectors, and the class has been passed the old name "
"(pretrained_dims) but not the new name (pretrained_vectors).")

View File

@ -521,7 +521,7 @@ class Language(object):
"""Make a "rehearsal" update to the models in the pipeline, to prevent
forgetting. Rehearsal updates run an initial copy of the model over some
data, and update the model so its current predictions are more like the
initial ones. This is useful for keeping a pre-trained model on-track,
initial ones. This is useful for keeping a pretrained model on-track,
even if you're updating it with a smaller set of examples.
docs (iterable): A batch of `Doc` objects.
@ -627,7 +627,7 @@ class Language(object):
return self._optimizer
def resume_training(self, sgd=None, **cfg):
"""Continue training a pre-trained model.
"""Continue training a pretrained model.
Create and return an optimizer, and initialize "rehearsal" for any pipeline
component that has a .rehearse() method. Rehearsal is used to prevent

View File

@ -125,7 +125,7 @@ class Pipe(object):
def add_label(self, label):
"""Add an output label, to be predicted by the model.
It's possible to extend pre-trained models with new labels,
It's possible to extend pretrained models with new labels,
but care should be taken to avoid the "catastrophic forgetting"
problem.
"""

View File

@ -439,10 +439,10 @@ $ token_vector_width=256 learn_rate=0.0001 spacy train [...]
## Pretrain {#pretrain new="2.1" tag="experimental"}
Pre-train the "token to vector" (`tok2vec`) layer of pipeline components, using
an approximate language-modeling objective. Specifically, we load pre-trained
an approximate language-modeling objective. Specifically, we load pretrained
vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which
match the pre-trained ones. The weights are saved to a directory after each
epoch. You can then pass a path to one of these pre-trained weights files to the
match the pretrained ones. The weights are saved to a directory after each
epoch. You can then pass a path to one of these pretrained weights files to the
`spacy train` command.
This technique may be especially helpful if you have little labelled data.
@ -476,7 +476,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
| `--n-save-every`, `-se` | option | Save model every X batches. |
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
| `--epoch-start`, `-es` <Tag variant="new">2.1.5</Tag> | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files. |
| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. |
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
### JSONL format for raw text {#pretrain-jsonl}

View File

@ -10,7 +10,7 @@ The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#
objects, which are plausible external identifiers given a certain textual mention.
Each such `Candidate` holds information from the relevant KB entities,
such as its frequency in text and possible aliases.
Each entity in the knowledge base also has a pre-trained entity vector of a fixed size.
Each entity in the knowledge base also has a pretrained entity vector of a fixed size.
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
@ -55,7 +55,7 @@ and entity vector, which should be of length [`entity_vector_length`](/api/kb#en
| --------------- | ------------- | ------------------------------------------------- |
| `entity` | unicode | The unique entity identifier |
| `freq` | float | The frequency of the entity in a typical corpus |
| `entity_vector` | vector | The pre-trained vector of the entity |
| `entity_vector` | vector | The pretrained vector of the entity |
## KnowledgeBase.set_entities {#set_entities tag="method"}
@ -167,7 +167,7 @@ of type [`Candidate`](/api/kb/#candidate_init).
## KnowledgeBase.get_vector {#get_vector tag="method"}
Given a certain entity ID, retrieve its pre-trained entity vector.
Given a certain entity ID, retrieve its pretrained entity vector.
> #### Example
>
@ -265,4 +265,4 @@ of a `KnowledgeBase`.
| `alias_` | unicode | The alias or textual mention |
| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
| `entity_freq` | long | The frequency of the entity in a typical corpus |
| `entity_vector` | vector | The pre-trained vector of the entity |
| `entity_vector` | vector | The pretrained vector of the entity |

View File

@ -440,7 +440,7 @@ package exposes the data files via language-specific
constructing the `Vocab` and [`Lookups`](/api/lookups). This allows easier
access to the data, serialization with the models and file compression on disk
(so your spaCy installation is smaller). If you want to use the lookup tables
without a pre-trained model, you have to explicitly install spaCy with lookups
without a pretrained model, you have to explicitly install spaCy with lookups
via `pip install spacy[lookups]` or by installing
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) in the
same environment as spaCy.

View File

@ -93,7 +93,7 @@ https://github.com/explosion/spaCy/tree/master/examples/pipeline/multi_processin
### Training spaCy's Named Entity Recognizer {#training-ner}
This example shows how to update spaCy's entity recognizer with your own
examples, starting off with an existing, pre-trained model, or from scratch
examples, starting off with an existing, pretrained model, or from scratch
using a blank `Language` class.
```python
@ -102,7 +102,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py
### Training an additional entity type {#new-entity-type}
This script shows how to add a new entity type to an existing pre-trained NER
This script shows how to add a new entity type to an existing pretrained NER
model. To keep the example short and simple, only four sentences are provided as
examples. In practice, you'll need many more — a few hundred would be a good
start.
@ -114,7 +114,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_new_entit
### Training spaCy's Dependency Parser {#parser}
This example shows how to update spaCy's dependency parser, starting off with an
existing, pre-trained model, or from scratch using a blank `Language` class.
existing, pretrained model, or from scratch using a blank `Language` class.
```python
https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py

View File

@ -137,7 +137,7 @@ pre-processing.
### Model comparison {#spacy-models}
In this section, we provide benchmark accuracies for the pre-trained model
In this section, we provide benchmark accuracies for the pretrained model
pipelines we distribute with spaCy. Evaluations are conducted end-to-end from
raw text, with no "gold standard" pre-processing, over text from a mix of genres
where possible.

View File

@ -56,7 +56,7 @@ run `pip install spacy[lookups]` or install
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
separately. The lookups package is needed to create blank models with
lemmatization data, and to lemmatize in languages that don't yet come with
pre-trained models and aren't powered by third-party libraries.
pretrained models and aren't powered by third-party libraries.
</Infobox>

View File

@ -508,7 +508,7 @@ responsibility for ensuring that the data is left in a consistent state.
<Infobox title="Annotation scheme">
For details on the entity types available in spaCy's pre-trained models, see the
For details on the entity types available in spaCy's pretrained models, see the
[NER annotation scheme](/api/annotation#named-entities).
</Infobox>
@ -998,7 +998,7 @@ can sometimes tokenize things differently for example, `"I'm"` →
In situations like that, you often want to align the tokenization so that you
can merge annotations from different sources together, or take vectors predicted
by a
[pre-trained BERT model](https://github.com/huggingface/pytorch-transformers)
[pretrained BERT model](https://github.com/huggingface/pytorch-transformers)
and apply them to spaCy tokens. spaCy's [`gold.align`](/api/goldparse#align)
helper returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the
number of misaligned tokens, the one-to-one mappings of token indices in both

View File

@ -860,7 +860,7 @@ def custom_ner_wrapper(doc):
The `custom_ner_wrapper` can then be added to the pipeline of a blank model
using [`nlp.add_pipe`](/api/language#add_pipe). You can also replace the
existing entity recognizer of a pre-trained model with
existing entity recognizer of a pretrained model with
[`nlp.replace_pipe`](/api/language#replace_pipe).
Here's another example of a custom model, `your_custom_model`, that takes a list

View File

@ -1078,7 +1078,7 @@ order to implement more abstract logic.
### Example: Expanding named entities {#models-rules-ner}
When using the a pre-trained
When using the a pretrained
[named entity recognition](/usage/linguistic-features/#named-entities) model to
extract information from your texts, you may find that the predicted span only
includes parts of the entity you're looking for. Sometimes, this happens if

View File

@ -321,7 +321,7 @@ the `drop` keyword argument. See the [`Language`](/api/language) and
## Training the named entity recognizer {#ner}
All [spaCy models](/models) support online learning, so you can update a
pre-trained model with new examples. You'll usually need to provide many
pretrained model with new examples. You'll usually need to provide many
**examples** to meaningfully improve the system — a few hundred is a good start,
although more is better.
@ -347,7 +347,7 @@ your data** to find a solution that works best for you.
### Updating the Named Entity Recognizer {#example-train-ner}
This example shows how to update spaCy's entity recognizer with your own
examples, starting off with an existing, pre-trained model, or from scratch
examples, starting off with an existing, pretrained model, or from scratch
using a blank `Language` class. To do this, you'll need **example texts** and
the **character offsets** and **labels** of each entity contained in the texts.
@ -376,7 +376,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py
### Training an additional entity type {#example-new-entity-type}
This script shows how to add a new entity type `ANIMAL` to an existing
pre-trained NER model, or an empty `Language` class. To keep the example short
pretrained NER model, or an empty `Language` class. To keep the example short
and simple, only a few sentences are provided as examples. In practice, you'll
need many more — a few hundred would be a good start. You will also likely need
to mix in examples of other entity types, which might be obtained by running the
@ -440,7 +440,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py
training the parser.
2. **Add the dependency labels** to the parser using the
[`add_label`](/api/dependencyparser#add_label) method. If you're starting off
with a pre-trained spaCy model, this is usually not necessary but it
with a pretrained spaCy model, this is usually not necessary but it
doesn't hurt either, just to be safe.
3. **Shuffle and loop over** the examples. For each example, **update the
model** by calling [`nlp.update`](/api/language#update), which steps through
@ -624,7 +624,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py
a pre-defined [`vocab`](/api/vocab) object.
2. **Pretrain the entity embeddings** by running the descriptions of the
entities through a simple encoder-decoder network. The current implementation
requires the `nlp` model to have access to pre-trained word embeddings, but a
requires the `nlp` model to have access to pretrained word embeddings, but a
custom implementation of this encoding step can also be used.
3. **Construct the KB** by defining all entities with their pretrained vectors,
and all aliases with their prior probabilities.

View File

@ -324,9 +324,9 @@ check if all of your models are up to date, you can run the
- The lemmatization tables have been moved to their own package,
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data), which
is not installed by default. If you're using pre-trained models, **nothing
is not installed by default. If you're using pretrained models, **nothing
changes**, because the tables are now included in the model packages. If you
want to use the lemmatizer for other languages that don't yet have pre-trained
want to use the lemmatizer for other languages that don't yet have pretrained
models (e.g. Turkish or Croatian) or start off with a blank model that
contains lookup data (e.g. `spacy.blank("en")`), you'll need to **explicitly
install spaCy plus data** via `pip install spacy[lookups]`.

View File

@ -1677,7 +1677,7 @@
{
"id": "spacy-pytorch-transformers",
"title": "spacy-pytorch-transformers",
"slogan": "spaCy pipelines for pre-trained BERT, XLNet and GPT-2",
"slogan": "spaCy pipelines for pretrained BERT, XLNet and GPT-2",
"description": "This package provides spaCy model pipelines that wrap [Hugging Face's `pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
"github": "explosion/spacy-pytorch-transformers",
"url": "https://explosion.ai/blog/spacy-pytorch-transformers",
@ -1855,7 +1855,7 @@
{
"id": "models",
"title": "Models",
"description": "Third-party pre-trained models for different languages and domains"
"description": "Third-party pretrained models for different languages and domains"
}
]
},

View File

@ -345,7 +345,7 @@ const Models = ({ pageContext, repo, children }) => {
return (
<>
<Title title={title} teaser={`Available pre-trained statistical models for ${title}`} />
<Title title={title} teaser={`Available pretrained statistical models for ${title}`} />
<StaticQuery
query={query}
render={({ site }) =>

View File

@ -126,7 +126,7 @@ const Landing = ({ data }) => {
{counts.modelLangs} languages
</Li>
<Li>
Pre-trained <strong>word vectors</strong>
pretrained <strong>word vectors</strong>
</Li>
<Li>State-of-the-art speed</Li>
<Li>