mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-10 19:57:17 +03:00
Use consistent spelling
This commit is contained in:
parent
208629615d
commit
b6670bf0c2
|
@ -3,9 +3,9 @@
|
|||
# spaCy: Industrial-strength NLP
|
||||
|
||||
spaCy is a library for advanced Natural Language Processing in Python and
|
||||
[pre-trained statistical models](https://spacy.io/models) and word vectors, and
|
||||
Cython. It's built on the very latest research, and was designed from day one to
|
||||
be used in real products. spaCy comes with
|
||||
[pretrained statistical models](https://spacy.io/models) and word vectors, and
|
||||
currently supports tokenization for **50+ languages**. It features
|
||||
state-of-the-art speed, convolutional **neural network models** for tagging,
|
||||
parsing and **named entity recognition** and easy **deep learning** integration.
|
||||
|
@ -73,7 +73,7 @@ it.
|
|||
- Non-destructive **tokenization**
|
||||
- **Named entity** recognition
|
||||
- Support for **50+ languages**
|
||||
- Pre-trained [statistical models](https://spacy.io/models) and word vectors
|
||||
- pretrained [statistical models](https://spacy.io/models) and word vectors
|
||||
- State-of-the-art speed
|
||||
- Easy **deep learning** integration
|
||||
- Part-of-speech tagging
|
||||
|
|
|
@ -376,7 +376,7 @@ def initialize_pipeline(nlp, docs, golds, config, device):
|
|||
|
||||
|
||||
def _load_pretrained_tok2vec(nlp, loc):
|
||||
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
||||
"""Load pretrained weights for the 'token-to-vector' part of the component
|
||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||
"""
|
||||
with Path(loc).open("rb") as file_:
|
||||
|
@ -472,7 +472,7 @@ class TreebankPaths(object):
|
|||
gpu_device=("Use GPU", "option", "g", int),
|
||||
use_oracle_segments=("Use oracle segments", "flag", "G", int),
|
||||
vectors_dir=(
|
||||
"Path to directory with pre-trained vectors, named e.g. en/",
|
||||
"Path to directory with pretrained vectors, named e.g. en/",
|
||||
"option",
|
||||
"v",
|
||||
Path,
|
||||
|
|
|
@ -38,10 +38,10 @@ def create_kb(
|
|||
# check the length of the nlp vectors
|
||||
if "vectors" in nlp.meta and nlp.vocab.vectors.size:
|
||||
input_dim = nlp.vocab.vectors_length
|
||||
logger.info("Loaded pre-trained vectors of size %s" % input_dim)
|
||||
logger.info("Loaded pretrained vectors of size %s" % input_dim)
|
||||
else:
|
||||
raise ValueError(
|
||||
"The `nlp` object should have access to pre-trained word vectors, "
|
||||
"The `nlp` object should have access to pretrained word vectors, "
|
||||
" cf. https://spacy.io/usage/models#languages."
|
||||
)
|
||||
|
||||
|
|
|
@ -83,7 +83,7 @@ def main(
|
|||
# check the length of the nlp vectors
|
||||
if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
|
||||
raise ValueError(
|
||||
"The `nlp` object should have access to pre-trained word vectors, "
|
||||
"The `nlp` object should have access to pretrained word vectors, "
|
||||
" cf. https://spacy.io/usage/models#languages."
|
||||
)
|
||||
|
||||
|
|
|
@ -65,7 +65,7 @@ def main(
|
|||
|
||||
# check that there is a NER component in the pipeline
|
||||
if "ner" not in nlp.pipe_names:
|
||||
raise ValueError("The `nlp` object should have a pre-trained `ner` component.")
|
||||
raise ValueError("The `nlp` object should have a pretrained `ner` component.")
|
||||
|
||||
# STEP 2: create a training dataset from WP
|
||||
logger.info("STEP 2: reading training dataset from {}".format(training_path))
|
||||
|
|
|
@ -27,7 +27,7 @@ from bin.wiki_entity_linking.train_descriptions import EntityEncoder
|
|||
# Q7381115 (Russ Cochran): publisher
|
||||
ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
|
||||
|
||||
INPUT_DIM = 300 # dimension of pre-trained input vectors
|
||||
INPUT_DIM = 300 # dimension of pretrained input vectors
|
||||
DESC_WIDTH = 64 # dimension of output entity vectors
|
||||
|
||||
|
||||
|
@ -39,7 +39,7 @@ DESC_WIDTH = 64 # dimension of output entity vectors
|
|||
)
|
||||
def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
|
||||
"""Load the model, create the KB and pretrain the entity encodings.
|
||||
Either an nlp model or a vocab is needed to provide access to pre-trained word embeddings.
|
||||
Either an nlp model or a vocab is needed to provide access to pretrained word embeddings.
|
||||
If an output_dir is provided, the KB will be stored there in a file 'kb'.
|
||||
When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
|
||||
if model is None and vocab_path is None:
|
||||
|
|
|
@ -1,9 +1,9 @@
|
|||
"""This script is experimental.
|
||||
|
||||
Try pre-training the CNN component of the text categorizer using a cheap
|
||||
language modelling-like objective. Specifically, we load pre-trained vectors
|
||||
language modelling-like objective. Specifically, we load pretrained vectors
|
||||
(from something like word2vec, GloVe, FastText etc), and use the CNN to
|
||||
predict the tokens' pre-trained vectors. This isn't as easy as it sounds:
|
||||
predict the tokens' pretrained vectors. This isn't as easy as it sounds:
|
||||
we're not merely doing compression here, because heavy dropout is applied,
|
||||
including over the input words. This means the model must often (50% of the time)
|
||||
use the context in order to predict the word.
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
# coding: utf8
|
||||
"""Example of training an additional entity type
|
||||
|
||||
This script shows how to add a new entity type to an existing pre-trained NER
|
||||
This script shows how to add a new entity type to an existing pretrained NER
|
||||
model. To keep the example short and simple, only four sentences are provided
|
||||
as examples. In practice, you'll need many more — a few hundred would be a
|
||||
good start. You will also likely need to mix in examples of other entity
|
||||
|
|
|
@ -96,9 +96,9 @@ def pretrain(
|
|||
"""
|
||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||
using an approximate language-modelling objective. Specifically, we load
|
||||
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
||||
vectors which match the pre-trained ones. The weights are saved to a directory
|
||||
after each epoch. You can then pass a path to one of these pre-trained weights
|
||||
pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
||||
vectors which match the pretrained ones. The weights are saved to a directory
|
||||
after each epoch. You can then pass a path to one of these pretrained weights
|
||||
files to the 'spacy train' command.
|
||||
|
||||
This technique may be especially helpful if you have little labelled data.
|
||||
|
@ -156,7 +156,7 @@ def pretrain(
|
|||
subword_features=True, # Set to False for Chinese etc
|
||||
),
|
||||
)
|
||||
# Load in pre-trained weights
|
||||
# Load in pretrained weights
|
||||
if init_tok2vec is not None:
|
||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
||||
|
|
|
@ -241,7 +241,7 @@ def train(
|
|||
|
||||
nlp._optimizer = None
|
||||
|
||||
# Load in pre-trained weights
|
||||
# Load in pretrained weights
|
||||
if init_tok2vec is not None:
|
||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
||||
|
@ -529,7 +529,7 @@ def _load_vectors(nlp, vectors):
|
|||
|
||||
|
||||
def _load_pretrained_tok2vec(nlp, loc):
|
||||
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
||||
"""Load pretrained weights for the 'token-to-vector' part of the component
|
||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||
"""
|
||||
with loc.open("rb") as file_:
|
||||
|
|
|
@ -356,7 +356,7 @@ class Errors(object):
|
|||
E113 = ("The newly split token can only have one root (head = 0).")
|
||||
E114 = ("The newly split token needs to have a root (head = 0).")
|
||||
E115 = ("All subtokens must have associated heads.")
|
||||
E116 = ("Cannot currently add labels to pre-trained text classifier. Add "
|
||||
E116 = ("Cannot currently add labels to pretrained text classifier. Add "
|
||||
"labels before training begins. This functionality was available "
|
||||
"in previous versions, but had significant bugs that led to poor "
|
||||
"performance.")
|
||||
|
@ -482,7 +482,7 @@ class Errors(object):
|
|||
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||
E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can "
|
||||
"happen if the tagger was trained with a different set of "
|
||||
"morphological features. If you're using a pre-trained model, make "
|
||||
"morphological features. If you're using a pretrained model, make "
|
||||
"sure that your models are up to date:\npython -m spacy validate")
|
||||
E168 = ("Unknown field: {field}")
|
||||
E169 = ("Can't find module: {module}")
|
||||
|
@ -499,13 +499,13 @@ class Errors(object):
|
|||
|
||||
@add_codes
|
||||
class TempErrors(object):
|
||||
T003 = ("Resizing pre-trained Tagger models is not currently supported.")
|
||||
T003 = ("Resizing pretrained Tagger models is not currently supported.")
|
||||
T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
|
||||
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
T008 = ("Bad configuration of Tagger. This is probably a bug within "
|
||||
"spaCy. We changed the name of an internal attribute for loading "
|
||||
"pre-trained vectors, and the class has been passed the old name "
|
||||
"pretrained vectors, and the class has been passed the old name "
|
||||
"(pretrained_dims) but not the new name (pretrained_vectors).")
|
||||
|
||||
|
||||
|
|
|
@ -521,7 +521,7 @@ class Language(object):
|
|||
"""Make a "rehearsal" update to the models in the pipeline, to prevent
|
||||
forgetting. Rehearsal updates run an initial copy of the model over some
|
||||
data, and update the model so its current predictions are more like the
|
||||
initial ones. This is useful for keeping a pre-trained model on-track,
|
||||
initial ones. This is useful for keeping a pretrained model on-track,
|
||||
even if you're updating it with a smaller set of examples.
|
||||
|
||||
docs (iterable): A batch of `Doc` objects.
|
||||
|
@ -627,7 +627,7 @@ class Language(object):
|
|||
return self._optimizer
|
||||
|
||||
def resume_training(self, sgd=None, **cfg):
|
||||
"""Continue training a pre-trained model.
|
||||
"""Continue training a pretrained model.
|
||||
|
||||
Create and return an optimizer, and initialize "rehearsal" for any pipeline
|
||||
component that has a .rehearse() method. Rehearsal is used to prevent
|
||||
|
|
|
@ -125,7 +125,7 @@ class Pipe(object):
|
|||
def add_label(self, label):
|
||||
"""Add an output label, to be predicted by the model.
|
||||
|
||||
It's possible to extend pre-trained models with new labels,
|
||||
It's possible to extend pretrained models with new labels,
|
||||
but care should be taken to avoid the "catastrophic forgetting"
|
||||
problem.
|
||||
"""
|
||||
|
|
|
@ -439,10 +439,10 @@ $ token_vector_width=256 learn_rate=0.0001 spacy train [...]
|
|||
## Pretrain {#pretrain new="2.1" tag="experimental"}
|
||||
|
||||
Pre-train the "token to vector" (`tok2vec`) layer of pipeline components, using
|
||||
an approximate language-modeling objective. Specifically, we load pre-trained
|
||||
an approximate language-modeling objective. Specifically, we load pretrained
|
||||
vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which
|
||||
match the pre-trained ones. The weights are saved to a directory after each
|
||||
epoch. You can then pass a path to one of these pre-trained weights files to the
|
||||
match the pretrained ones. The weights are saved to a directory after each
|
||||
epoch. You can then pass a path to one of these pretrained weights files to the
|
||||
`spacy train` command.
|
||||
|
||||
This technique may be especially helpful if you have little labelled data.
|
||||
|
@ -476,7 +476,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
|
|||
| `--n-save-every`, `-se` | option | Save model every X batches. |
|
||||
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
|
||||
| `--epoch-start`, `-es` <Tag variant="new">2.1.5</Tag> | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files. |
|
||||
| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. |
|
||||
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
|
||||
|
||||
### JSONL format for raw text {#pretrain-jsonl}
|
||||
|
||||
|
|
|
@ -6,11 +6,11 @@ source: spacy/kb.pyx
|
|||
new: 2.2
|
||||
---
|
||||
|
||||
The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init)
|
||||
objects, which are plausible external identifiers given a certain textual mention.
|
||||
The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init)
|
||||
objects, which are plausible external identifiers given a certain textual mention.
|
||||
Each such `Candidate` holds information from the relevant KB entities,
|
||||
such as its frequency in text and possible aliases.
|
||||
Each entity in the knowledge base also has a pre-trained entity vector of a fixed size.
|
||||
such as its frequency in text and possible aliases.
|
||||
Each entity in the knowledge base also has a pretrained entity vector of a fixed size.
|
||||
|
||||
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
|
@ -26,9 +26,9 @@ Create the knowledge base.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------------------- | ---------------- | ----------------------------------------- |
|
||||
| `vocab` | `Vocab` | A `Vocab` object. |
|
||||
| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
|
||||
| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
|
||||
| `vocab` | `Vocab` | A `Vocab` object. |
|
||||
| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
|
||||
| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
|
||||
|
||||
|
||||
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
|
||||
|
@ -41,7 +41,7 @@ The length of the fixed-size entity vectors in the knowledge base.
|
|||
|
||||
## KnowledgeBase.add_entity {#add_entity tag="method"}
|
||||
|
||||
Add an entity to the knowledge base, specifying its corpus frequency
|
||||
Add an entity to the knowledge base, specifying its corpus frequency
|
||||
and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length).
|
||||
|
||||
> #### Example
|
||||
|
@ -55,11 +55,11 @@ and entity vector, which should be of length [`entity_vector_length`](/api/kb#en
|
|||
| --------------- | ------------- | ------------------------------------------------- |
|
||||
| `entity` | unicode | The unique entity identifier |
|
||||
| `freq` | float | The frequency of the entity in a typical corpus |
|
||||
| `entity_vector` | vector | The pre-trained vector of the entity |
|
||||
| `entity_vector` | vector | The pretrained vector of the entity |
|
||||
|
||||
## KnowledgeBase.set_entities {#set_entities tag="method"}
|
||||
|
||||
Define the full list of entities in the knowledge base, specifying the corpus frequency
|
||||
Define the full list of entities in the knowledge base, specifying the corpus frequency
|
||||
and entity vector for each entity.
|
||||
|
||||
> #### Example
|
||||
|
@ -76,9 +76,9 @@ and entity vector for each entity.
|
|||
|
||||
## KnowledgeBase.add_alias {#add_alias tag="method"}
|
||||
|
||||
Add an alias or mention to the knowledge base, specifying its potential KB identifiers
|
||||
Add an alias or mention to the knowledge base, specifying its potential KB identifiers
|
||||
and their prior probabilities. The entity identifiers should refer to entities previously
|
||||
added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities).
|
||||
added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities).
|
||||
The sum of the prior probabilities should not exceed 1.
|
||||
|
||||
> #### Example
|
||||
|
@ -151,8 +151,8 @@ Get a list of all aliases in the knowledge base.
|
|||
|
||||
## KnowledgeBase.get_candidates {#get_candidates tag="method"}
|
||||
|
||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||
of type [`Candidate`](/api/kb/#candidate_init).
|
||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||
of type [`Candidate`](/api/kb/#candidate_init).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -167,7 +167,7 @@ of type [`Candidate`](/api/kb/#candidate_init).
|
|||
|
||||
## KnowledgeBase.get_vector {#get_vector tag="method"}
|
||||
|
||||
Given a certain entity ID, retrieve its pre-trained entity vector.
|
||||
Given a certain entity ID, retrieve its pretrained entity vector.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -182,7 +182,7 @@ Given a certain entity ID, retrieve its pre-trained entity vector.
|
|||
|
||||
## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
|
||||
|
||||
Given a certain entity ID and a certain textual mention, retrieve
|
||||
Given a certain entity ID and a certain textual mention, retrieve
|
||||
the prior probability of the fact that the mention links to the entity ID.
|
||||
|
||||
> #### Example
|
||||
|
@ -213,7 +213,7 @@ Save the current state of the knowledge base to a directory.
|
|||
|
||||
## KnowledgeBase.load_bulk {#load_bulk tag="method"}
|
||||
|
||||
Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab)
|
||||
Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab)
|
||||
should also be the same as the one used to create the KB.
|
||||
|
||||
> #### Example
|
||||
|
@ -265,4 +265,4 @@ of a `KnowledgeBase`.
|
|||
| `alias_` | unicode | The alias or textual mention |
|
||||
| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
|
||||
| `entity_freq` | long | The frequency of the entity in a typical corpus |
|
||||
| `entity_vector` | vector | The pre-trained vector of the entity |
|
||||
| `entity_vector` | vector | The pretrained vector of the entity |
|
||||
|
|
|
@ -440,7 +440,7 @@ package exposes the data files via language-specific
|
|||
constructing the `Vocab` and [`Lookups`](/api/lookups). This allows easier
|
||||
access to the data, serialization with the models and file compression on disk
|
||||
(so your spaCy installation is smaller). If you want to use the lookup tables
|
||||
without a pre-trained model, you have to explicitly install spaCy with lookups
|
||||
without a pretrained model, you have to explicitly install spaCy with lookups
|
||||
via `pip install spacy[lookups]` or by installing
|
||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) in the
|
||||
same environment as spaCy.
|
||||
|
|
|
@ -93,7 +93,7 @@ https://github.com/explosion/spaCy/tree/master/examples/pipeline/multi_processin
|
|||
### Training spaCy's Named Entity Recognizer {#training-ner}
|
||||
|
||||
This example shows how to update spaCy's entity recognizer with your own
|
||||
examples, starting off with an existing, pre-trained model, or from scratch
|
||||
examples, starting off with an existing, pretrained model, or from scratch
|
||||
using a blank `Language` class.
|
||||
|
||||
```python
|
||||
|
@ -102,7 +102,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py
|
|||
|
||||
### Training an additional entity type {#new-entity-type}
|
||||
|
||||
This script shows how to add a new entity type to an existing pre-trained NER
|
||||
This script shows how to add a new entity type to an existing pretrained NER
|
||||
model. To keep the example short and simple, only four sentences are provided as
|
||||
examples. In practice, you'll need many more — a few hundred would be a good
|
||||
start.
|
||||
|
@ -114,7 +114,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_new_entit
|
|||
### Training spaCy's Dependency Parser {#parser}
|
||||
|
||||
This example shows how to update spaCy's dependency parser, starting off with an
|
||||
existing, pre-trained model, or from scratch using a blank `Language` class.
|
||||
existing, pretrained model, or from scratch using a blank `Language` class.
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py
|
||||
|
|
|
@ -137,7 +137,7 @@ pre-processing.
|
|||
|
||||
### Model comparison {#spacy-models}
|
||||
|
||||
In this section, we provide benchmark accuracies for the pre-trained model
|
||||
In this section, we provide benchmark accuracies for the pretrained model
|
||||
pipelines we distribute with spaCy. Evaluations are conducted end-to-end from
|
||||
raw text, with no "gold standard" pre-processing, over text from a mix of genres
|
||||
where possible.
|
||||
|
|
|
@ -56,7 +56,7 @@ run `pip install spacy[lookups]` or install
|
|||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
|
||||
separately. The lookups package is needed to create blank models with
|
||||
lemmatization data, and to lemmatize in languages that don't yet come with
|
||||
pre-trained models and aren't powered by third-party libraries.
|
||||
pretrained models and aren't powered by third-party libraries.
|
||||
|
||||
</Infobox>
|
||||
|
||||
|
|
|
@ -508,7 +508,7 @@ responsibility for ensuring that the data is left in a consistent state.
|
|||
|
||||
<Infobox title="Annotation scheme">
|
||||
|
||||
For details on the entity types available in spaCy's pre-trained models, see the
|
||||
For details on the entity types available in spaCy's pretrained models, see the
|
||||
[NER annotation scheme](/api/annotation#named-entities).
|
||||
|
||||
</Infobox>
|
||||
|
@ -998,7 +998,7 @@ can sometimes tokenize things differently – for example, `"I'm"` →
|
|||
In situations like that, you often want to align the tokenization so that you
|
||||
can merge annotations from different sources together, or take vectors predicted
|
||||
by a
|
||||
[pre-trained BERT model](https://github.com/huggingface/pytorch-transformers)
|
||||
[pretrained BERT model](https://github.com/huggingface/pytorch-transformers)
|
||||
and apply them to spaCy tokens. spaCy's [`gold.align`](/api/goldparse#align)
|
||||
helper returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the
|
||||
number of misaligned tokens, the one-to-one mappings of token indices in both
|
||||
|
|
|
@ -860,7 +860,7 @@ def custom_ner_wrapper(doc):
|
|||
|
||||
The `custom_ner_wrapper` can then be added to the pipeline of a blank model
|
||||
using [`nlp.add_pipe`](/api/language#add_pipe). You can also replace the
|
||||
existing entity recognizer of a pre-trained model with
|
||||
existing entity recognizer of a pretrained model with
|
||||
[`nlp.replace_pipe`](/api/language#replace_pipe).
|
||||
|
||||
Here's another example of a custom model, `your_custom_model`, that takes a list
|
||||
|
|
|
@ -1078,7 +1078,7 @@ order to implement more abstract logic.
|
|||
|
||||
### Example: Expanding named entities {#models-rules-ner}
|
||||
|
||||
When using the a pre-trained
|
||||
When using the a pretrained
|
||||
[named entity recognition](/usage/linguistic-features/#named-entities) model to
|
||||
extract information from your texts, you may find that the predicted span only
|
||||
includes parts of the entity you're looking for. Sometimes, this happens if
|
||||
|
|
|
@ -321,7 +321,7 @@ the `drop` keyword argument. See the [`Language`](/api/language) and
|
|||
## Training the named entity recognizer {#ner}
|
||||
|
||||
All [spaCy models](/models) support online learning, so you can update a
|
||||
pre-trained model with new examples. You'll usually need to provide many
|
||||
pretrained model with new examples. You'll usually need to provide many
|
||||
**examples** to meaningfully improve the system — a few hundred is a good start,
|
||||
although more is better.
|
||||
|
||||
|
@ -347,7 +347,7 @@ your data** to find a solution that works best for you.
|
|||
### Updating the Named Entity Recognizer {#example-train-ner}
|
||||
|
||||
This example shows how to update spaCy's entity recognizer with your own
|
||||
examples, starting off with an existing, pre-trained model, or from scratch
|
||||
examples, starting off with an existing, pretrained model, or from scratch
|
||||
using a blank `Language` class. To do this, you'll need **example texts** and
|
||||
the **character offsets** and **labels** of each entity contained in the texts.
|
||||
|
||||
|
@ -376,7 +376,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py
|
|||
### Training an additional entity type {#example-new-entity-type}
|
||||
|
||||
This script shows how to add a new entity type `ANIMAL` to an existing
|
||||
pre-trained NER model, or an empty `Language` class. To keep the example short
|
||||
pretrained NER model, or an empty `Language` class. To keep the example short
|
||||
and simple, only a few sentences are provided as examples. In practice, you'll
|
||||
need many more — a few hundred would be a good start. You will also likely need
|
||||
to mix in examples of other entity types, which might be obtained by running the
|
||||
|
@ -440,7 +440,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py
|
|||
training the parser.
|
||||
2. **Add the dependency labels** to the parser using the
|
||||
[`add_label`](/api/dependencyparser#add_label) method. If you're starting off
|
||||
with a pre-trained spaCy model, this is usually not necessary – but it
|
||||
with a pretrained spaCy model, this is usually not necessary – but it
|
||||
doesn't hurt either, just to be safe.
|
||||
3. **Shuffle and loop over** the examples. For each example, **update the
|
||||
model** by calling [`nlp.update`](/api/language#update), which steps through
|
||||
|
@ -624,7 +624,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py
|
|||
a pre-defined [`vocab`](/api/vocab) object.
|
||||
2. **Pretrain the entity embeddings** by running the descriptions of the
|
||||
entities through a simple encoder-decoder network. The current implementation
|
||||
requires the `nlp` model to have access to pre-trained word embeddings, but a
|
||||
requires the `nlp` model to have access to pretrained word embeddings, but a
|
||||
custom implementation of this encoding step can also be used.
|
||||
3. **Construct the KB** by defining all entities with their pretrained vectors,
|
||||
and all aliases with their prior probabilities.
|
||||
|
|
|
@ -324,9 +324,9 @@ check if all of your models are up to date, you can run the
|
|||
|
||||
- The lemmatization tables have been moved to their own package,
|
||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data), which
|
||||
is not installed by default. If you're using pre-trained models, **nothing
|
||||
is not installed by default. If you're using pretrained models, **nothing
|
||||
changes**, because the tables are now included in the model packages. If you
|
||||
want to use the lemmatizer for other languages that don't yet have pre-trained
|
||||
want to use the lemmatizer for other languages that don't yet have pretrained
|
||||
models (e.g. Turkish or Croatian) or start off with a blank model that
|
||||
contains lookup data (e.g. `spacy.blank("en")`), you'll need to **explicitly
|
||||
install spaCy plus data** via `pip install spacy[lookups]`.
|
||||
|
|
|
@ -1677,7 +1677,7 @@
|
|||
{
|
||||
"id": "spacy-pytorch-transformers",
|
||||
"title": "spacy-pytorch-transformers",
|
||||
"slogan": "spaCy pipelines for pre-trained BERT, XLNet and GPT-2",
|
||||
"slogan": "spaCy pipelines for pretrained BERT, XLNet and GPT-2",
|
||||
"description": "This package provides spaCy model pipelines that wrap [Hugging Face's `pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
|
||||
"github": "explosion/spacy-pytorch-transformers",
|
||||
"url": "https://explosion.ai/blog/spacy-pytorch-transformers",
|
||||
|
@ -1855,7 +1855,7 @@
|
|||
{
|
||||
"id": "models",
|
||||
"title": "Models",
|
||||
"description": "Third-party pre-trained models for different languages and domains"
|
||||
"description": "Third-party pretrained models for different languages and domains"
|
||||
}
|
||||
]
|
||||
},
|
||||
|
|
|
@ -345,7 +345,7 @@ const Models = ({ pageContext, repo, children }) => {
|
|||
|
||||
return (
|
||||
<>
|
||||
<Title title={title} teaser={`Available pre-trained statistical models for ${title}`} />
|
||||
<Title title={title} teaser={`Available pretrained statistical models for ${title}`} />
|
||||
<StaticQuery
|
||||
query={query}
|
||||
render={({ site }) =>
|
||||
|
|
|
@ -126,7 +126,7 @@ const Landing = ({ data }) => {
|
|||
{counts.modelLangs} languages
|
||||
</Li>
|
||||
<Li>
|
||||
Pre-trained <strong>word vectors</strong>
|
||||
pretrained <strong>word vectors</strong>
|
||||
</Li>
|
||||
<Li>State-of-the-art speed</Li>
|
||||
<Li>
|
||||
|
|
Loading…
Reference in New Issue
Block a user