mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-27 17:54:39 +03:00
Use consistent spelling
This commit is contained in:
parent
208629615d
commit
b6670bf0c2
|
@ -3,9 +3,9 @@
|
||||||
# spaCy: Industrial-strength NLP
|
# spaCy: Industrial-strength NLP
|
||||||
|
|
||||||
spaCy is a library for advanced Natural Language Processing in Python and
|
spaCy is a library for advanced Natural Language Processing in Python and
|
||||||
[pre-trained statistical models](https://spacy.io/models) and word vectors, and
|
|
||||||
Cython. It's built on the very latest research, and was designed from day one to
|
Cython. It's built on the very latest research, and was designed from day one to
|
||||||
be used in real products. spaCy comes with
|
be used in real products. spaCy comes with
|
||||||
|
[pretrained statistical models](https://spacy.io/models) and word vectors, and
|
||||||
currently supports tokenization for **50+ languages**. It features
|
currently supports tokenization for **50+ languages**. It features
|
||||||
state-of-the-art speed, convolutional **neural network models** for tagging,
|
state-of-the-art speed, convolutional **neural network models** for tagging,
|
||||||
parsing and **named entity recognition** and easy **deep learning** integration.
|
parsing and **named entity recognition** and easy **deep learning** integration.
|
||||||
|
@ -73,7 +73,7 @@ it.
|
||||||
- Non-destructive **tokenization**
|
- Non-destructive **tokenization**
|
||||||
- **Named entity** recognition
|
- **Named entity** recognition
|
||||||
- Support for **50+ languages**
|
- Support for **50+ languages**
|
||||||
- Pre-trained [statistical models](https://spacy.io/models) and word vectors
|
- pretrained [statistical models](https://spacy.io/models) and word vectors
|
||||||
- State-of-the-art speed
|
- State-of-the-art speed
|
||||||
- Easy **deep learning** integration
|
- Easy **deep learning** integration
|
||||||
- Part-of-speech tagging
|
- Part-of-speech tagging
|
||||||
|
|
|
@ -376,7 +376,7 @@ def initialize_pipeline(nlp, docs, golds, config, device):
|
||||||
|
|
||||||
|
|
||||||
def _load_pretrained_tok2vec(nlp, loc):
|
def _load_pretrained_tok2vec(nlp, loc):
|
||||||
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
"""Load pretrained weights for the 'token-to-vector' part of the component
|
||||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||||
"""
|
"""
|
||||||
with Path(loc).open("rb") as file_:
|
with Path(loc).open("rb") as file_:
|
||||||
|
@ -472,7 +472,7 @@ class TreebankPaths(object):
|
||||||
gpu_device=("Use GPU", "option", "g", int),
|
gpu_device=("Use GPU", "option", "g", int),
|
||||||
use_oracle_segments=("Use oracle segments", "flag", "G", int),
|
use_oracle_segments=("Use oracle segments", "flag", "G", int),
|
||||||
vectors_dir=(
|
vectors_dir=(
|
||||||
"Path to directory with pre-trained vectors, named e.g. en/",
|
"Path to directory with pretrained vectors, named e.g. en/",
|
||||||
"option",
|
"option",
|
||||||
"v",
|
"v",
|
||||||
Path,
|
Path,
|
||||||
|
|
|
@ -38,10 +38,10 @@ def create_kb(
|
||||||
# check the length of the nlp vectors
|
# check the length of the nlp vectors
|
||||||
if "vectors" in nlp.meta and nlp.vocab.vectors.size:
|
if "vectors" in nlp.meta and nlp.vocab.vectors.size:
|
||||||
input_dim = nlp.vocab.vectors_length
|
input_dim = nlp.vocab.vectors_length
|
||||||
logger.info("Loaded pre-trained vectors of size %s" % input_dim)
|
logger.info("Loaded pretrained vectors of size %s" % input_dim)
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The `nlp` object should have access to pre-trained word vectors, "
|
"The `nlp` object should have access to pretrained word vectors, "
|
||||||
" cf. https://spacy.io/usage/models#languages."
|
" cf. https://spacy.io/usage/models#languages."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -83,7 +83,7 @@ def main(
|
||||||
# check the length of the nlp vectors
|
# check the length of the nlp vectors
|
||||||
if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
|
if "vectors" not in nlp.meta or not nlp.vocab.vectors.size:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"The `nlp` object should have access to pre-trained word vectors, "
|
"The `nlp` object should have access to pretrained word vectors, "
|
||||||
" cf. https://spacy.io/usage/models#languages."
|
" cf. https://spacy.io/usage/models#languages."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,7 @@ def main(
|
||||||
|
|
||||||
# check that there is a NER component in the pipeline
|
# check that there is a NER component in the pipeline
|
||||||
if "ner" not in nlp.pipe_names:
|
if "ner" not in nlp.pipe_names:
|
||||||
raise ValueError("The `nlp` object should have a pre-trained `ner` component.")
|
raise ValueError("The `nlp` object should have a pretrained `ner` component.")
|
||||||
|
|
||||||
# STEP 2: create a training dataset from WP
|
# STEP 2: create a training dataset from WP
|
||||||
logger.info("STEP 2: reading training dataset from {}".format(training_path))
|
logger.info("STEP 2: reading training dataset from {}".format(training_path))
|
||||||
|
|
|
@ -27,7 +27,7 @@ from bin.wiki_entity_linking.train_descriptions import EntityEncoder
|
||||||
# Q7381115 (Russ Cochran): publisher
|
# Q7381115 (Russ Cochran): publisher
|
||||||
ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
|
ENTITIES = {"Q2146908": ("American golfer", 342), "Q7381115": ("publisher", 17)}
|
||||||
|
|
||||||
INPUT_DIM = 300 # dimension of pre-trained input vectors
|
INPUT_DIM = 300 # dimension of pretrained input vectors
|
||||||
DESC_WIDTH = 64 # dimension of output entity vectors
|
DESC_WIDTH = 64 # dimension of output entity vectors
|
||||||
|
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ DESC_WIDTH = 64 # dimension of output entity vectors
|
||||||
)
|
)
|
||||||
def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
|
def main(vocab_path=None, model=None, output_dir=None, n_iter=50):
|
||||||
"""Load the model, create the KB and pretrain the entity encodings.
|
"""Load the model, create the KB and pretrain the entity encodings.
|
||||||
Either an nlp model or a vocab is needed to provide access to pre-trained word embeddings.
|
Either an nlp model or a vocab is needed to provide access to pretrained word embeddings.
|
||||||
If an output_dir is provided, the KB will be stored there in a file 'kb'.
|
If an output_dir is provided, the KB will be stored there in a file 'kb'.
|
||||||
When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
|
When providing an nlp model, the updated vocab will also be written to a directory in the output_dir."""
|
||||||
if model is None and vocab_path is None:
|
if model is None and vocab_path is None:
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
"""This script is experimental.
|
"""This script is experimental.
|
||||||
|
|
||||||
Try pre-training the CNN component of the text categorizer using a cheap
|
Try pre-training the CNN component of the text categorizer using a cheap
|
||||||
language modelling-like objective. Specifically, we load pre-trained vectors
|
language modelling-like objective. Specifically, we load pretrained vectors
|
||||||
(from something like word2vec, GloVe, FastText etc), and use the CNN to
|
(from something like word2vec, GloVe, FastText etc), and use the CNN to
|
||||||
predict the tokens' pre-trained vectors. This isn't as easy as it sounds:
|
predict the tokens' pretrained vectors. This isn't as easy as it sounds:
|
||||||
we're not merely doing compression here, because heavy dropout is applied,
|
we're not merely doing compression here, because heavy dropout is applied,
|
||||||
including over the input words. This means the model must often (50% of the time)
|
including over the input words. This means the model must often (50% of the time)
|
||||||
use the context in order to predict the word.
|
use the context in order to predict the word.
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
"""Example of training an additional entity type
|
"""Example of training an additional entity type
|
||||||
|
|
||||||
This script shows how to add a new entity type to an existing pre-trained NER
|
This script shows how to add a new entity type to an existing pretrained NER
|
||||||
model. To keep the example short and simple, only four sentences are provided
|
model. To keep the example short and simple, only four sentences are provided
|
||||||
as examples. In practice, you'll need many more — a few hundred would be a
|
as examples. In practice, you'll need many more — a few hundred would be a
|
||||||
good start. You will also likely need to mix in examples of other entity
|
good start. You will also likely need to mix in examples of other entity
|
||||||
|
|
|
@ -96,9 +96,9 @@ def pretrain(
|
||||||
"""
|
"""
|
||||||
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components,
|
||||||
using an approximate language-modelling objective. Specifically, we load
|
using an approximate language-modelling objective. Specifically, we load
|
||||||
pre-trained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
pretrained vectors, and train a component like a CNN, BiLSTM, etc to predict
|
||||||
vectors which match the pre-trained ones. The weights are saved to a directory
|
vectors which match the pretrained ones. The weights are saved to a directory
|
||||||
after each epoch. You can then pass a path to one of these pre-trained weights
|
after each epoch. You can then pass a path to one of these pretrained weights
|
||||||
files to the 'spacy train' command.
|
files to the 'spacy train' command.
|
||||||
|
|
||||||
This technique may be especially helpful if you have little labelled data.
|
This technique may be especially helpful if you have little labelled data.
|
||||||
|
@ -156,7 +156,7 @@ def pretrain(
|
||||||
subword_features=True, # Set to False for Chinese etc
|
subword_features=True, # Set to False for Chinese etc
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
# Load in pre-trained weights
|
# Load in pretrained weights
|
||||||
if init_tok2vec is not None:
|
if init_tok2vec is not None:
|
||||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
||||||
|
|
|
@ -241,7 +241,7 @@ def train(
|
||||||
|
|
||||||
nlp._optimizer = None
|
nlp._optimizer = None
|
||||||
|
|
||||||
# Load in pre-trained weights
|
# Load in pretrained weights
|
||||||
if init_tok2vec is not None:
|
if init_tok2vec is not None:
|
||||||
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
components = _load_pretrained_tok2vec(nlp, init_tok2vec)
|
||||||
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
msg.text("Loaded pretrained tok2vec for: {}".format(components))
|
||||||
|
@ -529,7 +529,7 @@ def _load_vectors(nlp, vectors):
|
||||||
|
|
||||||
|
|
||||||
def _load_pretrained_tok2vec(nlp, loc):
|
def _load_pretrained_tok2vec(nlp, loc):
|
||||||
"""Load pre-trained weights for the 'token-to-vector' part of the component
|
"""Load pretrained weights for the 'token-to-vector' part of the component
|
||||||
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
models, which is typically a CNN. See 'spacy pretrain'. Experimental.
|
||||||
"""
|
"""
|
||||||
with loc.open("rb") as file_:
|
with loc.open("rb") as file_:
|
||||||
|
|
|
@ -356,7 +356,7 @@ class Errors(object):
|
||||||
E113 = ("The newly split token can only have one root (head = 0).")
|
E113 = ("The newly split token can only have one root (head = 0).")
|
||||||
E114 = ("The newly split token needs to have a root (head = 0).")
|
E114 = ("The newly split token needs to have a root (head = 0).")
|
||||||
E115 = ("All subtokens must have associated heads.")
|
E115 = ("All subtokens must have associated heads.")
|
||||||
E116 = ("Cannot currently add labels to pre-trained text classifier. Add "
|
E116 = ("Cannot currently add labels to pretrained text classifier. Add "
|
||||||
"labels before training begins. This functionality was available "
|
"labels before training begins. This functionality was available "
|
||||||
"in previous versions, but had significant bugs that led to poor "
|
"in previous versions, but had significant bugs that led to poor "
|
||||||
"performance.")
|
"performance.")
|
||||||
|
@ -482,7 +482,7 @@ class Errors(object):
|
||||||
"Current DocBin: {current}\nOther DocBin: {other}")
|
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||||
E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can "
|
E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can "
|
||||||
"happen if the tagger was trained with a different set of "
|
"happen if the tagger was trained with a different set of "
|
||||||
"morphological features. If you're using a pre-trained model, make "
|
"morphological features. If you're using a pretrained model, make "
|
||||||
"sure that your models are up to date:\npython -m spacy validate")
|
"sure that your models are up to date:\npython -m spacy validate")
|
||||||
E168 = ("Unknown field: {field}")
|
E168 = ("Unknown field: {field}")
|
||||||
E169 = ("Can't find module: {module}")
|
E169 = ("Can't find module: {module}")
|
||||||
|
@ -499,13 +499,13 @@ class Errors(object):
|
||||||
|
|
||||||
@add_codes
|
@add_codes
|
||||||
class TempErrors(object):
|
class TempErrors(object):
|
||||||
T003 = ("Resizing pre-trained Tagger models is not currently supported.")
|
T003 = ("Resizing pretrained Tagger models is not currently supported.")
|
||||||
T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
|
T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
|
||||||
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
T008 = ("Bad configuration of Tagger. This is probably a bug within "
|
T008 = ("Bad configuration of Tagger. This is probably a bug within "
|
||||||
"spaCy. We changed the name of an internal attribute for loading "
|
"spaCy. We changed the name of an internal attribute for loading "
|
||||||
"pre-trained vectors, and the class has been passed the old name "
|
"pretrained vectors, and the class has been passed the old name "
|
||||||
"(pretrained_dims) but not the new name (pretrained_vectors).")
|
"(pretrained_dims) but not the new name (pretrained_vectors).")
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -521,7 +521,7 @@ class Language(object):
|
||||||
"""Make a "rehearsal" update to the models in the pipeline, to prevent
|
"""Make a "rehearsal" update to the models in the pipeline, to prevent
|
||||||
forgetting. Rehearsal updates run an initial copy of the model over some
|
forgetting. Rehearsal updates run an initial copy of the model over some
|
||||||
data, and update the model so its current predictions are more like the
|
data, and update the model so its current predictions are more like the
|
||||||
initial ones. This is useful for keeping a pre-trained model on-track,
|
initial ones. This is useful for keeping a pretrained model on-track,
|
||||||
even if you're updating it with a smaller set of examples.
|
even if you're updating it with a smaller set of examples.
|
||||||
|
|
||||||
docs (iterable): A batch of `Doc` objects.
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
@ -627,7 +627,7 @@ class Language(object):
|
||||||
return self._optimizer
|
return self._optimizer
|
||||||
|
|
||||||
def resume_training(self, sgd=None, **cfg):
|
def resume_training(self, sgd=None, **cfg):
|
||||||
"""Continue training a pre-trained model.
|
"""Continue training a pretrained model.
|
||||||
|
|
||||||
Create and return an optimizer, and initialize "rehearsal" for any pipeline
|
Create and return an optimizer, and initialize "rehearsal" for any pipeline
|
||||||
component that has a .rehearse() method. Rehearsal is used to prevent
|
component that has a .rehearse() method. Rehearsal is used to prevent
|
||||||
|
|
|
@ -125,7 +125,7 @@ class Pipe(object):
|
||||||
def add_label(self, label):
|
def add_label(self, label):
|
||||||
"""Add an output label, to be predicted by the model.
|
"""Add an output label, to be predicted by the model.
|
||||||
|
|
||||||
It's possible to extend pre-trained models with new labels,
|
It's possible to extend pretrained models with new labels,
|
||||||
but care should be taken to avoid the "catastrophic forgetting"
|
but care should be taken to avoid the "catastrophic forgetting"
|
||||||
problem.
|
problem.
|
||||||
"""
|
"""
|
||||||
|
|
|
@ -439,10 +439,10 @@ $ token_vector_width=256 learn_rate=0.0001 spacy train [...]
|
||||||
## Pretrain {#pretrain new="2.1" tag="experimental"}
|
## Pretrain {#pretrain new="2.1" tag="experimental"}
|
||||||
|
|
||||||
Pre-train the "token to vector" (`tok2vec`) layer of pipeline components, using
|
Pre-train the "token to vector" (`tok2vec`) layer of pipeline components, using
|
||||||
an approximate language-modeling objective. Specifically, we load pre-trained
|
an approximate language-modeling objective. Specifically, we load pretrained
|
||||||
vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which
|
vectors, and train a component like a CNN, BiLSTM, etc to predict vectors which
|
||||||
match the pre-trained ones. The weights are saved to a directory after each
|
match the pretrained ones. The weights are saved to a directory after each
|
||||||
epoch. You can then pass a path to one of these pre-trained weights files to the
|
epoch. You can then pass a path to one of these pretrained weights files to the
|
||||||
`spacy train` command.
|
`spacy train` command.
|
||||||
|
|
||||||
This technique may be especially helpful if you have little labelled data.
|
This technique may be especially helpful if you have little labelled data.
|
||||||
|
@ -476,7 +476,7 @@ $ python -m spacy pretrain [texts_loc] [vectors_model] [output_dir]
|
||||||
| `--n-save-every`, `-se` | option | Save model every X batches. |
|
| `--n-save-every`, `-se` | option | Save model every X batches. |
|
||||||
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
|
| `--init-tok2vec`, `-t2v` <Tag variant="new">2.1</Tag> | option | Path to pretrained weights for the token-to-vector parts of the models. See `spacy pretrain`. Experimental. |
|
||||||
| `--epoch-start`, `-es` <Tag variant="new">2.1.5</Tag> | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files. |
|
| `--epoch-start`, `-es` <Tag variant="new">2.1.5</Tag> | option | The epoch to start counting at. Only relevant when using `--init-tok2vec` and the given weight file has been renamed. Prevents unintended overwriting of existing weight files. |
|
||||||
| **CREATES** | weights | The pre-trained weights that can be used to initialize `spacy train`. |
|
| **CREATES** | weights | The pretrained weights that can be used to initialize `spacy train`. |
|
||||||
|
|
||||||
### JSONL format for raw text {#pretrain-jsonl}
|
### JSONL format for raw text {#pretrain-jsonl}
|
||||||
|
|
||||||
|
|
|
@ -6,11 +6,11 @@ source: spacy/kb.pyx
|
||||||
new: 2.2
|
new: 2.2
|
||||||
---
|
---
|
||||||
|
|
||||||
The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init)
|
The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init)
|
||||||
objects, which are plausible external identifiers given a certain textual mention.
|
objects, which are plausible external identifiers given a certain textual mention.
|
||||||
Each such `Candidate` holds information from the relevant KB entities,
|
Each such `Candidate` holds information from the relevant KB entities,
|
||||||
such as its frequency in text and possible aliases.
|
such as its frequency in text and possible aliases.
|
||||||
Each entity in the knowledge base also has a pre-trained entity vector of a fixed size.
|
Each entity in the knowledge base also has a pretrained entity vector of a fixed size.
|
||||||
|
|
||||||
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
|
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
|
||||||
|
|
||||||
|
@ -26,9 +26,9 @@ Create the knowledge base.
|
||||||
|
|
||||||
| Name | Type | Description |
|
| Name | Type | Description |
|
||||||
| ----------------------- | ---------------- | ----------------------------------------- |
|
| ----------------------- | ---------------- | ----------------------------------------- |
|
||||||
| `vocab` | `Vocab` | A `Vocab` object. |
|
| `vocab` | `Vocab` | A `Vocab` object. |
|
||||||
| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
|
| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
|
||||||
| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
|
| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
|
||||||
|
|
||||||
|
|
||||||
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
|
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
|
||||||
|
@ -41,7 +41,7 @@ The length of the fixed-size entity vectors in the knowledge base.
|
||||||
|
|
||||||
## KnowledgeBase.add_entity {#add_entity tag="method"}
|
## KnowledgeBase.add_entity {#add_entity tag="method"}
|
||||||
|
|
||||||
Add an entity to the knowledge base, specifying its corpus frequency
|
Add an entity to the knowledge base, specifying its corpus frequency
|
||||||
and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length).
|
and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -55,11 +55,11 @@ and entity vector, which should be of length [`entity_vector_length`](/api/kb#en
|
||||||
| --------------- | ------------- | ------------------------------------------------- |
|
| --------------- | ------------- | ------------------------------------------------- |
|
||||||
| `entity` | unicode | The unique entity identifier |
|
| `entity` | unicode | The unique entity identifier |
|
||||||
| `freq` | float | The frequency of the entity in a typical corpus |
|
| `freq` | float | The frequency of the entity in a typical corpus |
|
||||||
| `entity_vector` | vector | The pre-trained vector of the entity |
|
| `entity_vector` | vector | The pretrained vector of the entity |
|
||||||
|
|
||||||
## KnowledgeBase.set_entities {#set_entities tag="method"}
|
## KnowledgeBase.set_entities {#set_entities tag="method"}
|
||||||
|
|
||||||
Define the full list of entities in the knowledge base, specifying the corpus frequency
|
Define the full list of entities in the knowledge base, specifying the corpus frequency
|
||||||
and entity vector for each entity.
|
and entity vector for each entity.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -76,9 +76,9 @@ and entity vector for each entity.
|
||||||
|
|
||||||
## KnowledgeBase.add_alias {#add_alias tag="method"}
|
## KnowledgeBase.add_alias {#add_alias tag="method"}
|
||||||
|
|
||||||
Add an alias or mention to the knowledge base, specifying its potential KB identifiers
|
Add an alias or mention to the knowledge base, specifying its potential KB identifiers
|
||||||
and their prior probabilities. The entity identifiers should refer to entities previously
|
and their prior probabilities. The entity identifiers should refer to entities previously
|
||||||
added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities).
|
added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities).
|
||||||
The sum of the prior probabilities should not exceed 1.
|
The sum of the prior probabilities should not exceed 1.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -151,8 +151,8 @@ Get a list of all aliases in the knowledge base.
|
||||||
|
|
||||||
## KnowledgeBase.get_candidates {#get_candidates tag="method"}
|
## KnowledgeBase.get_candidates {#get_candidates tag="method"}
|
||||||
|
|
||||||
Given a certain textual mention as input, retrieve a list of candidate entities
|
Given a certain textual mention as input, retrieve a list of candidate entities
|
||||||
of type [`Candidate`](/api/kb/#candidate_init).
|
of type [`Candidate`](/api/kb/#candidate_init).
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -167,7 +167,7 @@ of type [`Candidate`](/api/kb/#candidate_init).
|
||||||
|
|
||||||
## KnowledgeBase.get_vector {#get_vector tag="method"}
|
## KnowledgeBase.get_vector {#get_vector tag="method"}
|
||||||
|
|
||||||
Given a certain entity ID, retrieve its pre-trained entity vector.
|
Given a certain entity ID, retrieve its pretrained entity vector.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
>
|
>
|
||||||
|
@ -182,7 +182,7 @@ Given a certain entity ID, retrieve its pre-trained entity vector.
|
||||||
|
|
||||||
## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
|
## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
|
||||||
|
|
||||||
Given a certain entity ID and a certain textual mention, retrieve
|
Given a certain entity ID and a certain textual mention, retrieve
|
||||||
the prior probability of the fact that the mention links to the entity ID.
|
the prior probability of the fact that the mention links to the entity ID.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -213,7 +213,7 @@ Save the current state of the knowledge base to a directory.
|
||||||
|
|
||||||
## KnowledgeBase.load_bulk {#load_bulk tag="method"}
|
## KnowledgeBase.load_bulk {#load_bulk tag="method"}
|
||||||
|
|
||||||
Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab)
|
Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab)
|
||||||
should also be the same as the one used to create the KB.
|
should also be the same as the one used to create the KB.
|
||||||
|
|
||||||
> #### Example
|
> #### Example
|
||||||
|
@ -265,4 +265,4 @@ of a `KnowledgeBase`.
|
||||||
| `alias_` | unicode | The alias or textual mention |
|
| `alias_` | unicode | The alias or textual mention |
|
||||||
| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
|
| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
|
||||||
| `entity_freq` | long | The frequency of the entity in a typical corpus |
|
| `entity_freq` | long | The frequency of the entity in a typical corpus |
|
||||||
| `entity_vector` | vector | The pre-trained vector of the entity |
|
| `entity_vector` | vector | The pretrained vector of the entity |
|
||||||
|
|
|
@ -440,7 +440,7 @@ package exposes the data files via language-specific
|
||||||
constructing the `Vocab` and [`Lookups`](/api/lookups). This allows easier
|
constructing the `Vocab` and [`Lookups`](/api/lookups). This allows easier
|
||||||
access to the data, serialization with the models and file compression on disk
|
access to the data, serialization with the models and file compression on disk
|
||||||
(so your spaCy installation is smaller). If you want to use the lookup tables
|
(so your spaCy installation is smaller). If you want to use the lookup tables
|
||||||
without a pre-trained model, you have to explicitly install spaCy with lookups
|
without a pretrained model, you have to explicitly install spaCy with lookups
|
||||||
via `pip install spacy[lookups]` or by installing
|
via `pip install spacy[lookups]` or by installing
|
||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) in the
|
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data) in the
|
||||||
same environment as spaCy.
|
same environment as spaCy.
|
||||||
|
|
|
@ -93,7 +93,7 @@ https://github.com/explosion/spaCy/tree/master/examples/pipeline/multi_processin
|
||||||
### Training spaCy's Named Entity Recognizer {#training-ner}
|
### Training spaCy's Named Entity Recognizer {#training-ner}
|
||||||
|
|
||||||
This example shows how to update spaCy's entity recognizer with your own
|
This example shows how to update spaCy's entity recognizer with your own
|
||||||
examples, starting off with an existing, pre-trained model, or from scratch
|
examples, starting off with an existing, pretrained model, or from scratch
|
||||||
using a blank `Language` class.
|
using a blank `Language` class.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
@ -102,7 +102,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py
|
||||||
|
|
||||||
### Training an additional entity type {#new-entity-type}
|
### Training an additional entity type {#new-entity-type}
|
||||||
|
|
||||||
This script shows how to add a new entity type to an existing pre-trained NER
|
This script shows how to add a new entity type to an existing pretrained NER
|
||||||
model. To keep the example short and simple, only four sentences are provided as
|
model. To keep the example short and simple, only four sentences are provided as
|
||||||
examples. In practice, you'll need many more — a few hundred would be a good
|
examples. In practice, you'll need many more — a few hundred would be a good
|
||||||
start.
|
start.
|
||||||
|
@ -114,7 +114,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_new_entit
|
||||||
### Training spaCy's Dependency Parser {#parser}
|
### Training spaCy's Dependency Parser {#parser}
|
||||||
|
|
||||||
This example shows how to update spaCy's dependency parser, starting off with an
|
This example shows how to update spaCy's dependency parser, starting off with an
|
||||||
existing, pre-trained model, or from scratch using a blank `Language` class.
|
existing, pretrained model, or from scratch using a blank `Language` class.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py
|
https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py
|
||||||
|
|
|
@ -137,7 +137,7 @@ pre-processing.
|
||||||
|
|
||||||
### Model comparison {#spacy-models}
|
### Model comparison {#spacy-models}
|
||||||
|
|
||||||
In this section, we provide benchmark accuracies for the pre-trained model
|
In this section, we provide benchmark accuracies for the pretrained model
|
||||||
pipelines we distribute with spaCy. Evaluations are conducted end-to-end from
|
pipelines we distribute with spaCy. Evaluations are conducted end-to-end from
|
||||||
raw text, with no "gold standard" pre-processing, over text from a mix of genres
|
raw text, with no "gold standard" pre-processing, over text from a mix of genres
|
||||||
where possible.
|
where possible.
|
||||||
|
|
|
@ -56,7 +56,7 @@ run `pip install spacy[lookups]` or install
|
||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
|
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data)
|
||||||
separately. The lookups package is needed to create blank models with
|
separately. The lookups package is needed to create blank models with
|
||||||
lemmatization data, and to lemmatize in languages that don't yet come with
|
lemmatization data, and to lemmatize in languages that don't yet come with
|
||||||
pre-trained models and aren't powered by third-party libraries.
|
pretrained models and aren't powered by third-party libraries.
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
|
||||||
|
|
|
@ -508,7 +508,7 @@ responsibility for ensuring that the data is left in a consistent state.
|
||||||
|
|
||||||
<Infobox title="Annotation scheme">
|
<Infobox title="Annotation scheme">
|
||||||
|
|
||||||
For details on the entity types available in spaCy's pre-trained models, see the
|
For details on the entity types available in spaCy's pretrained models, see the
|
||||||
[NER annotation scheme](/api/annotation#named-entities).
|
[NER annotation scheme](/api/annotation#named-entities).
|
||||||
|
|
||||||
</Infobox>
|
</Infobox>
|
||||||
|
@ -998,7 +998,7 @@ can sometimes tokenize things differently – for example, `"I'm"` →
|
||||||
In situations like that, you often want to align the tokenization so that you
|
In situations like that, you often want to align the tokenization so that you
|
||||||
can merge annotations from different sources together, or take vectors predicted
|
can merge annotations from different sources together, or take vectors predicted
|
||||||
by a
|
by a
|
||||||
[pre-trained BERT model](https://github.com/huggingface/pytorch-transformers)
|
[pretrained BERT model](https://github.com/huggingface/pytorch-transformers)
|
||||||
and apply them to spaCy tokens. spaCy's [`gold.align`](/api/goldparse#align)
|
and apply them to spaCy tokens. spaCy's [`gold.align`](/api/goldparse#align)
|
||||||
helper returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the
|
helper returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the
|
||||||
number of misaligned tokens, the one-to-one mappings of token indices in both
|
number of misaligned tokens, the one-to-one mappings of token indices in both
|
||||||
|
|
|
@ -860,7 +860,7 @@ def custom_ner_wrapper(doc):
|
||||||
|
|
||||||
The `custom_ner_wrapper` can then be added to the pipeline of a blank model
|
The `custom_ner_wrapper` can then be added to the pipeline of a blank model
|
||||||
using [`nlp.add_pipe`](/api/language#add_pipe). You can also replace the
|
using [`nlp.add_pipe`](/api/language#add_pipe). You can also replace the
|
||||||
existing entity recognizer of a pre-trained model with
|
existing entity recognizer of a pretrained model with
|
||||||
[`nlp.replace_pipe`](/api/language#replace_pipe).
|
[`nlp.replace_pipe`](/api/language#replace_pipe).
|
||||||
|
|
||||||
Here's another example of a custom model, `your_custom_model`, that takes a list
|
Here's another example of a custom model, `your_custom_model`, that takes a list
|
||||||
|
|
|
@ -1078,7 +1078,7 @@ order to implement more abstract logic.
|
||||||
|
|
||||||
### Example: Expanding named entities {#models-rules-ner}
|
### Example: Expanding named entities {#models-rules-ner}
|
||||||
|
|
||||||
When using the a pre-trained
|
When using the a pretrained
|
||||||
[named entity recognition](/usage/linguistic-features/#named-entities) model to
|
[named entity recognition](/usage/linguistic-features/#named-entities) model to
|
||||||
extract information from your texts, you may find that the predicted span only
|
extract information from your texts, you may find that the predicted span only
|
||||||
includes parts of the entity you're looking for. Sometimes, this happens if
|
includes parts of the entity you're looking for. Sometimes, this happens if
|
||||||
|
|
|
@ -321,7 +321,7 @@ the `drop` keyword argument. See the [`Language`](/api/language) and
|
||||||
## Training the named entity recognizer {#ner}
|
## Training the named entity recognizer {#ner}
|
||||||
|
|
||||||
All [spaCy models](/models) support online learning, so you can update a
|
All [spaCy models](/models) support online learning, so you can update a
|
||||||
pre-trained model with new examples. You'll usually need to provide many
|
pretrained model with new examples. You'll usually need to provide many
|
||||||
**examples** to meaningfully improve the system — a few hundred is a good start,
|
**examples** to meaningfully improve the system — a few hundred is a good start,
|
||||||
although more is better.
|
although more is better.
|
||||||
|
|
||||||
|
@ -347,7 +347,7 @@ your data** to find a solution that works best for you.
|
||||||
### Updating the Named Entity Recognizer {#example-train-ner}
|
### Updating the Named Entity Recognizer {#example-train-ner}
|
||||||
|
|
||||||
This example shows how to update spaCy's entity recognizer with your own
|
This example shows how to update spaCy's entity recognizer with your own
|
||||||
examples, starting off with an existing, pre-trained model, or from scratch
|
examples, starting off with an existing, pretrained model, or from scratch
|
||||||
using a blank `Language` class. To do this, you'll need **example texts** and
|
using a blank `Language` class. To do this, you'll need **example texts** and
|
||||||
the **character offsets** and **labels** of each entity contained in the texts.
|
the **character offsets** and **labels** of each entity contained in the texts.
|
||||||
|
|
||||||
|
@ -376,7 +376,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_ner.py
|
||||||
### Training an additional entity type {#example-new-entity-type}
|
### Training an additional entity type {#example-new-entity-type}
|
||||||
|
|
||||||
This script shows how to add a new entity type `ANIMAL` to an existing
|
This script shows how to add a new entity type `ANIMAL` to an existing
|
||||||
pre-trained NER model, or an empty `Language` class. To keep the example short
|
pretrained NER model, or an empty `Language` class. To keep the example short
|
||||||
and simple, only a few sentences are provided as examples. In practice, you'll
|
and simple, only a few sentences are provided as examples. In practice, you'll
|
||||||
need many more — a few hundred would be a good start. You will also likely need
|
need many more — a few hundred would be a good start. You will also likely need
|
||||||
to mix in examples of other entity types, which might be obtained by running the
|
to mix in examples of other entity types, which might be obtained by running the
|
||||||
|
@ -440,7 +440,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/train_parser.py
|
||||||
training the parser.
|
training the parser.
|
||||||
2. **Add the dependency labels** to the parser using the
|
2. **Add the dependency labels** to the parser using the
|
||||||
[`add_label`](/api/dependencyparser#add_label) method. If you're starting off
|
[`add_label`](/api/dependencyparser#add_label) method. If you're starting off
|
||||||
with a pre-trained spaCy model, this is usually not necessary – but it
|
with a pretrained spaCy model, this is usually not necessary – but it
|
||||||
doesn't hurt either, just to be safe.
|
doesn't hurt either, just to be safe.
|
||||||
3. **Shuffle and loop over** the examples. For each example, **update the
|
3. **Shuffle and loop over** the examples. For each example, **update the
|
||||||
model** by calling [`nlp.update`](/api/language#update), which steps through
|
model** by calling [`nlp.update`](/api/language#update), which steps through
|
||||||
|
@ -624,7 +624,7 @@ https://github.com/explosion/spaCy/tree/master/examples/training/pretrain_kb.py
|
||||||
a pre-defined [`vocab`](/api/vocab) object.
|
a pre-defined [`vocab`](/api/vocab) object.
|
||||||
2. **Pretrain the entity embeddings** by running the descriptions of the
|
2. **Pretrain the entity embeddings** by running the descriptions of the
|
||||||
entities through a simple encoder-decoder network. The current implementation
|
entities through a simple encoder-decoder network. The current implementation
|
||||||
requires the `nlp` model to have access to pre-trained word embeddings, but a
|
requires the `nlp` model to have access to pretrained word embeddings, but a
|
||||||
custom implementation of this encoding step can also be used.
|
custom implementation of this encoding step can also be used.
|
||||||
3. **Construct the KB** by defining all entities with their pretrained vectors,
|
3. **Construct the KB** by defining all entities with their pretrained vectors,
|
||||||
and all aliases with their prior probabilities.
|
and all aliases with their prior probabilities.
|
||||||
|
|
|
@ -324,9 +324,9 @@ check if all of your models are up to date, you can run the
|
||||||
|
|
||||||
- The lemmatization tables have been moved to their own package,
|
- The lemmatization tables have been moved to their own package,
|
||||||
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data), which
|
[`spacy-lookups-data`](https://github.com/explosion/spacy-lookups-data), which
|
||||||
is not installed by default. If you're using pre-trained models, **nothing
|
is not installed by default. If you're using pretrained models, **nothing
|
||||||
changes**, because the tables are now included in the model packages. If you
|
changes**, because the tables are now included in the model packages. If you
|
||||||
want to use the lemmatizer for other languages that don't yet have pre-trained
|
want to use the lemmatizer for other languages that don't yet have pretrained
|
||||||
models (e.g. Turkish or Croatian) or start off with a blank model that
|
models (e.g. Turkish or Croatian) or start off with a blank model that
|
||||||
contains lookup data (e.g. `spacy.blank("en")`), you'll need to **explicitly
|
contains lookup data (e.g. `spacy.blank("en")`), you'll need to **explicitly
|
||||||
install spaCy plus data** via `pip install spacy[lookups]`.
|
install spaCy plus data** via `pip install spacy[lookups]`.
|
||||||
|
|
|
@ -1677,7 +1677,7 @@
|
||||||
{
|
{
|
||||||
"id": "spacy-pytorch-transformers",
|
"id": "spacy-pytorch-transformers",
|
||||||
"title": "spacy-pytorch-transformers",
|
"title": "spacy-pytorch-transformers",
|
||||||
"slogan": "spaCy pipelines for pre-trained BERT, XLNet and GPT-2",
|
"slogan": "spaCy pipelines for pretrained BERT, XLNet and GPT-2",
|
||||||
"description": "This package provides spaCy model pipelines that wrap [Hugging Face's `pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
|
"description": "This package provides spaCy model pipelines that wrap [Hugging Face's `pytorch-transformers`](https://github.com/huggingface/pytorch-transformers) package, so you can use them in spaCy. The result is convenient access to state-of-the-art transformer architectures, such as BERT, GPT-2, XLNet, etc.",
|
||||||
"github": "explosion/spacy-pytorch-transformers",
|
"github": "explosion/spacy-pytorch-transformers",
|
||||||
"url": "https://explosion.ai/blog/spacy-pytorch-transformers",
|
"url": "https://explosion.ai/blog/spacy-pytorch-transformers",
|
||||||
|
@ -1855,7 +1855,7 @@
|
||||||
{
|
{
|
||||||
"id": "models",
|
"id": "models",
|
||||||
"title": "Models",
|
"title": "Models",
|
||||||
"description": "Third-party pre-trained models for different languages and domains"
|
"description": "Third-party pretrained models for different languages and domains"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|
|
@ -345,7 +345,7 @@ const Models = ({ pageContext, repo, children }) => {
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<>
|
<>
|
||||||
<Title title={title} teaser={`Available pre-trained statistical models for ${title}`} />
|
<Title title={title} teaser={`Available pretrained statistical models for ${title}`} />
|
||||||
<StaticQuery
|
<StaticQuery
|
||||||
query={query}
|
query={query}
|
||||||
render={({ site }) =>
|
render={({ site }) =>
|
||||||
|
|
|
@ -126,7 +126,7 @@ const Landing = ({ data }) => {
|
||||||
{counts.modelLangs} languages
|
{counts.modelLangs} languages
|
||||||
</Li>
|
</Li>
|
||||||
<Li>
|
<Li>
|
||||||
Pre-trained <strong>word vectors</strong>
|
pretrained <strong>word vectors</strong>
|
||||||
</Li>
|
</Li>
|
||||||
<Li>State-of-the-art speed</Li>
|
<Li>State-of-the-art speed</Li>
|
||||||
<Li>
|
<Li>
|
||||||
|
|
Loading…
Reference in New Issue
Block a user