mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
Document TokenVectorEncoder
This commit is contained in:
parent
b687ad109d
commit
0fc05e54e4
|
@ -38,21 +38,47 @@ from .parts_of_speech import X
|
||||||
|
|
||||||
|
|
||||||
class TokenVectorEncoder(object):
|
class TokenVectorEncoder(object):
|
||||||
'''Assign position-sensitive vectors to tokens, using a CNN or RNN.'''
|
"""Assign position-sensitive vectors to tokens, using a CNN or RNN."""
|
||||||
name = 'tok2vec'
|
name = 'tok2vec'
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def Model(cls, width=128, embed_size=5000, **cfg):
|
def Model(cls, width=128, embed_size=5000, **cfg):
|
||||||
|
"""Create a new statistical model for the class.
|
||||||
|
|
||||||
|
width (int): Output size of the model.
|
||||||
|
embed_size (int): Number of vectors in the embedding table.
|
||||||
|
**cfg: Config parameters.
|
||||||
|
RETURNS (Model): A `thinc.neural.Model` or similar instance.
|
||||||
|
"""
|
||||||
width = util.env_opt('token_vector_width', width)
|
width = util.env_opt('token_vector_width', width)
|
||||||
embed_size = util.env_opt('embed_size', embed_size)
|
embed_size = util.env_opt('embed_size', embed_size)
|
||||||
return Tok2Vec(width, embed_size, preprocess=None)
|
return Tok2Vec(width, embed_size, preprocess=None)
|
||||||
|
|
||||||
def __init__(self, vocab, model=True, **cfg):
|
def __init__(self, vocab, model=True, **cfg):
|
||||||
|
"""Construct a new statistical model. Weights are not allocated on
|
||||||
|
initialisation.
|
||||||
|
|
||||||
|
vocab (Vocab): A `Vocab` instance. The model must share the same `Vocab`
|
||||||
|
instance with the `Doc` objects it will process.
|
||||||
|
model (Model): A `Model` instance or `True` allocate one later.
|
||||||
|
**cfg: Config parameters.
|
||||||
|
|
||||||
|
EXAMPLE:
|
||||||
|
>>> from spacy.pipeline import TokenVectorEncoder
|
||||||
|
>>> tok2vec = TokenVectorEncoder(nlp.vocab)
|
||||||
|
>>> tok2vec.model = tok2vec.Model(128, 5000)
|
||||||
|
"""
|
||||||
self.vocab = vocab
|
self.vocab = vocab
|
||||||
self.doc2feats = doc2feats()
|
self.doc2feats = doc2feats()
|
||||||
self.model = model
|
self.model = model
|
||||||
|
|
||||||
def __call__(self, docs, state=None):
|
def __call__(self, docs, state=None):
|
||||||
|
"""Add context-sensitive vectors to a `Doc`, e.g. from a CNN or LSTM
|
||||||
|
model. Vectors are set to the `Doc.tensor` attribute.
|
||||||
|
|
||||||
|
docs (Doc or iterable): One or more documents to add vectors to.
|
||||||
|
RETURNS (dict or None): Intermediate computations.
|
||||||
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
tokvecs = self.predict(docs)
|
tokvecs = self.predict(docs)
|
||||||
|
@ -62,6 +88,13 @@ class TokenVectorEncoder(object):
|
||||||
return state
|
return state
|
||||||
|
|
||||||
def pipe(self, stream, batch_size=128, n_threads=-1):
|
def pipe(self, stream, batch_size=128, n_threads=-1):
|
||||||
|
"""Process `Doc` objects as a stream.
|
||||||
|
|
||||||
|
stream (iterator): A sequence of `Doc` objects to process.
|
||||||
|
batch_size (int): Number of `Doc` objects to group.
|
||||||
|
n_threads (int): Number of threads.
|
||||||
|
YIELDS (tuple): Tuples of `(Doc, state)`.
|
||||||
|
"""
|
||||||
for batch in cytoolz.partition_all(batch_size, stream):
|
for batch in cytoolz.partition_all(batch_size, stream):
|
||||||
docs, states = zip(*batch)
|
docs, states = zip(*batch)
|
||||||
tokvecs = self.predict(docs)
|
tokvecs = self.predict(docs)
|
||||||
|
@ -71,18 +104,35 @@ class TokenVectorEncoder(object):
|
||||||
yield from zip(docs, states)
|
yield from zip(docs, states)
|
||||||
|
|
||||||
def predict(self, docs):
|
def predict(self, docs):
|
||||||
|
"""Return a single tensor for a batch of documents.
|
||||||
|
|
||||||
|
docs (iterable): A sequence of `Doc` objects.
|
||||||
|
RETURNS (object): Vector representations for each token in the documents.
|
||||||
|
"""
|
||||||
feats = self.doc2feats(docs)
|
feats = self.doc2feats(docs)
|
||||||
tokvecs = self.model(feats)
|
tokvecs = self.model(feats)
|
||||||
return tokvecs
|
return tokvecs
|
||||||
|
|
||||||
def set_annotations(self, docs, tokvecs):
|
def set_annotations(self, docs, tokvecs):
|
||||||
|
"""Set the tensor attribute for a batch of documents.
|
||||||
|
|
||||||
|
docs (iterable): A sequence of `Doc` objects.
|
||||||
|
tokvecs (object): Vector representation for each token in the documents.
|
||||||
|
"""
|
||||||
start = 0
|
start = 0
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
doc.tensor = tokvecs[start : start + len(doc)]
|
doc.tensor = tokvecs[start : start + len(doc)]
|
||||||
start += len(doc)
|
start += len(doc)
|
||||||
|
|
||||||
def update(self, docs, golds, state=None,
|
def update(self, docs, golds, state=None, drop=0., sgd=None):
|
||||||
drop=0., sgd=None):
|
"""Update the model.
|
||||||
|
|
||||||
|
docs (iterable): A batch of `Doc` objects.
|
||||||
|
golds (iterable): A batch of `GoldParse` objects.
|
||||||
|
drop (float): The droput rate.
|
||||||
|
sgd (function): An optimizer.
|
||||||
|
RETURNS (dict): Results from the update.
|
||||||
|
"""
|
||||||
if isinstance(docs, Doc):
|
if isinstance(docs, Doc):
|
||||||
docs = [docs]
|
docs = [docs]
|
||||||
golds = [golds]
|
golds = [golds]
|
||||||
|
@ -95,14 +145,26 @@ class TokenVectorEncoder(object):
|
||||||
return state
|
return state
|
||||||
|
|
||||||
def get_loss(self, docs, golds, scores):
|
def get_loss(self, docs, golds, scores):
|
||||||
|
# TODO: implement
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def begin_training(self, gold_tuples, pipeline=None):
|
def begin_training(self, gold_tuples, pipeline=None):
|
||||||
|
"""Allocate models, pre-process training data and acquire a trainer and
|
||||||
|
optimizer.
|
||||||
|
|
||||||
|
gold_tuples (iterable): Gold-standard training data.
|
||||||
|
pipeline (list): The pipeline the model is part of.
|
||||||
|
"""
|
||||||
self.doc2feats = doc2feats()
|
self.doc2feats = doc2feats()
|
||||||
if self.model is True:
|
if self.model is True:
|
||||||
self.model = self.Model()
|
self.model = self.Model()
|
||||||
|
|
||||||
def use_params(self, params):
|
def use_params(self, params):
|
||||||
|
"""Replace weights of models in the pipeline with those provided in the
|
||||||
|
params dictionary.
|
||||||
|
|
||||||
|
params (dict): A dictionary of parameters keyed by model ID.
|
||||||
|
"""
|
||||||
with self.model.use_params(params):
|
with self.model.use_params(params):
|
||||||
yield
|
yield
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user