mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Add support for character features to Tok2Vec
This commit is contained in:
parent
eae384ebb2
commit
e1a83d15ed
103
spacy/_ml.py
103
spacy/_ml.py
|
@ -15,7 +15,7 @@ from thinc.api import uniqued, wrap, noop
|
||||||
from thinc.api import with_square_sequences
|
from thinc.api import with_square_sequences
|
||||||
from thinc.linear.linear import LinearModel
|
from thinc.linear.linear import LinearModel
|
||||||
from thinc.neural.ops import NumpyOps, CupyOps
|
from thinc.neural.ops import NumpyOps, CupyOps
|
||||||
from thinc.neural.util import get_array_module
|
from thinc.neural.util import get_array_module, copy_array
|
||||||
from thinc.neural.optimizers import Adam
|
from thinc.neural.optimizers import Adam
|
||||||
|
|
||||||
from thinc import describe
|
from thinc import describe
|
||||||
|
@ -273,6 +273,9 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
pretrained_vectors = kwargs.get("pretrained_vectors", None)
|
pretrained_vectors = kwargs.get("pretrained_vectors", None)
|
||||||
cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
|
cnn_maxout_pieces = kwargs.get("cnn_maxout_pieces", 3)
|
||||||
subword_features = kwargs.get("subword_features", True)
|
subword_features = kwargs.get("subword_features", True)
|
||||||
|
char_embed = kwargs.get("char_embed", False)
|
||||||
|
if char_embed:
|
||||||
|
subword_features = False
|
||||||
conv_depth = kwargs.get("conv_depth", 4)
|
conv_depth = kwargs.get("conv_depth", 4)
|
||||||
bilstm_depth = kwargs.get("bilstm_depth", 0)
|
bilstm_depth = kwargs.get("bilstm_depth", 0)
|
||||||
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
cols = [ID, NORM, PREFIX, SUFFIX, SHAPE, ORTH]
|
||||||
|
@ -295,7 +298,7 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
if pretrained_vectors is not None:
|
if pretrained_vectors is not None:
|
||||||
glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
|
glove = StaticVectors(pretrained_vectors, width, column=cols.index(ID))
|
||||||
|
|
||||||
if subword_features:
|
if subword_features:
|
||||||
embed = uniqued(
|
embed = uniqued(
|
||||||
(glove | norm | prefix | suffix | shape)
|
(glove | norm | prefix | suffix | shape)
|
||||||
>> LN(Maxout(width, width * 5, pieces=3)),
|
>> LN(Maxout(width, width * 5, pieces=3)),
|
||||||
|
@ -310,8 +313,14 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
embed = uniqued(
|
embed = uniqued(
|
||||||
(norm | prefix | suffix | shape)
|
(norm | prefix | suffix | shape)
|
||||||
>> LN(Maxout(width, width * 4, pieces=3)),
|
>> LN(Maxout(width, width * 4, pieces=3)),
|
||||||
column=cols.index(ORTH),
|
column=cols.index(ORTH)
|
||||||
)
|
)
|
||||||
|
elif char_embed:
|
||||||
|
embed = concatenate_lists(
|
||||||
|
CharacterEmbed(nM=64, nC=8),
|
||||||
|
FeatureExtracter(cols) >> with_flatten(norm)
|
||||||
|
)
|
||||||
|
reduce_dimensions = LN(Maxout(width, 64*8+width, pieces=cnn_maxout_pieces))
|
||||||
else:
|
else:
|
||||||
embed = norm
|
embed = norm
|
||||||
|
|
||||||
|
@ -319,9 +328,23 @@ def Tok2Vec(width, embed_size, **kwargs):
|
||||||
ExtractWindow(nW=1)
|
ExtractWindow(nW=1)
|
||||||
>> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
|
>> LN(Maxout(width, width * 3, pieces=cnn_maxout_pieces))
|
||||||
)
|
)
|
||||||
tok2vec = FeatureExtracter(cols) >> with_flatten(
|
if char_embed:
|
||||||
embed >> convolution ** conv_depth, pad=conv_depth
|
tok2vec = (
|
||||||
)
|
embed
|
||||||
|
>> with_flatten(
|
||||||
|
reduce_dimensions
|
||||||
|
>> convolution ** conv_depth, pad=conv_depth
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
tok2vec = (
|
||||||
|
FeatureExtracter(cols)
|
||||||
|
>> with_flatten(
|
||||||
|
embed
|
||||||
|
>> convolution ** conv_depth, pad=conv_depth
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
if bilstm_depth >= 1:
|
if bilstm_depth >= 1:
|
||||||
tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
|
tok2vec = tok2vec >> PyTorchBiLSTM(width, width, bilstm_depth)
|
||||||
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
# Work around thinc API limitations :(. TODO: Revise in Thinc 7
|
||||||
|
@ -537,7 +560,7 @@ def build_morphologizer_model(class_nums, **cfg):
|
||||||
else:
|
else:
|
||||||
token_vector_width = util.env_opt("token_vector_width", 128)
|
token_vector_width = util.env_opt("token_vector_width", 128)
|
||||||
pretrained_vectors = cfg.get("pretrained_vectors")
|
pretrained_vectors = cfg.get("pretrained_vectors")
|
||||||
subword_features = cfg.get("subword_features", True)
|
char_embed = cfg.get("char_embed", True)
|
||||||
with Model.define_operators({">>": chain, "+": add}):
|
with Model.define_operators({">>": chain, "+": add}):
|
||||||
if "tok2vec" in cfg:
|
if "tok2vec" in cfg:
|
||||||
tok2vec = cfg["tok2vec"]
|
tok2vec = cfg["tok2vec"]
|
||||||
|
@ -545,7 +568,7 @@ def build_morphologizer_model(class_nums, **cfg):
|
||||||
tok2vec = Tok2Vec(
|
tok2vec = Tok2Vec(
|
||||||
token_vector_width,
|
token_vector_width,
|
||||||
embed_size,
|
embed_size,
|
||||||
subword_features=subword_features,
|
char_embed=char_embed,
|
||||||
pretrained_vectors=pretrained_vectors,
|
pretrained_vectors=pretrained_vectors,
|
||||||
)
|
)
|
||||||
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
|
softmax = with_flatten(MultiSoftmax(class_nums, token_vector_width))
|
||||||
|
@ -688,7 +711,8 @@ def concatenate_lists(*layers, **kwargs): # pragma: no cover
|
||||||
concat = concatenate(*layers)
|
concat = concatenate(*layers)
|
||||||
|
|
||||||
def concatenate_lists_fwd(Xs, drop=0.0):
|
def concatenate_lists_fwd(Xs, drop=0.0):
|
||||||
drop *= drop_factor
|
if drop is not None:
|
||||||
|
drop *= drop_factor
|
||||||
lengths = ops.asarray([len(X) for X in Xs], dtype="i")
|
lengths = ops.asarray([len(X) for X in Xs], dtype="i")
|
||||||
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
flat_y, bp_flat_y = concat.begin_update(Xs, drop=drop)
|
||||||
ys = ops.unflatten(flat_y, lengths)
|
ys = ops.unflatten(flat_y, lengths)
|
||||||
|
@ -776,3 +800,64 @@ def _replace_word(word, random_words, mask="[MASK]"):
|
||||||
return random_words.next()
|
return random_words.next()
|
||||||
else:
|
else:
|
||||||
return word
|
return word
|
||||||
|
|
||||||
|
|
||||||
|
def _uniform_init(lo, hi):
|
||||||
|
def wrapped(W, ops):
|
||||||
|
copy_array(W, ops.xp.random.uniform(lo, hi, W.shape))
|
||||||
|
return wrapped
|
||||||
|
|
||||||
|
|
||||||
|
@describe.attributes(
|
||||||
|
nM=Dimension("Vector dimensions"),
|
||||||
|
nC=Dimension("Number of characters per word"),
|
||||||
|
vectors=Synapses("Embed matrix",
|
||||||
|
lambda obj: (obj.nC, obj.nV, obj.nM),
|
||||||
|
_uniform_init(-0.1, 0.1)),
|
||||||
|
d_vectors=Gradient("vectors")
|
||||||
|
)
|
||||||
|
class CharacterEmbed(Model):
|
||||||
|
def __init__(self, nM=None, nC=None, **kwargs):
|
||||||
|
Model.__init__(self, **kwargs)
|
||||||
|
self.nM = nM
|
||||||
|
self.nC = nC
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nO(self):
|
||||||
|
return self.nM * self.nC
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nV(self):
|
||||||
|
return 256
|
||||||
|
|
||||||
|
def begin_update(self, docs, drop=0.):
|
||||||
|
if not docs:
|
||||||
|
return []
|
||||||
|
ids = []
|
||||||
|
output = []
|
||||||
|
weights = self.vectors
|
||||||
|
# This assists in indexing; it's like looping over this dimension.
|
||||||
|
# Still consider this weird witch craft...But thanks to Mark Neumann
|
||||||
|
# for the tip.
|
||||||
|
nCv = self.ops.xp.arange(self.nC)
|
||||||
|
for doc in docs:
|
||||||
|
doc_ids = doc.to_utf8_array(nr_char=self.nC)
|
||||||
|
doc_vectors = self.ops.allocate((len(doc), self.nC, self.nM))
|
||||||
|
# Let's say I have a 2d array of indices, and a 3d table of data. What numpy
|
||||||
|
# incantation do I chant to get
|
||||||
|
# output[i, j, k] == data[j, ids[i, j], k]?
|
||||||
|
doc_vectors[:, nCv] = weights[nCv, doc_ids[:, nCv]]
|
||||||
|
output.append(doc_vectors.reshape((len(doc), self.nO)))
|
||||||
|
ids.append(doc_ids)
|
||||||
|
|
||||||
|
def backprop_character_embed(d_vectors, sgd=None):
|
||||||
|
gradient = self.d_vectors
|
||||||
|
for doc_ids, d_doc_vectors in zip(ids, d_vectors):
|
||||||
|
d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), self.nC, self.nM))
|
||||||
|
gradient[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
|
||||||
|
if sgd is not None:
|
||||||
|
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||||
|
return None
|
||||||
|
return output, backprop_character_embed
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user