spaCy/spacy/ml/_character_embed.py
Sofie Van Landeghem 5847be6022
Tok2Vec: extract-embed-encode (#5102)
* avoid changing original config

* fix elif structure, batch with just int crashes otherwise

* tok2vec example with doc2feats, encode and embed architectures

* further clean up MultiHashEmbed

* further generalize Tok2Vec to work with extract-embed-encode parts

* avoid initializing the charembed layer with Docs (for now ?)

* small fixes for bilstm config (still does not run)

* rename to core layer

* move new configs

* walk model to set nI instead of using core ref

* fix senter overfitting test to be more similar to the training data (avoid flakey behaviour)
2020-03-08 13:23:18 +01:00

55 lines
1.7 KiB
Python

from thinc.api import Model
def CharacterEmbed(nM, nC):
# nM: Number of dimensions per character. nC: Number of characters.
nO = nM * nC if (nM is not None and nC is not None) else None
return Model(
"charembed",
forward,
init=init,
dims={"nM": nM, "nC": nC, "nO": nO, "nV": 256},
params={"E": None},
).initialize()
def init(model, X=None, Y=None):
vectors_table = model.ops.alloc3f(
model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
)
model.set_param("E", vectors_table)
def forward(model, docs, is_train):
if docs is None:
return []
ids = []
output = []
E = model.get_param("E")
nC = model.get_dim("nC")
nM = model.get_dim("nM")
nO = model.get_dim("nO")
# This assists in indexing; it's like looping over this dimension.
# Still consider this weird witch craft...But thanks to Mark Neumann
# for the tip.
nCv = model.ops.xp.arange(nC)
for doc in docs:
doc_ids = doc.to_utf8_array(nr_char=nC)
doc_vectors = model.ops.alloc3f(len(doc), nC, nM)
# Let's say I have a 2d array of indices, and a 3d table of data. What numpy
# incantation do I chant to get
# output[i, j, k] == data[j, ids[i, j], k]?
doc_vectors[:, nCv] = E[nCv, doc_ids[:, nCv]]
output.append(doc_vectors.reshape((len(doc), nO)))
ids.append(doc_ids)
def backprop(d_output):
dE = model.ops.alloc(E.shape, dtype=E.dtype)
for doc_ids, d_doc_vectors in zip(ids, d_output):
d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), nC, nM))
dE[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
model.inc_grad("E", dE)
return []
return output, backprop