from typing import List from thinc.api import Model from thinc.types import Floats2d from ..tokens import Doc def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]: # nM: Number of dimensions per character. nC: Number of characters. return Model( "charembed", forward, init=init, dims={"nM": nM, "nC": nC, "nO": nM * nC, "nV": 256}, params={"E": None}, ) def init(model: Model, X=None, Y=None): vectors_table = model.ops.alloc3f( model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM") ) model.set_param("E", vectors_table) def forward(model: Model, docs: List[Doc], is_train: bool): if docs is None: return [] ids = [] output = [] E = model.get_param("E") nC = model.get_dim("nC") nM = model.get_dim("nM") nO = model.get_dim("nO") # This assists in indexing; it's like looping over this dimension. # Still consider this weird witch craft...But thanks to Mark Neumann # for the tip. nCv = model.ops.xp.arange(nC) for doc in docs: doc_ids = model.ops.asarray(doc.to_utf8_array(nr_char=nC)) doc_vectors = model.ops.alloc3f(len(doc), nC, nM) # Let's say I have a 2d array of indices, and a 3d table of data. What numpy # incantation do I chant to get # output[i, j, k] == data[j, ids[i, j], k]? doc_vectors[:, nCv] = E[nCv, doc_ids[:, nCv]] output.append(doc_vectors.reshape((len(doc), nO))) ids.append(doc_ids) def backprop(d_output): dE = model.ops.alloc(E.shape, dtype=E.dtype) for doc_ids, d_doc_vectors in zip(ids, d_output): d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), nC, nM)) dE[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv] model.inc_grad("E", dE) return [] return output, backprop