mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-27 02:16:32 +03:00
e2b70df012
* Use isort with Black profile * isort all the things * Fix import cycles as a result of import sorting * Add DOCBIN_ALL_ATTRS type definition * Add isort to requirements * Remove isort from build dependencies check * Typo
61 lines
1.9 KiB
Python
61 lines
1.9 KiB
Python
from typing import List
|
|
|
|
from thinc.api import Model
|
|
from thinc.types import Floats2d
|
|
|
|
from ..tokens import Doc
|
|
from ..util import registry
|
|
|
|
|
|
@registry.layers("spacy.CharEmbed.v1")
|
|
def CharacterEmbed(nM: int, nC: int) -> Model[List[Doc], List[Floats2d]]:
|
|
# nM: Number of dimensions per character. nC: Number of characters.
|
|
return Model(
|
|
"charembed",
|
|
forward,
|
|
init=init,
|
|
dims={"nM": nM, "nC": nC, "nO": nM * nC, "nV": 256},
|
|
params={"E": None},
|
|
)
|
|
|
|
|
|
def init(model: Model, X=None, Y=None):
|
|
vectors_table = model.ops.alloc3f(
|
|
model.get_dim("nC"), model.get_dim("nV"), model.get_dim("nM")
|
|
)
|
|
model.set_param("E", vectors_table)
|
|
|
|
|
|
def forward(model: Model, docs: List[Doc], is_train: bool):
|
|
if docs is None:
|
|
return []
|
|
ids = []
|
|
output = []
|
|
E = model.get_param("E")
|
|
nC = model.get_dim("nC")
|
|
nM = model.get_dim("nM")
|
|
nO = model.get_dim("nO")
|
|
# This assists in indexing; it's like looping over this dimension.
|
|
# Still consider this weird witch craft...But thanks to Mark Neumann
|
|
# for the tip.
|
|
nCv = model.ops.xp.arange(nC)
|
|
for doc in docs:
|
|
doc_ids = model.ops.asarray(doc.to_utf8_array(nr_char=nC))
|
|
doc_vectors = model.ops.alloc3f(len(doc), nC, nM)
|
|
# Let's say I have a 2d array of indices, and a 3d table of data. What numpy
|
|
# incantation do I chant to get
|
|
# output[i, j, k] == data[j, ids[i, j], k]?
|
|
doc_vectors[:, nCv] = E[nCv, doc_ids[:, nCv]] # type: ignore[call-overload, index]
|
|
output.append(doc_vectors.reshape((len(doc), nO)))
|
|
ids.append(doc_ids)
|
|
|
|
def backprop(d_output):
|
|
dE = model.ops.alloc(E.shape, dtype=E.dtype)
|
|
for doc_ids, d_doc_vectors in zip(ids, d_output):
|
|
d_doc_vectors = d_doc_vectors.reshape((len(doc_ids), nC, nM))
|
|
dE[nCv, doc_ids[:, nCv]] += d_doc_vectors[:, nCv]
|
|
model.inc_grad("E", dE)
|
|
return []
|
|
|
|
return output, backprop
|