Add experimental NeuralLabeller

2025-09-19 02:22:43 +03:00 · 2017-05-21 17:52:30 -05:00 · 2017-05-21 17:52:30 -05:00 · 8d1e64be69
commit 8d1e64be69
parent 9b1b0742fd
2 changed files with 45 additions and 1 deletions
--- a/spacy/language.py
+++ b/spacy/language.py
@ -16,6 +16,7 @@ from .syntax.parser import get_templates
 from .syntax.nonproj import PseudoProjectivity
 from .pipeline import NeuralDependencyParser, EntityRecognizer
 from .pipeline import TokenVectorEncoder, NeuralTagger, NeuralEntityRecognizer
 from .pipeline import NeuralLabeller
 from .compat import json_dumps
 from .attrs import IS_STOP
 from .lang.punctuation import TOKENIZER_PREFIXES, TOKENIZER_SUFFIXES, TOKENIZER_INFIXES
@ -230,7 +231,7 @@ class Language(object):
        for doc, gold in docs_golds:
            yield doc, gold
-    def begin_training(self, gold_tuples, **cfg):
+    def begin_training(self, get_gold_tuples, **cfg):
        """Allocate models, pre-process training data and acquire a trainer and
        optimizer. Used as a contextmanager.
@ -244,6 +245,7 @@ class Language(object):
            >>>        for docs, golds in epoch:
            >>>            state = nlp.update(docs, golds, sgd=optimizer)
        """
        self.pipeline.append(NeuralLabeller(self.vocab))
        # Populate vocab
        for _, annots_brackets in get_gold_tuples():
            for annots, _ in annots_brackets:
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@ -31,6 +31,7 @@ from .syntax.stateclass cimport StateClass
 from .gold cimport GoldParse
 from .morphology cimport Morphology
 from .vocab cimport Vocab
 from .syntax.nonproj import PseudoProjectivity
 from .attrs import ID, LOWER, PREFIX, SUFFIX, SHAPE, TAG, DEP, POS
 from ._ml import rebatch, Tok2Vec, flatten, get_col, doc2feats
@ -148,6 +149,7 @@ class TokenVectorEncoder(object):
        if self.model is True:
            self.model = self.Model()
    def use_params(self, params):
        """Replace weights of models in the pipeline with those provided in the
        params dictionary.
@ -252,6 +254,46 @@ class NeuralTagger(object):
        with self.model.use_params(params):
            yield
 class NeuralLabeller(NeuralTagger):
    name = 'nn_labeller'
    def __init__(self, vocab, model=True):
        self.vocab = vocab
        self.model = model
        self.labels = {}
    def set_annotations(self, docs, dep_ids):
        pass
    def begin_training(self, gold_tuples, pipeline=None):
        gold_tuples = PseudoProjectivity.preprocess_training_data(gold_tuples)
        for raw_text, annots_brackets in gold_tuples:
            for annots, brackets in annots_brackets:
                ids, words, tags, heads, deps, ents = annots
                for dep in deps:
                    if dep not in self.labels:
                        self.labels[dep] = len(self.labels)
        token_vector_width = pipeline[0].model.nO
        self.model = with_flatten(
            Softmax(len(self.labels), token_vector_width))
    def get_loss(self, docs, golds, scores):
        scores = self.model.ops.flatten(scores)
        cdef int idx = 0
        correct = numpy.zeros((scores.shape[0],), dtype='i')
        guesses = scores.argmax(axis=1)
        for gold in golds:
            for tag in gold.labels:
                if tag is None:
                    correct[idx] = guesses[idx]
                else:
                    correct[idx] = self.labels[tag]
                idx += 1
        correct = self.model.ops.xp.array(correct, dtype='i')
        d_scores = scores - to_categorical(correct, nb_classes=scores.shape[1])
        loss = (d_scores**2).sum()
        d_scores = self.model.ops.unflatten(d_scores, [len(d) for d in docs])
        return float(loss), d_scores
 cdef class EntityRecognizer(LinearParser):
    """Annotate named entities on Doc objects."""