Update train_ner_standalone example

2025-07-08 13:53:13 +03:00 · 2017-09-15 10:36:46 +02:00 · 2017-09-15 10:36:46 +02:00 · 027a5d8b75
commit 027a5d8b75
parent d84607f6bb
1 changed files with 80 additions and 112 deletions
--- a/examples/training/train_ner_standalone.py
+++ b/examples/training/train_ner_standalone.py
@ -13,24 +13,27 @@ Input data:
 https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
 Developed for: spaCy 1.7.1
-Last tested for: spaCy 1.7.1
+Last tested for: spaCy 2.0.0a13
 '''
 from __future__ import unicode_literals, print_function
 import plac
 from pathlib import Path
 import random
 import json
 from thinc.neural.optimizers import Adam
 from thinc.neural.ops import NumpyOps
 import tqdm
 import spacy.orth as orth_funcs
 from spacy.vocab import Vocab
-from spacy.pipeline import BeamEntityRecognizer
+from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
 from spacy.pipeline import EntityRecognizer
 from spacy.tokenizer import Tokenizer
 from spacy.tokens import Doc
 from spacy.attrs import *
 from spacy.gold import GoldParse
-from spacy.gold import _iob_to_biluo as iob_to_biluo
+from spacy.gold import iob_to_biluo
 from spacy.gold import minibatch
 from spacy.scorer import Scorer
 import spacy.util
 try:
    unicode
@ -38,95 +41,40 @@ except NameError:
    unicode = str
 spacy.util.set_env_log(True)
 def init_vocab():
    return Vocab(
        lex_attr_getters={
            LOWER: lambda string: string.lower(),
-            SHAPE: orth_funcs.word_shape,
+            NORM: lambda string: string.lower(),
            PREFIX: lambda string: string[0],
            SUFFIX: lambda string: string[-3:],
            CLUSTER: lambda string: 0,
            IS_ALPHA: orth_funcs.is_alpha,
            IS_ASCII: orth_funcs.is_ascii,
            IS_DIGIT: lambda string: string.isdigit(),
            IS_LOWER: orth_funcs.is_lower,
            IS_PUNCT: orth_funcs.is_punct,
            IS_SPACE: lambda string: string.isspace(),
            IS_TITLE: orth_funcs.is_title,
            IS_UPPER: orth_funcs.is_upper,
            IS_STOP: lambda string: False,
            IS_OOV: lambda string: True
        })
 def save_vocab(vocab, path):
    path = Path(path)
    if not path.exists():
        path.mkdir()
    elif not path.is_dir():
        raise IOError("Can't save vocab to %s\nNot a directory" % path)
    with (path / 'strings.json').open('w') as file_:
        vocab.strings.dump(file_)
    vocab.dump((path / 'lexemes.bin').as_posix())
 def load_vocab(path):
    path = Path(path)
    if not path.exists():
        raise IOError("Cannot load vocab from %s\nDoes not exist" % path)
    if not path.is_dir():
        raise IOError("Cannot load vocab from %s\nNot a directory" % path)
    return Vocab.load(path)
 def init_ner_model(vocab, features=None):
    if features is None:
        features = tuple(EntityRecognizer.feature_templates)
    return EntityRecognizer(vocab, features=features)
 def save_ner_model(model, path):
    path = Path(path)
    if not path.exists():
        path.mkdir()
    if not path.is_dir():
        raise IOError("Can't save model to %s\nNot a directory" % path)
    model.model.dump((path / 'model').as_posix())
    with (path / 'config.json').open('w') as file_:
        data = json.dumps(model.cfg)
        if not isinstance(data, unicode):
            data = data.decode('utf8')
        file_.write(data)
 def load_ner_model(vocab, path):
    return EntityRecognizer.load(path, vocab)
 class Pipeline(object):
-    @classmethod
+    def __init__(self, vocab=None, tokenizer=None, tensorizer=None, entity=None):
    def load(cls, path):
        path = Path(path)
        if not path.exists():
            raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
        if not path.is_dir():
            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
        vocab = load_vocab(path)
        tokenizer = Tokenizer(vocab, {}, None, None, None)
        ner_model = load_ner_model(vocab, path / 'ner')
        return cls(vocab, tokenizer, ner_model)
    def __init__(self, vocab=None, tokenizer=None, entity=None):
        if vocab is None:
            vocab = init_vocab()
        if tokenizer is None:
            tokenizer = Tokenizer(vocab, {}, None, None, None)
        if tensorizer is None:
            tensorizer = TokenVectorEncoder(vocab)
        if entity is None:
-            entity = init_ner_model(self.vocab)
+            entity = NeuralEntityRecognizer(vocab)
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.tensorizer = tensorizer
        self.entity = entity
-        self.pipeline = [self.entity]
+        self.pipeline = [tensorizer, self.entity]
    def begin_training(self):
        for model in self.pipeline:
            model.begin_training([])
        optimizer = Adam(NumpyOps(), 0.001)
        return optimizer
    def __call__(self, input_):
        doc = self.make_doc(input_)
@ -147,14 +95,18 @@ class Pipeline(object):
        gold = GoldParse(doc, entities=annotations)
        return gold
-    def update(self, input_, annot):
+    def update(self, inputs, annots, sgd, losses=None, drop=0.):
-        doc = self.make_doc(input_)
+        if losses is None:
-        gold = self.make_gold(input_, annot)
+            losses = {}
-        for ner in gold.ner:
+        docs = [self.make_doc(input_) for input_ in inputs]
-            if ner not in (None, '-', 'O'):
+        golds = [self.make_gold(input_, annot) for input_, annot in
-                action, label = ner.split('-', 1)
+                 zip(inputs, annots)]
-                self.entity.add_label(label)
+
-        return self.entity.update(doc, gold)
+        tensors, bp_tensors = self.tensorizer.update(docs, golds, drop=drop)
        d_tensors = self.entity.update((docs, tensors), golds, drop=drop,
                                      sgd=sgd, losses=losses)
        bp_tensors(d_tensors, sgd=sgd)
        return losses
    def evaluate(self, examples):
        scorer = Scorer()
@ -164,34 +116,38 @@ class Pipeline(object):
            scorer.score(doc, gold)
        return scorer.scores
-    def average_weights(self):
+    def to_disk(self, path):
        self.entity.model.end_training()
    def save(self, path):
        path = Path(path)
        if not path.exists():
            path.mkdir()
        elif not path.is_dir():
            raise IOError("Can't save pipeline to %s\nNot a directory" % path)
-        save_vocab(self.vocab, path / 'vocab')
+        self.vocab.to_disk(path / 'vocab')
-        save_ner_model(self.entity, path / 'ner')
+        self.tensorizer.to_disk(path / 'tensorizer')
        self.entity.to_disk(path / 'ner')
    def from_disk(self, path):
        path = Path(path)
        if not path.exists():
            raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
        if not path.is_dir():
            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
        self.vocab = self.vocab.from_disk(path / 'vocab')
        self.tensorizer = self.tensorizer.from_disk(path / 'tensorizer')
        self.entity = self.entity.from_disk(path / 'ner')
-def train(nlp, train_examples, dev_examples, ctx, nr_epoch=5):
+def train(nlp, train_examples, dev_examples, nr_epoch=5):
-    next_epoch = train_examples
+    sgd = nlp.begin_training()
    print("Iter", "Loss", "P", "R", "F")
    for i in range(nr_epoch):
-        this_epoch = next_epoch
+        random.shuffle(train_examples)
-        next_epoch = []
+        losses = {}
-        loss = 0
+        for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
-        for input_, annot in this_epoch:
+            inputs, annots = zip(*batch)
-            loss += nlp.update(input_, annot)
+            nlp.update(list(inputs), list(annots), sgd, losses=losses)
            if (i+1) < nr_epoch:
                next_epoch.append((input_, annot))
        random.shuffle(next_epoch)
        scores = nlp.evaluate(dev_examples)
-        report_scores(i, loss, scores)
+        report_scores(i, losses['ner'], scores)
    nlp.average_weights()
    scores = nlp.evaluate(dev_examples)
    report_scores(channels, i+1, loss, scores)
@ -208,7 +164,8 @@ def read_examples(path):
    with path.open() as file_:
        sents = file_.read().strip().split('\n\n')
        for sent in sents:
-            if not sent.strip():
+            sent = sent.strip()
            if not sent:
                continue
            tokens = sent.split('\n')
            while tokens and tokens[0].startswith('#'):
@ -217,28 +174,39 @@ def read_examples(path):
            iob = []
            for token in tokens:
                if token.strip():
-                    pieces = token.split()
+                    pieces = token.split('\t')
                    words.append(pieces[1])
                    iob.append(pieces[2])
            yield words, iob_to_biluo(iob)
 def get_labels(examples):
    labels = set()
    for words, tags in examples:
        for tag in tags:
            if '-' in tag:
                labels.add(tag.split('-')[1])
    return sorted(labels)
@plac.annotations(
    model_dir=("Path to save the model", "positional", None, Path),
    train_loc=("Path to your training data", "positional", None, Path),
    dev_loc=("Path to your development data", "positional", None, Path),
 )
-def main(model_dir=Path('/home/matt/repos/spaCy/spacy/data/de-1.0.0'),
+def main(model_dir, train_loc, dev_loc, nr_epoch=30):
-        train_loc=None, dev_loc=None, nr_epoch=30):
+    print(model_dir, train_loc, dev_loc)
-    
+    train_examples = list(read_examples(train_loc))
    train_examples = read_examples(train_loc)
    dev_examples = read_examples(dev_loc)
-    nlp = Pipeline.load(model_dir)
+    nlp = Pipeline()
    for label in get_labels(train_examples):
        nlp.entity.add_label(label)
        print("Add label", label)
-    train(nlp, train_examples, list(dev_examples), ctx, nr_epoch)
+    train(nlp, train_examples, list(dev_examples), nr_epoch)
-    nlp.save(model_dir)
+    nlp.to_disk(model_dir)
 if __name__ == '__main__':
-    main()
+    plac.call(main)