Remove old example

2025-07-21 21:49:49 +03:00 · 2017-10-26 14:23:52 +02:00 · 2017-10-26 14:23:52 +02:00 · c30258c3a2
commit c30258c3a2
parent e6536d231f
1 changed files with 0 additions and 206 deletions
--- a/examples/training/train_ner_standalone.py
+++ b/examples/training/train_ner_standalone.py
@ -1,206 +0,0 @@
 #!/usr/bin/env python
 '''Example of training a named entity recognition system from scratch using spaCy
 This example is written to be self-contained and reasonably transparent.
 To achieve that, it duplicates some of spaCy's internal functionality.
 Specifically, in this example, we don't use spaCy's built-in Language class to
 wire together the Vocab, Tokenizer and EntityRecognizer. Instead, we write
 our own simple Pipeline class, so that it's easier to see how the pieces
 interact.
 Input data:
 https://www.lt.informatik.tu-darmstadt.de/fileadmin/user_upload/Group_LangTech/data/GermEval2014_complete_data.zip
 Developed for: spaCy 1.7.1
 Last tested for: spaCy 2.0.0a13
 '''
 from __future__ import unicode_literals, print_function
 import plac
 from pathlib import Path
 import random
 import json
 import tqdm
 from thinc.neural.optimizers import Adam
 from thinc.neural.ops import NumpyOps
 from spacy.vocab import Vocab
 from spacy.pipeline import TokenVectorEncoder, NeuralEntityRecognizer
 from spacy.tokenizer import Tokenizer
 from spacy.tokens import Doc
 from spacy.attrs import *
 from spacy.gold import GoldParse
 from spacy.gold import iob_to_biluo
 from spacy.gold import minibatch
 from spacy.scorer import Scorer
 import spacy.util
 try:
    unicode
 except NameError:
    unicode = str
 spacy.util.set_env_log(True)
 def init_vocab():
    return Vocab(
        lex_attr_getters={
            LOWER: lambda string: string.lower(),
            NORM: lambda string: string.lower(),
            PREFIX: lambda string: string[0],
            SUFFIX: lambda string: string[-3:],
        })
 class Pipeline(object):
    def __init__(self, vocab=None, tokenizer=None, entity=None):
        if vocab is None:
            vocab = init_vocab()
        if tokenizer is None:
            tokenizer = Tokenizer(vocab, {}, None, None, None)
        if entity is None:
            entity = NeuralEntityRecognizer(vocab)
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.entity = entity
        self.pipeline = [self.entity]
    def begin_training(self):
        for model in self.pipeline:
            model.begin_training([])
        optimizer = Adam(NumpyOps(), 0.001)
        return optimizer
    def __call__(self, input_):
        doc = self.make_doc(input_)
        for process in self.pipeline:
            process(doc)
        return doc
    def make_doc(self, input_):
        if isinstance(input_, bytes):
            input_ = input_.decode('utf8')
        if isinstance(input_, unicode):
            return self.tokenizer(input_)
        else:
            return Doc(self.vocab, words=input_)
    def make_gold(self, input_, annotations):
        doc = self.make_doc(input_)
        gold = GoldParse(doc, entities=annotations)
        return gold
    def update(self, inputs, annots, sgd, losses=None, drop=0.):
        if losses is None:
            losses = {}
        docs = [self.make_doc(input_) for input_ in inputs]
        golds = [self.make_gold(input_, annot) for input_, annot in
                 zip(inputs, annots)]
        self.entity.update(docs, golds, drop=drop,
                           sgd=sgd, losses=losses)
        return losses
    def evaluate(self, examples):
        scorer = Scorer()
        for input_, annot in examples:
            gold = self.make_gold(input_, annot)
            doc = self(input_)
            scorer.score(doc, gold)
        return scorer.scores
    def to_disk(self, path):
        path = Path(path)
        if not path.exists():
            path.mkdir()
        elif not path.is_dir():
            raise IOError("Can't save pipeline to %s\nNot a directory" % path)
        self.vocab.to_disk(path / 'vocab')
        self.entity.to_disk(path / 'ner')
    def from_disk(self, path):
        path = Path(path)
        if not path.exists():
            raise IOError("Cannot load pipeline from %s\nDoes not exist" % path)
        if not path.is_dir():
            raise IOError("Cannot load pipeline from %s\nNot a directory" % path)
        self.vocab = self.vocab.from_disk(path / 'vocab')
        self.entity = self.entity.from_disk(path / 'ner')
 def train(nlp, train_examples, dev_examples, nr_epoch=5):
    sgd = nlp.begin_training()
    print("Iter", "Loss", "P", "R", "F")
    for i in range(nr_epoch):
        random.shuffle(train_examples)
        losses = {}
        for batch in minibatch(tqdm.tqdm(train_examples, leave=False), size=8):
            inputs, annots = zip(*batch)
            nlp.update(list(inputs), list(annots), sgd, losses=losses)
        scores = nlp.evaluate(dev_examples)
        report_scores(i+1, losses['ner'], scores)
 def report_scores(i, loss, scores):
    precision = '%.2f' % scores['ents_p']
    recall = '%.2f' % scores['ents_r']
    f_measure = '%.2f' % scores['ents_f']
    print('Epoch %d: %d %s %s %s' % (
        i, int(loss), precision, recall, f_measure))
 def read_examples(path):
    path = Path(path)
    with path.open() as file_:
        sents = file_.read().strip().split('\n\n')
        for sent in sents:
            sent = sent.strip()
            if not sent:
                continue
            tokens = sent.split('\n')
            while tokens and tokens[0].startswith('#'):
                tokens.pop(0)
            words = []
            iob = []
            for token in tokens:
                if token.strip():
                    pieces = token.split('\t')
                    words.append(pieces[1])
                    iob.append(pieces[2])
            yield words, iob_to_biluo(iob)
 def get_labels(examples):
    labels = set()
    for words, tags in examples:
        for tag in tags:
            if '-' in tag:
                labels.add(tag.split('-')[1])
    return sorted(labels)
@plac.annotations(
    model_dir=("Path to save the model", "positional", None, Path),
    train_loc=("Path to your training data", "positional", None, Path),
    dev_loc=("Path to your development data", "positional", None, Path),
 )
 def main(model_dir, train_loc, dev_loc, nr_epoch=30):
    print(model_dir, train_loc, dev_loc)
    train_examples = list(read_examples(train_loc))
    dev_examples = read_examples(dev_loc)
    nlp = Pipeline()
    for label in get_labels(train_examples):
        nlp.entity.add_label(label)
        print("Add label", label)
    train(nlp, train_examples, list(dev_examples), nr_epoch)
    nlp.to_disk(model_dir)
 if __name__ == '__main__':
    plac.call(main)