Update parser training example

2025-07-12 09:12:21 +03:00 · 2017-10-26 15:15:37 +02:00 · 2017-10-26 15:15:37 +02:00 · b5c74dbb34
commit b5c74dbb34
parent 586b9047fd
1 changed files with 95 additions and 58 deletions
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@ -1,40 +1,28 @@
 #!/usr/bin/env python
 # coding: utf8
 """
 Example of training spaCy dependency parser, starting off with an existing model
 or a blank model.
 For more details, see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse
 Developed for: spaCy 2.0.0a18
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
-import json
+
 import pathlib
 import random
 from pathlib import Path
 import spacy
 from spacy.pipeline import DependencyParser
 from spacy.gold import GoldParse
 from spacy.tokens import Doc
-def train_parser(nlp, train_data, left_labels, right_labels):
+# training data
-    parser = DependencyParser(
+TRAIN_DATA = [
                nlp.vocab,
                left_labels=left_labels,
                right_labels=right_labels)
    for itn in range(1000):
        random.shuffle(train_data)
        loss = 0
        for words, heads, deps in train_data:
            doc = Doc(nlp.vocab, words=words)
            gold = GoldParse(doc, heads=heads, deps=deps)
            loss += parser.update(doc, gold)
    parser.model.end_training()
    return parser
 def main(model_dir=None):
    if model_dir is not None:
        model_dir = pathlib.Path(model_dir)
        if not model_dir.exists():
            model_dir.mkdir()
        assert model_dir.is_dir()
    nlp = spacy.load('en', tagger=False, parser=False, entity=False, add_vectors=False)
    train_data = [
    (
        ['They', 'trade',  'mortgage', '-', 'backed', 'securities', '.'],
        [1, 1, 4, 4, 5, 1, 1],
@ -46,30 +34,79 @@ def main(model_dir=None):
        ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
    )
 ]
    left_labels = set()
    right_labels = set()
    for _, heads, deps in train_data:
        for i, (head, dep) in enumerate(zip(heads, deps)):
            if i < head:
                left_labels.add(dep)
            elif i > head:
                right_labels.add(dep)
    parser = train_parser(nlp, train_data, sorted(left_labels), sorted(right_labels))
    doc = Doc(nlp.vocab, words=['I', 'like', 'securities', '.'])
    parser(doc)
    for word in doc:
        print(word.text, word.dep_, word.head.text)
-    if model_dir is not None:
+def main(model=None, output_dir=None, n_iter=1000):
-        with (model_dir / 'config.json').open('w') as file_:
+    """Load the model, set up the pipeline and train the parser.
-            json.dump(parser.cfg, file_)
+
-        parser.model.dump(str(model_dir / 'model'))
+    model (unicode): Model name to start off with. If None, a blank English
        Language class is created.
    output_dir (unicode / Path): Optional output directory. If None, no model
        will be saved.
    n_iter (int): Number of iterations during training.
    """
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # add the parser to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'parser' not in nlp.pipe_names:
        parser = nlp.create_pipe('parser')
        nlp.add_pipe(parser, first=True)
    # otherwise, get it, so we can add labels to it
    else:
        parser = nlp.get_pipe('parser')
    # add labels to the parser
    for _, heads, deps in TRAIN_DATA:
        for dep in deps:
            parser.add_label(dep)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
    with nlp.disable_pipes(*other_pipes) as disabled:  # only train parser
        optimizer = nlp.begin_training(lambda: [])
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for words, heads, deps in TRAIN_DATA:
                doc = Doc(nlp.vocab, words=words)
                gold = GoldParse(doc, heads=heads, deps=deps)
                nlp.update([doc], [gold], sgd=optimizer, losses=losses)
            print(losses)
    # test the trained model
    test_text = "I like securities."
    doc = nlp(test_text)
    print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
        # test the save model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
 if __name__ == '__main__':
-    main()
+    import plac
-    # I nsubj like
+    plac.call(main)
-    # like ROOT like
+
-    # securities dobj like
+    # expected result:
-    # . cc securities
+    # [
    #   ('I', 'nsubj', 'like'),
    #   ('like', 'ROOT', 'like'),
    #   ('securities', 'dobj', 'like'),
    #   ('.', 'punct', 'like')
    # ]