Update parser training example

2025-07-17 19:52:18 +03:00 · 2017-10-26 15:15:37 +02:00 · 2017-10-26 15:15:37 +02:00 · b5c74dbb34
commit b5c74dbb34
parent 586b9047fd
1 changed files with 95 additions and 58 deletions
--- a/examples/training/train_parser.py
+++ b/examples/training/train_parser.py
@ -1,75 +1,112 @@
 #!/usr/bin/env python
 # coding: utf8
 """
 Example of training spaCy dependency parser, starting off with an existing model
 or a blank model.
 For more details, see the documentation:
 * Training: https://alpha.spacy.io/usage/training
 * Dependency Parse: https://alpha.spacy.io/usage/linguistic-features#dependency-parse
 Developed for: spaCy 2.0.0a18
 Last updated for: spaCy 2.0.0a18
 """
 from __future__ import unicode_literals, print_function
-import json
+
 import pathlib
 import random
 from pathlib import Path
 import spacy
 from spacy.pipeline import DependencyParser
 from spacy.gold import GoldParse
 from spacy.tokens import Doc
-def train_parser(nlp, train_data, left_labels, right_labels):
+# training data
-    parser = DependencyParser(
+TRAIN_DATA = [
-                nlp.vocab,
+    (
-                left_labels=left_labels,
+        ['They', 'trade',  'mortgage', '-', 'backed', 'securities', '.'],
-                right_labels=right_labels)
+        [1, 1, 4, 4, 5, 1, 1],
-    for itn in range(1000):
+        ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
-        random.shuffle(train_data)
+    ),
-        loss = 0
+    (
-        for words, heads, deps in train_data:
+        ['I', 'like', 'London', 'and', 'Berlin', '.'],
-            doc = Doc(nlp.vocab, words=words)
+        [1, 1, 1, 2, 2, 1],
-            gold = GoldParse(doc, heads=heads, deps=deps)
+        ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
-            loss += parser.update(doc, gold)
+    )
-    parser.model.end_training()
+]
    return parser
-def main(model_dir=None):
+def main(model=None, output_dir=None, n_iter=1000):
-    if model_dir is not None:
+    """Load the model, set up the pipeline and train the parser.
        model_dir = pathlib.Path(model_dir)
        if not model_dir.exists():
            model_dir.mkdir()
        assert model_dir.is_dir()
-    nlp = spacy.load('en', tagger=False, parser=False, entity=False, add_vectors=False)
+    model (unicode): Model name to start off with. If None, a blank English
        Language class is created.
    output_dir (unicode / Path): Optional output directory. If None, no model
        will be saved.
    n_iter (int): Number of iterations during training.
    """
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
-    train_data = [
+    # add the parser to the pipeline if it doesn't exist
-        (
+    # nlp.create_pipe works for built-ins that are registered with spaCy
-            ['They', 'trade',  'mortgage', '-', 'backed', 'securities', '.'],
+    if 'parser' not in nlp.pipe_names:
-            [1, 1, 4, 4, 5, 1, 1],
+        parser = nlp.create_pipe('parser')
-            ['nsubj', 'ROOT', 'compound', 'punct', 'nmod', 'dobj', 'punct']
+        nlp.add_pipe(parser, first=True)
-        ),
+    # otherwise, get it, so we can add labels to it
-        (
+    else:
-            ['I', 'like', 'London', 'and', 'Berlin', '.'],
+        parser = nlp.get_pipe('parser')
            [1, 1, 1, 2, 2, 1],
            ['nsubj', 'ROOT', 'dobj', 'cc', 'conj', 'punct']
        )
    ]
    left_labels = set()
    right_labels = set()
    for _, heads, deps in train_data:
        for i, (head, dep) in enumerate(zip(heads, deps)):
            if i < head:
                left_labels.add(dep)
            elif i > head:
                right_labels.add(dep)
    parser = train_parser(nlp, train_data, sorted(left_labels), sorted(right_labels))
-    doc = Doc(nlp.vocab, words=['I', 'like', 'securities', '.'])
+    # add labels to the parser
-    parser(doc)
+    for _, heads, deps in TRAIN_DATA:
-    for word in doc:
+        for dep in deps:
-        print(word.text, word.dep_, word.head.text)
+            parser.add_label(dep)
-    if model_dir is not None:
+    # get names of other pipes to disable them during training
-        with (model_dir / 'config.json').open('w') as file_:
+    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
-            json.dump(parser.cfg, file_)
+    with nlp.disable_pipes(*other_pipes) as disabled:  # only train parser
-        parser.model.dump(str(model_dir / 'model'))
+        optimizer = nlp.begin_training(lambda: [])
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for words, heads, deps in TRAIN_DATA:
                doc = Doc(nlp.vocab, words=words)
                gold = GoldParse(doc, heads=heads, deps=deps)
                nlp.update([doc], [gold], sgd=optimizer, losses=losses)
            print(losses)
    # test the trained model
    test_text = "I like securities."
    doc = nlp(test_text)
    print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)
        # test the save model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print('Dependencies', [(t.text, t.dep_, t.head.text) for t in doc])
 if __name__ == '__main__':
-    main()
+    import plac
-    # I nsubj like
+    plac.call(main)
-    # like ROOT like
+
-    # securities dobj like
+    # expected result:
-    # . cc securities
+    # [
    #   ('I', 'nsubj', 'like'),
    #   ('like', 'ROOT', 'like'),
    #   ('securities', 'dobj', 'like'),
    #   ('.', 'punct', 'like')
    # ]