Update train_tagger script

2025-11-01 00:17:44 +03:00 · 2016-10-16 16:10:23 +02:00 · 2016-10-16 16:10:23 +02:00 · 01b42c531f
commit 01b42c531f
parent a81c5a7abf
1 changed files with 79 additions and 0 deletions
--- a/examples/training/train_tagger.py
+++ b/examples/training/train_tagger.py
@ -0,0 +1,79 @@
+"""A quick example for training a part-of-speech tagger, without worrying
+about the tokenization, or other language-specific customizations."""
+
+from __future__ import unicode_literals
+from __future__ import print_function
+
+import plac
+from pathlib import Path
+
+from spacy.vocab import Vocab
+from spacy.tagger import Tagger
+from spacy.tokens import Doc
+import random
+
+
+# You need to define a mapping from your data's part-of-speech tag names to the
+# Universal Part-of-Speech tag set, as spaCy includes an enum of these tags.
+# See here for the Universal Tag Set:
+# http://universaldependencies.github.io/docs/u/pos/index.html
+# You may also specify morphological features for your tags, from the universal
+# scheme.
+TAG_MAP = {
+        'N': {"pos": "NOUN"},
+        'V': {"pos": "VERB"},
+        'J': {"pos": "ADJ"}
+    }
+
+# Usually you'll read this in, of course. Data formats vary.
+# Ensure your strings are unicode.
+DATA = [
+    (
+        ["I", "like", "green", "eggs"],
+        ["N", "V",    "J",     "N"]
+    ),
+    (
+        ["Eat", "blue", "ham"],
+        ["V",   "J",    "N"]
+    )
+]
+    
+def ensure_dir(path):
+    if not path.exists():
+        path.mkdir()
+
+
+def main(output_dir=None):
+    if output_dir is not None:
+        output_dir = Path(output_dir)
+        ensure_dir(output_dir)
+        ensure_dir(output_dir / "pos")
+        ensure_dir(output_dir / "vocab")
+    
+    vocab = Vocab(tag_map=TAG_MAP)
+    # The default_templates argument is where features are specified. See
+    # spacy/tagger.pyx for the defaults.
+    tagger = Tagger.blank(vocab, Tagger.default_templates())
+
+    for i in range(5):
+        for words, tags in DATA:
+            doc = Doc(vocab, orths_and_spaces=zip(words, [True] * len(words)))
+            tagger.update(doc, tags)
+        random.shuffle(DATA)
+    tagger.model.end_training()
+    doc = Doc(vocab, orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True]*4))
+    tagger(doc)
+    for word in doc:
+        print(word.text, word.tag_, word.pos_)
+    if output_dir is not None:
+        tagger.model.dump(str(output_dir / 'pos' / 'model'))
+        with (output_dir / 'vocab' / 'strings.json').open('wb') as file_:
+            tagger.vocab.strings.dump(file_)
+
+
+if __name__ == '__main__':
+    plac.call(main)
+    # I V VERB
+    # like V VERB
+    # blue N NOUN
+    # eggs N NOUN