Update train_tagger script

This commit is contained in:
Matthew Honnibal 2016-10-16 16:10:23 +02:00
parent a81c5a7abf
commit 01b42c531f

View File

@ -0,0 +1,79 @@
"""A quick example for training a part-of-speech tagger, without worrying
about the tokenization, or other language-specific customizations."""
from __future__ import unicode_literals
from __future__ import print_function
import plac
from pathlib import Path
from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.tokens import Doc
import random
# You need to define a mapping from your data's part-of-speech tag names to the
# Universal Part-of-Speech tag set, as spaCy includes an enum of these tags.
# See here for the Universal Tag Set:
# http://universaldependencies.github.io/docs/u/pos/index.html
# You may also specify morphological features for your tags, from the universal
# scheme.
TAG_MAP = {
'N': {"pos": "NOUN"},
'V': {"pos": "VERB"},
'J': {"pos": "ADJ"}
}
# Usually you'll read this in, of course. Data formats vary.
# Ensure your strings are unicode.
DATA = [
(
["I", "like", "green", "eggs"],
["N", "V", "J", "N"]
),
(
["Eat", "blue", "ham"],
["V", "J", "N"]
)
]
def ensure_dir(path):
if not path.exists():
path.mkdir()
def main(output_dir=None):
if output_dir is not None:
output_dir = Path(output_dir)
ensure_dir(output_dir)
ensure_dir(output_dir / "pos")
ensure_dir(output_dir / "vocab")
vocab = Vocab(tag_map=TAG_MAP)
# The default_templates argument is where features are specified. See
# spacy/tagger.pyx for the defaults.
tagger = Tagger.blank(vocab, Tagger.default_templates())
for i in range(5):
for words, tags in DATA:
doc = Doc(vocab, orths_and_spaces=zip(words, [True] * len(words)))
tagger.update(doc, tags)
random.shuffle(DATA)
tagger.model.end_training()
doc = Doc(vocab, orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True]*4))
tagger(doc)
for word in doc:
print(word.text, word.tag_, word.pos_)
if output_dir is not None:
tagger.model.dump(str(output_dir / 'pos' / 'model'))
with (output_dir / 'vocab' / 'strings.json').open('wb') as file_:
tagger.vocab.strings.dump(file_)
if __name__ == '__main__':
plac.call(main)
# I V VERB
# like V VERB
# blue N NOUN
# eggs N NOUN