Wire up new vocab command

This commit is contained in:
Explosion Bot 2017-10-30 16:14:50 +01:00
parent aa64031751
commit 0fc1209421
3 changed files with 52 additions and 1 deletions

View File

@ -7,7 +7,7 @@ if __name__ == '__main__':
import plac
import sys
from spacy.cli import download, link, info, package, train, convert, model
from spacy.cli import profile, evaluate, validate
from spacy.cli import vocab, profile, evaluate, validate
from spacy.util import prints
commands = {
@ -19,6 +19,7 @@ if __name__ == '__main__':
'convert': convert,
'package': package,
'model': model,
'model': vocab,
'profile': profile,
'validate': validate
}

View File

@ -7,4 +7,5 @@ from .train import train
from .evaluate import evaluate
from .convert import convert
from .model import model
from .vocab import make_vocab as vocab
from .validate import validate

49
spacy/cli/vocab.py Normal file
View File

@ -0,0 +1,49 @@
'''Compile a vocabulary from a lexicon jsonl file and word vectors.'''
# coding: utf8
from __future__ import unicode_literals
import plac
import json
import spacy
import numpy
from spacy.util import ensure_path
@plac.annotations(
lang=("model language", "positional", None, str),
output_dir=("output directory to store model in", "positional", None, str),
lexemes_loc=("location of JSONL-formatted lexical data", "positional",
None, str),
vectors_loc=("location of vectors data, as numpy .npz (optional)",
"positional", None, str),
version=("Model version", "option", "V", str),
meta_path=("Optional path to meta.json. All relevant properties will be "
"overwritten.", "option", "m", Path))
def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None):
out_dir = ensure_path(output_dir)
jsonl_loc = ensure_path(lexemes_loc)
nlp = spacy.blank(lang)
for word in nlp.vocab:
word.rank = 0
with jsonl_loc.open() as file_:
for line in file_:
if line.strip():
attrs = json.loads(line)
if 'settings' in attrs:
nlp.vocab.cfg.update(attrs['settings'])
else:
lex = nlp.vocab[attrs['orth']]
lex.set_attrs(**attrs)
assert lex.rank == attrs['id']
if vectors_loc is not None:
vector_data = numpy.load(open(vectors_loc, 'rb'))
nlp.vocab.clear_vectors(width=vector_data.shape[1])
added = 0
for word in nlp.vocab:
if word.rank:
nlp.vocab.vectors.add(word.orth_, row=word.rank,
vector=vector_data[word.rank])
added += 1
nlp.to_disk(out_dir)
return nlp