From 98c35d2585c548e6ff2c25a537cfd81c25482283 Mon Sep 17 00:00:00 2001 From: ines Date: Mon, 30 Oct 2017 18:38:41 +0100 Subject: [PATCH] Fix spacy vocab command --- spacy/__main__.py | 2 +- spacy/cli/vocab.py | 38 ++++++++++++++++++++++---------------- 2 files changed, 23 insertions(+), 17 deletions(-) diff --git a/spacy/__main__.py b/spacy/__main__.py index 770ce5296..f4b5e6715 100644 --- a/spacy/__main__.py +++ b/spacy/__main__.py @@ -19,7 +19,7 @@ if __name__ == '__main__': 'convert': convert, 'package': package, 'model': model, - 'model': vocab, + 'vocab': vocab, 'profile': profile, 'validate': validate } diff --git a/spacy/cli/vocab.py b/spacy/cli/vocab.py index c1bab825c..d05eff3f0 100644 --- a/spacy/cli/vocab.py +++ b/spacy/cli/vocab.py @@ -1,31 +1,33 @@ -'''Compile a vocabulary from a lexicon jsonl file and word vectors.''' # coding: utf8 from __future__ import unicode_literals -from pathlib import Path import plac import json import spacy import numpy -from spacy.util import ensure_path +from pathlib import Path + +from ..util import prints, ensure_path @plac.annotations( lang=("model language", "positional", None, str), - output_dir=("output directory to store model in", "positional", None, str), + output_dir=("model output directory", "positional", None, Path), lexemes_loc=("location of JSONL-formatted lexical data", "positional", - None, str), - vectors_loc=("location of vectors data, as numpy .npz (optional)", - "positional", None, str), - version=("Model version", "option", "V", str), -) -def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, version=None): - out_dir = ensure_path(output_dir) - jsonl_loc = ensure_path(lexemes_loc) + None, Path), + vectors_loc=("optional: location of vectors data, as numpy .npz", + "positional", None, str)) +def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None): + """Compile a vocabulary from a lexicon jsonl file and word vectors.""" + if not lexemes_loc.exists(): + prints(lexemes_loc, title="Can't find lexical data", exits=1) + vectors_loc = ensure_path(vectors_loc) nlp = spacy.blank(lang) for word in nlp.vocab: word.rank = 0 - with jsonl_loc.open() as file_: + lex_added = 0 + vec_added = 0 + with lexemes_loc.open() as file_: for line in file_: if line.strip(): attrs = json.loads(line) @@ -35,14 +37,18 @@ def make_vocab(lang, output_dir, lexemes_loc, vectors_loc=None, version=None): lex = nlp.vocab[attrs['orth']] lex.set_attrs(**attrs) assert lex.rank == attrs['id'] + lex_added += 1 if vectors_loc is not None: vector_data = numpy.load(open(vectors_loc, 'rb')) nlp.vocab.clear_vectors(width=vector_data.shape[1]) - added = 0 for word in nlp.vocab: if word.rank: nlp.vocab.vectors.add(word.orth_, row=word.rank, vector=vector_data[word.rank]) - added += 1 - nlp.to_disk(out_dir) + vec_added += 1 + if not output_dir.exists(): + output_dir.mkdir() + nlp.to_disk(output_dir) + prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir, + title="Sucessfully compiled vocab and vectors, and saved model") return nlp