diff --git a/spacy/cli/_messages.py b/spacy/cli/_messages.py index d55ff8d10..c3c9e496f 100644 --- a/spacy/cli/_messages.py +++ b/spacy/cli/_messages.py @@ -1,6 +1,7 @@ # coding: utf8 from __future__ import unicode_literals + class Messages(object): M001 = ("Download successful but linking failed") M002 = ("Creating a shortcut link for 'en' didn't work (maybe you " @@ -64,7 +65,7 @@ class Messages(object): "flag to overwrite existing directories.") M046 = ("Generating meta.json") M047 = ("Enter the package settings for your model. The following " - "information will be read from your model data: pipeline, vectors.") + "information will be read from your model data: pipeline, vectors.") M048 = ("No '{key}' setting found in meta.json") M049 = ("This setting is required to build your package.") M050 = ("Training data not found") diff --git a/spacy/cli/init_model.py b/spacy/cli/init_model.py index 5dbe39901..87c3033ad 100644 --- a/spacy/cli/init_model.py +++ b/spacy/cli/init_model.py @@ -29,12 +29,14 @@ except ImportError: freqs_loc=("location of words frequencies file", "positional", None, Path), clusters_loc=("optional: location of brown clusters data", "option", "c", str), - vectors_loc=("optional: location of vectors file in GenSim text format", - "option", "v", str), + vectors_loc=("optional: location of vectors file in Word2Vec format " + "(either as .txt or zipped as .zip or .tar.gz)", "option", + "v", str), prune_vectors=("optional: number of vectors to prune to", "option", "V", int) ) -def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, vectors_loc=None, prune_vectors=-1): +def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, + vectors_loc=None, prune_vectors=-1): """ Create a new model from raw data, like word frequencies, Brown clusters and word vectors. @@ -114,7 +116,7 @@ def read_vectors(vectors_loc): pieces = line.rsplit(' ', vectors_data.shape[1]+1) word = pieces.pop(0) if len(pieces) != vectors_data.shape[1]: - raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc) + raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc)) vectors_data[i] = numpy.asarray(pieces, dtype='f') vectors_keys.append(word) return vectors_data, vectors_keys diff --git a/website/api/cli.jade b/website/api/cli.jade index 85a13c3c0..a34271d81 100644 --- a/website/api/cli.jade +++ b/website/api/cli.jade @@ -533,8 +533,10 @@ p +cell option +cell | Optional location of vectors file. Should be a tab-separated - | file where the first column contains the word and the remaining - | columns the values. + | file in Word2Vec format where the first column contains the word + | and the remaining columns the values. File can be provided in + | #[code .txt] format or as a zipped text file in #[code .zip] or + | #[code .tar.gz] format. +row +cell #[code --prune-vectors], #[code -V]