Merge branch 'master' of https://github.com/explosion/spaCy

2025-10-24 20:51:30 +03:00 · 2018-04-10 22:19:40 +02:00 · 2018-04-10 22:19:40 +02:00 · ed39c75a92
commit ed39c75a92
parent 3836199a83 0299d5fac8
3 changed files with 12 additions and 7 deletions
--- a/spacy/cli/_messages.py
+++ b/spacy/cli/_messages.py
@ -1,6 +1,7 @@
 # coding: utf8
 from __future__ import unicode_literals

+
 class Messages(object):
    M001 = ("Download successful but linking failed")
    M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
@ -64,7 +65,7 @@ class Messages(object):
            "flag to overwrite existing directories.")
    M046 = ("Generating meta.json")
    M047 = ("Enter the package settings for your model. The following "
-           "information will be read from your model data: pipeline, vectors.")
+            "information will be read from your model data: pipeline, vectors.")
    M048 = ("No '{key}' setting found in meta.json")
    M049 = ("This setting is required to build your package.")
    M050 = ("Training data not found")
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -29,12 +29,14 @@ except ImportError:
    freqs_loc=("location of words frequencies file", "positional", None, Path),
    clusters_loc=("optional: location of brown clusters data",
                  "option", "c", str),
-    vectors_loc=("optional: location of vectors file in GenSim text format",
-                 "option", "v", str),
+    vectors_loc=("optional: location of vectors file in Word2Vec format "
+                 "(either as .txt or zipped as .zip or .tar.gz)", "option",
+                 "v", str),
    prune_vectors=("optional: number of vectors to prune to",
                   "option", "V", int)
 )
-def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, vectors_loc=None, prune_vectors=-1):
+def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None,
+               vectors_loc=None, prune_vectors=-1):
    """
    Create a new model from raw data, like word frequencies, Brown clusters
    and word vectors.
@ -114,7 +116,7 @@ def read_vectors(vectors_loc):
        pieces = line.rsplit(' ', vectors_data.shape[1]+1)
        word = pieces.pop(0)
        if len(pieces) != vectors_data.shape[1]:
-            raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc)
+            raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
        vectors_data[i] = numpy.asarray(pieces, dtype='f')
        vectors_keys.append(word)
    return vectors_data, vectors_keys
--- a/website/api/cli.jade
+++ b/website/api/cli.jade
@ -533,8 +533,10 @@ p
        +cell option
        +cell
            |  Optional location of vectors file. Should be a tab-separated
-            |  file where the first column contains the word and the remaining
-            |  columns the values.
+            |  file in Word2Vec format where the first column contains the word
+            |  and the remaining columns the values. File can be provided in
+            |  #[code .txt] format or as a zipped text file in #[code .zip] or
+            |  #[code .tar.gz] format.

    +row
        +cell #[code --prune-vectors], #[code -V]