This commit is contained in:
Matthew Honnibal 2018-04-10 22:19:40 +02:00
commit ed39c75a92
3 changed files with 12 additions and 7 deletions

View File

@ -1,6 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
class Messages(object): class Messages(object):
M001 = ("Download successful but linking failed") M001 = ("Download successful but linking failed")
M002 = ("Creating a shortcut link for 'en' didn't work (maybe you " M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "

View File

@ -29,12 +29,14 @@ except ImportError:
freqs_loc=("location of words frequencies file", "positional", None, Path), freqs_loc=("location of words frequencies file", "positional", None, Path),
clusters_loc=("optional: location of brown clusters data", clusters_loc=("optional: location of brown clusters data",
"option", "c", str), "option", "c", str),
vectors_loc=("optional: location of vectors file in GenSim text format", vectors_loc=("optional: location of vectors file in Word2Vec format "
"option", "v", str), "(either as .txt or zipped as .zip or .tar.gz)", "option",
"v", str),
prune_vectors=("optional: number of vectors to prune to", prune_vectors=("optional: number of vectors to prune to",
"option", "V", int) "option", "V", int)
) )
def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, vectors_loc=None, prune_vectors=-1): def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None,
vectors_loc=None, prune_vectors=-1):
""" """
Create a new model from raw data, like word frequencies, Brown clusters Create a new model from raw data, like word frequencies, Brown clusters
and word vectors. and word vectors.
@ -114,7 +116,7 @@ def read_vectors(vectors_loc):
pieces = line.rsplit(' ', vectors_data.shape[1]+1) pieces = line.rsplit(' ', vectors_data.shape[1]+1)
word = pieces.pop(0) word = pieces.pop(0)
if len(pieces) != vectors_data.shape[1]: if len(pieces) != vectors_data.shape[1]:
raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc) raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
vectors_data[i] = numpy.asarray(pieces, dtype='f') vectors_data[i] = numpy.asarray(pieces, dtype='f')
vectors_keys.append(word) vectors_keys.append(word)
return vectors_data, vectors_keys return vectors_data, vectors_keys

View File

@ -533,8 +533,10 @@ p
+cell option +cell option
+cell +cell
| Optional location of vectors file. Should be a tab-separated | Optional location of vectors file. Should be a tab-separated
| file where the first column contains the word and the remaining | file in Word2Vec format where the first column contains the word
| columns the values. | and the remaining columns the values. File can be provided in
| #[code .txt] format or as a zipped text file in #[code .zip] or
| #[code .tar.gz] format.
+row +row
+cell #[code --prune-vectors], #[code -V] +cell #[code --prune-vectors], #[code -V]