mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Merge branch 'master' of https://github.com/explosion/spaCy
This commit is contained in:
commit
ed39c75a92
|
@ -1,6 +1,7 @@
|
||||||
# coding: utf8
|
# coding: utf8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
class Messages(object):
|
class Messages(object):
|
||||||
M001 = ("Download successful but linking failed")
|
M001 = ("Download successful but linking failed")
|
||||||
M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
|
M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
|
||||||
|
|
|
@ -29,12 +29,14 @@ except ImportError:
|
||||||
freqs_loc=("location of words frequencies file", "positional", None, Path),
|
freqs_loc=("location of words frequencies file", "positional", None, Path),
|
||||||
clusters_loc=("optional: location of brown clusters data",
|
clusters_loc=("optional: location of brown clusters data",
|
||||||
"option", "c", str),
|
"option", "c", str),
|
||||||
vectors_loc=("optional: location of vectors file in GenSim text format",
|
vectors_loc=("optional: location of vectors file in Word2Vec format "
|
||||||
"option", "v", str),
|
"(either as .txt or zipped as .zip or .tar.gz)", "option",
|
||||||
|
"v", str),
|
||||||
prune_vectors=("optional: number of vectors to prune to",
|
prune_vectors=("optional: number of vectors to prune to",
|
||||||
"option", "V", int)
|
"option", "V", int)
|
||||||
)
|
)
|
||||||
def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, vectors_loc=None, prune_vectors=-1):
|
def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None,
|
||||||
|
vectors_loc=None, prune_vectors=-1):
|
||||||
"""
|
"""
|
||||||
Create a new model from raw data, like word frequencies, Brown clusters
|
Create a new model from raw data, like word frequencies, Brown clusters
|
||||||
and word vectors.
|
and word vectors.
|
||||||
|
@ -114,7 +116,7 @@ def read_vectors(vectors_loc):
|
||||||
pieces = line.rsplit(' ', vectors_data.shape[1]+1)
|
pieces = line.rsplit(' ', vectors_data.shape[1]+1)
|
||||||
word = pieces.pop(0)
|
word = pieces.pop(0)
|
||||||
if len(pieces) != vectors_data.shape[1]:
|
if len(pieces) != vectors_data.shape[1]:
|
||||||
raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc)
|
raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
|
||||||
vectors_data[i] = numpy.asarray(pieces, dtype='f')
|
vectors_data[i] = numpy.asarray(pieces, dtype='f')
|
||||||
vectors_keys.append(word)
|
vectors_keys.append(word)
|
||||||
return vectors_data, vectors_keys
|
return vectors_data, vectors_keys
|
||||||
|
|
|
@ -533,8 +533,10 @@ p
|
||||||
+cell option
|
+cell option
|
||||||
+cell
|
+cell
|
||||||
| Optional location of vectors file. Should be a tab-separated
|
| Optional location of vectors file. Should be a tab-separated
|
||||||
| file where the first column contains the word and the remaining
|
| file in Word2Vec format where the first column contains the word
|
||||||
| columns the values.
|
| and the remaining columns the values. File can be provided in
|
||||||
|
| #[code .txt] format or as a zipped text file in #[code .zip] or
|
||||||
|
| #[code .tar.gz] format.
|
||||||
|
|
||||||
+row
|
+row
|
||||||
+cell #[code --prune-vectors], #[code -V]
|
+cell #[code --prune-vectors], #[code -V]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user