2017-10-26 18:32:59 +03:00
|
|
|
|
#!/usr/bin/env python
|
|
|
|
|
# coding: utf8
|
2017-10-26 19:47:02 +03:00
|
|
|
|
"""Load vectors for a language trained using fastText
|
2017-10-02 00:40:02 +03:00
|
|
|
|
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
|
2017-11-07 03:22:30 +03:00
|
|
|
|
Compatible with: spaCy v2.0.0+
|
2017-10-26 18:32:59 +03:00
|
|
|
|
"""
|
2017-10-02 00:40:02 +03:00
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
import plac
|
|
|
|
|
import numpy
|
|
|
|
|
|
2017-11-09 16:54:39 +03:00
|
|
|
|
import spacy
|
2017-11-01 02:43:28 +03:00
|
|
|
|
from spacy.language import Language
|
2017-10-02 00:40:02 +03:00
|
|
|
|
|
|
|
|
|
|
2017-10-26 19:47:02 +03:00
|
|
|
|
@plac.annotations(
|
2017-11-17 16:57:11 +03:00
|
|
|
|
vectors_loc=("Path to .vec file", "positional", None, str),
|
2018-12-02 06:26:26 +03:00
|
|
|
|
lang=(
|
|
|
|
|
"Optional language ID. If not set, blank Language() will be used.",
|
|
|
|
|
"positional",
|
|
|
|
|
None,
|
|
|
|
|
str,
|
|
|
|
|
),
|
|
|
|
|
)
|
2017-11-09 16:54:39 +03:00
|
|
|
|
def main(vectors_loc, lang=None):
|
|
|
|
|
if lang is None:
|
|
|
|
|
nlp = Language()
|
|
|
|
|
else:
|
|
|
|
|
# create empty language class – this is required if you're planning to
|
|
|
|
|
# save the model to disk and load it back later (models always need a
|
|
|
|
|
# "lang" setting). Use 'xx' for blank multi-language class.
|
|
|
|
|
nlp = spacy.blank(lang)
|
2018-12-02 06:26:26 +03:00
|
|
|
|
with open(vectors_loc, "rb") as file_:
|
2017-10-02 00:40:02 +03:00
|
|
|
|
header = file_.readline()
|
|
|
|
|
nr_row, nr_dim = header.split()
|
2017-11-08 20:11:23 +03:00
|
|
|
|
nlp.vocab.reset_vectors(width=int(nr_dim))
|
2017-10-02 00:40:02 +03:00
|
|
|
|
for line in file_:
|
2018-12-02 06:26:26 +03:00
|
|
|
|
line = line.rstrip().decode("utf8")
|
|
|
|
|
pieces = line.rsplit(" ", int(nr_dim))
|
2017-10-02 00:40:02 +03:00
|
|
|
|
word = pieces[0]
|
2018-12-02 06:26:26 +03:00
|
|
|
|
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")
|
2017-11-01 02:43:22 +03:00
|
|
|
|
nlp.vocab.set_vector(word, vector) # add the vectors to the vocab
|
|
|
|
|
# test the vectors and similarity
|
2018-12-02 06:26:26 +03:00
|
|
|
|
text = "class colspan"
|
2017-11-01 02:43:22 +03:00
|
|
|
|
doc = nlp(text)
|
|
|
|
|
print(text, doc[0].similarity(doc[1]))
|
2017-10-02 00:40:02 +03:00
|
|
|
|
|
|
|
|
|
|
2018-12-02 06:26:26 +03:00
|
|
|
|
if __name__ == "__main__":
|
2017-10-02 00:40:02 +03:00
|
|
|
|
plac.call(main)
|