spaCy/examples/vectors_fast_text.py
ines c57e05bec1 Make sure nr_dim is an int
In some languages (e.g. Dutch), the nr_dim is extracted as a byte string, causing an error down the line.
2017-11-17 14:56:27 +01:00

45 lines
1.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# coding: utf8
"""Load vectors for a language trained using fastText
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals
import plac
import numpy
import spacy
from spacy.language import Language
@plac.annotations(
vectors_loc=("Path to vectors", "positional", None, str),
lang=("Optional language ID. If not set, blank Language() will be used.",
"positional", None, str))
def main(vectors_loc, lang=None):
if lang is None:
nlp = Language()
else:
# create empty language class this is required if you're planning to
# save the model to disk and load it back later (models always need a
# "lang" setting). Use 'xx' for blank multi-language class.
nlp = spacy.blank(lang)
with open(vectors_loc, 'rb') as file_:
header = file_.readline()
nr_row, nr_dim = header.split()
nlp.vocab.reset_vectors(width=int(nr_dim))
for line in file_:
line = line.rstrip().decode('utf8')
pieces = line.rsplit(' ', int(nr_dim))
word = pieces[0]
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
nlp.vocab.set_vector(word, vector) # add the vectors to the vocab
# test the vectors and similarity
text = 'class colspan'
doc = nlp(text)
print(text, doc[0].similarity(doc[1]))
if __name__ == '__main__':
plac.call(main)