mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
334ed433b2
loading english fast text giving error because line contains new line at the end and rsplit is splitting it incorrectly
45 lines
1.5 KiB
Python
45 lines
1.5 KiB
Python
#!/usr/bin/env python
|
||
# coding: utf8
|
||
"""Load vectors for a language trained using fastText
|
||
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
|
||
Compatible with: spaCy v2.0.0+
|
||
"""
|
||
from __future__ import unicode_literals
|
||
import plac
|
||
import numpy
|
||
|
||
import spacy
|
||
from spacy.language import Language
|
||
|
||
|
||
@plac.annotations(
|
||
vectors_loc=("Path to vectors", "positional", None, str),
|
||
lang=("Optional language ID. If not set, blank Language() will be used.",
|
||
"positional", None, str))
|
||
def main(vectors_loc, lang=None):
|
||
if lang is None:
|
||
nlp = Language()
|
||
else:
|
||
# create empty language class – this is required if you're planning to
|
||
# save the model to disk and load it back later (models always need a
|
||
# "lang" setting). Use 'xx' for blank multi-language class.
|
||
nlp = spacy.blank(lang)
|
||
with open(vectors_loc, 'rb') as file_:
|
||
header = file_.readline()
|
||
nr_row, nr_dim = header.split()
|
||
nlp.vocab.reset_vectors(width=int(nr_dim))
|
||
for line in file_:
|
||
line = line.rstrip().decode('utf8')
|
||
pieces = line.rsplit(' ', nr_dim)
|
||
word = pieces[0]
|
||
vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
|
||
nlp.vocab.set_vector(word, vector) # add the vectors to the vocab
|
||
# test the vectors and similarity
|
||
text = 'class colspan'
|
||
doc = nlp(text)
|
||
print(text, doc[0].similarity(doc[1]))
|
||
|
||
|
||
if __name__ == '__main__':
|
||
plac.call(main)
|