mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 05:31:15 +03:00 
			
		
		
		
	loading english fast text giving error because line contains new line at the end and rsplit is splitting it incorrectly
		
			
				
	
	
		
			45 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			45 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python
 | ||
| # coding: utf8
 | ||
| """Load vectors for a language trained using fastText
 | ||
| https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
 | ||
| Compatible with: spaCy v2.0.0+
 | ||
| """
 | ||
| from __future__ import unicode_literals
 | ||
| import plac
 | ||
| import numpy
 | ||
| 
 | ||
| import spacy
 | ||
| from spacy.language import Language
 | ||
| 
 | ||
| 
 | ||
| @plac.annotations(
 | ||
|     vectors_loc=("Path to vectors", "positional", None, str),
 | ||
|     lang=("Optional language ID. If not set, blank Language() will be used.",
 | ||
|           "positional", None, str))
 | ||
| def main(vectors_loc, lang=None):
 | ||
|     if lang is None:
 | ||
|         nlp = Language()
 | ||
|     else:
 | ||
|         # create empty language class – this is required if you're planning to
 | ||
|         # save the model to disk and load it back later (models always need a
 | ||
|         # "lang" setting). Use 'xx' for blank multi-language class.
 | ||
|         nlp = spacy.blank(lang)
 | ||
|     with open(vectors_loc, 'rb') as file_:
 | ||
|         header = file_.readline()
 | ||
|         nr_row, nr_dim = header.split()
 | ||
|         nlp.vocab.reset_vectors(width=int(nr_dim))
 | ||
|         for line in file_:
 | ||
|             line = line.rstrip().decode('utf8')
 | ||
|             pieces = line.rsplit(' ', nr_dim)
 | ||
|             word = pieces[0]
 | ||
|             vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
 | ||
|             nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
 | ||
|     # test the vectors and similarity
 | ||
|     text = 'class colspan'
 | ||
|     doc = nlp(text)
 | ||
|     print(text, doc[0].similarity(doc[1]))
 | ||
| 
 | ||
| 
 | ||
| if __name__ == '__main__':
 | ||
|     plac.call(main)
 |