mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	
		
			
				
	
	
		
			50 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			50 lines
		
	
	
		
			1.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
#!/usr/bin/env python
 | 
						||
# coding: utf8
 | 
						||
"""Load vectors for a language trained using fastText
 | 
						||
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
 | 
						||
Compatible with: spaCy v2.0.0+
 | 
						||
"""
 | 
						||
from __future__ import unicode_literals
 | 
						||
import plac
 | 
						||
import numpy
 | 
						||
 | 
						||
import spacy
 | 
						||
from spacy.language import Language
 | 
						||
 | 
						||
 | 
						||
@plac.annotations(
 | 
						||
    vectors_loc=("Path to .vec file", "positional", None, str),
 | 
						||
    lang=(
 | 
						||
        "Optional language ID. If not set, blank Language() will be used.",
 | 
						||
        "positional",
 | 
						||
        None,
 | 
						||
        str,
 | 
						||
    ),
 | 
						||
)
 | 
						||
def main(vectors_loc, lang=None):
 | 
						||
    if lang is None:
 | 
						||
        nlp = Language()
 | 
						||
    else:
 | 
						||
        # create empty language class – this is required if you're planning to
 | 
						||
        # save the model to disk and load it back later (models always need a
 | 
						||
        # "lang" setting). Use 'xx' for blank multi-language class.
 | 
						||
        nlp = spacy.blank(lang)
 | 
						||
    with open(vectors_loc, "rb") as file_:
 | 
						||
        header = file_.readline()
 | 
						||
        nr_row, nr_dim = header.split()
 | 
						||
        nlp.vocab.reset_vectors(width=int(nr_dim))
 | 
						||
        for line in file_:
 | 
						||
            line = line.rstrip().decode("utf8")
 | 
						||
            pieces = line.rsplit(" ", int(nr_dim))
 | 
						||
            word = pieces[0]
 | 
						||
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype="f")
 | 
						||
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
 | 
						||
    # test the vectors and similarity
 | 
						||
    text = "class colspan"
 | 
						||
    doc = nlp(text)
 | 
						||
    print(text, doc[0].similarity(doc[1]))
 | 
						||
 | 
						||
 | 
						||
if __name__ == "__main__":
 | 
						||
    plac.call(main)
 |