spaCy/examples/vectors_fast_text.py

#!/usr/bin/env python
# coding: utf8
"""Load vectors for a language trained using fastText
https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
Compatible with: spaCy v2.0.0+
"""
from __future__ import unicode_literals
import plac
import numpy

import spacy
from spacy.language import Language


@plac.annotations(
    vectors_loc=("Path to .vec file", "positional", None, str),
    lang=("Optional language ID. If not set, blank Language() will be used.",
          "positional", None, str))
def main(vectors_loc, lang=None):
    if lang is None:
        nlp = Language()
    else:
        # create empty language class – this is required if you're planning to
        # save the model to disk and load it back later (models always need a
        # "lang" setting). Use 'xx' for blank multi-language class.
        nlp = spacy.blank(lang)
    with open(vectors_loc, 'rb') as file_:
        header = file_.readline()
        nr_row, nr_dim = header.split()
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
    # test the vectors and similarity
    text = 'class colspan'
    doc = nlp(text)
    print(text, doc[0].similarity(doc[1]))


if __name__ == '__main__':
    plac.call(main)
-												Clean up examples

											
										
										
											2017-10-26 18:32:59 +03:00
+								#!/usr/bin/env python
 								# coding: utf8
-												Update fastText example and add to examples in docs

											
										
										
											2017-10-26 19:47:02 +03:00
+								"""Load vectors for a language trained using fastText
-												Add example loadig Fast Text vectors

											
										
										
											2017-10-02 00:40:02 +03:00
+								https://github.com/facebookresearch/fastText/blob/master/pretrained-vectors.md
-												Update examples

											
										
										
											2017-11-07 03:22:30 +03:00
+								Compatible with: spaCy v2.0.0+
-												Clean up examples

											
										
										
											2017-10-26 18:32:59 +03:00
+								"""
-												Add example loadig Fast Text vectors

											
										
										
											2017-10-02 00:40:02 +03:00
+								from __future__ import unicode_literals
 								import plac
 								import numpy
-												Update fastText vectors example (see #1525)

Add option to specify language, and add note on "lang" being required to save out model

											
										
										
											2017-11-09 16:54:39 +03:00
+								import spacy
-												Fix syntax error

											
										
										
											2017-11-01 02:43:28 +03:00
+								from spacy.language import Language
-												Add example loadig Fast Text vectors

											
										
										
											2017-10-02 00:40:02 +03:00
-												Update fastText example and add to examples in docs

											
										
										
											2017-10-26 19:47:02 +03:00
+								@plac.annotations(
-												Update vectors_loc description

											
										
										
											2017-11-17 16:57:11 +03:00
+								    vectors_loc=("Path to .vec file", "positional", None, str),
-												Update fastText vectors example (see #1525)

Add option to specify language, and add note on "lang" being required to save out model

											
										
										
											2017-11-09 16:54:39 +03:00
+								    lang=("Optional language ID. If not set, blank Language() will be used.",
 								          "positional", None, str))
 								def main(vectors_loc, lang=None):
 								    if lang is None:
 								        nlp = Language()
 								    else:
 								        # create empty language class – this is required if you're planning to
 								        # save the model to disk and load it back later (models always need a
 								        # "lang" setting). Use 'xx' for blank multi-language class.
 								        nlp = spacy.blank(lang)
-												Add example loadig Fast Text vectors

											
										
										
											2017-10-02 00:40:02 +03:00
+								    with open(vectors_loc, 'rb') as file_:
 								        header = file_.readline()
 								        nr_row, nr_dim = header.split()
-												Change clear_vectors to reset_vectors (resolves #1516)

											
										
										
											2017-11-08 20:11:23 +03:00
+								        nlp.vocab.reset_vectors(width=int(nr_dim))
-												Add example loadig Fast Text vectors

											
										
										
											2017-10-02 00:40:02 +03:00
+								        for line in file_:
-												rstrip line before rsplit

loading english fast text giving error because line contains new line at the end and rsplit is splitting it incorrectly
											
										
										
											2017-11-15 11:25:08 +03:00
+								            line = line.rstrip().decode('utf8')
-												Make sure nr_dim is an int

In some languages (e.g. Dutch), the nr_dim is extracted as a byte string, causing an error down the line.

											
										
										
											2017-11-17 16:56:27 +03:00
+								            pieces = line.rsplit(' ', int(nr_dim))
-												Add example loadig Fast Text vectors

											
										
										
											2017-10-02 00:40:02 +03:00
+								            word = pieces[0]
 								            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
-												Fix formatting

											
										
										
											2017-11-01 02:43:22 +03:00
+								            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
 								    # test the vectors and similarity
 								    text = 'class colspan'
 								    doc = nlp(text)
 								    print(text, doc[0].similarity(doc[1]))
-												Add example loadig Fast Text vectors

											
										
										
											2017-10-02 00:40:02 +03:00
 								if __name__ == '__main__':
 								    plac.call(main)