spaCy/bin/train_word_vectors.py

#!/usr/bin/env python
from __future__ import print_function, unicode_literals, division

import logging
from pathlib import Path
from collections import defaultdict
from gensim.models import Word2Vec
from preshed.counter import PreshCounter
import plac
import spacy

logger = logging.getLogger(__name__)


class Corpus(object):
    def __init__(self, directory, nlp):
        self.directory = directory
        self.nlp = nlp

    def __iter__(self):
        for text_loc in iter_dir(self.directory):
            with text_loc.open("r", encoding="utf-8") as file_:
                text = file_.read()

            # This is to keep the input to the blank model (which doesn't
            # sentencize) from being too long. It works particularly well with
            # the output of [WikiExtractor](https://github.com/attardi/wikiextractor)
            paragraphs = text.split('\n\n')
            for par in paragraphs:
                yield [word.orth_ for word in self.nlp(par)]


def iter_dir(loc):
    dir_path = Path(loc)
    for fn_path in dir_path.iterdir():
        if fn_path.is_dir():
            for sub_path in fn_path.iterdir():
                yield sub_path
        else:
            yield fn_path


@plac.annotations(
    lang=("ISO language code"),
    in_dir=("Location of input directory"),
    out_loc=("Location of output file"),
    n_workers=("Number of workers", "option", "n", int),
    size=("Dimension of the word vectors", "option", "d", int),
    window=("Context window size", "option", "w", int),
    min_count=("Min count", "option", "m", int),
    negative=("Number of negative samples", "option", "g", int),
    nr_iter=("Number of iterations", "option", "i", int),
)
def main(
    lang,
    in_dir,
    out_loc,
    negative=5,
    n_workers=4,
    window=5,
    size=128,
    min_count=10,
    nr_iter=5,
):
    logging.basicConfig(
        format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
    )
    nlp = spacy.blank(lang)
    corpus = Corpus(in_dir, nlp)
    model = Word2Vec(
        sentences=corpus,
        size=size,
        window=window,
        min_count=min_count,
        workers=n_workers,
        sample=1e-5,
        negative=negative,
    )
    model.save(out_loc)

if __name__ == "__main__":
    plac.call(main)