diff --git a/bin/train_word_vectors.py b/bin/train_word_vectors.py new file mode 100644 index 000000000..8482a7a55 --- /dev/null +++ b/bin/train_word_vectors.py @@ -0,0 +1,107 @@ +#!/usr/bin/env python +from __future__ import print_function, unicode_literals, division + +import logging +from pathlib import Path +from collections import defaultdict +from gensim.models import Word2Vec +from preshed.counter import PreshCounter +import plac +import spacy + +logger = logging.getLogger(__name__) + + +class Corpus(object): + def __init__(self, directory, min_freq=10): + self.directory = directory + self.counts = PreshCounter() + self.strings = {} + self.min_freq = min_freq + + def count_doc(self, doc): + # Get counts for this document + for word in doc: + self.counts.inc(word.orth, 1) + return len(doc) + + def __iter__(self): + for text_loc in iter_dir(self.directory): + with text_loc.open("r", encoding="utf-8") as file_: + text = file_.read() + yield text + + +def iter_dir(loc): + dir_path = Path(loc) + for fn_path in dir_path.iterdir(): + if fn_path.is_dir(): + for sub_path in fn_path.iterdir(): + yield sub_path + else: + yield fn_path + + +@plac.annotations( + lang=("ISO language code"), + in_dir=("Location of input directory"), + out_loc=("Location of output file"), + n_workers=("Number of workers", "option", "n", int), + size=("Dimension of the word vectors", "option", "d", int), + window=("Context window size", "option", "w", int), + min_count=("Min count", "option", "m", int), + negative=("Number of negative samples", "option", "g", int), + nr_iter=("Number of iterations", "option", "i", int), +) +def main( + lang, + in_dir, + out_loc, + negative=5, + n_workers=4, + window=5, + size=128, + min_count=10, + nr_iter=2, +): + logging.basicConfig( + format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO + ) + model = Word2Vec( + size=size, + window=window, + min_count=min_count, + workers=n_workers, + sample=1e-5, + negative=negative, + ) + nlp = spacy.blank(lang) + corpus = Corpus(in_dir) + total_words = 0 + total_sents = 0 + for text_no, text_loc in enumerate(iter_dir(corpus.directory)): + with text_loc.open("r", encoding="utf-8") as file_: + text = file_.read() + total_sents += text.count("\n") + doc = nlp(text) + total_words += corpus.count_doc(doc) + logger.info( + "PROGRESS: at batch #%i, processed %i words, keeping %i word types", + text_no, + total_words, + len(corpus.strings), + ) + model.corpus_count = total_sents + model.raw_vocab = defaultdict(int) + for orth, freq in corpus.counts: + if freq >= min_count: + model.raw_vocab[nlp.vocab.strings[orth]] = freq + model.scale_vocab() + model.finalize_vocab() + model.iter = nr_iter + model.train(corpus) + model.save(out_loc) + + +if __name__ == "__main__": + plac.call(main) diff --git a/website/docs/usage/adding-languages.md b/website/docs/usage/adding-languages.md index a66d71d26..236df6402 100644 --- a/website/docs/usage/adding-languages.md +++ b/website/docs/usage/adding-languages.md @@ -631,13 +631,13 @@ of using deep learning for NLP with limited labeled data. The vectors are also useful by themselves – they power the `.similarity` methods in spaCy. For best results, you should pre-process the text with spaCy before training the Word2vec model. This ensures your tokenization will match. You can use our -[word vectors training script](https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py), +[word vectors training script](https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py), which pre-processes the text with your language-specific tokenizer and trains the model using [Gensim](https://radimrehurek.com/gensim/). The `vectors.bin` file should consist of one word and vector per line. ```python -https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py +https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py ``` If you don't have a large sample of text available, you can also convert word