Add vector training script to bin [ci skip]

2026-01-02 06:53:28 +03:00 · 2019-03-12 12:07:56 +01:00 · 2019-03-12 12:07:56 +01:00 · 72fb324d95
commit 72fb324d95
parent 3abf0e6b9f
2 changed files with 109 additions and 2 deletions
--- a/bin/train_word_vectors.py
+++ b/bin/train_word_vectors.py
@ -0,0 +1,107 @@
+#!/usr/bin/env python
+from __future__ import print_function, unicode_literals, division
+
+import logging
+from pathlib import Path
+from collections import defaultdict
+from gensim.models import Word2Vec
+from preshed.counter import PreshCounter
+import plac
+import spacy
+
+logger = logging.getLogger(__name__)
+
+
+class Corpus(object):
+    def __init__(self, directory, min_freq=10):
+        self.directory = directory
+        self.counts = PreshCounter()
+        self.strings = {}
+        self.min_freq = min_freq
+
+    def count_doc(self, doc):
+        # Get counts for this document
+        for word in doc:
+            self.counts.inc(word.orth, 1)
+        return len(doc)
+
+    def __iter__(self):
+        for text_loc in iter_dir(self.directory):
+            with text_loc.open("r", encoding="utf-8") as file_:
+                text = file_.read()
+            yield text
+
+
+def iter_dir(loc):
+    dir_path = Path(loc)
+    for fn_path in dir_path.iterdir():
+        if fn_path.is_dir():
+            for sub_path in fn_path.iterdir():
+                yield sub_path
+        else:
+            yield fn_path
+
+
+@plac.annotations(
+    lang=("ISO language code"),
+    in_dir=("Location of input directory"),
+    out_loc=("Location of output file"),
+    n_workers=("Number of workers", "option", "n", int),
+    size=("Dimension of the word vectors", "option", "d", int),
+    window=("Context window size", "option", "w", int),
+    min_count=("Min count", "option", "m", int),
+    negative=("Number of negative samples", "option", "g", int),
+    nr_iter=("Number of iterations", "option", "i", int),
+)
+def main(
+    lang,
+    in_dir,
+    out_loc,
+    negative=5,
+    n_workers=4,
+    window=5,
+    size=128,
+    min_count=10,
+    nr_iter=2,
+):
+    logging.basicConfig(
+        format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
+    )
+    model = Word2Vec(
+        size=size,
+        window=window,
+        min_count=min_count,
+        workers=n_workers,
+        sample=1e-5,
+        negative=negative,
+    )
+    nlp = spacy.blank(lang)
+    corpus = Corpus(in_dir)
+    total_words = 0
+    total_sents = 0
+    for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
+        with text_loc.open("r", encoding="utf-8") as file_:
+            text = file_.read()
+        total_sents += text.count("\n")
+        doc = nlp(text)
+        total_words += corpus.count_doc(doc)
+        logger.info(
+            "PROGRESS: at batch #%i, processed %i words, keeping %i word types",
+            text_no,
+            total_words,
+            len(corpus.strings),
+        )
+    model.corpus_count = total_sents
+    model.raw_vocab = defaultdict(int)
+    for orth, freq in corpus.counts:
+        if freq >= min_count:
+            model.raw_vocab[nlp.vocab.strings[orth]] = freq
+    model.scale_vocab()
+    model.finalize_vocab()
+    model.iter = nr_iter
+    model.train(corpus)
+    model.save(out_loc)
+
+
+if __name__ == "__main__":
+    plac.call(main)
--- a/website/docs/usage/adding-languages.md
+++ b/website/docs/usage/adding-languages.md
@ -631,13 +631,13 @@ of using deep learning for NLP with limited labeled data. The vectors are also
 useful by themselves – they power the `.similarity` methods in spaCy. For best
 results, you should pre-process the text with spaCy before training the Word2vec
 model. This ensures your tokenization will match. You can use our
-[word vectors training script](https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py),
+[word vectors training script](https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py),
 which pre-processes the text with your language-specific tokenizer and trains
 the model using [Gensim](https://radimrehurek.com/gensim/). The `vectors.bin`
 file should consist of one word and vector per line.

 ```python
-https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py
+https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py
 ```

 If you don't have a large sample of text available, you can also convert word