Add vector training script to bin [ci skip]

This commit is contained in:
Ines Montani 2019-03-12 12:07:56 +01:00
parent 3abf0e6b9f
commit 72fb324d95
2 changed files with 109 additions and 2 deletions

107
bin/train_word_vectors.py Normal file
View File

@ -0,0 +1,107 @@
#!/usr/bin/env python
from __future__ import print_function, unicode_literals, division
import logging
from pathlib import Path
from collections import defaultdict
from gensim.models import Word2Vec
from preshed.counter import PreshCounter
import plac
import spacy
logger = logging.getLogger(__name__)
class Corpus(object):
def __init__(self, directory, min_freq=10):
self.directory = directory
self.counts = PreshCounter()
self.strings = {}
self.min_freq = min_freq
def count_doc(self, doc):
# Get counts for this document
for word in doc:
self.counts.inc(word.orth, 1)
return len(doc)
def __iter__(self):
for text_loc in iter_dir(self.directory):
with text_loc.open("r", encoding="utf-8") as file_:
text = file_.read()
yield text
def iter_dir(loc):
dir_path = Path(loc)
for fn_path in dir_path.iterdir():
if fn_path.is_dir():
for sub_path in fn_path.iterdir():
yield sub_path
else:
yield fn_path
@plac.annotations(
lang=("ISO language code"),
in_dir=("Location of input directory"),
out_loc=("Location of output file"),
n_workers=("Number of workers", "option", "n", int),
size=("Dimension of the word vectors", "option", "d", int),
window=("Context window size", "option", "w", int),
min_count=("Min count", "option", "m", int),
negative=("Number of negative samples", "option", "g", int),
nr_iter=("Number of iterations", "option", "i", int),
)
def main(
lang,
in_dir,
out_loc,
negative=5,
n_workers=4,
window=5,
size=128,
min_count=10,
nr_iter=2,
):
logging.basicConfig(
format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)
model = Word2Vec(
size=size,
window=window,
min_count=min_count,
workers=n_workers,
sample=1e-5,
negative=negative,
)
nlp = spacy.blank(lang)
corpus = Corpus(in_dir)
total_words = 0
total_sents = 0
for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
with text_loc.open("r", encoding="utf-8") as file_:
text = file_.read()
total_sents += text.count("\n")
doc = nlp(text)
total_words += corpus.count_doc(doc)
logger.info(
"PROGRESS: at batch #%i, processed %i words, keeping %i word types",
text_no,
total_words,
len(corpus.strings),
)
model.corpus_count = total_sents
model.raw_vocab = defaultdict(int)
for orth, freq in corpus.counts:
if freq >= min_count:
model.raw_vocab[nlp.vocab.strings[orth]] = freq
model.scale_vocab()
model.finalize_vocab()
model.iter = nr_iter
model.train(corpus)
model.save(out_loc)
if __name__ == "__main__":
plac.call(main)

View File

@ -631,13 +631,13 @@ of using deep learning for NLP with limited labeled data. The vectors are also
useful by themselves they power the `.similarity` methods in spaCy. For best
results, you should pre-process the text with spaCy before training the Word2vec
model. This ensures your tokenization will match. You can use our
[word vectors training script](https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py),
[word vectors training script](https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py),
which pre-processes the text with your language-specific tokenizer and trains
the model using [Gensim](https://radimrehurek.com/gensim/). The `vectors.bin`
file should consist of one word and vector per line.
```python
https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py
https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py
```
If you don't have a large sample of text available, you can also convert word