Add vector training script to bin [ci skip]

This commit is contained in:
Ines Montani 2019-03-12 12:07:56 +01:00
parent 3abf0e6b9f
commit 72fb324d95
2 changed files with 109 additions and 2 deletions

107
bin/train_word_vectors.py Normal file
View File

@ -0,0 +1,107 @@
#!/usr/bin/env python
from __future__ import print_function, unicode_literals, division
import logging
from pathlib import Path
from collections import defaultdict
from gensim.models import Word2Vec
from preshed.counter import PreshCounter
import plac
import spacy
logger = logging.getLogger(__name__)
class Corpus(object):
def __init__(self, directory, min_freq=10):
self.directory = directory
self.counts = PreshCounter()
self.strings = {}
self.min_freq = min_freq
def count_doc(self, doc):
# Get counts for this document
for word in doc:
self.counts.inc(word.orth, 1)
return len(doc)
def __iter__(self):
for text_loc in iter_dir(self.directory):
with text_loc.open("r", encoding="utf-8") as file_:
text = file_.read()
yield text
def iter_dir(loc):
dir_path = Path(loc)
for fn_path in dir_path.iterdir():
if fn_path.is_dir():
for sub_path in fn_path.iterdir():
yield sub_path
else:
yield fn_path
@plac.annotations(
lang=("ISO language code"),
in_dir=("Location of input directory"),
out_loc=("Location of output file"),
n_workers=("Number of workers", "option", "n", int),
size=("Dimension of the word vectors", "option", "d", int),
window=("Context window size", "option", "w", int),
min_count=("Min count", "option", "m", int),
negative=("Number of negative samples", "option", "g", int),
nr_iter=("Number of iterations", "option", "i", int),
)
def main(
lang,
in_dir,
out_loc,
negative=5,
n_workers=4,
window=5,
size=128,
min_count=10,
nr_iter=2,
):
logging.basicConfig(
format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
)
model = Word2Vec(
size=size,
window=window,
min_count=min_count,
workers=n_workers,
sample=1e-5,
negative=negative,
)
nlp = spacy.blank(lang)
corpus = Corpus(in_dir)
total_words = 0
total_sents = 0
for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
with text_loc.open("r", encoding="utf-8") as file_:
text = file_.read()
total_sents += text.count("\n")
doc = nlp(text)
total_words += corpus.count_doc(doc)
logger.info(
"PROGRESS: at batch #%i, processed %i words, keeping %i word types",
text_no,
total_words,
len(corpus.strings),
)
model.corpus_count = total_sents
model.raw_vocab = defaultdict(int)
for orth, freq in corpus.counts:
if freq >= min_count:
model.raw_vocab[nlp.vocab.strings[orth]] = freq
model.scale_vocab()
model.finalize_vocab()
model.iter = nr_iter
model.train(corpus)
model.save(out_loc)
if __name__ == "__main__":
plac.call(main)

View File

@ -631,13 +631,13 @@ of using deep learning for NLP with limited labeled data. The vectors are also
useful by themselves they power the `.similarity` methods in spaCy. For best useful by themselves they power the `.similarity` methods in spaCy. For best
results, you should pre-process the text with spaCy before training the Word2vec results, you should pre-process the text with spaCy before training the Word2vec
model. This ensures your tokenization will match. You can use our model. This ensures your tokenization will match. You can use our
[word vectors training script](https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py), [word vectors training script](https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py),
which pre-processes the text with your language-specific tokenizer and trains which pre-processes the text with your language-specific tokenizer and trains
the model using [Gensim](https://radimrehurek.com/gensim/). The `vectors.bin` the model using [Gensim](https://radimrehurek.com/gensim/). The `vectors.bin`
file should consist of one word and vector per line. file should consist of one word and vector per line.
```python ```python
https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py
``` ```
If you don't have a large sample of text available, you can also convert word If you don't have a large sample of text available, you can also convert word