mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
Add vector training script to bin [ci skip]
This commit is contained in:
parent
3abf0e6b9f
commit
72fb324d95
107
bin/train_word_vectors.py
Normal file
107
bin/train_word_vectors.py
Normal file
|
@ -0,0 +1,107 @@
|
|||
#!/usr/bin/env python
|
||||
from __future__ import print_function, unicode_literals, division
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
from gensim.models import Word2Vec
|
||||
from preshed.counter import PreshCounter
|
||||
import plac
|
||||
import spacy
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Corpus(object):
|
||||
def __init__(self, directory, min_freq=10):
|
||||
self.directory = directory
|
||||
self.counts = PreshCounter()
|
||||
self.strings = {}
|
||||
self.min_freq = min_freq
|
||||
|
||||
def count_doc(self, doc):
|
||||
# Get counts for this document
|
||||
for word in doc:
|
||||
self.counts.inc(word.orth, 1)
|
||||
return len(doc)
|
||||
|
||||
def __iter__(self):
|
||||
for text_loc in iter_dir(self.directory):
|
||||
with text_loc.open("r", encoding="utf-8") as file_:
|
||||
text = file_.read()
|
||||
yield text
|
||||
|
||||
|
||||
def iter_dir(loc):
|
||||
dir_path = Path(loc)
|
||||
for fn_path in dir_path.iterdir():
|
||||
if fn_path.is_dir():
|
||||
for sub_path in fn_path.iterdir():
|
||||
yield sub_path
|
||||
else:
|
||||
yield fn_path
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
lang=("ISO language code"),
|
||||
in_dir=("Location of input directory"),
|
||||
out_loc=("Location of output file"),
|
||||
n_workers=("Number of workers", "option", "n", int),
|
||||
size=("Dimension of the word vectors", "option", "d", int),
|
||||
window=("Context window size", "option", "w", int),
|
||||
min_count=("Min count", "option", "m", int),
|
||||
negative=("Number of negative samples", "option", "g", int),
|
||||
nr_iter=("Number of iterations", "option", "i", int),
|
||||
)
|
||||
def main(
|
||||
lang,
|
||||
in_dir,
|
||||
out_loc,
|
||||
negative=5,
|
||||
n_workers=4,
|
||||
window=5,
|
||||
size=128,
|
||||
min_count=10,
|
||||
nr_iter=2,
|
||||
):
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
|
||||
)
|
||||
model = Word2Vec(
|
||||
size=size,
|
||||
window=window,
|
||||
min_count=min_count,
|
||||
workers=n_workers,
|
||||
sample=1e-5,
|
||||
negative=negative,
|
||||
)
|
||||
nlp = spacy.blank(lang)
|
||||
corpus = Corpus(in_dir)
|
||||
total_words = 0
|
||||
total_sents = 0
|
||||
for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
|
||||
with text_loc.open("r", encoding="utf-8") as file_:
|
||||
text = file_.read()
|
||||
total_sents += text.count("\n")
|
||||
doc = nlp(text)
|
||||
total_words += corpus.count_doc(doc)
|
||||
logger.info(
|
||||
"PROGRESS: at batch #%i, processed %i words, keeping %i word types",
|
||||
text_no,
|
||||
total_words,
|
||||
len(corpus.strings),
|
||||
)
|
||||
model.corpus_count = total_sents
|
||||
model.raw_vocab = defaultdict(int)
|
||||
for orth, freq in corpus.counts:
|
||||
if freq >= min_count:
|
||||
model.raw_vocab[nlp.vocab.strings[orth]] = freq
|
||||
model.scale_vocab()
|
||||
model.finalize_vocab()
|
||||
model.iter = nr_iter
|
||||
model.train(corpus)
|
||||
model.save(out_loc)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
plac.call(main)
|
|
@ -631,13 +631,13 @@ of using deep learning for NLP with limited labeled data. The vectors are also
|
|||
useful by themselves – they power the `.similarity` methods in spaCy. For best
|
||||
results, you should pre-process the text with spaCy before training the Word2vec
|
||||
model. This ensures your tokenization will match. You can use our
|
||||
[word vectors training script](https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py),
|
||||
[word vectors training script](https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py),
|
||||
which pre-processes the text with your language-specific tokenizer and trains
|
||||
the model using [Gensim](https://radimrehurek.com/gensim/). The `vectors.bin`
|
||||
file should consist of one word and vector per line.
|
||||
|
||||
```python
|
||||
https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py
|
||||
https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py
|
||||
```
|
||||
|
||||
If you don't have a large sample of text available, you can also convert word
|
||||
|
|
Loading…
Reference in New Issue
Block a user