mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
Add vector training script to bin [ci skip]
This commit is contained in:
parent
3abf0e6b9f
commit
72fb324d95
107
bin/train_word_vectors.py
Normal file
107
bin/train_word_vectors.py
Normal file
|
@ -0,0 +1,107 @@
|
||||||
|
#!/usr/bin/env python
|
||||||
|
from __future__ import print_function, unicode_literals, division
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import defaultdict
|
||||||
|
from gensim.models import Word2Vec
|
||||||
|
from preshed.counter import PreshCounter
|
||||||
|
import plac
|
||||||
|
import spacy
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Corpus(object):
|
||||||
|
def __init__(self, directory, min_freq=10):
|
||||||
|
self.directory = directory
|
||||||
|
self.counts = PreshCounter()
|
||||||
|
self.strings = {}
|
||||||
|
self.min_freq = min_freq
|
||||||
|
|
||||||
|
def count_doc(self, doc):
|
||||||
|
# Get counts for this document
|
||||||
|
for word in doc:
|
||||||
|
self.counts.inc(word.orth, 1)
|
||||||
|
return len(doc)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
for text_loc in iter_dir(self.directory):
|
||||||
|
with text_loc.open("r", encoding="utf-8") as file_:
|
||||||
|
text = file_.read()
|
||||||
|
yield text
|
||||||
|
|
||||||
|
|
||||||
|
def iter_dir(loc):
|
||||||
|
dir_path = Path(loc)
|
||||||
|
for fn_path in dir_path.iterdir():
|
||||||
|
if fn_path.is_dir():
|
||||||
|
for sub_path in fn_path.iterdir():
|
||||||
|
yield sub_path
|
||||||
|
else:
|
||||||
|
yield fn_path
|
||||||
|
|
||||||
|
|
||||||
|
@plac.annotations(
|
||||||
|
lang=("ISO language code"),
|
||||||
|
in_dir=("Location of input directory"),
|
||||||
|
out_loc=("Location of output file"),
|
||||||
|
n_workers=("Number of workers", "option", "n", int),
|
||||||
|
size=("Dimension of the word vectors", "option", "d", int),
|
||||||
|
window=("Context window size", "option", "w", int),
|
||||||
|
min_count=("Min count", "option", "m", int),
|
||||||
|
negative=("Number of negative samples", "option", "g", int),
|
||||||
|
nr_iter=("Number of iterations", "option", "i", int),
|
||||||
|
)
|
||||||
|
def main(
|
||||||
|
lang,
|
||||||
|
in_dir,
|
||||||
|
out_loc,
|
||||||
|
negative=5,
|
||||||
|
n_workers=4,
|
||||||
|
window=5,
|
||||||
|
size=128,
|
||||||
|
min_count=10,
|
||||||
|
nr_iter=2,
|
||||||
|
):
|
||||||
|
logging.basicConfig(
|
||||||
|
format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
|
||||||
|
)
|
||||||
|
model = Word2Vec(
|
||||||
|
size=size,
|
||||||
|
window=window,
|
||||||
|
min_count=min_count,
|
||||||
|
workers=n_workers,
|
||||||
|
sample=1e-5,
|
||||||
|
negative=negative,
|
||||||
|
)
|
||||||
|
nlp = spacy.blank(lang)
|
||||||
|
corpus = Corpus(in_dir)
|
||||||
|
total_words = 0
|
||||||
|
total_sents = 0
|
||||||
|
for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
|
||||||
|
with text_loc.open("r", encoding="utf-8") as file_:
|
||||||
|
text = file_.read()
|
||||||
|
total_sents += text.count("\n")
|
||||||
|
doc = nlp(text)
|
||||||
|
total_words += corpus.count_doc(doc)
|
||||||
|
logger.info(
|
||||||
|
"PROGRESS: at batch #%i, processed %i words, keeping %i word types",
|
||||||
|
text_no,
|
||||||
|
total_words,
|
||||||
|
len(corpus.strings),
|
||||||
|
)
|
||||||
|
model.corpus_count = total_sents
|
||||||
|
model.raw_vocab = defaultdict(int)
|
||||||
|
for orth, freq in corpus.counts:
|
||||||
|
if freq >= min_count:
|
||||||
|
model.raw_vocab[nlp.vocab.strings[orth]] = freq
|
||||||
|
model.scale_vocab()
|
||||||
|
model.finalize_vocab()
|
||||||
|
model.iter = nr_iter
|
||||||
|
model.train(corpus)
|
||||||
|
model.save(out_loc)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
plac.call(main)
|
|
@ -631,13 +631,13 @@ of using deep learning for NLP with limited labeled data. The vectors are also
|
||||||
useful by themselves – they power the `.similarity` methods in spaCy. For best
|
useful by themselves – they power the `.similarity` methods in spaCy. For best
|
||||||
results, you should pre-process the text with spaCy before training the Word2vec
|
results, you should pre-process the text with spaCy before training the Word2vec
|
||||||
model. This ensures your tokenization will match. You can use our
|
model. This ensures your tokenization will match. You can use our
|
||||||
[word vectors training script](https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py),
|
[word vectors training script](https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py),
|
||||||
which pre-processes the text with your language-specific tokenizer and trains
|
which pre-processes the text with your language-specific tokenizer and trains
|
||||||
the model using [Gensim](https://radimrehurek.com/gensim/). The `vectors.bin`
|
the model using [Gensim](https://radimrehurek.com/gensim/). The `vectors.bin`
|
||||||
file should consist of one word and vector per line.
|
file should consist of one word and vector per line.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
https://github.com/explosion/spacy-dev-resources/tree/master/training/word_vectors.py
|
https://github.com/explosion/spacy/tree/master/bin/train_word_vectors.py
|
||||||
```
|
```
|
||||||
|
|
||||||
If you don't have a large sample of text available, you can also convert word
|
If you don't have a large sample of text available, you can also convert word
|
||||||
|
|
Loading…
Reference in New Issue
Block a user