mirror of
https://github.com/explosion/spaCy.git
synced 2025-07-04 03:43:09 +03:00
Fix pretrain
This commit is contained in:
parent
3e7b214e57
commit
6af6950e46
|
@ -21,6 +21,7 @@ import time
|
||||||
import ujson as json
|
import ujson as json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import sys
|
import sys
|
||||||
|
from collections import Counter
|
||||||
|
|
||||||
import spacy
|
import spacy
|
||||||
from spacy.attrs import ID
|
from spacy.attrs import ID
|
||||||
|
@ -179,7 +180,7 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4,
|
||||||
optimizer = create_default_optimizer(model.ops)
|
optimizer = create_default_optimizer(model.ops)
|
||||||
tracker = ProgressTracker()
|
tracker = ProgressTracker()
|
||||||
print('Epoch', '#Words', 'Loss', 'w/s')
|
print('Epoch', '#Words', 'Loss', 'w/s')
|
||||||
texts = stream_texts() if text_loc == '-' else load_texts(texts_loc)
|
texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc)
|
||||||
for epoch in range(nr_iter):
|
for epoch in range(nr_iter):
|
||||||
for batch in minibatch_by_words(texts, tuples=False, size=50000):
|
for batch in minibatch_by_words(texts, tuples=False, size=50000):
|
||||||
docs = [nlp.make_doc(text) for text in batch]
|
docs = [nlp.make_doc(text) for text in batch]
|
||||||
|
|
Loading…
Reference in New Issue
Block a user