mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-26 13:41:21 +03:00 
			
		
		
		
	Merge branch 'develop' of https://github.com/explosion/spaCy into develop
This commit is contained in:
		
						commit
						647d1a1efc
					
				|  | @ -2,7 +2,7 @@ cython>=0.25 | ||||||
| numpy>=1.15.0 | numpy>=1.15.0 | ||||||
| cymem>=2.0.2,<2.1.0 | cymem>=2.0.2,<2.1.0 | ||||||
| preshed>=2.0.1,<2.1.0 | preshed>=2.0.1,<2.1.0 | ||||||
| thinc==7.0.0.dev0 | thinc==7.0.0.dev1 | ||||||
| blis>=0.2.2,<0.3.0 | blis>=0.2.2,<0.3.0 | ||||||
| murmurhash>=0.28.0,<1.1.0 | murmurhash>=0.28.0,<1.1.0 | ||||||
| cytoolz>=0.9.0,<0.10.0 | cytoolz>=0.9.0,<0.10.0 | ||||||
|  |  | ||||||
							
								
								
									
										2
									
								
								setup.py
									
									
									
									
									
								
							
							
						
						
									
										2
									
								
								setup.py
									
									
									
									
									
								
							|  | @ -200,7 +200,7 @@ def setup_package(): | ||||||
|                 "murmurhash>=0.28.0,<1.1.0", |                 "murmurhash>=0.28.0,<1.1.0", | ||||||
|                 "cymem>=2.0.2,<2.1.0", |                 "cymem>=2.0.2,<2.1.0", | ||||||
|                 "preshed>=2.0.1,<2.1.0", |                 "preshed>=2.0.1,<2.1.0", | ||||||
|                 "thinc==7.0.0.dev0", |                 "thinc==7.0.0.dev1", | ||||||
|                 "blis>=0.2.2,<0.3.0", |                 "blis>=0.2.2,<0.3.0", | ||||||
|                 "plac<1.0.0,>=0.9.6", |                 "plac<1.0.0,>=0.9.6", | ||||||
|                 "ujson>=1.35", |                 "ujson>=1.35", | ||||||
|  |  | ||||||
							
								
								
									
										12
									
								
								spacy/_ml.py
									
									
									
									
									
								
							
							
						
						
									
										12
									
								
								spacy/_ml.py
									
									
									
									
									
								
							|  | @ -48,11 +48,11 @@ def cosine(vec1, vec2): | ||||||
| 
 | 
 | ||||||
| def create_default_optimizer(ops, **cfg): | def create_default_optimizer(ops, **cfg): | ||||||
|     learn_rate = util.env_opt('learn_rate', 0.001) |     learn_rate = util.env_opt('learn_rate', 0.001) | ||||||
|     beta1 = util.env_opt('optimizer_B1', 0.9) |     beta1 = util.env_opt('optimizer_B1', 0.8) | ||||||
|     beta2 = util.env_opt('optimizer_B2', 0.9) |     beta2 = util.env_opt('optimizer_B2', 0.8) | ||||||
|     eps = util.env_opt('optimizer_eps', 1e-12) |     eps = util.env_opt('optimizer_eps', 0.00001) | ||||||
|     L2 = util.env_opt('L2_penalty', 1e-6) |     L2 = util.env_opt('L2_penalty', 1e-6) | ||||||
|     max_grad_norm = util.env_opt('grad_norm_clip', 1.) |     max_grad_norm = util.env_opt('grad_norm_clip', 5.) | ||||||
|     optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, |     optimizer = Adam(ops, learn_rate, L2=L2, beta1=beta1, | ||||||
|                      beta2=beta2, eps=eps) |                      beta2=beta2, eps=eps) | ||||||
|     optimizer.max_grad_norm = max_grad_norm |     optimizer.max_grad_norm = max_grad_norm | ||||||
|  | @ -445,11 +445,11 @@ def getitem(i): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def build_tagger_model(nr_class, **cfg): | def build_tagger_model(nr_class, **cfg): | ||||||
|     embed_size = util.env_opt('embed_size', 7000) |     embed_size = util.env_opt('embed_size', 2000) | ||||||
|     if 'token_vector_width' in cfg: |     if 'token_vector_width' in cfg: | ||||||
|         token_vector_width = cfg['token_vector_width'] |         token_vector_width = cfg['token_vector_width'] | ||||||
|     else: |     else: | ||||||
|         token_vector_width = util.env_opt('token_vector_width', 128) |         token_vector_width = util.env_opt('token_vector_width', 96) | ||||||
|     pretrained_vectors = cfg.get('pretrained_vectors') |     pretrained_vectors = cfg.get('pretrained_vectors') | ||||||
|     subword_features = cfg.get('subword_features', True) |     subword_features = cfg.get('subword_features', True) | ||||||
|     with Model.define_operators({'>>': chain, '+': add}): |     with Model.define_operators({'>>': chain, '+': add}): | ||||||
|  |  | ||||||
|  | @ -24,10 +24,12 @@ import sys | ||||||
| from collections import Counter | from collections import Counter | ||||||
| 
 | 
 | ||||||
| import spacy | import spacy | ||||||
| from spacy.attrs import ID | from spacy.tokens import Doc | ||||||
|  | from spacy.attrs import ID, HEAD | ||||||
| from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path | from spacy.util import minibatch, minibatch_by_words, use_gpu, compounding, ensure_path | ||||||
| from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer | from spacy._ml import Tok2Vec, flatten, chain, zero_init, create_default_optimizer | ||||||
| from thinc.v2v import Affine | from thinc.v2v import Affine | ||||||
|  | from thinc.api import wrap | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def prefer_gpu(): | def prefer_gpu(): | ||||||
|  | @ -47,13 +49,14 @@ def load_texts(path): | ||||||
|     ''' |     ''' | ||||||
|     path = ensure_path(path) |     path = ensure_path(path) | ||||||
|     with path.open('r', encoding='utf8') as file_: |     with path.open('r', encoding='utf8') as file_: | ||||||
|         texts = [json.loads(line)['text'] for line in file_] |         texts = [json.loads(line) for line in file_] | ||||||
|     random.shuffle(texts) |     random.shuffle(texts) | ||||||
|     return texts |     return texts | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def stream_texts(): | def stream_texts(): | ||||||
|     for line in sys.stdin: |     for line in sys.stdin: | ||||||
|         yield json.loads(line)['text'] |         yield json.loads(line) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def make_update(model, docs, optimizer, drop=0.): | def make_update(model, docs, optimizer, drop=0.): | ||||||
|  | @ -65,11 +68,33 @@ def make_update(model, docs, optimizer, drop=0.): | ||||||
|     RETURNS loss: A float for the loss. |     RETURNS loss: A float for the loss. | ||||||
|     """ |     """ | ||||||
|     predictions, backprop = model.begin_update(docs, drop=drop) |     predictions, backprop = model.begin_update(docs, drop=drop) | ||||||
|     loss, gradients = get_vectors_loss(model.ops, docs, predictions) |     gradients = get_vectors_loss(model.ops, docs, predictions) | ||||||
|     backprop(gradients, sgd=optimizer) |     backprop(gradients, sgd=optimizer) | ||||||
|  |     # Don't want to return a cupy object here | ||||||
|  |     # The gradients are modified in-place by the BERT MLM, | ||||||
|  |     # so we get an accurate loss | ||||||
|  |     loss = float((gradients**2).mean()) | ||||||
|     return loss |     return loss | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def make_docs(nlp, batch): | ||||||
|  |     docs = [] | ||||||
|  |     for record in batch: | ||||||
|  |         text = record["text"] | ||||||
|  |         if "tokens" in record: | ||||||
|  |             doc = Doc(nlp.vocab, words=record["tokens"]) | ||||||
|  |         else: | ||||||
|  |             doc = nlp.make_doc(text) | ||||||
|  |         if "heads" in record: | ||||||
|  |             heads = record["heads"] | ||||||
|  |             heads = numpy.asarray(heads, dtype="uint64") | ||||||
|  |             heads = heads.reshape((len(doc), 1)) | ||||||
|  |             doc = doc.from_array([HEAD], heads) | ||||||
|  |         if len(doc) >= 1 and len(doc) < 200: | ||||||
|  |             docs.append(doc) | ||||||
|  |     return docs | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def get_vectors_loss(ops, docs, prediction): | def get_vectors_loss(ops, docs, prediction): | ||||||
|     """Compute a mean-squared error loss between the documents' vectors and |     """Compute a mean-squared error loss between the documents' vectors and | ||||||
|     the prediction.     |     the prediction.     | ||||||
|  | @ -84,10 +109,8 @@ def get_vectors_loss(ops, docs, prediction): | ||||||
|     # and look them up all at once. This prevents data copying. |     # and look them up all at once. This prevents data copying. | ||||||
|     ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) |     ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs]) | ||||||
|     target = docs[0].vocab.vectors.data[ids] |     target = docs[0].vocab.vectors.data[ids] | ||||||
|     d_scores = (prediction - target) / prediction.shape[0] |     d_scores = prediction - target | ||||||
|     # Don't want to return a cupy object here |     return d_scores | ||||||
|     loss = float((d_scores**2).sum()) |  | ||||||
|     return loss, d_scores |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def create_pretraining_model(nlp, tok2vec): | def create_pretraining_model(nlp, tok2vec): | ||||||
|  | @ -107,15 +130,77 @@ def create_pretraining_model(nlp, tok2vec): | ||||||
|         tok2vec, |         tok2vec, | ||||||
|         output_layer |         output_layer | ||||||
|     ) |     ) | ||||||
|  |     model = masked_language_model(nlp.vocab, model) | ||||||
|     model.tok2vec = tok2vec |     model.tok2vec = tok2vec | ||||||
|     model.output_layer = output_layer |     model.output_layer = output_layer | ||||||
|     model.begin_training([nlp.make_doc('Give it a doc to infer shapes')]) |     model.begin_training([nlp.make_doc('Give it a doc to infer shapes')]) | ||||||
|     return model |     return model | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def masked_language_model(vocab, model, mask_prob=0.15): | ||||||
|  |     '''Convert a model into a BERT-style masked language model''' | ||||||
|  |     vocab_words = [lex.text for lex in vocab if lex.prob != 0.0] | ||||||
|  |     vocab_probs = [lex.prob for lex in vocab if lex.prob != 0.0] | ||||||
|  |     vocab_words = vocab_words[:10000] | ||||||
|  |     vocab_probs = vocab_probs[:10000] | ||||||
|  |     vocab_probs = numpy.exp(numpy.array(vocab_probs, dtype='f')) | ||||||
|  |     vocab_probs /= vocab_probs.sum() | ||||||
|  |      | ||||||
|  |     def mlm_forward(docs, drop=0.): | ||||||
|  |         mask, docs = apply_mask(docs, vocab_words, vocab_probs, | ||||||
|  |                                 mask_prob=mask_prob) | ||||||
|  |         mask = model.ops.asarray(mask).reshape((mask.shape[0], 1)) | ||||||
|  |         output, backprop = model.begin_update(docs, drop=drop) | ||||||
|  | 
 | ||||||
|  |         def mlm_backward(d_output, sgd=None): | ||||||
|  |             d_output *= 1-mask | ||||||
|  |             return backprop(d_output, sgd=sgd) | ||||||
|  | 
 | ||||||
|  |         return output, mlm_backward | ||||||
|  | 
 | ||||||
|  |     return wrap(mlm_forward, model) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def apply_mask(docs, vocab_texts, vocab_probs, mask_prob=0.15): | ||||||
|  |     N = sum(len(doc) for doc in docs) | ||||||
|  |     mask = numpy.random.uniform(0., 1.0, (N,)) | ||||||
|  |     mask = mask >= mask_prob | ||||||
|  |     i = 0 | ||||||
|  |     masked_docs = [] | ||||||
|  |     for doc in docs: | ||||||
|  |         words = [] | ||||||
|  |         for token in doc: | ||||||
|  |             if not mask[i]: | ||||||
|  |                 word = replace_word(token.text, vocab_texts, vocab_probs) | ||||||
|  |             else: | ||||||
|  |                 word = token.text | ||||||
|  |             words.append(word) | ||||||
|  |             i += 1 | ||||||
|  |         spaces = [bool(w.whitespace_) for w in doc] | ||||||
|  |         # NB: If you change this implementation to instead modify | ||||||
|  |         # the docs in place, take care that the IDs reflect the original | ||||||
|  |         # words. Currently we use the original docs to make the vectors | ||||||
|  |         # for the target, so we don't lose the original tokens. But if | ||||||
|  |         # you modified the docs in place here, you would. | ||||||
|  |         masked_docs.append(Doc(doc.vocab, words=words, spaces=spaces)) | ||||||
|  |     return mask, masked_docs | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def replace_word(word, vocab_texts, vocab_probs, mask='[MASK]'): | ||||||
|  |     roll = random.random() | ||||||
|  |     if roll < 0.8: | ||||||
|  |         return mask | ||||||
|  |     elif roll < 0.9: | ||||||
|  |         index = numpy.random.choice(len(vocab_texts), p=vocab_probs) | ||||||
|  |         return vocab_texts[index] | ||||||
|  |     else: | ||||||
|  |         return word | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| class ProgressTracker(object): | class ProgressTracker(object): | ||||||
|     def __init__(self, frequency=100000): |     def __init__(self, frequency=100000): | ||||||
|         self.loss = 0. |         self.loss = 0.0 | ||||||
|  |         self.prev_loss = 0.0 | ||||||
|         self.nr_word = 0 |         self.nr_word = 0 | ||||||
|         self.words_per_epoch = Counter() |         self.words_per_epoch = Counter() | ||||||
|         self.frequency = frequency |         self.frequency = frequency | ||||||
|  | @ -132,7 +217,15 @@ class ProgressTracker(object): | ||||||
|             wps = words_since_update / (time.time() - self.last_time) |             wps = words_since_update / (time.time() - self.last_time) | ||||||
|             self.last_update = self.nr_word |             self.last_update = self.nr_word | ||||||
|             self.last_time = time.time() |             self.last_time = time.time() | ||||||
|             status = (epoch, self.nr_word, '%.5f' % self.loss, int(wps)) |             loss_per_word = self.loss - self.prev_loss | ||||||
|  |             status = ( | ||||||
|  |                 epoch, | ||||||
|  |                 self.nr_word, | ||||||
|  |                 "%.5f" % self.loss, | ||||||
|  |                 "%.4f" % loss_per_word, | ||||||
|  |                 int(wps), | ||||||
|  |             ) | ||||||
|  |             self.prev_loss = float(self.loss) | ||||||
|             return status |             return status | ||||||
|         else: |         else: | ||||||
|             return None |             return None | ||||||
|  | @ -145,12 +238,13 @@ class ProgressTracker(object): | ||||||
|     width=("Width of CNN layers", "option", "cw", int), |     width=("Width of CNN layers", "option", "cw", int), | ||||||
|     depth=("Depth of CNN layers", "option", "cd", int), |     depth=("Depth of CNN layers", "option", "cd", int), | ||||||
|     embed_rows=("Embedding rows", "option", "er", int), |     embed_rows=("Embedding rows", "option", "er", int), | ||||||
|  |     use_vectors=("Whether to use the static vectors as input features", "flag", "uv"), | ||||||
|     dropout=("Dropout", "option", "d", float), |     dropout=("Dropout", "option", "d", float), | ||||||
|     seed=("Seed for random number generators", "option", "s", float), |     seed=("Seed for random number generators", "option", "s", float), | ||||||
|     nr_iter=("Number of iterations to pretrain", "option", "i", int), |     nr_iter=("Number of iterations to pretrain", "option", "i", int), | ||||||
| ) | ) | ||||||
| def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4, | def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4, | ||||||
|         embed_rows=1000, dropout=0.2, nr_iter=10, seed=0): |         embed_rows=5000, use_vectors=False, dropout=0.2, nr_iter=100, seed=0): | ||||||
|     """ |     """ | ||||||
|     Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, |     Pre-train the 'token-to-vector' (tok2vec) layer of pipeline components, | ||||||
|     using an approximate language-modelling objective. Specifically, we load |     using an approximate language-modelling objective. Specifically, we load | ||||||
|  | @ -175,11 +269,13 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4, | ||||||
|     with (output_dir / 'config.json').open('w') as file_: |     with (output_dir / 'config.json').open('w') as file_: | ||||||
|         file_.write(json.dumps(config)) |         file_.write(json.dumps(config)) | ||||||
|     has_gpu = prefer_gpu() |     has_gpu = prefer_gpu() | ||||||
|  |     print("Use GPU?", has_gpu) | ||||||
|     nlp = spacy.load(vectors_model) |     nlp = spacy.load(vectors_model) | ||||||
|  |     pretrained_vectors = None if not use_vectors else nlp.vocab.vectors.name | ||||||
|     model = create_pretraining_model(nlp, |     model = create_pretraining_model(nlp, | ||||||
|                 Tok2Vec(width, embed_rows, |                 Tok2Vec(width, embed_rows, | ||||||
|                     conv_depth=depth, |                     conv_depth=depth, | ||||||
|                     pretrained_vectors=nlp.vocab.vectors.name, |                     pretrained_vectors=pretrained_vectors, | ||||||
|                     bilstm_depth=0, # Requires PyTorch. Experimental. |                     bilstm_depth=0, # Requires PyTorch. Experimental. | ||||||
|                     cnn_maxout_pieces=2, # You can try setting this higher |                     cnn_maxout_pieces=2, # You can try setting this higher | ||||||
|                     subword_features=True)) # Set to False for character models, e.g. Chinese |                     subword_features=True)) # Set to False for character models, e.g. Chinese | ||||||
|  | @ -188,19 +284,19 @@ def pretrain(texts_loc, vectors_model, output_dir, width=128, depth=4, | ||||||
|     print('Epoch', '#Words', 'Loss', 'w/s') |     print('Epoch', '#Words', 'Loss', 'w/s') | ||||||
|     texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc)  |     texts = stream_texts() if texts_loc == '-' else load_texts(texts_loc)  | ||||||
|     for epoch in range(nr_iter): |     for epoch in range(nr_iter): | ||||||
|         for batch in minibatch(texts, size=64): |         for batch in minibatch(texts, size=256): | ||||||
|             docs = [nlp.make_doc(text) for text in batch] |             docs = make_docs(nlp, batch) | ||||||
|             loss = make_update(model, docs, optimizer, drop=dropout) |             loss = make_update(model, docs, optimizer, drop=dropout) | ||||||
|             progress = tracker.update(epoch, loss, docs) |             progress = tracker.update(epoch, loss, docs) | ||||||
|             if progress: |             if progress: | ||||||
|                 print(*progress) |                 print(*progress) | ||||||
|                 if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**6: |                 if texts_loc == '-' and tracker.words_per_epoch[epoch] >= 10**7: | ||||||
|                     break |                     break | ||||||
|         with model.use_params(optimizer.averages): |         with model.use_params(optimizer.averages): | ||||||
|             with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_: |             with (output_dir / ('model%d.bin' % epoch)).open('wb') as file_: | ||||||
|                 file_.write(model.tok2vec.to_bytes()) |                 file_.write(model.tok2vec.to_bytes()) | ||||||
|             with (output_dir / 'log.jsonl').open('a') as file_: |             with (output_dir / 'log.jsonl').open('a') as file_: | ||||||
|                 file_.write(json.dumps({'nr_word': tracker.nr_word, |                 file_.write(json.dumps({'nr_word': tracker.nr_word, | ||||||
|                     'loss': tracker.loss, 'epoch': epoch})) |                     'loss': tracker.loss, 'epoch': epoch}) + '\n') | ||||||
|         if texts_loc != '-': |         if texts_loc != '-': | ||||||
|             texts = load_texts(texts_loc) |             texts = load_texts(texts_loc) | ||||||
|  |  | ||||||
|  | @ -90,11 +90,11 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, | ||||||
|     # starts high and decays sharply, to force the optimizer to explore. |     # starts high and decays sharply, to force the optimizer to explore. | ||||||
|     # Batch size starts at 1 and grows, so that we make updates quickly |     # Batch size starts at 1 and grows, so that we make updates quickly | ||||||
|     # at the beginning of training. |     # at the beginning of training. | ||||||
|     dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), |     dropout_rates = util.decaying(util.env_opt('dropout_from', 0.1), | ||||||
|                                   util.env_opt('dropout_to', 0.2), |                                   util.env_opt('dropout_to', 0.1), | ||||||
|                                   util.env_opt('dropout_decay', 0.0)) |                                   util.env_opt('dropout_decay', 0.0)) | ||||||
|     batch_sizes = util.compounding(util.env_opt('batch_from', 1000), |     batch_sizes = util.compounding(util.env_opt('batch_from', 750), | ||||||
|                                    util.env_opt('batch_to', 1000), |                                    util.env_opt('batch_to', 750), | ||||||
|                                    util.env_opt('batch_compound', 1.001)) |                                    util.env_opt('batch_compound', 1.001)) | ||||||
|     lang_class = util.get_lang_class(lang) |     lang_class = util.get_lang_class(lang) | ||||||
|     nlp = lang_class() |     nlp = lang_class() | ||||||
|  |  | ||||||
|  | @ -25,6 +25,7 @@ from .compat import json_dumps | ||||||
| 
 | 
 | ||||||
| from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek | from libc.stdio cimport FILE, fopen, fclose, fread, fwrite, feof, fseek | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
| def tags_to_entities(tags): | def tags_to_entities(tags): | ||||||
|     entities = [] |     entities = [] | ||||||
|     start = None |     start = None | ||||||
|  | @ -110,19 +111,23 @@ class GoldCorpus(object): | ||||||
|         # Write temp directory with one doc per file, so we can shuffle |         # Write temp directory with one doc per file, so we can shuffle | ||||||
|         # and stream |         # and stream | ||||||
|         self.tmp_dir = Path(tempfile.mkdtemp()) |         self.tmp_dir = Path(tempfile.mkdtemp()) | ||||||
|         self.write_msgpack(self.tmp_dir / 'train', train) |         self.write_msgpack(self.tmp_dir / 'train', train, limit=self.limit) | ||||||
|         self.write_msgpack(self.tmp_dir / 'dev', dev) |         self.write_msgpack(self.tmp_dir / 'dev', dev, limit=self.limit) | ||||||
| 
 | 
 | ||||||
|     def __del__(self): |     def __del__(self): | ||||||
|         shutil.rmtree(self.tmp_dir) |         shutil.rmtree(self.tmp_dir) | ||||||
| 
 | 
 | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def write_msgpack(directory, doc_tuples): |     def write_msgpack(directory, doc_tuples, limit=0): | ||||||
|         if not directory.exists(): |         if not directory.exists(): | ||||||
|             directory.mkdir() |             directory.mkdir() | ||||||
|  |         n = 0 | ||||||
|         for i, doc_tuple in enumerate(doc_tuples): |         for i, doc_tuple in enumerate(doc_tuples): | ||||||
|             with open(directory / '{}.msg'.format(i), 'wb') as file_: |             with open(directory / '{}.msg'.format(i), 'wb') as file_: | ||||||
|                 msgpack.dump([doc_tuple], file_, use_bin_type=True, encoding='utf8') |                 msgpack.dump([doc_tuple], file_, use_bin_type=True) | ||||||
|  |             n += len(doc_tuple[1]) | ||||||
|  |             if limit and n >= limit: | ||||||
|  |                 break | ||||||
|      |      | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     def walk_corpus(path): |     def walk_corpus(path): | ||||||
|  | @ -153,7 +158,7 @@ class GoldCorpus(object): | ||||||
|                 gold_tuples = read_json_file(loc) |                 gold_tuples = read_json_file(loc) | ||||||
|             elif loc.parts[-1].endswith('msg'): |             elif loc.parts[-1].endswith('msg'): | ||||||
|                 with loc.open('rb') as file_: |                 with loc.open('rb') as file_: | ||||||
|                     gold_tuples = msgpack.load(file_, encoding='utf8') |                     gold_tuples = msgpack.load(file_, raw=False) | ||||||
|             else: |             else: | ||||||
|                 msg = "Cannot read from file: %s. Supported formats: .json, .msg" |                 msg = "Cannot read from file: %s. Supported formats: .json, .msg" | ||||||
|                 raise ValueError(msg % loc) |                 raise ValueError(msg % loc) | ||||||
|  | @ -350,7 +355,7 @@ def _json_iterate(loc): | ||||||
|                 py_str = py_raw[start : i+1].decode('utf8') |                 py_str = py_raw[start : i+1].decode('utf8') | ||||||
|                 try: |                 try: | ||||||
|                     yield json.loads(py_str) |                     yield json.loads(py_str) | ||||||
|                 except: |                 except Exception: | ||||||
|                     print(py_str) |                     print(py_str) | ||||||
|                     raise |                     raise | ||||||
|                 start = -1 |                 start = -1 | ||||||
|  |  | ||||||
|  | @ -759,7 +759,7 @@ class Tagger(Pipe): | ||||||
|             if self.model is True: |             if self.model is True: | ||||||
|                 token_vector_width = util.env_opt( |                 token_vector_width = util.env_opt( | ||||||
|                     'token_vector_width', |                     'token_vector_width', | ||||||
|                     self.cfg.get('token_vector_width', 128)) |                     self.cfg.get('token_vector_width', 96)) | ||||||
|                 self.model = self.Model(self.vocab.morphology.n_tags, |                 self.model = self.Model(self.vocab.morphology.n_tags, | ||||||
|                                         **self.cfg) |                                         **self.cfg) | ||||||
|             self.model.from_bytes(b) |             self.model.from_bytes(b) | ||||||
|  | @ -878,7 +878,7 @@ class MultitaskObjective(Tagger): | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def Model(cls, n_tags, tok2vec=None, **cfg): |     def Model(cls, n_tags, tok2vec=None, **cfg): | ||||||
|         token_vector_width = util.env_opt('token_vector_width', 128) |         token_vector_width = util.env_opt('token_vector_width', 96) | ||||||
|         softmax = Softmax(n_tags, token_vector_width) |         softmax = Softmax(n_tags, token_vector_width) | ||||||
|         model = chain( |         model = chain( | ||||||
|             tok2vec, |             tok2vec, | ||||||
|  |  | ||||||
|  | @ -63,9 +63,9 @@ cdef class Parser: | ||||||
|         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', |         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', | ||||||
|                                             cfg.get('maxout_pieces', 2)) |                                             cfg.get('maxout_pieces', 2)) | ||||||
|         token_vector_width = util.env_opt('token_vector_width', |         token_vector_width = util.env_opt('token_vector_width', | ||||||
|                                            cfg.get('token_vector_width', 128)) |                                            cfg.get('token_vector_width', 96)) | ||||||
|         hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 128)) |         hidden_width = util.env_opt('hidden_width', cfg.get('hidden_width', 64)) | ||||||
|         embed_size = util.env_opt('embed_size', cfg.get('embed_size', 5000)) |         embed_size = util.env_opt('embed_size', cfg.get('embed_size', 2000)) | ||||||
|         pretrained_vectors = cfg.get('pretrained_vectors', None) |         pretrained_vectors = cfg.get('pretrained_vectors', None) | ||||||
|         tok2vec = Tok2Vec(token_vector_width, embed_size, |         tok2vec = Tok2Vec(token_vector_width, embed_size, | ||||||
|                           conv_depth=conv_depth, |                           conv_depth=conv_depth, | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user