mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Fix conflict on convert.py
This commit is contained in:
		
						commit
						d06f235fc9
					
				|  | @ -57,9 +57,9 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | |||
|     # starts high and decays sharply, to force the optimizer to explore. | ||||
|     # Batch size starts at 1 and grows, so that we make updates quickly | ||||
|     # at the beginning of training. | ||||
|     dropout_rates = util.decaying(util.env_opt('dropout_from', 0.5), | ||||
|     dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), | ||||
|                                   util.env_opt('dropout_to', 0.2), | ||||
|                                   util.env_opt('dropout_decay', 1e-4)) | ||||
|                                   util.env_opt('dropout_decay', 0.0)) | ||||
|     batch_sizes = util.compounding(util.env_opt('batch_from', 1), | ||||
|                                    util.env_opt('batch_to', 64), | ||||
|                                    util.env_opt('batch_compound', 1.001)) | ||||
|  | @ -71,23 +71,30 @@ def train(_, lang, output_dir, train_data, dev_data, n_iter=20, n_sents=0, | |||
|     optimizer = nlp.begin_training(lambda: corpus.train_tuples, use_gpu=use_gpu) | ||||
| 
 | ||||
|     print("Itn.\tDep. Loss\tUAS\tNER P.\tNER R.\tNER F.\tTag %\tToken %") | ||||
|     for i in range(n_iter): | ||||
|         with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar: | ||||
|             train_docs = corpus.train_docs(nlp, projectivize=True, | ||||
|                                            gold_preproc=False, shuffle=i) | ||||
|             losses = {} | ||||
|             for batch in minibatch(train_docs, size=batch_sizes): | ||||
|                 docs, golds = zip(*batch) | ||||
|                 nlp.update(docs, golds, sgd=optimizer, | ||||
|                            drop=next(dropout_rates), losses=losses) | ||||
|                 pbar.update(len(docs)) | ||||
|     try: | ||||
|         for i in range(n_iter): | ||||
|             with tqdm.tqdm(total=corpus.count_train(), leave=False) as pbar: | ||||
|                 train_docs = corpus.train_docs(nlp, projectivize=True, | ||||
|                                                gold_preproc=False, max_length=1000) | ||||
|                 losses = {} | ||||
|                 for batch in minibatch(train_docs, size=batch_sizes): | ||||
|                     docs, golds = zip(*batch) | ||||
|                     nlp.update(docs, golds, sgd=optimizer, | ||||
|                                drop=next(dropout_rates), losses=losses) | ||||
|                     pbar.update(len(docs)) | ||||
| 
 | ||||
|         with nlp.use_params(optimizer.averages): | ||||
|             scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False)) | ||||
|         print_progress(i, losses, scorer.scores) | ||||
|     with (output_path / 'model.bin').open('wb') as file_: | ||||
|         with nlp.use_params(optimizer.averages): | ||||
|             dill.dump(nlp, file_, -1) | ||||
|             with nlp.use_params(optimizer.averages): | ||||
|                 scorer = nlp.evaluate(corpus.dev_docs(nlp, gold_preproc=False)) | ||||
|                 with (output_path / ('model%d.pickle' % i)).open('wb') as file_: | ||||
|                     dill.dump(nlp, file_, -1) | ||||
| 
 | ||||
| 
 | ||||
|             print_progress(i, losses, scorer.scores) | ||||
|     finally: | ||||
|         print("Saving model...") | ||||
|         with (output_path / 'model-final.pickle').open('wb') as file_: | ||||
|             with nlp.use_params(optimizer.averages): | ||||
|                 dill.dump(nlp, file_, -1) | ||||
| 
 | ||||
| 
 | ||||
| def _render_parses(i, to_render): | ||||
|  |  | |||
|  | @ -198,15 +198,15 @@ class GoldCorpus(object): | |||
|             n += 1 | ||||
|         return n | ||||
| 
 | ||||
|     def train_docs(self, nlp, shuffle=0, gold_preproc=False, | ||||
|                    projectivize=False): | ||||
|     def train_docs(self, nlp, gold_preproc=False, | ||||
|                    projectivize=False, max_length=None): | ||||
|         train_tuples = self.train_tuples | ||||
|         if projectivize: | ||||
|             train_tuples = nonproj.preprocess_training_data( | ||||
|                                self.train_tuples) | ||||
|         if shuffle: | ||||
|             random.shuffle(train_tuples) | ||||
|         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc) | ||||
|         random.shuffle(train_tuples) | ||||
|         gold_docs = self.iter_gold_docs(nlp, train_tuples, gold_preproc, | ||||
|                                         max_length=max_length) | ||||
|         yield from gold_docs | ||||
| 
 | ||||
|     def dev_docs(self, nlp, gold_preproc=False): | ||||
|  | @ -215,7 +215,7 @@ class GoldCorpus(object): | |||
|         yield from gold_docs | ||||
| 
 | ||||
|     @classmethod | ||||
|     def iter_gold_docs(cls, nlp, tuples, gold_preproc): | ||||
|     def iter_gold_docs(cls, nlp, tuples, gold_preproc, max_length=None): | ||||
|         for raw_text, paragraph_tuples in tuples: | ||||
|             if gold_preproc: | ||||
|                 raw_text = None | ||||
|  | @ -226,7 +226,8 @@ class GoldCorpus(object): | |||
|                                   gold_preproc) | ||||
|             golds = cls._make_golds(docs, paragraph_tuples) | ||||
|             for doc, gold in zip(docs, golds): | ||||
|                 yield doc, gold | ||||
|                 if not max_length or len(doc) < max_length: | ||||
|                     yield doc, gold | ||||
| 
 | ||||
|     @classmethod | ||||
|     def _make_docs(cls, nlp, raw_text, paragraph_tuples, gold_preproc): | ||||
|  |  | |||
|  | @ -223,8 +223,7 @@ class Language(object): | |||
|             tokvecses, bp_tokvecses = tok2vec.model.begin_update(feats, drop=drop) | ||||
|             d_tokvecses = proc.update((docs, tokvecses), golds, | ||||
|                                       drop=drop, sgd=get_grads, losses=losses) | ||||
|             bp_tokvecses(d_tokvecses, sgd=get_grads) | ||||
|             break | ||||
|             bp_tokvecses(d_tokvecses, sgd=sgd) | ||||
|         for key, (W, dW) in grads.items(): | ||||
|             sgd(W, dW, key=key) | ||||
|         # Clear the tensor variable, to free GPU memory. | ||||
|  |  | |||
|  | @ -345,6 +345,7 @@ cdef cppclass StateC: | |||
|         this._s_i = src._s_i | ||||
|         this._e_i = src._e_i | ||||
|         this._break = src._break | ||||
|         this.offset = src.offset | ||||
| 
 | ||||
|     void fast_forward() nogil: | ||||
|         # space token attachement policy: | ||||
|  |  | |||
|  | @ -350,8 +350,15 @@ cdef class ArcEager(TransitionSystem): | |||
|         def __get__(self): | ||||
|             return (SHIFT, REDUCE, LEFT, RIGHT, BREAK) | ||||
| 
 | ||||
|     def has_gold(self, GoldParse gold, start=0, end=None): | ||||
|         end = end or len(gold.heads) | ||||
|         if all([tag is None for tag in gold.heads[start:end]]): | ||||
|             return False | ||||
|         else: | ||||
|             return True | ||||
| 
 | ||||
|     def preprocess_gold(self, GoldParse gold): | ||||
|         if all([h is None for h in gold.heads]): | ||||
|         if not self.has_gold(gold): | ||||
|             return None | ||||
|         for i in range(gold.length): | ||||
|             if gold.heads[i] is None: # Missing values | ||||
|  |  | |||
|  | @ -95,8 +95,15 @@ cdef class BiluoPushDown(TransitionSystem): | |||
|         else: | ||||
|             return MOVE_NAMES[move] + '-' + self.strings[label] | ||||
| 
 | ||||
|     def has_gold(self, GoldParse gold, start=0, end=None): | ||||
|         end = end or len(gold.ner) | ||||
|         if all([tag == '-' for tag in gold.ner[start:end]]): | ||||
|             return False | ||||
|         else: | ||||
|             return True | ||||
| 
 | ||||
|     def preprocess_gold(self, GoldParse gold): | ||||
|         if all([tag == '-' for tag in gold.ner]): | ||||
|         if not self.has_gold(gold): | ||||
|             return None | ||||
|         for i in range(gold.length): | ||||
|             gold.c.ner[i] = self.lookup_transition(gold.ner[i]) | ||||
|  |  | |||
|  | @ -427,8 +427,7 @@ cdef class Parser: | |||
| 
 | ||||
|         cuda_stream = get_cuda_stream() | ||||
| 
 | ||||
|         states, golds = self._init_gold_batch(docs, golds) | ||||
|         max_length = min([len(doc) for doc in docs]) | ||||
|         states, golds, max_length = self._init_gold_batch(docs, golds) | ||||
|         state2vec, vec2scores = self.get_batch_model(len(states), tokvecs, cuda_stream, | ||||
|                                                       0.0) | ||||
|         todo = [(s, g) for (s, g) in zip(states, golds) | ||||
|  | @ -472,46 +471,36 @@ cdef class Parser: | |||
|             backprops, sgd, cuda_stream) | ||||
|         return self.model[0].ops.unflatten(d_tokvecs, [len(d) for d in docs]) | ||||
| 
 | ||||
|     def _init_gold_batch(self, docs, golds): | ||||
|     def _init_gold_batch(self, whole_docs, whole_golds): | ||||
|         """Make a square batch, of length equal to the shortest doc. A long | ||||
|         doc will get multiple states. Let's say we have a doc of length 2*N, | ||||
|         where N is the shortest doc. We'll make two states, one representing | ||||
|         long_doc[:N], and another representing long_doc[N:].""" | ||||
|         cdef StateClass state | ||||
|         lengths = [len(doc) for doc in docs] | ||||
|         min_length = min(lengths) | ||||
|         offset = 0 | ||||
|         cdef: | ||||
|             StateClass state | ||||
|             Transition action | ||||
|         whole_states = self.moves.init_batch(whole_docs) | ||||
|         max_length = max(5, min(20, min([len(doc) for doc in whole_docs]))) | ||||
|         states = [] | ||||
|         extra_golds = [] | ||||
|         cdef Pool mem = Pool() | ||||
|         costs = <float*>mem.alloc(self.moves.n_moves, sizeof(float)) | ||||
|         is_valid = <int*>mem.alloc(self.moves.n_moves, sizeof(int)) | ||||
|         for doc, gold in zip(docs, golds): | ||||
|         golds = [] | ||||
|         for doc, state, gold in zip(whole_docs, whole_states, whole_golds): | ||||
|             gold = self.moves.preprocess_gold(gold) | ||||
|             state = StateClass(doc, offset=offset) | ||||
|             self.moves.initialize_state(state.c) | ||||
|             if not state.is_final(): | ||||
|                 states.append(state) | ||||
|                 extra_golds.append(gold) | ||||
|             start = min(min_length, len(doc)) | ||||
|             if gold is None: | ||||
|                 continue | ||||
|             oracle_actions = self.moves.get_oracle_sequence(doc, gold) | ||||
|             start = 0 | ||||
|             while start < len(doc): | ||||
|                 length = min(min_length, len(doc)-start) | ||||
|                 state = StateClass(doc, offset=offset) | ||||
|                 self.moves.initialize_state(state.c) | ||||
|                 state = state.copy() | ||||
|                 while state.B(0) < start and not state.is_final(): | ||||
|                     self.moves.set_costs(is_valid, costs, state, gold) | ||||
|                     for i in range(self.moves.n_moves): | ||||
|                         if is_valid[i] and costs[i] <= 0: | ||||
|                             self.moves.c[i].do(state.c, self.moves.c[i].label) | ||||
|                             break | ||||
|                     else: | ||||
|                         raise ValueError("Could not find gold move") | ||||
|                 start += length | ||||
|                 if not state.is_final(): | ||||
|                     action = self.moves.c[oracle_actions.pop(0)] | ||||
|                     action.do(state.c, action.label) | ||||
|                 has_gold = self.moves.has_gold(gold, start=start, | ||||
|                                                end=start+max_length) | ||||
|                 if not state.is_final() and has_gold: | ||||
|                     states.append(state) | ||||
|                     extra_golds.append(gold) | ||||
|             offset += len(doc) | ||||
|         return states, extra_golds | ||||
|                     golds.append(gold) | ||||
|                 start += min(max_length, len(doc)-start) | ||||
|         return states, golds, max_length | ||||
| 
 | ||||
|     def _make_updates(self, d_tokvecs, backprops, sgd, cuda_stream=None): | ||||
|         # Tells CUDA to block, so our async copies complete. | ||||
|  |  | |||
|  | @ -41,6 +41,11 @@ cdef class StateClass: | |||
|     def is_final(self): | ||||
|         return self.c.is_final() | ||||
| 
 | ||||
|     def copy(self): | ||||
|         cdef StateClass new_state = StateClass.init(self.c._sent, self.c.length) | ||||
|         new_state.c.clone(self.c) | ||||
|         return new_state | ||||
| 
 | ||||
|     def print_state(self, words): | ||||
|         words = list(words) + ['_'] | ||||
|         top = words[self.S(0)] + '_%d' % self.S_(0).head | ||||
|  |  | |||
|  | @ -61,6 +61,24 @@ cdef class TransitionSystem: | |||
|             offset += len(doc) | ||||
|         return states | ||||
| 
 | ||||
|     def get_oracle_sequence(self, doc, GoldParse gold): | ||||
|         cdef Pool mem = Pool() | ||||
|         costs = <float*>mem.alloc(self.n_moves, sizeof(float)) | ||||
|         is_valid = <int*>mem.alloc(self.n_moves, sizeof(int)) | ||||
| 
 | ||||
|         cdef StateClass state = StateClass(doc, offset=0) | ||||
|         self.initialize_state(state.c) | ||||
|         history = [] | ||||
|         while not state.is_final(): | ||||
|             self.set_costs(is_valid, costs, state, gold) | ||||
|             for i in range(self.n_moves): | ||||
|                 if is_valid[i] and costs[i] <= 0: | ||||
|                     action = self.c[i] | ||||
|                     history.append(i) | ||||
|                     action.do(state.c, action.label) | ||||
|                     break | ||||
|         return history | ||||
| 
 | ||||
|     cdef int initialize_state(self, StateC* state) nogil: | ||||
|         pass | ||||
| 
 | ||||
|  | @ -92,11 +110,21 @@ cdef class TransitionSystem: | |||
|                        StateClass stcls, GoldParse gold) except -1: | ||||
|         cdef int i | ||||
|         self.set_valid(is_valid, stcls.c) | ||||
|         cdef int n_gold = 0 | ||||
|         for i in range(self.n_moves): | ||||
|             if is_valid[i]: | ||||
|                 costs[i] = self.c[i].get_cost(stcls, &gold.c, self.c[i].label) | ||||
|                 n_gold += costs[i] <= 0 | ||||
|             else: | ||||
|                 costs[i] = 9000 | ||||
|         if n_gold <= 0: | ||||
|             print(gold.words) | ||||
|             print(gold.ner) | ||||
|             raise ValueError( | ||||
|                 "Could not find a gold-standard action to supervise " | ||||
|                 "the entity recognizer\n" | ||||
|                 "The transition system has %d actions.\n" | ||||
|                 "%s" % (self.n_moves)) | ||||
| 
 | ||||
|     def add_action(self, int action, label): | ||||
|         if not isinstance(label, int): | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user