mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 05:01:02 +03:00 
			
		
		
		
	* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc.
This commit is contained in:
		
							parent
							
								
									4539c70542
								
							
						
					
					
						commit
						ae235e07b9
					
				|  | @ -206,7 +206,7 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0, | |||
| 
 | ||||
|     Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, | ||||
|                  labels=Language.ParserTransitionSystem.get_labels(gold_tuples)) | ||||
|     Config.write(ner_model_dir, 'config', features=feat_set, seed=seed, | ||||
|     Config.write(ner_model_dir, 'config', features='ner', seed=seed, | ||||
|                  labels=Language.EntityTransitionSystem.get_labels(gold_tuples)) | ||||
| 
 | ||||
|     nlp = Language() | ||||
|  | @ -214,6 +214,7 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0, | |||
|     for itn in range(n_iter): | ||||
|         dep_corr = 0 | ||||
|         pos_corr = 0 | ||||
|         ent_corr = 0 | ||||
|         n_tokens = 0 | ||||
|         for raw_text, segmented_text, annot_tuples in gold_tuples: | ||||
|             if gold_preproc: | ||||
|  | @ -221,14 +222,11 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0, | |||
|             else: | ||||
|                 sents = [nlp.tokenizer(raw_text)] | ||||
|             for tokens in sents: | ||||
| 
 | ||||
|                 gold = GoldParse(tokens, annot_tuples, nlp.tags, | ||||
|                                  nlp.parser.moves.label_ids, | ||||
|                                  nlp.entity.moves.label_ids) | ||||
| 
 | ||||
|                 gold = GoldParse(tokens, annot_tuples) | ||||
|                 nlp.tagger(tokens) | ||||
|                 #ent_corr += nlp.entity.train(tokens, gold, force_gold=force_gold) | ||||
|                 dep_corr += nlp.parser.train(tokens, gold, force_gold=force_gold) | ||||
|                 pos_corr += nlp.tagger.train(tokens, gold.tags_) | ||||
|                 pos_corr += nlp.tagger.train(tokens, gold.tags) | ||||
|                 n_tokens += len(tokens) | ||||
|         acc = float(dep_corr) / n_tokens | ||||
|         pos_acc = float(pos_corr) / n_tokens | ||||
|  |  | |||
|  | @ -27,6 +27,13 @@ cdef enum: | |||
|     BREAK | ||||
|     N_MOVES | ||||
| 
 | ||||
| MOVE_NAMES = [None] * N_MOVES | ||||
| MOVE_NAMES[SHIFT] = 'S' | ||||
| MOVE_NAMES[REDUCE] = 'D' | ||||
| MOVE_NAMES[LEFT] = 'L' | ||||
| MOVE_NAMES[RIGHT] = 'R' | ||||
| MOVE_NAMES[BREAK] = 'B' | ||||
| 
 | ||||
| 
 | ||||
| cdef do_func_t[N_MOVES] do_funcs | ||||
| cdef get_cost_func_t[N_MOVES] get_cost_funcs | ||||
|  | @ -46,6 +53,23 @@ cdef class ArcEager(TransitionSystem): | |||
|                         move_labels[LEFT][label] = True | ||||
|         return move_labels | ||||
| 
 | ||||
|     cdef int preprocess_gold(self, GoldParse gold) except -1: | ||||
|         for i in range(gold.length): | ||||
|             gold.c_heads[i] = gold.heads[i] | ||||
|             gold.c_labels[i] = self.label_ids[gold.labels[i]] | ||||
| 
 | ||||
| 
 | ||||
|     cdef Transition lookup_transition(self, object name) except *: | ||||
|         if '-' in name: | ||||
|             move_str, label_str = name.split('-', 1) | ||||
|             label = self.label_ids[label_str] | ||||
|         else: | ||||
|             label = 0 | ||||
|         move = MOVE_NAMES.index(move_str) | ||||
|         for i in range(self.n_moves): | ||||
|             if self.c[i].move == move and self.c[i].label == label: | ||||
|                 return self.c[i] | ||||
| 
 | ||||
|     cdef Transition init_transition(self, int clas, int move, int label) except *: | ||||
|         # TODO: Apparent Cython bug here when we try to use the Transition() | ||||
|         # constructor with the function pointers | ||||
|  |  | |||
|  | @ -3,31 +3,21 @@ from cymem.cymem cimport Pool | |||
| from ..structs cimport TokenC | ||||
| from .transition_system cimport Transition | ||||
| 
 | ||||
| cimport numpy | ||||
| 
 | ||||
| cdef class GoldParse: | ||||
|     cdef Pool mem | ||||
| 
 | ||||
|     cdef int length | ||||
|     cdef readonly int loss | ||||
|     cdef readonly object ids | ||||
|     cdef readonly object tags | ||||
|     cdef readonly object heads | ||||
|     cdef readonly object labels | ||||
|     cdef readonly list tags | ||||
|     cdef readonly list heads | ||||
|     cdef readonly list labels | ||||
|     cdef readonly list ner | ||||
| 
 | ||||
|     cdef readonly object tags_ | ||||
|     cdef readonly object labels_ | ||||
|     cdef readonly object ner_ | ||||
| 
 | ||||
|     cdef Transition* ner | ||||
|     cdef int* c_tags | ||||
|     cdef int* c_heads | ||||
|     cdef int* c_labels | ||||
|     cdef Transition* c_ner | ||||
| 
 | ||||
|     cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1 | ||||
| 
 | ||||
| 
 | ||||
| cdef class NERAnnotation: | ||||
|     cdef Pool mem | ||||
|     cdef int* starts | ||||
|     cdef int* ends | ||||
|     cdef int* labels | ||||
|     cdef readonly list entities | ||||
|  |  | |||
|  | @ -34,38 +34,37 @@ def read_docparse_file(loc): | |||
|         sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents))) | ||||
|     return sents | ||||
| 
 | ||||
| def _parse_line(line): | ||||
|     pieces = line.split() | ||||
|     if len(pieces) == 4: | ||||
|         return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3] | ||||
|     else: | ||||
|         id_ = int(pieces[0]) | ||||
|         word = pieces[1] | ||||
|         pos = pieces[3] | ||||
|         iob_ent = pieces[5] | ||||
|         head_idx = int(pieces[6]) | ||||
|         label = pieces[7] | ||||
|         return id_, word, pos, head_idx, label, iob_ent | ||||
| 
 | ||||
| cdef class GoldParse: | ||||
|     def __init__(self, tokens, annot_tuples, pos_tags, dep_labels, entity_types): | ||||
|     def __init__(self, tokens, annot_tuples): | ||||
|         self.mem = Pool() | ||||
|         self.loss = 0 | ||||
|         self.length = len(tokens) | ||||
|         self.ids = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) | ||||
|         self.tags = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) | ||||
|         self.heads = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) | ||||
|         self.labels = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) | ||||
| 
 | ||||
|         self.ids[:] = -1 | ||||
|         self.tags[:] = -1 | ||||
|         self.heads[:] = -1 | ||||
|         self.labels[:] = -1 | ||||
| 
 | ||||
|         self.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition)) | ||||
|         # These are filled by the tagger/parser/entity recogniser | ||||
|         self.c_tags = <int*>self.mem.alloc(len(tokens), sizeof(int)) | ||||
|         self.c_heads = <int*>self.mem.alloc(len(tokens), sizeof(int)) | ||||
|         self.c_labels = <int*>self.mem.alloc(len(tokens), sizeof(int)) | ||||
|         self.c_ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition)) | ||||
| 
 | ||||
|         for i in range(len(tokens)): | ||||
|             self.c_heads[i] = -1 | ||||
|             self.c_labels[i] = -1 | ||||
|          | ||||
|         self.tags_ = [None] * len(tokens) | ||||
|         self.labels_ = [None] * len(tokens) | ||||
|         self.ner_ = [None] * len(tokens) | ||||
|         self.tags = [None] * len(tokens) | ||||
|         self.heads = [-1] * len(tokens) | ||||
|         self.labels = ['MISSING'] * len(tokens) | ||||
|         self.ner = [None] * len(tokens) | ||||
| 
 | ||||
|         idx_map = {token.idx: token.i for token in tokens} | ||||
|         print idx_map | ||||
|         # TODO: Fill NER moves | ||||
|         print raw_text | ||||
|         for idx, tag, head, label, ner in zip(*annot_tuples): | ||||
|             if idx < tokens[0].idx: | ||||
|                 pass | ||||
|  | @ -73,16 +72,12 @@ cdef class GoldParse: | |||
|                 break | ||||
|             elif idx in idx_map: | ||||
|                 i = idx_map[idx] | ||||
|                 print i, idx, head, idx_map.get(head, -1) | ||||
|                 self.ids[i] = idx | ||||
|                 self.tags[i] = pos_tags.index(tag) | ||||
|                 self.tags[i] = tag | ||||
|                 self.heads[i] = idx_map.get(head, -1) | ||||
|                 self.labels[i] = dep_labels[label] | ||||
|                 self.c_heads[i] = -1 | ||||
|                 self.c_labels[i] = -1 | ||||
|                 self.tags_[i] = tag | ||||
|                 self.labels_[i] = label | ||||
|                 self.ner_[i] = ner | ||||
|                 self.labels[i] = label | ||||
|                 self.tags[i] = tag | ||||
|                 self.labels[i] = label | ||||
|                 self.ner[i] = ner | ||||
| 
 | ||||
|     @property | ||||
|     def n_non_punct(self): | ||||
|  | @ -116,71 +111,3 @@ def _map_indices_to_tokens(ids, heads): | |||
|     return mapped | ||||
| 
 | ||||
| 
 | ||||
| def _parse_line(line): | ||||
|     pieces = line.split() | ||||
|     if len(pieces) == 4: | ||||
|         return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3] | ||||
|     else: | ||||
|         id_ = int(pieces[0]) | ||||
|         word = pieces[1] | ||||
|         pos = pieces[3] | ||||
|         iob_ent = pieces[5] | ||||
|         head_idx = int(pieces[6]) | ||||
|         label = pieces[7] | ||||
|         return id_, word, pos, head_idx, label, iob_ent | ||||
| 
 | ||||
| 
 | ||||
| cdef class NERAnnotation: | ||||
|     def __init__(self, entities, length, entity_types): | ||||
|         self.mem = Pool() | ||||
|         self.starts = <int*>self.mem.alloc(length, sizeof(int)) | ||||
|         self.ends = <int*>self.mem.alloc(length, sizeof(int)) | ||||
|         self.labels = <int*>self.mem.alloc(length, sizeof(int)) | ||||
|         self.entities = entities | ||||
|         memset(self.starts, -1, sizeof(int) * length) | ||||
|         memset(self.ends, -1, sizeof(int) * length) | ||||
|         memset(self.labels, -1, sizeof(int) * length) | ||||
|          | ||||
|         cdef int start, end, label | ||||
|         for start, end, label in entities: | ||||
|             for i in range(start, end): | ||||
|                 self.starts[i] = start | ||||
|                 self.ends[i] = end | ||||
|                 self.labels[i] = label | ||||
|     @property | ||||
|     def biluo_tags(self): | ||||
|         pass | ||||
| 
 | ||||
|     @property | ||||
|     def iob_tags(self): | ||||
|         pass | ||||
| 
 | ||||
|     @classmethod | ||||
|     def from_iobs(cls, iob_strs, entity_types): | ||||
|         return cls.from_biluos(iob_to_biluo(iob_strs), entity_types) | ||||
| 
 | ||||
|     @classmethod | ||||
|     def from_biluos(cls, tag_strs, entity_types): | ||||
|         entities = [] | ||||
|         start = None | ||||
|         for i, tag_str in enumerate(tag_strs): | ||||
|             if tag_str == 'O' or tag_str == '-': | ||||
|                 continue | ||||
|             move, label_str = tag_str.split('-') | ||||
|             label = entity_types.index(label_str) | ||||
|             if label == -1: | ||||
|                 label = len(entity_types) | ||||
|                 entity_types.append(label) | ||||
|             if move == 'U': | ||||
|                 assert start is None | ||||
|                 entities.append((i, i+1, label)) | ||||
|             elif move == 'B': | ||||
|                 assert start is None | ||||
|                 start = i | ||||
|             elif move == 'L': | ||||
|                 assert start is not None | ||||
|                 entities.append((start, i+1, label)) | ||||
|                 start = None | ||||
|         return cls(entities, len(tag_strs), entity_types) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -21,6 +21,14 @@ cdef enum: | |||
|     OUT | ||||
|     N_MOVES | ||||
| 
 | ||||
| MOVE_NAMES = [None] * N_MOVES | ||||
| MOVE_NAMES[MISSING] = 'M' | ||||
| MOVE_NAMES[BEGIN] = 'B' | ||||
| MOVE_NAMES[IN] = 'I' | ||||
| MOVE_NAMES[LAST] = 'L' | ||||
| MOVE_NAMES[UNIT] = 'U' | ||||
| MOVE_NAMES[OUT] = 'O' | ||||
| 
 | ||||
| 
 | ||||
| cdef do_func_t[N_MOVES] do_funcs | ||||
| 
 | ||||
|  | @ -70,6 +78,23 @@ cdef class BiluoPushDown(TransitionSystem): | |||
|                     move_labels[moves.index(move_str)][label] = True | ||||
|         return move_labels | ||||
| 
 | ||||
|     cdef int preprocess_gold(self, GoldParse gold) except -1: | ||||
|         biluo_strings = iob_to_biluo(gold.ner) | ||||
|         for i in range(gold.length): | ||||
|             gold.c_ner[i] = self.lookup_transition(biluo_strings[i]) | ||||
| 
 | ||||
|     cdef Transition lookup_transition(self, object name) except *: | ||||
|         if '-' in name: | ||||
|             move_str, label_str = name.split('-', 1) | ||||
|             label = self.label_ids[label_str] | ||||
|         else: | ||||
|             move_str = name | ||||
|             label = 0 | ||||
|         move = MOVE_NAMES.index(move_str) | ||||
|         for i in range(self.n_moves): | ||||
|             if self.c[i].move == move and self.c[i].label == label: | ||||
|                 return self.c[i] | ||||
| 
 | ||||
|     cdef Transition init_transition(self, int clas, int move, int label) except *: | ||||
|         # TODO: Apparent Cython bug here when we try to use the Transition() | ||||
|         # constructor with the function pointers | ||||
|  | @ -101,9 +126,9 @@ cdef class BiluoPushDown(TransitionSystem): | |||
| cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) except -1: | ||||
|     if not _is_valid(self.move, self.label, s): | ||||
|         return 9000 | ||||
|     cdef bint is_sunk = _entity_is_sunk(s, gold.ner) | ||||
|     cdef int next_act = gold.ner[s.i+1].move if s.i < s.sent_len else OUT | ||||
|     return not _is_gold(self.move, self.label, gold.ner[s.i].move, gold.ner[s.i].label, | ||||
|     cdef bint is_sunk = _entity_is_sunk(s, gold.c_ner) | ||||
|     cdef int next_act = gold.c_ner[s.i+1].move if s.i < s.sent_len else OUT | ||||
|     return not _is_gold(self.move, self.label, gold.c_ner[s.i].move, gold.c_ner[s.i].label, | ||||
|                         next_act, is_sunk) | ||||
| 
 | ||||
| cdef bint _is_gold(int act, int tag, int g_act, int g_tag, | ||||
|  |  | |||
|  | @ -34,6 +34,8 @@ from .conll cimport GoldParse | |||
| from . import _parse_features | ||||
| from ._parse_features cimport fill_context, CONTEXT_SIZE | ||||
| 
 | ||||
| from ._ner_features cimport _ner_features | ||||
| 
 | ||||
| 
 | ||||
| DEBUG = False  | ||||
| def set_debug(val): | ||||
|  | @ -55,6 +57,8 @@ def get_templates(name): | |||
|     pf = _parse_features | ||||
|     if name == 'zhang': | ||||
|         return pf.arc_eager | ||||
|     elif name == 'ner': | ||||
|         return _ner_features.basic | ||||
|     else: | ||||
|         return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \ | ||||
|                 pf.tree_shape + pf.trigrams) | ||||
|  | @ -95,7 +99,8 @@ cdef class GreedyParser: | |||
|             Transition best | ||||
|              | ||||
|             atom_t[CONTEXT_SIZE] context | ||||
|          | ||||
|         | ||||
|         self.moves.preprocess_gold(gold) | ||||
|         cdef Pool mem = Pool() | ||||
|         cdef State* state = init_state(mem, tokens.data, tokens.length) | ||||
|         while not is_final(state): | ||||
|  |  | |||
|  | @ -29,6 +29,10 @@ cdef class TransitionSystem: | |||
|     cdef const Transition* c | ||||
|     cdef readonly int n_moves | ||||
| 
 | ||||
|     cdef int preprocess_gold(self, GoldParse gold) except -1 | ||||
|      | ||||
|     cdef Transition lookup_transition(self, object name) except * | ||||
|      | ||||
|     cdef Transition init_transition(self, int clas, int move, int label) except * | ||||
| 
 | ||||
|     cdef Transition best_valid(self, const weight_t* scores, const State* state) except * | ||||
|  |  | |||
|  | @ -28,6 +28,12 @@ cdef class TransitionSystem: | |||
|         self.label_ids['MISSING'] = -1 | ||||
|         self.c = moves | ||||
| 
 | ||||
|     cdef int preprocess_gold(self, GoldParse gold) except -1: | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     cdef Transition lookup_transition(self, object name) except *: | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|     cdef Transition init_transition(self, int clas, int move, int label) except *: | ||||
|         raise NotImplementedError | ||||
| 
 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user