mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	* Refactoring working for parser, but now need to rig up features for NER, and then debug oracle etc.
This commit is contained in:
		
							parent
							
								
									4539c70542
								
							
						
					
					
						commit
						ae235e07b9
					
				|  | @ -206,7 +206,7 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0, | ||||||
| 
 | 
 | ||||||
|     Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, |     Config.write(dep_model_dir, 'config', features=feat_set, seed=seed, | ||||||
|                  labels=Language.ParserTransitionSystem.get_labels(gold_tuples)) |                  labels=Language.ParserTransitionSystem.get_labels(gold_tuples)) | ||||||
|     Config.write(ner_model_dir, 'config', features=feat_set, seed=seed, |     Config.write(ner_model_dir, 'config', features='ner', seed=seed, | ||||||
|                  labels=Language.EntityTransitionSystem.get_labels(gold_tuples)) |                  labels=Language.EntityTransitionSystem.get_labels(gold_tuples)) | ||||||
| 
 | 
 | ||||||
|     nlp = Language() |     nlp = Language() | ||||||
|  | @ -214,6 +214,7 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0, | ||||||
|     for itn in range(n_iter): |     for itn in range(n_iter): | ||||||
|         dep_corr = 0 |         dep_corr = 0 | ||||||
|         pos_corr = 0 |         pos_corr = 0 | ||||||
|  |         ent_corr = 0 | ||||||
|         n_tokens = 0 |         n_tokens = 0 | ||||||
|         for raw_text, segmented_text, annot_tuples in gold_tuples: |         for raw_text, segmented_text, annot_tuples in gold_tuples: | ||||||
|             if gold_preproc: |             if gold_preproc: | ||||||
|  | @ -221,14 +222,11 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0, | ||||||
|             else: |             else: | ||||||
|                 sents = [nlp.tokenizer(raw_text)] |                 sents = [nlp.tokenizer(raw_text)] | ||||||
|             for tokens in sents: |             for tokens in sents: | ||||||
| 
 |                 gold = GoldParse(tokens, annot_tuples) | ||||||
|                 gold = GoldParse(tokens, annot_tuples, nlp.tags, |  | ||||||
|                                  nlp.parser.moves.label_ids, |  | ||||||
|                                  nlp.entity.moves.label_ids) |  | ||||||
| 
 |  | ||||||
|                 nlp.tagger(tokens) |                 nlp.tagger(tokens) | ||||||
|  |                 #ent_corr += nlp.entity.train(tokens, gold, force_gold=force_gold) | ||||||
|                 dep_corr += nlp.parser.train(tokens, gold, force_gold=force_gold) |                 dep_corr += nlp.parser.train(tokens, gold, force_gold=force_gold) | ||||||
|                 pos_corr += nlp.tagger.train(tokens, gold.tags_) |                 pos_corr += nlp.tagger.train(tokens, gold.tags) | ||||||
|                 n_tokens += len(tokens) |                 n_tokens += len(tokens) | ||||||
|         acc = float(dep_corr) / n_tokens |         acc = float(dep_corr) / n_tokens | ||||||
|         pos_acc = float(pos_corr) / n_tokens |         pos_acc = float(pos_corr) / n_tokens | ||||||
|  |  | ||||||
|  | @ -27,6 +27,13 @@ cdef enum: | ||||||
|     BREAK |     BREAK | ||||||
|     N_MOVES |     N_MOVES | ||||||
| 
 | 
 | ||||||
|  | MOVE_NAMES = [None] * N_MOVES | ||||||
|  | MOVE_NAMES[SHIFT] = 'S' | ||||||
|  | MOVE_NAMES[REDUCE] = 'D' | ||||||
|  | MOVE_NAMES[LEFT] = 'L' | ||||||
|  | MOVE_NAMES[RIGHT] = 'R' | ||||||
|  | MOVE_NAMES[BREAK] = 'B' | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| cdef do_func_t[N_MOVES] do_funcs | cdef do_func_t[N_MOVES] do_funcs | ||||||
| cdef get_cost_func_t[N_MOVES] get_cost_funcs | cdef get_cost_func_t[N_MOVES] get_cost_funcs | ||||||
|  | @ -46,6 +53,23 @@ cdef class ArcEager(TransitionSystem): | ||||||
|                         move_labels[LEFT][label] = True |                         move_labels[LEFT][label] = True | ||||||
|         return move_labels |         return move_labels | ||||||
| 
 | 
 | ||||||
|  |     cdef int preprocess_gold(self, GoldParse gold) except -1: | ||||||
|  |         for i in range(gold.length): | ||||||
|  |             gold.c_heads[i] = gold.heads[i] | ||||||
|  |             gold.c_labels[i] = self.label_ids[gold.labels[i]] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |     cdef Transition lookup_transition(self, object name) except *: | ||||||
|  |         if '-' in name: | ||||||
|  |             move_str, label_str = name.split('-', 1) | ||||||
|  |             label = self.label_ids[label_str] | ||||||
|  |         else: | ||||||
|  |             label = 0 | ||||||
|  |         move = MOVE_NAMES.index(move_str) | ||||||
|  |         for i in range(self.n_moves): | ||||||
|  |             if self.c[i].move == move and self.c[i].label == label: | ||||||
|  |                 return self.c[i] | ||||||
|  | 
 | ||||||
|     cdef Transition init_transition(self, int clas, int move, int label) except *: |     cdef Transition init_transition(self, int clas, int move, int label) except *: | ||||||
|         # TODO: Apparent Cython bug here when we try to use the Transition() |         # TODO: Apparent Cython bug here when we try to use the Transition() | ||||||
|         # constructor with the function pointers |         # constructor with the function pointers | ||||||
|  |  | ||||||
|  | @ -3,31 +3,21 @@ from cymem.cymem cimport Pool | ||||||
| from ..structs cimport TokenC | from ..structs cimport TokenC | ||||||
| from .transition_system cimport Transition | from .transition_system cimport Transition | ||||||
| 
 | 
 | ||||||
|  | cimport numpy | ||||||
| 
 | 
 | ||||||
| cdef class GoldParse: | cdef class GoldParse: | ||||||
|     cdef Pool mem |     cdef Pool mem | ||||||
| 
 | 
 | ||||||
|     cdef int length |     cdef int length | ||||||
|     cdef readonly int loss |     cdef readonly int loss | ||||||
|     cdef readonly object ids |     cdef readonly list tags | ||||||
|     cdef readonly object tags |     cdef readonly list heads | ||||||
|     cdef readonly object heads |     cdef readonly list labels | ||||||
|     cdef readonly object labels |     cdef readonly list ner | ||||||
| 
 | 
 | ||||||
|     cdef readonly object tags_ |     cdef int* c_tags | ||||||
|     cdef readonly object labels_ |  | ||||||
|     cdef readonly object ner_ |  | ||||||
| 
 |  | ||||||
|     cdef Transition* ner |  | ||||||
|     cdef int* c_heads |     cdef int* c_heads | ||||||
|     cdef int* c_labels |     cdef int* c_labels | ||||||
|  |     cdef Transition* c_ner | ||||||
| 
 | 
 | ||||||
|     cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1 |     cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1 | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef class NERAnnotation: |  | ||||||
|     cdef Pool mem |  | ||||||
|     cdef int* starts |  | ||||||
|     cdef int* ends |  | ||||||
|     cdef int* labels |  | ||||||
|     cdef readonly list entities |  | ||||||
|  |  | ||||||
|  | @ -34,38 +34,37 @@ def read_docparse_file(loc): | ||||||
|         sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents))) |         sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents))) | ||||||
|     return sents |     return sents | ||||||
| 
 | 
 | ||||||
|  | def _parse_line(line): | ||||||
|  |     pieces = line.split() | ||||||
|  |     if len(pieces) == 4: | ||||||
|  |         return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3] | ||||||
|  |     else: | ||||||
|  |         id_ = int(pieces[0]) | ||||||
|  |         word = pieces[1] | ||||||
|  |         pos = pieces[3] | ||||||
|  |         iob_ent = pieces[5] | ||||||
|  |         head_idx = int(pieces[6]) | ||||||
|  |         label = pieces[7] | ||||||
|  |         return id_, word, pos, head_idx, label, iob_ent | ||||||
| 
 | 
 | ||||||
| cdef class GoldParse: | cdef class GoldParse: | ||||||
|     def __init__(self, tokens, annot_tuples, pos_tags, dep_labels, entity_types): |     def __init__(self, tokens, annot_tuples): | ||||||
|         self.mem = Pool() |         self.mem = Pool() | ||||||
|         self.loss = 0 |         self.loss = 0 | ||||||
|         self.length = len(tokens) |         self.length = len(tokens) | ||||||
|         self.ids = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) |  | ||||||
|         self.tags = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) |  | ||||||
|         self.heads = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) |  | ||||||
|         self.labels = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) |  | ||||||
| 
 | 
 | ||||||
|         self.ids[:] = -1 |         # These are filled by the tagger/parser/entity recogniser | ||||||
|         self.tags[:] = -1 |         self.c_tags = <int*>self.mem.alloc(len(tokens), sizeof(int)) | ||||||
|         self.heads[:] = -1 |  | ||||||
|         self.labels[:] = -1 |  | ||||||
| 
 |  | ||||||
|         self.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition)) |  | ||||||
|         self.c_heads = <int*>self.mem.alloc(len(tokens), sizeof(int)) |         self.c_heads = <int*>self.mem.alloc(len(tokens), sizeof(int)) | ||||||
|         self.c_labels = <int*>self.mem.alloc(len(tokens), sizeof(int)) |         self.c_labels = <int*>self.mem.alloc(len(tokens), sizeof(int)) | ||||||
|  |         self.c_ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition)) | ||||||
| 
 | 
 | ||||||
|         for i in range(len(tokens)): |         self.tags = [None] * len(tokens) | ||||||
|             self.c_heads[i] = -1 |         self.heads = [-1] * len(tokens) | ||||||
|             self.c_labels[i] = -1 |         self.labels = ['MISSING'] * len(tokens) | ||||||
|          |         self.ner = [None] * len(tokens) | ||||||
|         self.tags_ = [None] * len(tokens) |  | ||||||
|         self.labels_ = [None] * len(tokens) |  | ||||||
|         self.ner_ = [None] * len(tokens) |  | ||||||
| 
 | 
 | ||||||
|         idx_map = {token.idx: token.i for token in tokens} |         idx_map = {token.idx: token.i for token in tokens} | ||||||
|         print idx_map |  | ||||||
|         # TODO: Fill NER moves |  | ||||||
|         print raw_text |  | ||||||
|         for idx, tag, head, label, ner in zip(*annot_tuples): |         for idx, tag, head, label, ner in zip(*annot_tuples): | ||||||
|             if idx < tokens[0].idx: |             if idx < tokens[0].idx: | ||||||
|                 pass |                 pass | ||||||
|  | @ -73,16 +72,12 @@ cdef class GoldParse: | ||||||
|                 break |                 break | ||||||
|             elif idx in idx_map: |             elif idx in idx_map: | ||||||
|                 i = idx_map[idx] |                 i = idx_map[idx] | ||||||
|                 print i, idx, head, idx_map.get(head, -1) |                 self.tags[i] = tag | ||||||
|                 self.ids[i] = idx |  | ||||||
|                 self.tags[i] = pos_tags.index(tag) |  | ||||||
|                 self.heads[i] = idx_map.get(head, -1) |                 self.heads[i] = idx_map.get(head, -1) | ||||||
|                 self.labels[i] = dep_labels[label] |                 self.labels[i] = label | ||||||
|                 self.c_heads[i] = -1 |                 self.tags[i] = tag | ||||||
|                 self.c_labels[i] = -1 |                 self.labels[i] = label | ||||||
|                 self.tags_[i] = tag |                 self.ner[i] = ner | ||||||
|                 self.labels_[i] = label |  | ||||||
|                 self.ner_[i] = ner |  | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def n_non_punct(self): |     def n_non_punct(self): | ||||||
|  | @ -116,71 +111,3 @@ def _map_indices_to_tokens(ids, heads): | ||||||
|     return mapped |     return mapped | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _parse_line(line): |  | ||||||
|     pieces = line.split() |  | ||||||
|     if len(pieces) == 4: |  | ||||||
|         return 0, pieces[0], pieces[1], int(pieces[2]) - 1, pieces[3] |  | ||||||
|     else: |  | ||||||
|         id_ = int(pieces[0]) |  | ||||||
|         word = pieces[1] |  | ||||||
|         pos = pieces[3] |  | ||||||
|         iob_ent = pieces[5] |  | ||||||
|         head_idx = int(pieces[6]) |  | ||||||
|         label = pieces[7] |  | ||||||
|         return id_, word, pos, head_idx, label, iob_ent |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef class NERAnnotation: |  | ||||||
|     def __init__(self, entities, length, entity_types): |  | ||||||
|         self.mem = Pool() |  | ||||||
|         self.starts = <int*>self.mem.alloc(length, sizeof(int)) |  | ||||||
|         self.ends = <int*>self.mem.alloc(length, sizeof(int)) |  | ||||||
|         self.labels = <int*>self.mem.alloc(length, sizeof(int)) |  | ||||||
|         self.entities = entities |  | ||||||
|         memset(self.starts, -1, sizeof(int) * length) |  | ||||||
|         memset(self.ends, -1, sizeof(int) * length) |  | ||||||
|         memset(self.labels, -1, sizeof(int) * length) |  | ||||||
|          |  | ||||||
|         cdef int start, end, label |  | ||||||
|         for start, end, label in entities: |  | ||||||
|             for i in range(start, end): |  | ||||||
|                 self.starts[i] = start |  | ||||||
|                 self.ends[i] = end |  | ||||||
|                 self.labels[i] = label |  | ||||||
|     @property |  | ||||||
|     def biluo_tags(self): |  | ||||||
|         pass |  | ||||||
| 
 |  | ||||||
|     @property |  | ||||||
|     def iob_tags(self): |  | ||||||
|         pass |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def from_iobs(cls, iob_strs, entity_types): |  | ||||||
|         return cls.from_biluos(iob_to_biluo(iob_strs), entity_types) |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def from_biluos(cls, tag_strs, entity_types): |  | ||||||
|         entities = [] |  | ||||||
|         start = None |  | ||||||
|         for i, tag_str in enumerate(tag_strs): |  | ||||||
|             if tag_str == 'O' or tag_str == '-': |  | ||||||
|                 continue |  | ||||||
|             move, label_str = tag_str.split('-') |  | ||||||
|             label = entity_types.index(label_str) |  | ||||||
|             if label == -1: |  | ||||||
|                 label = len(entity_types) |  | ||||||
|                 entity_types.append(label) |  | ||||||
|             if move == 'U': |  | ||||||
|                 assert start is None |  | ||||||
|                 entities.append((i, i+1, label)) |  | ||||||
|             elif move == 'B': |  | ||||||
|                 assert start is None |  | ||||||
|                 start = i |  | ||||||
|             elif move == 'L': |  | ||||||
|                 assert start is not None |  | ||||||
|                 entities.append((start, i+1, label)) |  | ||||||
|                 start = None |  | ||||||
|         return cls(entities, len(tag_strs), entity_types) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|  |  | ||||||
|  | @ -21,6 +21,14 @@ cdef enum: | ||||||
|     OUT |     OUT | ||||||
|     N_MOVES |     N_MOVES | ||||||
| 
 | 
 | ||||||
|  | MOVE_NAMES = [None] * N_MOVES | ||||||
|  | MOVE_NAMES[MISSING] = 'M' | ||||||
|  | MOVE_NAMES[BEGIN] = 'B' | ||||||
|  | MOVE_NAMES[IN] = 'I' | ||||||
|  | MOVE_NAMES[LAST] = 'L' | ||||||
|  | MOVE_NAMES[UNIT] = 'U' | ||||||
|  | MOVE_NAMES[OUT] = 'O' | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| cdef do_func_t[N_MOVES] do_funcs | cdef do_func_t[N_MOVES] do_funcs | ||||||
| 
 | 
 | ||||||
|  | @ -70,6 +78,23 @@ cdef class BiluoPushDown(TransitionSystem): | ||||||
|                     move_labels[moves.index(move_str)][label] = True |                     move_labels[moves.index(move_str)][label] = True | ||||||
|         return move_labels |         return move_labels | ||||||
| 
 | 
 | ||||||
|  |     cdef int preprocess_gold(self, GoldParse gold) except -1: | ||||||
|  |         biluo_strings = iob_to_biluo(gold.ner) | ||||||
|  |         for i in range(gold.length): | ||||||
|  |             gold.c_ner[i] = self.lookup_transition(biluo_strings[i]) | ||||||
|  | 
 | ||||||
|  |     cdef Transition lookup_transition(self, object name) except *: | ||||||
|  |         if '-' in name: | ||||||
|  |             move_str, label_str = name.split('-', 1) | ||||||
|  |             label = self.label_ids[label_str] | ||||||
|  |         else: | ||||||
|  |             move_str = name | ||||||
|  |             label = 0 | ||||||
|  |         move = MOVE_NAMES.index(move_str) | ||||||
|  |         for i in range(self.n_moves): | ||||||
|  |             if self.c[i].move == move and self.c[i].label == label: | ||||||
|  |                 return self.c[i] | ||||||
|  | 
 | ||||||
|     cdef Transition init_transition(self, int clas, int move, int label) except *: |     cdef Transition init_transition(self, int clas, int move, int label) except *: | ||||||
|         # TODO: Apparent Cython bug here when we try to use the Transition() |         # TODO: Apparent Cython bug here when we try to use the Transition() | ||||||
|         # constructor with the function pointers |         # constructor with the function pointers | ||||||
|  | @ -101,9 +126,9 @@ cdef class BiluoPushDown(TransitionSystem): | ||||||
| cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) except -1: | cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) except -1: | ||||||
|     if not _is_valid(self.move, self.label, s): |     if not _is_valid(self.move, self.label, s): | ||||||
|         return 9000 |         return 9000 | ||||||
|     cdef bint is_sunk = _entity_is_sunk(s, gold.ner) |     cdef bint is_sunk = _entity_is_sunk(s, gold.c_ner) | ||||||
|     cdef int next_act = gold.ner[s.i+1].move if s.i < s.sent_len else OUT |     cdef int next_act = gold.c_ner[s.i+1].move if s.i < s.sent_len else OUT | ||||||
|     return not _is_gold(self.move, self.label, gold.ner[s.i].move, gold.ner[s.i].label, |     return not _is_gold(self.move, self.label, gold.c_ner[s.i].move, gold.c_ner[s.i].label, | ||||||
|                         next_act, is_sunk) |                         next_act, is_sunk) | ||||||
| 
 | 
 | ||||||
| cdef bint _is_gold(int act, int tag, int g_act, int g_tag, | cdef bint _is_gold(int act, int tag, int g_act, int g_tag, | ||||||
|  |  | ||||||
|  | @ -34,6 +34,8 @@ from .conll cimport GoldParse | ||||||
| from . import _parse_features | from . import _parse_features | ||||||
| from ._parse_features cimport fill_context, CONTEXT_SIZE | from ._parse_features cimport fill_context, CONTEXT_SIZE | ||||||
| 
 | 
 | ||||||
|  | from ._ner_features cimport _ner_features | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| DEBUG = False  | DEBUG = False  | ||||||
| def set_debug(val): | def set_debug(val): | ||||||
|  | @ -55,6 +57,8 @@ def get_templates(name): | ||||||
|     pf = _parse_features |     pf = _parse_features | ||||||
|     if name == 'zhang': |     if name == 'zhang': | ||||||
|         return pf.arc_eager |         return pf.arc_eager | ||||||
|  |     elif name == 'ner': | ||||||
|  |         return _ner_features.basic | ||||||
|     else: |     else: | ||||||
|         return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \ |         return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \ | ||||||
|                 pf.tree_shape + pf.trigrams) |                 pf.tree_shape + pf.trigrams) | ||||||
|  | @ -95,7 +99,8 @@ cdef class GreedyParser: | ||||||
|             Transition best |             Transition best | ||||||
|              |              | ||||||
|             atom_t[CONTEXT_SIZE] context |             atom_t[CONTEXT_SIZE] context | ||||||
|          |         | ||||||
|  |         self.moves.preprocess_gold(gold) | ||||||
|         cdef Pool mem = Pool() |         cdef Pool mem = Pool() | ||||||
|         cdef State* state = init_state(mem, tokens.data, tokens.length) |         cdef State* state = init_state(mem, tokens.data, tokens.length) | ||||||
|         while not is_final(state): |         while not is_final(state): | ||||||
|  |  | ||||||
|  | @ -29,6 +29,10 @@ cdef class TransitionSystem: | ||||||
|     cdef const Transition* c |     cdef const Transition* c | ||||||
|     cdef readonly int n_moves |     cdef readonly int n_moves | ||||||
| 
 | 
 | ||||||
|  |     cdef int preprocess_gold(self, GoldParse gold) except -1 | ||||||
|  |      | ||||||
|  |     cdef Transition lookup_transition(self, object name) except * | ||||||
|  |      | ||||||
|     cdef Transition init_transition(self, int clas, int move, int label) except * |     cdef Transition init_transition(self, int clas, int move, int label) except * | ||||||
| 
 | 
 | ||||||
|     cdef Transition best_valid(self, const weight_t* scores, const State* state) except * |     cdef Transition best_valid(self, const weight_t* scores, const State* state) except * | ||||||
|  |  | ||||||
|  | @ -28,6 +28,12 @@ cdef class TransitionSystem: | ||||||
|         self.label_ids['MISSING'] = -1 |         self.label_ids['MISSING'] = -1 | ||||||
|         self.c = moves |         self.c = moves | ||||||
| 
 | 
 | ||||||
|  |     cdef int preprocess_gold(self, GoldParse gold) except -1: | ||||||
|  |         raise NotImplementedError | ||||||
|  | 
 | ||||||
|  |     cdef Transition lookup_transition(self, object name) except *: | ||||||
|  |         raise NotImplementedError | ||||||
|  | 
 | ||||||
|     cdef Transition init_transition(self, int clas, int move, int label) except *: |     cdef Transition init_transition(self, int clas, int move, int label) except *: | ||||||
|         raise NotImplementedError |         raise NotImplementedError | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user