mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	* Tmp
This commit is contained in:
		
							parent
							
								
									220ce8bfed
								
							
						
					
					
						commit
						b3eda03c9c
					
				|  | @ -42,9 +42,17 @@ cdef struct PosTag: | ||||||
|     univ_pos_t pos |     univ_pos_t pos | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | cdef struct Entity: | ||||||
|  |     int start | ||||||
|  |     int end | ||||||
|  |     int tag | ||||||
|  |     int label | ||||||
|  |      | ||||||
|  | 
 | ||||||
| cdef struct TokenC: | cdef struct TokenC: | ||||||
|     const LexemeC* lex |     const LexemeC* lex | ||||||
|     Morphology morph |     Morphology morph | ||||||
|  |     Entity ent | ||||||
|     univ_pos_t pos |     univ_pos_t pos | ||||||
|     int tag |     int tag | ||||||
|     int idx |     int idx | ||||||
|  |  | ||||||
|  | @ -2,15 +2,17 @@ from libc.stdint cimport uint32_t | ||||||
| 
 | 
 | ||||||
| from cymem.cymem cimport Pool | from cymem.cymem cimport Pool | ||||||
| 
 | 
 | ||||||
| from ..structs cimport TokenC | from ..structs cimport TokenC, Entity | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef struct State: | cdef struct State: | ||||||
|     TokenC* sent |     TokenC* sent | ||||||
|     int* stack |     int* stack | ||||||
|  |     Entity* ent | ||||||
|     int i |     int i | ||||||
|     int sent_len |     int sent_len | ||||||
|     int stack_len |     int stack_len | ||||||
|  |     int ents_len | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef int add_dep(const State *s, const int head, const int child, const int label) except -1 | cdef int add_dep(const State *s, const int head, const int child, const int label) except -1 | ||||||
|  |  | ||||||
|  | @ -35,16 +35,16 @@ cdef get_cost_func_t[N_MOVES] get_cost_funcs | ||||||
| cdef class ArcEager(TransitionSystem): | cdef class ArcEager(TransitionSystem): | ||||||
|     @classmethod |     @classmethod | ||||||
|     def get_labels(cls, gold_parses): |     def get_labels(cls, gold_parses): | ||||||
|         labels = {SHIFT: {'ROOT': True}, REDUCE: {'ROOT': True}, RIGHT: {}, |         move_labels = {SHIFT: {'ROOT': True}, REDUCE: {'ROOT': True}, RIGHT: {}, | ||||||
|                   LEFT: {}, BREAK: {'ROOT': True}} |                        LEFT: {}, BREAK: {'ROOT': True}} | ||||||
|         for parse in gold_parses: |         for raw_text, segmented, (ids, tags, heads, labels, iob) in gold_parses: | ||||||
|             for i, (head, label) in enumerate(zip(parse.heads, parse.labels)): |             for i, (head, label) in enumerate(zip(heads, labels)): | ||||||
|                 if label != 'ROOT': |                 if label != 'ROOT': | ||||||
|                     if head > i: |                     if head > i: | ||||||
|                         labels[RIGHT][label] = True |                         move_labels[RIGHT][label] = True | ||||||
|                     elif head < i: |                     elif head < i: | ||||||
|                         labels[LEFT][label] = True |                         move_labels[LEFT][label] = True | ||||||
|         return labels |         return move_labels | ||||||
| 
 | 
 | ||||||
|     cdef Transition init_transition(self, int clas, int move, int label) except *: |     cdef Transition init_transition(self, int clas, int move, int label) except *: | ||||||
|         # TODO: Apparent Cython bug here when we try to use the Transition() |         # TODO: Apparent Cython bug here when we try to use the Transition() | ||||||
|  |  | ||||||
|  | @ -1,22 +1,33 @@ | ||||||
| from cymem.cymem cimport Pool | from cymem.cymem cimport Pool | ||||||
| 
 | 
 | ||||||
| from ..structs cimport TokenC | from ..structs cimport TokenC | ||||||
|  | from .transition_system cimport Transition | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| cdef class GoldParse: | cdef class GoldParse: | ||||||
|     cdef Pool mem |     cdef Pool mem | ||||||
| 
 | 
 | ||||||
|  |     cdef int length | ||||||
|  |     cdef readonly int loss | ||||||
|  |     cdef readonly object ids | ||||||
|  |     cdef readonly object tags | ||||||
|  |     cdef readonly object heads | ||||||
|  |     cdef readonly object labels | ||||||
|  | 
 | ||||||
|  |     cdef readonly object tags_ | ||||||
|  |     cdef readonly object labels_ | ||||||
|  |     cdef readonly object ner_ | ||||||
|  | 
 | ||||||
|  |     cdef Transition* ner | ||||||
|     cdef int* c_heads |     cdef int* c_heads | ||||||
|     cdef int* c_labels |     cdef int* c_labels | ||||||
| 
 | 
 | ||||||
|     cdef int length |  | ||||||
|     cdef int loss |  | ||||||
| 
 |  | ||||||
|     cdef readonly unicode raw_text |  | ||||||
|     cdef readonly list words |  | ||||||
|     cdef readonly list ids |  | ||||||
|     cdef readonly list tags |  | ||||||
|     cdef readonly list heads |  | ||||||
|     cdef readonly list labels |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
|     cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1 |     cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef class NERAnnotation: | ||||||
|  |     cdef Pool mem | ||||||
|  |     cdef int* starts | ||||||
|  |     cdef int* ends | ||||||
|  |     cdef int* labels | ||||||
|  |     cdef readonly list entities | ||||||
|  |  | ||||||
|  | @ -1,67 +1,24 @@ | ||||||
| cdef class GoldParse: | import numpy | ||||||
|     def __init__(self, raw_text, words, ids, tags, heads, labels): | import codecs | ||||||
|         self.mem = Pool() | from .ner_util import iob_to_biluo | ||||||
|         self.loss = 0 |  | ||||||
|         self.length = len(words) |  | ||||||
|         self.raw_text = raw_text |  | ||||||
|         self.words = words |  | ||||||
|         self.ids = ids |  | ||||||
|         self.tags = tags |  | ||||||
|         self.heads = heads |  | ||||||
|         self.labels = labels |  | ||||||
|         self.c_heads = <int*>self.mem.alloc(self.length, sizeof(int)) |  | ||||||
|         self.c_labels = <int*>self.mem.alloc(self.length, sizeof(int)) |  | ||||||
| 
 | 
 | ||||||
|     @property | from libc.string cimport memset | ||||||
|     def n_non_punct(self): |  | ||||||
|         return len([l for l in self.labels if l != 'P']) |  | ||||||
| 
 | 
 | ||||||
|     @property |  | ||||||
|     def py_heads(self): |  | ||||||
|         return [self.c_heads[i] for i in range(self.length)] |  | ||||||
| 
 | 
 | ||||||
|     cdef int heads_correct(self, TokenC* tokens, bint score_punct=False) except -1: | def read_docparse_file(loc): | ||||||
|         n = 0 |     sents = [] | ||||||
|         for i in range(self.length): |     for sent_str in codecs.open(loc, 'r', 'utf8').read().strip().split('\n\n'): | ||||||
|             if not score_punct and self.labels[i] == 'P': |  | ||||||
|                 continue |  | ||||||
|             n += (i + tokens[i].head) == self.c_heads[i] |  | ||||||
|         return n |  | ||||||
| 
 |  | ||||||
|     def is_correct(self, i, head): |  | ||||||
|         return head == self.c_heads[i] |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def from_conll(cls, unicode sent_str): |  | ||||||
|         ids = [] |  | ||||||
|         words = [] |  | ||||||
|         heads = [] |  | ||||||
|         labels = [] |  | ||||||
|         tags = [] |  | ||||||
|         for i, line in enumerate(sent_str.split('\n')): |  | ||||||
|             id_, word, pos_string, head_idx, label = _parse_line(line) |  | ||||||
|             words.append(word) |  | ||||||
|             if head_idx == -1: |  | ||||||
|                 head_idx = i |  | ||||||
|             ids.append(id_) |  | ||||||
|             heads.append(head_idx) |  | ||||||
|             labels.append(label) |  | ||||||
|             tags.append(pos_string) |  | ||||||
|         text = ' '.join(words) |  | ||||||
|         return cls(text, [words], ids, words, tags, heads, labels) |  | ||||||
| 
 |  | ||||||
|     @classmethod |  | ||||||
|     def from_docparse(cls, unicode sent_str): |  | ||||||
|         words = [] |         words = [] | ||||||
|         heads = [] |         heads = [] | ||||||
|         labels = [] |         labels = [] | ||||||
|         tags = [] |         tags = [] | ||||||
|         ids = [] |         ids = [] | ||||||
|  |         iob_ents = [] | ||||||
|         lines = sent_str.strip().split('\n') |         lines = sent_str.strip().split('\n') | ||||||
|         raw_text = lines.pop(0).strip() |         raw_text = lines.pop(0).strip() | ||||||
|         tok_text = lines.pop(0).strip() |         tok_text = lines.pop(0).strip() | ||||||
|         for i, line in enumerate(lines): |         for i, line in enumerate(lines): | ||||||
|             id_, word, pos_string, head_idx, label = _parse_line(line) |             id_, word, pos_string, head_idx, label, iob_ent = _parse_line(line) | ||||||
|             if label == 'root': |             if label == 'root': | ||||||
|                 label = 'ROOT' |                 label = 'ROOT' | ||||||
|             words.append(word) |             words.append(word) | ||||||
|  | @ -71,57 +28,78 @@ cdef class GoldParse: | ||||||
|             heads.append(head_idx) |             heads.append(head_idx) | ||||||
|             labels.append(label) |             labels.append(label) | ||||||
|             tags.append(pos_string) |             tags.append(pos_string) | ||||||
|         tokenized = [sent_str.replace('<SEP>', ' ').split(' ') |             iob_ents.append(iob_ent) | ||||||
|                      for sent_str in tok_text.split('<SENT>')] |         tokenized = [s.replace('<SEP>', ' ').split(' ') | ||||||
|         return cls(raw_text, words, ids, tags, heads, labels) |                      for s in tok_text.split('<SENT>')] | ||||||
|  |         sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents))) | ||||||
|  |     return sents | ||||||
| 
 | 
 | ||||||
|     def align_to_tokens(self, tokens, label_ids): | 
 | ||||||
|         orig_words = list(self.words) | cdef class GoldParse: | ||||||
|         annot = zip(self.ids, self.tags, self.heads, self.labels) |     def __init__(self, tokens, annot_tuples, pos_tags, dep_labels, entity_types): | ||||||
|         self.ids = [] |         self.mem = Pool() | ||||||
|         self.tags = [] |         self.loss = 0 | ||||||
|         self.heads = [] |  | ||||||
|         self.labels = [] |  | ||||||
|         missed = [] |  | ||||||
|         for token in tokens: |  | ||||||
|             while annot and token.idx > annot[0][0]: |  | ||||||
|                 miss_id, miss_tag, miss_head, miss_label = annot.pop(0) |  | ||||||
|                 if not is_punct_label(miss_label): |  | ||||||
|                     self.loss += 1 |  | ||||||
|             if not annot: |  | ||||||
|                 self.tags.append(None) |  | ||||||
|                 self.heads.append(None) |  | ||||||
|                 self.labels.append(None) |  | ||||||
|                 continue |  | ||||||
|             id_, tag, head, label = annot[0] |  | ||||||
|             if token.idx == id_: |  | ||||||
|                 self.tags.append(tag) |  | ||||||
|                 self.heads.append(head) |  | ||||||
|                 self.labels.append(label) |  | ||||||
|                 annot.pop(0) |  | ||||||
|             elif token.idx < id_: |  | ||||||
|                 self.tags.append(None) |  | ||||||
|                 self.heads.append(None) |  | ||||||
|                 self.labels.append(None) |  | ||||||
|             else: |  | ||||||
|                 raise StandardError |  | ||||||
|         self.length = len(tokens) |         self.length = len(tokens) | ||||||
|         self.c_heads = <int*>self.mem.alloc(self.length, sizeof(int)) |         self.ids = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) | ||||||
|         self.c_labels = <int*>self.mem.alloc(self.length, sizeof(int)) |         self.tags = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) | ||||||
|         self.ids = [token.idx for token in tokens] |         self.heads = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) | ||||||
|         self.map_heads(label_ids) |         self.labels = numpy.empty(shape=(len(tokens), 1), dtype=numpy.int32) | ||||||
|         return self.loss |  | ||||||
| 
 | 
 | ||||||
|     def map_heads(self, label_ids): |         self.ids[:] = -1 | ||||||
|         mapped_heads = _map_indices_to_tokens(self.ids, self.heads) |         self.tags[:] = -1 | ||||||
|         for i in range(self.length): |         self.heads[:] = -1 | ||||||
|             if mapped_heads[i] is None: |         self.labels[:] = -1 | ||||||
|  | 
 | ||||||
|  |         self.ner = <Transition*>self.mem.alloc(len(tokens), sizeof(Transition)) | ||||||
|  |         self.c_heads = <int*>self.mem.alloc(len(tokens), sizeof(int)) | ||||||
|  |         self.c_labels = <int*>self.mem.alloc(len(tokens), sizeof(int)) | ||||||
|  | 
 | ||||||
|  |         for i in range(len(tokens)): | ||||||
|  |             self.c_heads[i] = -1 | ||||||
|  |             self.c_labels[i] = -1 | ||||||
|  |          | ||||||
|  |         self.tags_ = [None] * len(tokens) | ||||||
|  |         self.labels_ = [None] * len(tokens) | ||||||
|  |         self.ner_ = [None] * len(tokens) | ||||||
|  | 
 | ||||||
|  |         idx_map = {token.idx: token.i for token in tokens} | ||||||
|  |         print idx_map | ||||||
|  |         # TODO: Fill NER moves | ||||||
|  |         print raw_text | ||||||
|  |         for idx, tag, head, label, ner in zip(*annot_tuples): | ||||||
|  |             if idx < tokens[0].idx: | ||||||
|  |                 pass | ||||||
|  |             elif idx > tokens[-1].idx: | ||||||
|  |                 break | ||||||
|  |             elif idx in idx_map: | ||||||
|  |                 i = idx_map[idx] | ||||||
|  |                 print i, idx, head, idx_map.get(head, -1) | ||||||
|  |                 self.ids[i] = idx | ||||||
|  |                 self.tags[i] = pos_tags.index(tag) | ||||||
|  |                 self.heads[i] = idx_map.get(head, -1) | ||||||
|  |                 self.labels[i] = dep_labels[label] | ||||||
|                 self.c_heads[i] = -1 |                 self.c_heads[i] = -1 | ||||||
|                 self.c_labels[i] = -1 |                 self.c_labels[i] = -1 | ||||||
|             else: |                 self.tags_[i] = tag | ||||||
|                 self.c_heads[i] = mapped_heads[i] |                 self.labels_[i] = label | ||||||
|                 self.c_labels[i] = label_ids[self.labels[i]] |                 self.ner_[i] = ner | ||||||
|         return self.loss | 
 | ||||||
|  |     @property | ||||||
|  |     def n_non_punct(self): | ||||||
|  |         return len([l for l in self.labels if l != 'P']) | ||||||
|  | 
 | ||||||
|  |     cdef int heads_correct(self, TokenC* tokens, bint score_punct=False) except -1: | ||||||
|  |         n = 0 | ||||||
|  |         for i in range(self.length): | ||||||
|  |             if not score_punct and self.labels_[i] == 'P': | ||||||
|  |                 continue | ||||||
|  |             if self.heads[i] == -1: | ||||||
|  |                 continue | ||||||
|  |             n += (i + tokens[i].head) == self.heads[i] | ||||||
|  |         return n | ||||||
|  | 
 | ||||||
|  |     def is_correct(self, i, head): | ||||||
|  |         return head == self.c_heads[i] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def is_punct_label(label): | def is_punct_label(label): | ||||||
|  | @ -146,6 +124,63 @@ def _parse_line(line): | ||||||
|         id_ = int(pieces[0]) |         id_ = int(pieces[0]) | ||||||
|         word = pieces[1] |         word = pieces[1] | ||||||
|         pos = pieces[3] |         pos = pieces[3] | ||||||
|  |         iob_ent = pieces[5] | ||||||
|         head_idx = int(pieces[6]) |         head_idx = int(pieces[6]) | ||||||
|         label = pieces[7] |         label = pieces[7] | ||||||
|         return id_, word, pos, head_idx, label |         return id_, word, pos, head_idx, label, iob_ent | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef class NERAnnotation: | ||||||
|  |     def __init__(self, entities, length, entity_types): | ||||||
|  |         self.mem = Pool() | ||||||
|  |         self.starts = <int*>self.mem.alloc(length, sizeof(int)) | ||||||
|  |         self.ends = <int*>self.mem.alloc(length, sizeof(int)) | ||||||
|  |         self.labels = <int*>self.mem.alloc(length, sizeof(int)) | ||||||
|  |         self.entities = entities | ||||||
|  |         memset(self.starts, -1, sizeof(int) * length) | ||||||
|  |         memset(self.ends, -1, sizeof(int) * length) | ||||||
|  |         memset(self.labels, -1, sizeof(int) * length) | ||||||
|  |          | ||||||
|  |         cdef int start, end, label | ||||||
|  |         for start, end, label in entities: | ||||||
|  |             for i in range(start, end): | ||||||
|  |                 self.starts[i] = start | ||||||
|  |                 self.ends[i] = end | ||||||
|  |                 self.labels[i] = label | ||||||
|  |     @property | ||||||
|  |     def biluo_tags(self): | ||||||
|  |         pass | ||||||
|  | 
 | ||||||
|  |     @property | ||||||
|  |     def iob_tags(self): | ||||||
|  |         pass | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def from_iobs(cls, iob_strs, entity_types): | ||||||
|  |         return cls.from_biluos(iob_to_biluo(iob_strs), entity_types) | ||||||
|  | 
 | ||||||
|  |     @classmethod | ||||||
|  |     def from_biluos(cls, tag_strs, entity_types): | ||||||
|  |         entities = [] | ||||||
|  |         start = None | ||||||
|  |         for i, tag_str in enumerate(tag_strs): | ||||||
|  |             if tag_str == 'O' or tag_str == '-': | ||||||
|  |                 continue | ||||||
|  |             move, label_str = tag_str.split('-') | ||||||
|  |             label = entity_types.index(label_str) | ||||||
|  |             if label == -1: | ||||||
|  |                 label = len(entity_types) | ||||||
|  |                 entity_types.append(label) | ||||||
|  |             if move == 'U': | ||||||
|  |                 assert start is None | ||||||
|  |                 entities.append((i, i+1, label)) | ||||||
|  |             elif move == 'B': | ||||||
|  |                 assert start is None | ||||||
|  |                 start = i | ||||||
|  |             elif move == 'L': | ||||||
|  |                 assert start is not None | ||||||
|  |                 entities.append((start, i+1, label)) | ||||||
|  |                 start = None | ||||||
|  |         return cls(entities, len(tag_strs), entity_types) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |  | ||||||
|  | @ -1,28 +1,7 @@ | ||||||
| from cymem.cymem cimport Pool | from .transition_system cimport TransitionSystem | ||||||
| 
 | from .transition_system cimport Transition | ||||||
| from thinc.typedefs cimport weight_t | from ._state cimport State | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| from ._state cimport State  | cdef class BiluoPushDown(TransitionSystem): | ||||||
| 
 |     pass | ||||||
| 
 |  | ||||||
| cdef struct Transition: |  | ||||||
|     int clas |  | ||||||
|     int move |  | ||||||
|     int label |  | ||||||
|     int cost |  | ||||||
|     weight_t score |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| cdef class TransitionSystem: |  | ||||||
|     cdef Pool mem |  | ||||||
|     cdef readonly int n_moves |  | ||||||
|     cdef dict label_ids |  | ||||||
| 
 |  | ||||||
|     cdef const Transition* _moves |  | ||||||
| 
 |  | ||||||
|     cdef Transition best_valid(self, const weight_t* scores, const State* s) except * |  | ||||||
|     cdef Transition best_gold(self, Transition* guess, const weight_t* scores, |  | ||||||
|                               const State* s, |  | ||||||
|                               const int* gold_heads, const int* gold_labels) except * |  | ||||||
|     cdef int transition(self, State *s, const Transition* t) except -1 |  | ||||||
|  |  | ||||||
|  | @ -1,16 +1,15 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| from ._state cimport State | from ._state cimport State | ||||||
| from ._state cimport has_head, get_idx, get_s0, get_n0 |  | ||||||
| from ._state cimport is_final, at_eol, pop_stack, push_stack, add_dep |  | ||||||
| from ._state cimport head_in_buffer, children_in_buffer |  | ||||||
| from ._state cimport head_in_stack, children_in_stack |  | ||||||
| 
 | 
 | ||||||
| from ..structs cimport TokenC | from .transition_system cimport Transition | ||||||
|  | from .transition_system cimport do_func_t | ||||||
| 
 | 
 | ||||||
|  | from ..structs cimport TokenC, Entity | ||||||
| 
 | 
 | ||||||
| DEF NON_MONOTONIC = True | from thinc.typedefs cimport weight_t | ||||||
| DEF USE_BREAK = True | from .conll cimport GoldParse | ||||||
|  | from .ner_util import iob_to_biluo | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef enum: | cdef enum: | ||||||
|  | @ -23,13 +22,34 @@ cdef enum: | ||||||
|     N_MOVES |     N_MOVES | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef int is_valid(ActionType act, int label, State* s) except -1: | cdef do_func_t[N_MOVES] do_funcs | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef bint entity_is_open(const State *s) except -1: | ||||||
|  |     return s.sent[s.i - 1].ent.tag >= 1 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef bint _entity_is_sunk(const State *s, Transition* golds) except -1: | ||||||
|  |     if not entity_is_open(s): | ||||||
|  |         return False | ||||||
|  | 
 | ||||||
|  |     cdef const Entity* curr = &s.sent[s.i - 1].ent | ||||||
|  |     cdef const Transition* gold = &golds[(s.i - 1) + curr.start] | ||||||
|  |     if gold.move != BEGIN and gold.move != UNIT: | ||||||
|  |         return True | ||||||
|  |     elif gold.label != s.ent.label: | ||||||
|  |         return True | ||||||
|  |     else: | ||||||
|  |         return False | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef int _is_valid(int act, int label, const State* s) except -1: | ||||||
|     if act == BEGIN: |     if act == BEGIN: | ||||||
|         return not entity_is_open(s) |         return not entity_is_open(s) | ||||||
|     elif act == IN: |     elif act == IN: | ||||||
|         return entity_is_open(s) and s.curr.label == label |         return entity_is_open(s) and s.ent.label == label | ||||||
|     elif act == LAST: |     elif act == LAST: | ||||||
|         return entity_is_open(s) and s.curr.label == label |         return entity_is_open(s) and s.ent.label == label | ||||||
|     elif act == UNIT: |     elif act == UNIT: | ||||||
|         return not entity_is_open(s) |         return not entity_is_open(s) | ||||||
|     elif act == OUT: |     elif act == OUT: | ||||||
|  | @ -38,8 +58,56 @@ cdef int is_valid(ActionType act, int label, State* s) except -1: | ||||||
|         raise UnknownMove(act, label) |         raise UnknownMove(act, label) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef bint is_gold(ActionType act, int tag, ActionType g_act, int g_tag, | cdef class BiluoPushDown(TransitionSystem): | ||||||
|                  ActionType next_act, bint is_sunk): |     @classmethod | ||||||
|  |     def get_labels(cls, gold_tuples): | ||||||
|  |         move_labels = {BEGIN: {}, IN: {}, LAST: {}, UNIT: {}, OUT: {'ROOT': True}} | ||||||
|  |         moves = ('-', 'B', 'I', 'L', 'U') | ||||||
|  |         for (raw_text, toks, (ids, tags, heads, labels, iob)) in gold_tuples: | ||||||
|  |             for i, ner_tag in enumerate(iob_to_biluo(iob)): | ||||||
|  |                 if ner_tag != 'O' and ner_tag != '-': | ||||||
|  |                     move_str, label = ner_tag.split('-') | ||||||
|  |                     move_labels[moves.index(move_str)][label] = True | ||||||
|  |         return move_labels | ||||||
|  | 
 | ||||||
|  |     cdef Transition init_transition(self, int clas, int move, int label) except *: | ||||||
|  |         # TODO: Apparent Cython bug here when we try to use the Transition() | ||||||
|  |         # constructor with the function pointers | ||||||
|  |         cdef Transition t | ||||||
|  |         t.score = 0 | ||||||
|  |         t.clas = clas | ||||||
|  |         t.move = move | ||||||
|  |         t.label = label | ||||||
|  |         t.do = do_funcs[move] | ||||||
|  |         t.get_cost = _get_cost | ||||||
|  |         return t | ||||||
|  | 
 | ||||||
|  |     cdef Transition best_valid(self, const weight_t* scores, const State* s) except *: | ||||||
|  |         cdef int best = -1 | ||||||
|  |         cdef weight_t score = -90000 | ||||||
|  |         cdef const Transition* m | ||||||
|  |         cdef int i | ||||||
|  |         for i in range(self.n_moves): | ||||||
|  |             m = &self.c[i] | ||||||
|  |             if _is_valid(m.move, m.label, s) and scores[i] > score: | ||||||
|  |                 best = i | ||||||
|  |                 score = scores[i] | ||||||
|  |         assert best >= 0 | ||||||
|  |         cdef Transition t = self.c[best] | ||||||
|  |         t.score = score | ||||||
|  |         return t | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) except -1: | ||||||
|  |     if not _is_valid(self.move, self.label, s): | ||||||
|  |         return 9000 | ||||||
|  |     cdef bint is_sunk = _entity_is_sunk(s, gold.ner) | ||||||
|  |     cdef int next_act = gold.ner[s.i+1].move if s.i < s.sent_len else OUT | ||||||
|  |     return not _is_gold(self.move, self.label, gold.ner[s.i].move, gold.ner[s.i].label, | ||||||
|  |                         next_act, is_sunk) | ||||||
|  | 
 | ||||||
|  | cdef bint _is_gold(int act, int tag, int g_act, int g_tag, | ||||||
|  |                    int next_act, bint is_sunk): | ||||||
|     if g_act == MISSING: |     if g_act == MISSING: | ||||||
|         return True |         return True | ||||||
|     if act == BEGIN: |     if act == BEGIN: | ||||||
|  | @ -112,98 +180,46 @@ cdef bint is_gold(ActionType act, int tag, ActionType g_act, int g_tag, | ||||||
|             return False |             return False | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef bint entity_is_open(State *s) except -1: | cdef int _do_begin(const Transition* self, State* s) except -1: | ||||||
|     return s.sent[s.i - 1].ent.tag >= 1 |     s.ent += 1 | ||||||
|  |     s.ents_len += 1 | ||||||
|  |     s.ent.start = s.i | ||||||
|  |     s.ent.label = self.label | ||||||
|  |     s.sent[s.i].ent.tag = self.clas  | ||||||
|  |     s.i += 1 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef bint entity_is_sunk(State *s, Move* golds) except -1: | cdef int _do_in(const Transition* self, State* s) except -1: | ||||||
|     if not entity_is_open(s): |     s.sent[s.i].ent.tag = self.clas  | ||||||
|         return False |     s.i += 1 | ||||||
| 
 |  | ||||||
|     cdef const Entity* curr = &s.sent[s.i - 1].ent |  | ||||||
|     cdef Move* gold = &golds[(s.i - 1) + curr.start] |  | ||||||
|     if gold.action != BEGIN and gold.action != UNIT: |  | ||||||
|         return True |  | ||||||
|     elif gold.label != s.curr.label: |  | ||||||
|         return True |  | ||||||
|     else: |  | ||||||
|         return False |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class TransitionSystem: | cdef int _do_last(const Transition* self, State* s) except -1: | ||||||
|     def __init__(self, list entity_type_strs): |     s.ent.end = s.i+1 | ||||||
|         self.mem = Pool() |     s.sent[s.i].ent.tag = self.clas  | ||||||
|  |     s.i += 1 | ||||||
| 
 | 
 | ||||||
|         cdef Move* m |  | ||||||
|         label_names = {'-': 0} |  | ||||||
|         for i, tag_name in enumerate(tag_names): |  | ||||||
|             m = &moves[i] |  | ||||||
|             if '-' in tag_name: |  | ||||||
|                 action_str, label = tag_name.split('-') |  | ||||||
|             elif tag_name == 'O': |  | ||||||
|                 action_str = 'O' |  | ||||||
|                 label = '-' |  | ||||||
|             elif tag_name == 'NULL' or tag_name == 'EOL': |  | ||||||
|                 action_str = '?' |  | ||||||
|                 label = '-' |  | ||||||
|             else: |  | ||||||
|                 raise StandardError(tag_name) |  | ||||||
|             m.action = ACTION_NAMES.index(action_str) |  | ||||||
|             m.label = label_names.setdefault(label, len(label_names)) |  | ||||||
|             m.clas = i |  | ||||||
| 
 | 
 | ||||||
|     cdef int transition(self, State *s, Move* move) except -1: | cdef int _do_unit(const Transition* self, State* s) except -1: | ||||||
|         if move.action == BEGIN: |     s.ent += 1 | ||||||
|             s.curr.start = s.i |     s.ents_len += 1 | ||||||
|             s.curr.label = label |     s.ent.start = s.i | ||||||
|         elif move.action == IN: |     s.ent.label = self.label | ||||||
|             pass |     s.ent.end = s.i+1 | ||||||
|         elif move.action == LAST: |     s.sent[s.i].ent.tag = self.clas  | ||||||
|             s.curr.end = s.i |     s.i += 1 | ||||||
|             s.ents[s.j] = s.curr |  | ||||||
|             s.j += 1 |  | ||||||
|             s.curr.start = 0 |  | ||||||
|             s.curr.label = -1 |  | ||||||
|             s.curr.end = 0 |  | ||||||
|         elif move.action == UNIT: |  | ||||||
|             begin_entity(s, move.label) |  | ||||||
|             end_entity(s) |  | ||||||
|         elif move.action == OUT: |  | ||||||
|             pass |  | ||||||
|         s.tags[s.i] = move.clas  |  | ||||||
|         s.i += 1 |  | ||||||
| 
 | 
 | ||||||
|     cdef Transition best_valid(self, const weight_t* scores, const State* s) except *: |  | ||||||
|         cdef int best = -1 |  | ||||||
|         cdef weight_t score = -90000 |  | ||||||
|         cdef const Transition* m |  | ||||||
|         cdef int i |  | ||||||
|         for i in range(self.n_moves): |  | ||||||
|             m = &self._moves[i] |  | ||||||
|             if _is_valid(s, m.ent_move, m.ent_label) and scores[i] > score: |  | ||||||
|                 best = i |  | ||||||
|                 score = scores[i] |  | ||||||
|         assert best >= 0 |  | ||||||
|         cdef Transition t = self._moves[best] |  | ||||||
|         t.score = score |  | ||||||
|         return t |  | ||||||
| 
 | 
 | ||||||
|     cdef Transition best_gold(self, Transition* guess, const weight_t* scores, | cdef int _do_out(const Transition* self, State* s) except -1: | ||||||
|                               const State* s, Move* golds) except *: |     s.sent[s.i].ent.tag = self.clas  | ||||||
|         cdef Move* g = &golds[s.i] |     s.i += 1 | ||||||
|         cdef ActionType next_act = <ActionType>golds[s.i+1].action if s.i < s.length else OUT | 
 | ||||||
|         cdef bint is_sunk = entity_is_sunk(s, golds) | 
 | ||||||
|         cdef Move* m | do_funcs[BEGIN] = _do_begin | ||||||
|         cdef int n_accept = 0 | do_funcs[IN] = _do_in | ||||||
|         for i in range(1, self.n_classes): | do_funcs[LAST] = _do_last | ||||||
|             m = &moves[i] | do_funcs[UNIT] = _do_unit | ||||||
|             if _is_valid(s, m.move, m.label) and \ | do_funcs[OUT] = _do_out | ||||||
|                _is_gold(s, m.move, m.label, next_act, is_sunk) and \ |  | ||||||
|                scores[i] > score: |  | ||||||
|                 best = i |  | ||||||
|                 score = scores[i] |  | ||||||
|         assert best >= 0 |  | ||||||
|         return self._moves[best] |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class OracleError(Exception): | class OracleError(Exception): | ||||||
|  | @ -212,3 +228,5 @@ class OracleError(Exception): | ||||||
| 
 | 
 | ||||||
| class UnknownMove(Exception): | class UnknownMove(Exception): | ||||||
|     pass |     pass | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  |  | ||||||
|  | @ -35,3 +35,10 @@ cdef class TransitionSystem: | ||||||
| 
 | 
 | ||||||
|     cdef Transition best_gold(self, const weight_t* scores, const State* state, |     cdef Transition best_gold(self, const weight_t* scores, const State* state, | ||||||
|                               GoldParse gold) except * |                               GoldParse gold) except * | ||||||
|  |      | ||||||
|  | 
 | ||||||
|  | #cdef class PyState: | ||||||
|  | #    """Provide a Python class for testing purposes.""" | ||||||
|  | #    cdef Pool mem | ||||||
|  | #    cdef TransitionSystem system | ||||||
|  | #    cdef State* _state | ||||||
|  |  | ||||||
|  | @ -45,3 +45,39 @@ cdef class TransitionSystem: | ||||||
|                 score = scores[i] |                 score = scores[i] | ||||||
|         assert score > MIN_SCORE |         assert score > MIN_SCORE | ||||||
|         return best |         return best | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | #cdef class PyState: | ||||||
|  | #    """Provide a Python class for testing purposes.""" | ||||||
|  | #    def __init__(self, GoldParse gold): | ||||||
|  | #        self.mem = Pool() | ||||||
|  | #        self.system = EntityRecognition(labels) | ||||||
|  | #        self._state = init_state(self.mem, tokens, gold.length) | ||||||
|  | # | ||||||
|  | #    def transition(self, name): | ||||||
|  | #        cdef const Transition* trans = self._transition_by_name(name) | ||||||
|  | #        trans.do(trans, self._state) | ||||||
|  | # | ||||||
|  | #    def is_valid(self, name): | ||||||
|  | #        cdef const Transition* trans = self._transition_by_name(name) | ||||||
|  | #        return _is_valid(trans.move, trans.label, self._state) | ||||||
|  | # | ||||||
|  | #    def is_gold(self, name): | ||||||
|  | #        cdef const Transition* trans = self._transition_by_name(name) | ||||||
|  | #        return _get_const(trans, self._state, self._gold) | ||||||
|  | # | ||||||
|  | #    property ent: | ||||||
|  | #        def __get__(self): | ||||||
|  | #            pass | ||||||
|  | # | ||||||
|  | #    property n_ents: | ||||||
|  | #        def __get__(self): | ||||||
|  | #            pass | ||||||
|  | # | ||||||
|  | #    property i: | ||||||
|  | #        def __get__(self): | ||||||
|  | #            pass | ||||||
|  | # | ||||||
|  | #    property open_entity: | ||||||
|  | #        def __get__(self): | ||||||
|  | #            return entity_is_open(self._s) | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user