* Add words to gold_tuples from gold conll file

This commit is contained in:
Matthew Honnibal 2015-03-24 04:27:20 +01:00
parent 2e12dec76e
commit 3b70b304b2
4 changed files with 17 additions and 10 deletions

View File

@ -44,7 +44,7 @@ cdef class ArcEager(TransitionSystem):
def get_labels(cls, gold_parses): def get_labels(cls, gold_parses):
move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {}, move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {},
LEFT: {}, BREAK: {'ROOT': True}} LEFT: {}, BREAK: {'ROOT': True}}
for raw_text, segmented, (ids, tags, heads, labels, iob) in gold_parses: for raw_text, segmented, (ids, words, tags, heads, labels, iob) in gold_parses:
for i, (head, label) in enumerate(zip(heads, labels)): for i, (head, label) in enumerate(zip(heads, labels)):
if label != 'ROOT': if label != 'ROOT':
if head > i: if head > i:
@ -69,6 +69,13 @@ cdef class ArcEager(TransitionSystem):
if self.c[i].move == move and self.c[i].label == label: if self.c[i].move == move and self.c[i].label == label:
return self.c[i] return self.c[i]
def move_name(self, int move, int label):
label_str = self.strings[label]
if label_str:
return MOVE_NAMES[move] + '-' + label_str
else:
return MOVE_NAMES[move]
cdef Transition init_transition(self, int clas, int move, int label) except *: cdef Transition init_transition(self, int clas, int move, int label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition() # TODO: Apparent Cython bug here when we try to use the Transition()
# constructor with the function pointers # constructor with the function pointers
@ -129,7 +136,7 @@ cdef int _do_right(const Transition* self, State* state) except -1:
cdef int _do_reduce(const Transition* self, State* state) except -1: cdef int _do_reduce(const Transition* self, State* state) except -1:
# TODO: Huh? Is this some weirdness from the non-monotonic? if NON_MONOTONIC and not has_head(get_s0(state)):
add_dep(state, state.stack[-1], state.stack[0], get_s0(state).dep) add_dep(state, state.stack[-1], state.stack[0], get_s0(state).dep)
pop_stack(state) pop_stack(state)

View File

@ -13,6 +13,7 @@ cdef class GoldParse:
cdef readonly list tags cdef readonly list tags
cdef readonly list heads cdef readonly list heads
cdef readonly list labels cdef readonly list labels
cdef readonly dict orths
cdef readonly list ner cdef readonly list ner
cdef readonly list ents cdef readonly list ents

View File

@ -30,7 +30,7 @@ def read_docparse_file(loc):
iob_ents.append(iob_ent) iob_ents.append(iob_ent)
tokenized = [s.replace('<SEP>', ' ').split(' ') tokenized = [s.replace('<SEP>', ' ').split(' ')
for s in tok_text.split('<SENT>')] for s in tok_text.split('<SENT>')]
sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents))) sents.append((raw_text, tokenized, (ids, words, tags, heads, labels, iob_ents)))
return sents return sents
def _parse_line(line): def _parse_line(line):
@ -63,12 +63,14 @@ cdef class GoldParse:
self.heads = [-1] * len(tokens) self.heads = [-1] * len(tokens)
self.labels = ['MISSING'] * len(tokens) self.labels = ['MISSING'] * len(tokens)
self.ner = ['O'] * len(tokens) self.ner = ['O'] * len(tokens)
self.orths = {}
idx_map = {token.idx: token.i for token in tokens} idx_map = {token.idx: token.i for token in tokens}
self.ents = [] self.ents = []
ent_start = None ent_start = None
ent_label = None ent_label = None
for idx, tag, head, label, ner in zip(*annot_tuples): for idx, orth, tag, head, label, ner in zip(*annot_tuples):
self.orths[idx] = orth
if idx < tokens[0].idx: if idx < tokens[0].idx:
pass pass
elif idx > tokens[-1].idx: elif idx > tokens[-1].idx:
@ -133,5 +135,3 @@ def _map_indices_to_tokens(ids, heads):
else: else:
mapped.append(ids.index(head)) mapped.append(ids.index(head))
return mapped return mapped

View File

@ -74,7 +74,7 @@ cdef class BiluoPushDown(TransitionSystem):
move_labels = {MISSING: {'': True}, BEGIN: {}, IN: {}, LAST: {}, UNIT: {}, move_labels = {MISSING: {'': True}, BEGIN: {}, IN: {}, LAST: {}, UNIT: {},
OUT: {'': True}} OUT: {'': True}}
moves = ('M', 'B', 'I', 'L', 'U') moves = ('M', 'B', 'I', 'L', 'U')
for (raw_text, toks, (ids, tags, heads, labels, biluo)) in gold_tuples: for (raw_text, toks, (ids, words, tags, heads, labels, biluo)) in gold_tuples:
for i, ner_tag in enumerate(biluo): for i, ner_tag in enumerate(biluo):
if ner_tag != 'O' and ner_tag != '-': if ner_tag != 'O' and ner_tag != '-':
move_str, label = ner_tag.split('-') move_str, label = ner_tag.split('-')
@ -87,8 +87,7 @@ cdef class BiluoPushDown(TransitionSystem):
elif move == 'MISSING': elif move == 'MISSING':
return 'M' return 'M'
else: else:
labels = {id_: name for name, id_ in self.label_ids.items()} return MOVE_NAMES[move] + '-' + self.strings[label]
return MOVE_NAMES[move] + '-' + labels[label]
cdef int preprocess_gold(self, GoldParse gold) except -1: cdef int preprocess_gold(self, GoldParse gold) except -1:
for i in range(gold.length): for i in range(gold.length):