mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 17:36:30 +03:00
* Add words to gold_tuples from gold conll file
This commit is contained in:
parent
2e12dec76e
commit
3b70b304b2
|
@ -44,7 +44,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
def get_labels(cls, gold_parses):
|
||||
move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {},
|
||||
LEFT: {}, BREAK: {'ROOT': True}}
|
||||
for raw_text, segmented, (ids, tags, heads, labels, iob) in gold_parses:
|
||||
for raw_text, segmented, (ids, words, tags, heads, labels, iob) in gold_parses:
|
||||
for i, (head, label) in enumerate(zip(heads, labels)):
|
||||
if label != 'ROOT':
|
||||
if head > i:
|
||||
|
@ -69,6 +69,13 @@ cdef class ArcEager(TransitionSystem):
|
|||
if self.c[i].move == move and self.c[i].label == label:
|
||||
return self.c[i]
|
||||
|
||||
def move_name(self, int move, int label):
|
||||
label_str = self.strings[label]
|
||||
if label_str:
|
||||
return MOVE_NAMES[move] + '-' + label_str
|
||||
else:
|
||||
return MOVE_NAMES[move]
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||
# constructor with the function pointers
|
||||
|
@ -129,7 +136,7 @@ cdef int _do_right(const Transition* self, State* state) except -1:
|
|||
|
||||
|
||||
cdef int _do_reduce(const Transition* self, State* state) except -1:
|
||||
# TODO: Huh? Is this some weirdness from the non-monotonic?
|
||||
if NON_MONOTONIC and not has_head(get_s0(state)):
|
||||
add_dep(state, state.stack[-1], state.stack[0], get_s0(state).dep)
|
||||
pop_stack(state)
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ cdef class GoldParse:
|
|||
cdef readonly list tags
|
||||
cdef readonly list heads
|
||||
cdef readonly list labels
|
||||
cdef readonly dict orths
|
||||
cdef readonly list ner
|
||||
cdef readonly list ents
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ def read_docparse_file(loc):
|
|||
iob_ents.append(iob_ent)
|
||||
tokenized = [s.replace('<SEP>', ' ').split(' ')
|
||||
for s in tok_text.split('<SENT>')]
|
||||
sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents)))
|
||||
sents.append((raw_text, tokenized, (ids, words, tags, heads, labels, iob_ents)))
|
||||
return sents
|
||||
|
||||
def _parse_line(line):
|
||||
|
@ -63,12 +63,14 @@ cdef class GoldParse:
|
|||
self.heads = [-1] * len(tokens)
|
||||
self.labels = ['MISSING'] * len(tokens)
|
||||
self.ner = ['O'] * len(tokens)
|
||||
self.orths = {}
|
||||
|
||||
idx_map = {token.idx: token.i for token in tokens}
|
||||
self.ents = []
|
||||
ent_start = None
|
||||
ent_label = None
|
||||
for idx, tag, head, label, ner in zip(*annot_tuples):
|
||||
for idx, orth, tag, head, label, ner in zip(*annot_tuples):
|
||||
self.orths[idx] = orth
|
||||
if idx < tokens[0].idx:
|
||||
pass
|
||||
elif idx > tokens[-1].idx:
|
||||
|
@ -133,5 +135,3 @@ def _map_indices_to_tokens(ids, heads):
|
|||
else:
|
||||
mapped.append(ids.index(head))
|
||||
return mapped
|
||||
|
||||
|
||||
|
|
|
@ -74,7 +74,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
move_labels = {MISSING: {'': True}, BEGIN: {}, IN: {}, LAST: {}, UNIT: {},
|
||||
OUT: {'': True}}
|
||||
moves = ('M', 'B', 'I', 'L', 'U')
|
||||
for (raw_text, toks, (ids, tags, heads, labels, biluo)) in gold_tuples:
|
||||
for (raw_text, toks, (ids, words, tags, heads, labels, biluo)) in gold_tuples:
|
||||
for i, ner_tag in enumerate(biluo):
|
||||
if ner_tag != 'O' and ner_tag != '-':
|
||||
move_str, label = ner_tag.split('-')
|
||||
|
@ -87,8 +87,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
elif move == 'MISSING':
|
||||
return 'M'
|
||||
else:
|
||||
labels = {id_: name for name, id_ in self.label_ids.items()}
|
||||
return MOVE_NAMES[move] + '-' + labels[label]
|
||||
return MOVE_NAMES[move] + '-' + self.strings[label]
|
||||
|
||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||
for i in range(gold.length):
|
||||
|
|
Loading…
Reference in New Issue
Block a user