mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 17:24:41 +03:00
* Add words to gold_tuples from gold conll file
This commit is contained in:
parent
2e12dec76e
commit
3b70b304b2
|
@ -44,7 +44,7 @@ cdef class ArcEager(TransitionSystem):
|
||||||
def get_labels(cls, gold_parses):
|
def get_labels(cls, gold_parses):
|
||||||
move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {},
|
move_labels = {SHIFT: {'': True}, REDUCE: {'': True}, RIGHT: {},
|
||||||
LEFT: {}, BREAK: {'ROOT': True}}
|
LEFT: {}, BREAK: {'ROOT': True}}
|
||||||
for raw_text, segmented, (ids, tags, heads, labels, iob) in gold_parses:
|
for raw_text, segmented, (ids, words, tags, heads, labels, iob) in gold_parses:
|
||||||
for i, (head, label) in enumerate(zip(heads, labels)):
|
for i, (head, label) in enumerate(zip(heads, labels)):
|
||||||
if label != 'ROOT':
|
if label != 'ROOT':
|
||||||
if head > i:
|
if head > i:
|
||||||
|
@ -69,6 +69,13 @@ cdef class ArcEager(TransitionSystem):
|
||||||
if self.c[i].move == move and self.c[i].label == label:
|
if self.c[i].move == move and self.c[i].label == label:
|
||||||
return self.c[i]
|
return self.c[i]
|
||||||
|
|
||||||
|
def move_name(self, int move, int label):
|
||||||
|
label_str = self.strings[label]
|
||||||
|
if label_str:
|
||||||
|
return MOVE_NAMES[move] + '-' + label_str
|
||||||
|
else:
|
||||||
|
return MOVE_NAMES[move]
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
cdef Transition init_transition(self, int clas, int move, int label) except *:
|
||||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||||
# constructor with the function pointers
|
# constructor with the function pointers
|
||||||
|
@ -129,7 +136,7 @@ cdef int _do_right(const Transition* self, State* state) except -1:
|
||||||
|
|
||||||
|
|
||||||
cdef int _do_reduce(const Transition* self, State* state) except -1:
|
cdef int _do_reduce(const Transition* self, State* state) except -1:
|
||||||
# TODO: Huh? Is this some weirdness from the non-monotonic?
|
if NON_MONOTONIC and not has_head(get_s0(state)):
|
||||||
add_dep(state, state.stack[-1], state.stack[0], get_s0(state).dep)
|
add_dep(state, state.stack[-1], state.stack[0], get_s0(state).dep)
|
||||||
pop_stack(state)
|
pop_stack(state)
|
||||||
|
|
||||||
|
|
|
@ -13,6 +13,7 @@ cdef class GoldParse:
|
||||||
cdef readonly list tags
|
cdef readonly list tags
|
||||||
cdef readonly list heads
|
cdef readonly list heads
|
||||||
cdef readonly list labels
|
cdef readonly list labels
|
||||||
|
cdef readonly dict orths
|
||||||
cdef readonly list ner
|
cdef readonly list ner
|
||||||
cdef readonly list ents
|
cdef readonly list ents
|
||||||
|
|
||||||
|
|
|
@ -30,7 +30,7 @@ def read_docparse_file(loc):
|
||||||
iob_ents.append(iob_ent)
|
iob_ents.append(iob_ent)
|
||||||
tokenized = [s.replace('<SEP>', ' ').split(' ')
|
tokenized = [s.replace('<SEP>', ' ').split(' ')
|
||||||
for s in tok_text.split('<SENT>')]
|
for s in tok_text.split('<SENT>')]
|
||||||
sents.append((raw_text, tokenized, (ids, tags, heads, labels, iob_ents)))
|
sents.append((raw_text, tokenized, (ids, words, tags, heads, labels, iob_ents)))
|
||||||
return sents
|
return sents
|
||||||
|
|
||||||
def _parse_line(line):
|
def _parse_line(line):
|
||||||
|
@ -63,12 +63,14 @@ cdef class GoldParse:
|
||||||
self.heads = [-1] * len(tokens)
|
self.heads = [-1] * len(tokens)
|
||||||
self.labels = ['MISSING'] * len(tokens)
|
self.labels = ['MISSING'] * len(tokens)
|
||||||
self.ner = ['O'] * len(tokens)
|
self.ner = ['O'] * len(tokens)
|
||||||
|
self.orths = {}
|
||||||
|
|
||||||
idx_map = {token.idx: token.i for token in tokens}
|
idx_map = {token.idx: token.i for token in tokens}
|
||||||
self.ents = []
|
self.ents = []
|
||||||
ent_start = None
|
ent_start = None
|
||||||
ent_label = None
|
ent_label = None
|
||||||
for idx, tag, head, label, ner in zip(*annot_tuples):
|
for idx, orth, tag, head, label, ner in zip(*annot_tuples):
|
||||||
|
self.orths[idx] = orth
|
||||||
if idx < tokens[0].idx:
|
if idx < tokens[0].idx:
|
||||||
pass
|
pass
|
||||||
elif idx > tokens[-1].idx:
|
elif idx > tokens[-1].idx:
|
||||||
|
@ -133,5 +135,3 @@ def _map_indices_to_tokens(ids, heads):
|
||||||
else:
|
else:
|
||||||
mapped.append(ids.index(head))
|
mapped.append(ids.index(head))
|
||||||
return mapped
|
return mapped
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -74,7 +74,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
move_labels = {MISSING: {'': True}, BEGIN: {}, IN: {}, LAST: {}, UNIT: {},
|
move_labels = {MISSING: {'': True}, BEGIN: {}, IN: {}, LAST: {}, UNIT: {},
|
||||||
OUT: {'': True}}
|
OUT: {'': True}}
|
||||||
moves = ('M', 'B', 'I', 'L', 'U')
|
moves = ('M', 'B', 'I', 'L', 'U')
|
||||||
for (raw_text, toks, (ids, tags, heads, labels, biluo)) in gold_tuples:
|
for (raw_text, toks, (ids, words, tags, heads, labels, biluo)) in gold_tuples:
|
||||||
for i, ner_tag in enumerate(biluo):
|
for i, ner_tag in enumerate(biluo):
|
||||||
if ner_tag != 'O' and ner_tag != '-':
|
if ner_tag != 'O' and ner_tag != '-':
|
||||||
move_str, label = ner_tag.split('-')
|
move_str, label = ner_tag.split('-')
|
||||||
|
@ -87,8 +87,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
elif move == 'MISSING':
|
elif move == 'MISSING':
|
||||||
return 'M'
|
return 'M'
|
||||||
else:
|
else:
|
||||||
labels = {id_: name for name, id_ in self.label_ids.items()}
|
return MOVE_NAMES[move] + '-' + self.strings[label]
|
||||||
return MOVE_NAMES[move] + '-' + labels[label]
|
|
||||||
|
|
||||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||||
for i in range(gold.length):
|
for i in range(gold.length):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user