mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-13 18:56:36 +03:00
* Tmp commit
This commit is contained in:
parent
4f83c9b3d5
commit
10ed738df2
|
@ -1,4 +1,4 @@
|
||||||
from libc.string cimport memmove
|
from libc.string cimport memmove, memcpy
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from ..lexeme cimport EMPTY_LEXEME
|
from ..lexeme cimport EMPTY_LEXEME
|
||||||
|
@ -120,7 +120,9 @@ cdef State* init_state(Pool mem, const TokenC* sent, const int sent_len) except
|
||||||
s.stack[i] = -1
|
s.stack[i] = -1
|
||||||
s.stack += (PADDING - 1)
|
s.stack += (PADDING - 1)
|
||||||
assert s.stack[0] == -1
|
assert s.stack[0] == -1
|
||||||
s.sent = <TokenC*>mem.alloc(sent_len, sizeof(TokenC))
|
state_sent = <TokenC*>mem.alloc(padded_len, sizeof(TokenC))
|
||||||
|
memcpy(state_sent, sent - PADDING, padded_len * sizeof(TokenC))
|
||||||
|
s.sent = state_sent + PADDING
|
||||||
s.stack_len = 0
|
s.stack_len = 0
|
||||||
s.i = 0
|
s.i = 0
|
||||||
s.sent_len = sent_len
|
s.sent_len = sent_len
|
||||||
|
|
|
@ -35,7 +35,8 @@ cdef get_cost_func_t[N_MOVES] get_cost_funcs
|
||||||
cdef class ArcEager(TransitionSystem):
|
cdef class ArcEager(TransitionSystem):
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_labels(cls, gold_parses):
|
def get_labels(cls, gold_parses):
|
||||||
labels = {RIGHT: {}, LEFT: {}}
|
labels = {SHIFT: {0: True}, REDUCE: {0: True}, RIGHT: {0: True},
|
||||||
|
LEFT: {0: True}, BREAK: {0: True}}
|
||||||
for parse in gold_parses:
|
for parse in gold_parses:
|
||||||
for i, (head, label) in enumerate(zip(parse.heads, parse.labels)):
|
for i, (head, label) in enumerate(zip(parse.heads, parse.labels)):
|
||||||
if head > i:
|
if head > i:
|
||||||
|
@ -128,7 +129,7 @@ cdef int _shift_cost(const Transition* self, const State* s, GoldParse gold) exc
|
||||||
cost += head_in_stack(s, s.i, gold.c_heads)
|
cost += head_in_stack(s, s.i, gold.c_heads)
|
||||||
cost += children_in_stack(s, s.i, gold.c_heads)
|
cost += children_in_stack(s, s.i, gold.c_heads)
|
||||||
if NON_MONOTONIC:
|
if NON_MONOTONIC:
|
||||||
cost += gold[s.stack[0]] == s.i
|
cost += gold.c_heads[s.stack[0]] == s.i
|
||||||
# If we can break, and there's no cost to doing so, we should
|
# If we can break, and there's no cost to doing so, we should
|
||||||
if _can_break(s) and _break_cost(self, s, gold) == 0:
|
if _can_break(s) and _break_cost(self, s, gold) == 0:
|
||||||
cost += 1
|
cost += 1
|
||||||
|
@ -138,29 +139,29 @@ cdef int _shift_cost(const Transition* self, const State* s, GoldParse gold) exc
|
||||||
cdef int _right_cost(const Transition* self, const State* s, GoldParse gold) except -1:
|
cdef int _right_cost(const Transition* self, const State* s, GoldParse gold) except -1:
|
||||||
assert s.stack_len >= 1
|
assert s.stack_len >= 1
|
||||||
cost = 0
|
cost = 0
|
||||||
if gold[s.i] == s.stack[0]:
|
if gold.c_heads[s.i] == s.stack[0]:
|
||||||
cost += self.label != gold.c_labels[s.i]
|
cost += self.label != gold.c_labels[s.i]
|
||||||
return cost
|
return cost
|
||||||
cost += head_in_buffer(s, s.i, gold.c_heads)
|
cost += head_in_buffer(s, s.i, gold.c_heads)
|
||||||
cost += children_in_stack(s, s.i, gold.c_heads)
|
cost += children_in_stack(s, s.i, gold.c_heads)
|
||||||
cost += head_in_stack(s, s.i, gold.c_heads)
|
cost += head_in_stack(s, s.i, gold.c_heads)
|
||||||
if NON_MONOTONIC:
|
if NON_MONOTONIC:
|
||||||
cost += gold[s.stack[0]] == s.i
|
cost += gold.c_heads[s.stack[0]] == s.i
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
|
|
||||||
cdef int _left_cost(const Transition* self, const State* s, GoldParse gold) except -1:
|
cdef int _left_cost(const Transition* self, const State* s, GoldParse gold) except -1:
|
||||||
assert s.stack_len >= 1
|
assert s.stack_len >= 1
|
||||||
cost = 0
|
cost = 0
|
||||||
if gold[s.stack[0]] == s.i:
|
if gold.c_heads[s.stack[0]] == s.i:
|
||||||
cost += self.label != gold.c_labels[s.top]
|
cost += self.label != gold.c_labels[s.stack[0]]
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
cost += head_in_buffer(s, s.stack[0], gold.c_heads)
|
cost += head_in_buffer(s, s.stack[0], gold.c_heads)
|
||||||
cost += children_in_buffer(s, s.stack[0], gold.c_heads)
|
cost += children_in_buffer(s, s.stack[0], gold.c_heads)
|
||||||
if NON_MONOTONIC and s.stack_len >= 2:
|
if NON_MONOTONIC and s.stack_len >= 2:
|
||||||
cost += gold[s.stack[0]] == s.stack[-1]
|
cost += gold.c_heads[s.stack[0]] == s.stack[-1]
|
||||||
cost += gold[s.stack[0]] == s.stack[0]
|
cost += gold.c_heads[s.stack[0]] == s.stack[0]
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -11,12 +11,12 @@ cdef class GoldParse:
|
||||||
cdef int length
|
cdef int length
|
||||||
cdef int loss
|
cdef int loss
|
||||||
|
|
||||||
cdef unicode raw_text
|
cdef readonly unicode raw_text
|
||||||
cdef list words
|
cdef readonly list words
|
||||||
cdef list ids
|
cdef readonly list ids
|
||||||
cdef list tags
|
cdef readonly list tags
|
||||||
cdef list heads
|
cdef readonly list heads
|
||||||
cdef list labels
|
cdef readonly list labels
|
||||||
|
|
||||||
|
|
||||||
cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1
|
cdef int heads_correct(self, TokenC* tokens, bint score_punct=?) except -1
|
||||||
|
|
|
@ -13,7 +13,10 @@ cdef class GoldParse:
|
||||||
self.c_labels = <int*>self.mem.alloc(self.length, sizeof(int))
|
self.c_labels = <int*>self.mem.alloc(self.length, sizeof(int))
|
||||||
|
|
||||||
cdef int heads_correct(self, TokenC* tokens, bint score_punct=False) except -1:
|
cdef int heads_correct(self, TokenC* tokens, bint score_punct=False) except -1:
|
||||||
pass
|
n = 0
|
||||||
|
for i in range(self.length):
|
||||||
|
n += (i + tokens[i].head) == self.c_heads[i]
|
||||||
|
return n
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_conll(cls, unicode sent_str):
|
def from_conll(cls, unicode sent_str):
|
||||||
|
@ -57,7 +60,7 @@ cdef class GoldParse:
|
||||||
tags.append(pos_string)
|
tags.append(pos_string)
|
||||||
tokenized = [sent_str.replace('<SEP>', ' ').split(' ')
|
tokenized = [sent_str.replace('<SEP>', ' ').split(' ')
|
||||||
for sent_str in tok_text.split('<SENT>')]
|
for sent_str in tok_text.split('<SENT>')]
|
||||||
return cls(raw_text, tokenized, ids, words, tags, heads, labels)
|
return cls(raw_text, words, ids, tags, heads, labels)
|
||||||
|
|
||||||
def align_to_tokens(self, tokens, label_ids):
|
def align_to_tokens(self, tokens, label_ids):
|
||||||
orig_words = list(self.words)
|
orig_words = list(self.words)
|
||||||
|
@ -70,9 +73,7 @@ cdef class GoldParse:
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
while annot and token.idx > annot[0][0]:
|
while annot and token.idx > annot[0][0]:
|
||||||
miss_id, miss_tag, miss_head, miss_label = annot.pop(0)
|
miss_id, miss_tag, miss_head, miss_label = annot.pop(0)
|
||||||
miss_w = self.words.pop(0)
|
|
||||||
if not is_punct_label(miss_label):
|
if not is_punct_label(miss_label):
|
||||||
missed.append(miss_w)
|
|
||||||
self.loss += 1
|
self.loss += 1
|
||||||
if not annot:
|
if not annot:
|
||||||
self.tags.append(None)
|
self.tags.append(None)
|
||||||
|
@ -85,15 +86,22 @@ cdef class GoldParse:
|
||||||
self.heads.append(head)
|
self.heads.append(head)
|
||||||
self.labels.append(label)
|
self.labels.append(label)
|
||||||
annot.pop(0)
|
annot.pop(0)
|
||||||
self.words.pop(0)
|
|
||||||
elif token.idx < id_:
|
elif token.idx < id_:
|
||||||
self.tags.append(None)
|
self.tags.append(None)
|
||||||
self.heads.append(None)
|
self.heads.append(None)
|
||||||
self.labels.append(None)
|
self.labels.append(None)
|
||||||
else:
|
else:
|
||||||
raise StandardError
|
raise StandardError
|
||||||
|
self.length = len(tokens)
|
||||||
|
self.c_heads = <int*>self.mem.alloc(self.length, sizeof(int))
|
||||||
|
self.c_labels = <int*>self.mem.alloc(self.length, sizeof(int))
|
||||||
|
self.ids = [token.idx for token in tokens]
|
||||||
mapped_heads = _map_indices_to_tokens(self.ids, self.heads)
|
mapped_heads = _map_indices_to_tokens(self.ids, self.heads)
|
||||||
for i in range(self.length):
|
for i in range(self.length):
|
||||||
|
if mapped_heads[i] is None:
|
||||||
|
self.c_heads[i] = -1
|
||||||
|
self.c_labels[i] = -1
|
||||||
|
else:
|
||||||
self.c_heads[i] = mapped_heads[i]
|
self.c_heads[i] = mapped_heads[i]
|
||||||
self.c_labels[i] = label_ids[self.labels[i]]
|
self.c_labels[i] = label_ids[self.labels[i]]
|
||||||
return self.loss
|
return self.loss
|
||||||
|
@ -125,38 +133,3 @@ def _parse_line(line):
|
||||||
head_idx = int(pieces[6])
|
head_idx = int(pieces[6])
|
||||||
label = pieces[7]
|
label = pieces[7]
|
||||||
return id_, word, pos, head_idx, label
|
return id_, word, pos, head_idx, label
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
# TODO
|
|
||||||
def evaluate(Language, dev_loc, model_dir, gold_preproc=False):
|
|
||||||
global loss
|
|
||||||
nlp = Language()
|
|
||||||
n_corr = 0
|
|
||||||
pos_corr = 0
|
|
||||||
n_tokens = 0
|
|
||||||
total = 0
|
|
||||||
skipped = 0
|
|
||||||
loss = 0
|
|
||||||
with codecs.open(dev_loc, 'r', 'utf8') as file_:
|
|
||||||
#paragraphs = read_tokenized_gold(file_)
|
|
||||||
paragraphs = read_docparse_gold(file_)
|
|
||||||
for tokens, tag_strs, heads, labels in iter_data(paragraphs, nlp.tokenizer,
|
|
||||||
gold_preproc=gold_preproc):
|
|
||||||
assert len(tokens) == len(labels)
|
|
||||||
nlp.tagger(tokens)
|
|
||||||
nlp.parser(tokens)
|
|
||||||
for i, token in enumerate(tokens):
|
|
||||||
pos_corr += token.tag_ == tag_strs[i]
|
|
||||||
n_tokens += 1
|
|
||||||
if heads[i] is None:
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
if is_punct_label(labels[i]):
|
|
||||||
continue
|
|
||||||
n_corr += token.head.i == heads[i]
|
|
||||||
total += 1
|
|
||||||
print loss, skipped, (loss+skipped + total)
|
|
||||||
print pos_corr / n_tokens
|
|
||||||
return float(n_corr) / (total + loss)
|
|
||||||
"""
|
|
||||||
|
|
|
@ -6,6 +6,6 @@ from ..tokens cimport Tokens, TokenC
|
||||||
|
|
||||||
|
|
||||||
cdef class GreedyParser:
|
cdef class GreedyParser:
|
||||||
cdef object cfg
|
cdef readonly object cfg
|
||||||
cdef readonly Model model
|
cdef readonly Model model
|
||||||
cdef TransitionSystem moves
|
cdef readonly TransitionSystem moves
|
||||||
|
|
|
@ -98,7 +98,6 @@ cdef class GreedyParser:
|
||||||
|
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef State* state = init_state(mem, tokens.data, tokens.length)
|
cdef State* state = init_state(mem, tokens.data, tokens.length)
|
||||||
|
|
||||||
while not is_final(state):
|
while not is_final(state):
|
||||||
fill_context(context, state)
|
fill_context(context, state)
|
||||||
scores = self.model.score(context)
|
scores = self.model.score(context)
|
||||||
|
|
|
@ -27,6 +27,7 @@ cdef class TransitionSystem:
|
||||||
cdef readonly dict label_ids
|
cdef readonly dict label_ids
|
||||||
cdef Pool mem
|
cdef Pool mem
|
||||||
cdef const Transition* c
|
cdef const Transition* c
|
||||||
|
cdef readonly int n_moves
|
||||||
|
|
||||||
cdef Transition init_transition(self, int clas, int move, int label) except *
|
cdef Transition init_transition(self, int clas, int move, int label) except *
|
||||||
|
|
||||||
|
|
|
@ -14,14 +14,16 @@ class OracleError(Exception):
|
||||||
cdef class TransitionSystem:
|
cdef class TransitionSystem:
|
||||||
def __init__(self, dict labels_by_action):
|
def __init__(self, dict labels_by_action):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self.n_moves = sum(len(labels) for labels in labels_by_action.items())
|
self.n_moves = sum(len(labels) for labels in labels_by_action.values())
|
||||||
moves = <Transition*>self.mem.alloc(self.n_moves, sizeof(Transition))
|
moves = <Transition*>self.mem.alloc(self.n_moves, sizeof(Transition))
|
||||||
cdef int i = 0
|
cdef int i = 0
|
||||||
self.label_ids = {}
|
cdef int label_id
|
||||||
|
self.label_ids = {'ROOT': 0, 'MISSING': -1}
|
||||||
for action, label_strs in sorted(labels_by_action.items()):
|
for action, label_strs in sorted(labels_by_action.items()):
|
||||||
|
for label_str in sorted(label_strs):
|
||||||
label_str = unicode(label_str)
|
label_str = unicode(label_str)
|
||||||
label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
|
label_id = self.label_ids.setdefault(label_str, len(self.label_ids))
|
||||||
moves[i] = self.init_transition(i, action, label_id)
|
moves[i] = self.init_transition(i, int(action), label_id)
|
||||||
i += 1
|
i += 1
|
||||||
self.c = moves
|
self.c = moves
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue
Block a user