mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
* NER seems to be working, scoring 69 F. Need to add decision-history features --- currently only use current word, 2 words context. Need refactoring.
This commit is contained in:
parent
e99f19dd6c
commit
8057a95f20
|
@ -24,6 +24,8 @@ from spacy.syntax.util import Config
|
||||||
from spacy.syntax.conll import read_docparse_file
|
from spacy.syntax.conll import read_docparse_file
|
||||||
from spacy.syntax.conll import GoldParse
|
from spacy.syntax.conll import GoldParse
|
||||||
|
|
||||||
|
from spacy.scorer import Scorer
|
||||||
|
|
||||||
|
|
||||||
def is_punct_label(label):
|
def is_punct_label(label):
|
||||||
return label == 'P' or label.lower() == 'punct'
|
return label == 'P' or label.lower() == 'punct'
|
||||||
|
@ -186,7 +188,6 @@ def get_labels(sents):
|
||||||
|
|
||||||
def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
||||||
gold_preproc=False, force_gold=False, n_sents=0):
|
gold_preproc=False, force_gold=False, n_sents=0):
|
||||||
print "Setup model dir"
|
|
||||||
dep_model_dir = path.join(model_dir, 'deps')
|
dep_model_dir = path.join(model_dir, 'deps')
|
||||||
pos_model_dir = path.join(model_dir, 'pos')
|
pos_model_dir = path.join(model_dir, 'pos')
|
||||||
ner_model_dir = path.join(model_dir, 'ner')
|
ner_model_dir = path.join(model_dir, 'ner')
|
||||||
|
@ -209,13 +210,16 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
||||||
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
|
Config.write(ner_model_dir, 'config', features='ner', seed=seed,
|
||||||
labels=Language.EntityTransitionSystem.get_labels(gold_tuples))
|
labels=Language.EntityTransitionSystem.get_labels(gold_tuples))
|
||||||
|
|
||||||
|
if n_sents > 0:
|
||||||
|
gold_tuples = gold_tuples[:n_sents]
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
|
ent_strings = [None] * (max(nlp.entity.moves.label_ids.values()) + 1)
|
||||||
|
for label, i in nlp.entity.moves.label_ids.items():
|
||||||
|
ent_strings[i] = label
|
||||||
|
|
||||||
|
print "Itn.\tUAS\tNER F.\tTag %"
|
||||||
for itn in range(n_iter):
|
for itn in range(n_iter):
|
||||||
dep_corr = 0
|
scorer = Scorer()
|
||||||
pos_corr = 0
|
|
||||||
ent_corr = 0
|
|
||||||
n_tokens = 0
|
|
||||||
for raw_text, segmented_text, annot_tuples in gold_tuples:
|
for raw_text, segmented_text, annot_tuples in gold_tuples:
|
||||||
if gold_preproc:
|
if gold_preproc:
|
||||||
sents = [nlp.tokenizer.tokens_from_list(s) for s in segmented_text]
|
sents = [nlp.tokenizer.tokens_from_list(s) for s in segmented_text]
|
||||||
|
@ -224,51 +228,32 @@ def train(Language, train_loc, model_dir, n_iter=15, feat_set=u'basic', seed=0,
|
||||||
for tokens in sents:
|
for tokens in sents:
|
||||||
gold = GoldParse(tokens, annot_tuples)
|
gold = GoldParse(tokens, annot_tuples)
|
||||||
nlp.tagger(tokens)
|
nlp.tagger(tokens)
|
||||||
#ent_corr += nlp.entity.train(tokens, gold, force_gold=force_gold)
|
nlp.entity.train(tokens, gold, force_gold=force_gold)
|
||||||
dep_corr += nlp.parser.train(tokens, gold, force_gold=force_gold)
|
#nlp.parser.train(tokens, gold, force_gold=force_gold)
|
||||||
pos_corr += nlp.tagger.train(tokens, gold.tags)
|
nlp.tagger.train(tokens, gold.tags)
|
||||||
n_tokens += len(tokens)
|
|
||||||
acc = float(dep_corr) / n_tokens
|
nlp.entity(tokens)
|
||||||
pos_acc = float(pos_corr) / n_tokens
|
tokens._ent_strings = tuple(ent_strings)
|
||||||
print '%d: ' % itn, '%.3f' % acc, '%.3f' % pos_acc
|
nlp.parser(tokens)
|
||||||
|
scorer.score(tokens, gold, verbose=False)
|
||||||
|
print '%d:\t%.3f\t%.3f\t%.3f' % (itn, scorer.uas, scorer.ents_f, scorer.tags_acc)
|
||||||
random.shuffle(gold_tuples)
|
random.shuffle(gold_tuples)
|
||||||
nlp.parser.model.end_training()
|
nlp.parser.model.end_training()
|
||||||
|
nlp.entity.model.end_training()
|
||||||
nlp.tagger.model.end_training()
|
nlp.tagger.model.end_training()
|
||||||
return acc
|
|
||||||
|
|
||||||
|
|
||||||
def evaluate(Language, dev_loc, model_dir, gold_preproc=False):
|
def evaluate(Language, dev_loc, model_dir, gold_preproc=False):
|
||||||
global loss
|
global loss
|
||||||
|
assert not gold_preproc
|
||||||
nlp = Language()
|
nlp = Language()
|
||||||
uas_corr = 0
|
gold_tuples = read_docparse_file(dev_loc)
|
||||||
las_corr = 0
|
scorer = Scorer()
|
||||||
pos_corr = 0
|
|
||||||
n_tokens = 0
|
|
||||||
total = 0
|
|
||||||
skipped = 0
|
|
||||||
loss = 0
|
|
||||||
gold_tuples = read_docparse_file(train_loc)
|
|
||||||
for raw_text, segmented_text, annot_tuples in gold_tuples:
|
for raw_text, segmented_text, annot_tuples in gold_tuples:
|
||||||
if gold_preproc:
|
tokens = nlp(raw_text)
|
||||||
tokens = nlp.tokenizer.tokens_from_list(gold_sent.words)
|
gold = GoldParse(tokens, annot_tuples)
|
||||||
nlp.tagger(tokens)
|
scorer.score(tokens, gold, verbose=False)
|
||||||
nlp.parser(tokens)
|
return scorer
|
||||||
gold_sent.map_heads(nlp.parser.moves.label_ids)
|
|
||||||
else:
|
|
||||||
tokens = nlp(gold_sent.raw_text)
|
|
||||||
loss += gold_sent.align_to_tokens(tokens, nlp.parser.moves.label_ids)
|
|
||||||
for i, token in enumerate(tokens):
|
|
||||||
pos_corr += token.tag_ == gold_sent.tags[i]
|
|
||||||
n_tokens += 1
|
|
||||||
if gold_sent.heads[i] is None:
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
if gold_sent.labels[i] != 'P':
|
|
||||||
n_corr += gold_sent.is_correct(i, token.head.i)
|
|
||||||
total += 1
|
|
||||||
print loss, skipped, (loss+skipped + total)
|
|
||||||
print pos_corr / n_tokens
|
|
||||||
return float(n_corr) / (total + loss)
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -281,7 +266,14 @@ def evaluate(Language, dev_loc, model_dir, gold_preproc=False):
|
||||||
def main(train_loc, dev_loc, model_dir, n_sents=0):
|
def main(train_loc, dev_loc, model_dir, n_sents=0):
|
||||||
train(English, train_loc, model_dir,
|
train(English, train_loc, model_dir,
|
||||||
gold_preproc=False, force_gold=False, n_sents=n_sents)
|
gold_preproc=False, force_gold=False, n_sents=n_sents)
|
||||||
print evaluate(English, dev_loc, model_dir, gold_preproc=False)
|
scorer = evaluate(English, dev_loc, model_dir, gold_preproc=False)
|
||||||
|
print 'POS', scorer.tags_acc
|
||||||
|
print 'UAS', scorer.uas
|
||||||
|
print 'LAS', scorer.las
|
||||||
|
|
||||||
|
print 'NER P', scorer.ents_p
|
||||||
|
print 'NER R', scorer.ents_r
|
||||||
|
print 'NER F', scorer.ents_f
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
|
@ -180,7 +180,12 @@ class English(object):
|
||||||
if parse and self.has_parser_model:
|
if parse and self.has_parser_model:
|
||||||
self.parser(tokens)
|
self.parser(tokens)
|
||||||
if entity and self.has_entity_model:
|
if entity and self.has_entity_model:
|
||||||
|
# TODO: Clean this up
|
||||||
self.entity(tokens)
|
self.entity(tokens)
|
||||||
|
ent_strings = [None] * (max(self.entity.moves.label_ids.values()) + 1)
|
||||||
|
for label, i in self.entity.moves.label_ids.items():
|
||||||
|
ent_strings[i] = label
|
||||||
|
tokens._ent_strings = tuple(ent_strings)
|
||||||
return tokens
|
return tokens
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
@ -45,14 +45,12 @@ cdef struct PosTag:
|
||||||
cdef struct Entity:
|
cdef struct Entity:
|
||||||
int start
|
int start
|
||||||
int end
|
int end
|
||||||
int tag
|
|
||||||
int label
|
int label
|
||||||
|
|
||||||
|
|
||||||
cdef struct TokenC:
|
cdef struct TokenC:
|
||||||
const LexemeC* lex
|
const LexemeC* lex
|
||||||
Morphology morph
|
Morphology morph
|
||||||
Entity ent
|
|
||||||
univ_pos_t pos
|
univ_pos_t pos
|
||||||
int tag
|
int tag
|
||||||
int idx
|
int idx
|
||||||
|
@ -64,6 +62,9 @@ cdef struct TokenC:
|
||||||
uint32_t l_kids
|
uint32_t l_kids
|
||||||
uint32_t r_kids
|
uint32_t r_kids
|
||||||
|
|
||||||
|
int ent_iob
|
||||||
|
int ent_type
|
||||||
|
|
||||||
|
|
||||||
cdef struct Utf8Str:
|
cdef struct Utf8Str:
|
||||||
id_t i
|
id_t i
|
||||||
|
|
|
@ -16,6 +16,7 @@ cdef int fill_context(atom_t* context, State* state) except -1
|
||||||
# S0w,
|
# S0w,
|
||||||
# S0r0w, S0r2w, S0rw,
|
# S0r0w, S0r2w, S0rw,
|
||||||
# N0l0w, N0l2w, N0lw,
|
# N0l0w, N0l2w, N0lw,
|
||||||
|
# P2w, P1w,
|
||||||
# N0w, N1w, N2w, N3w, 0
|
# N0w, N1w, N2w, N3w, 0
|
||||||
#]
|
#]
|
||||||
|
|
||||||
|
@ -28,6 +29,9 @@ cpdef enum:
|
||||||
S2c4
|
S2c4
|
||||||
S2c6
|
S2c6
|
||||||
S2L
|
S2L
|
||||||
|
S2_prefix
|
||||||
|
S2_suffix
|
||||||
|
S2_shape
|
||||||
|
|
||||||
S1w
|
S1w
|
||||||
S1W
|
S1W
|
||||||
|
@ -36,6 +40,9 @@ cpdef enum:
|
||||||
S1c4
|
S1c4
|
||||||
S1c6
|
S1c6
|
||||||
S1L
|
S1L
|
||||||
|
S1_prefix
|
||||||
|
S1_suffix
|
||||||
|
S1_shape
|
||||||
|
|
||||||
S1rw
|
S1rw
|
||||||
S1rW
|
S1rW
|
||||||
|
@ -44,6 +51,9 @@ cpdef enum:
|
||||||
S1rc4
|
S1rc4
|
||||||
S1rc6
|
S1rc6
|
||||||
S1rL
|
S1rL
|
||||||
|
S1r_prefix
|
||||||
|
S1r_suffix
|
||||||
|
S1r_shape
|
||||||
|
|
||||||
S0lw
|
S0lw
|
||||||
S0lW
|
S0lW
|
||||||
|
@ -52,6 +62,9 @@ cpdef enum:
|
||||||
S0lc4
|
S0lc4
|
||||||
S0lc6
|
S0lc6
|
||||||
S0lL
|
S0lL
|
||||||
|
S0l_prefix
|
||||||
|
S0l_suffix
|
||||||
|
S0l_shape
|
||||||
|
|
||||||
S0l2w
|
S0l2w
|
||||||
S0l2W
|
S0l2W
|
||||||
|
@ -60,6 +73,9 @@ cpdef enum:
|
||||||
S0l2c4
|
S0l2c4
|
||||||
S0l2c6
|
S0l2c6
|
||||||
S0l2L
|
S0l2L
|
||||||
|
S0l2_prefix
|
||||||
|
S0l2_suffix
|
||||||
|
S0l2_shape
|
||||||
|
|
||||||
S0w
|
S0w
|
||||||
S0W
|
S0W
|
||||||
|
@ -68,6 +84,9 @@ cpdef enum:
|
||||||
S0c4
|
S0c4
|
||||||
S0c6
|
S0c6
|
||||||
S0L
|
S0L
|
||||||
|
S0_prefix
|
||||||
|
S0_suffix
|
||||||
|
S0_shape
|
||||||
|
|
||||||
S0r2w
|
S0r2w
|
||||||
S0r2W
|
S0r2W
|
||||||
|
@ -76,6 +95,9 @@ cpdef enum:
|
||||||
S0r2c4
|
S0r2c4
|
||||||
S0r2c6
|
S0r2c6
|
||||||
S0r2L
|
S0r2L
|
||||||
|
S0r2_prefix
|
||||||
|
S0r2_suffix
|
||||||
|
S0r2_shape
|
||||||
|
|
||||||
S0rw
|
S0rw
|
||||||
S0rW
|
S0rW
|
||||||
|
@ -84,6 +106,9 @@ cpdef enum:
|
||||||
S0rc4
|
S0rc4
|
||||||
S0rc6
|
S0rc6
|
||||||
S0rL
|
S0rL
|
||||||
|
S0r_prefix
|
||||||
|
S0r_suffix
|
||||||
|
S0r_shape
|
||||||
|
|
||||||
N0l2w
|
N0l2w
|
||||||
N0l2W
|
N0l2W
|
||||||
|
@ -92,6 +117,9 @@ cpdef enum:
|
||||||
N0l2c4
|
N0l2c4
|
||||||
N0l2c6
|
N0l2c6
|
||||||
N0l2L
|
N0l2L
|
||||||
|
N0l2_prefix
|
||||||
|
N0l2_suffix
|
||||||
|
N0l2_shape
|
||||||
|
|
||||||
N0lw
|
N0lw
|
||||||
N0lW
|
N0lW
|
||||||
|
@ -100,6 +128,9 @@ cpdef enum:
|
||||||
N0lc4
|
N0lc4
|
||||||
N0lc6
|
N0lc6
|
||||||
N0lL
|
N0lL
|
||||||
|
N0l_prefix
|
||||||
|
N0l_suffix
|
||||||
|
N0l_shape
|
||||||
|
|
||||||
N0w
|
N0w
|
||||||
N0W
|
N0W
|
||||||
|
@ -108,6 +139,9 @@ cpdef enum:
|
||||||
N0c4
|
N0c4
|
||||||
N0c6
|
N0c6
|
||||||
N0L
|
N0L
|
||||||
|
N0_prefix
|
||||||
|
N0_suffix
|
||||||
|
N0_shape
|
||||||
|
|
||||||
N1w
|
N1w
|
||||||
N1W
|
N1W
|
||||||
|
@ -116,6 +150,9 @@ cpdef enum:
|
||||||
N1c4
|
N1c4
|
||||||
N1c6
|
N1c6
|
||||||
N1L
|
N1L
|
||||||
|
N1_prefix
|
||||||
|
N1_suffix
|
||||||
|
N1_shape
|
||||||
|
|
||||||
N2w
|
N2w
|
||||||
N2W
|
N2W
|
||||||
|
@ -124,6 +161,31 @@ cpdef enum:
|
||||||
N2c4
|
N2c4
|
||||||
N2c6
|
N2c6
|
||||||
N2L
|
N2L
|
||||||
|
N2_prefix
|
||||||
|
N2_suffix
|
||||||
|
N2_shape
|
||||||
|
|
||||||
|
P1w
|
||||||
|
P1W
|
||||||
|
P1p
|
||||||
|
P1c
|
||||||
|
P1c4
|
||||||
|
P1c6
|
||||||
|
P1L
|
||||||
|
P1_prefix
|
||||||
|
P1_suffix
|
||||||
|
P1_shape
|
||||||
|
|
||||||
|
P2w
|
||||||
|
P2W
|
||||||
|
P2p
|
||||||
|
P2c
|
||||||
|
P2c4
|
||||||
|
P2c6
|
||||||
|
P2L
|
||||||
|
P2_prefix
|
||||||
|
P2_suffix
|
||||||
|
P2_shape
|
||||||
|
|
||||||
# Misc features at the end
|
# Misc features at the end
|
||||||
dist
|
dist
|
||||||
|
|
|
@ -12,6 +12,7 @@ from itertools import combinations
|
||||||
from ..tokens cimport TokenC
|
from ..tokens cimport TokenC
|
||||||
from ._state cimport State
|
from ._state cimport State
|
||||||
from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2
|
from ._state cimport get_s2, get_s1, get_s0, get_n0, get_n1, get_n2
|
||||||
|
from ._state cimport get_p2, get_p1
|
||||||
from ._state cimport has_head, get_left, get_right
|
from ._state cimport has_head, get_left, get_right
|
||||||
from ._state cimport count_left_kids, count_right_kids
|
from ._state cimport count_left_kids, count_right_kids
|
||||||
|
|
||||||
|
@ -45,6 +46,9 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
||||||
context[4] = token.lex.cluster & 63
|
context[4] = token.lex.cluster & 63
|
||||||
context[5] = token.lex.cluster & 15
|
context[5] = token.lex.cluster & 15
|
||||||
context[6] = token.dep if has_head(token) else 0
|
context[6] = token.dep if has_head(token) else 0
|
||||||
|
context[7] = token.lex.prefix
|
||||||
|
context[8] = token.lex.suffix
|
||||||
|
context[9] = token.lex.shape
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_context(atom_t* context, State* state) except -1:
|
cdef int fill_context(atom_t* context, State* state) except -1:
|
||||||
|
@ -62,7 +66,8 @@ cdef int fill_context(atom_t* context, State* state) except -1:
|
||||||
fill_token(&context[N0l2w], get_left(state, get_n0(state), 2))
|
fill_token(&context[N0l2w], get_left(state, get_n0(state), 2))
|
||||||
fill_token(&context[N0w], get_n0(state))
|
fill_token(&context[N0w], get_n0(state))
|
||||||
fill_token(&context[N1w], get_n1(state))
|
fill_token(&context[N1w], get_n1(state))
|
||||||
fill_token(&context[N2w], get_n2(state))
|
fill_token(&context[P1w], get_p1(state))
|
||||||
|
fill_token(&context[P2w], get_p2(state))
|
||||||
|
|
||||||
if state.stack_len >= 1:
|
if state.stack_len >= 1:
|
||||||
context[dist] = state.stack[0] - state.i
|
context[dist] = state.stack[0] - state.i
|
||||||
|
@ -84,6 +89,54 @@ cdef int fill_context(atom_t* context, State* state) except -1:
|
||||||
if state.stack_len >= 3:
|
if state.stack_len >= 3:
|
||||||
context[S2_has_head] = has_head(get_s2(state))
|
context[S2_has_head] = has_head(get_s2(state))
|
||||||
|
|
||||||
|
ner = (
|
||||||
|
(N0w,),
|
||||||
|
(P1w,),
|
||||||
|
(N1w,),
|
||||||
|
(P2w,),
|
||||||
|
(N2w,),
|
||||||
|
|
||||||
|
(P1w, N0w,),
|
||||||
|
(N0w, N1w),
|
||||||
|
|
||||||
|
(N0_prefix,),
|
||||||
|
(N0_suffix,),
|
||||||
|
|
||||||
|
(P1_shape,),
|
||||||
|
(N0_shape,),
|
||||||
|
(N1_shape,),
|
||||||
|
(P1_shape, N0_shape,),
|
||||||
|
(N0_shape, P1_shape,),
|
||||||
|
(P1_shape, N0_shape, N1_shape),
|
||||||
|
(N2_shape,),
|
||||||
|
(P2_shape,),
|
||||||
|
|
||||||
|
#(P2_norm, P1_norm, W_norm),
|
||||||
|
#(P1_norm, W_norm, N1_norm),
|
||||||
|
#(W_norm, N1_norm, N2_norm)
|
||||||
|
|
||||||
|
(P2p,),
|
||||||
|
(P1p,),
|
||||||
|
(N0p,),
|
||||||
|
(N1p,),
|
||||||
|
(N2p,),
|
||||||
|
|
||||||
|
(P1p, N0p),
|
||||||
|
(N0p, N1p),
|
||||||
|
(P2p, P1p, N0p),
|
||||||
|
(P1p, N0p, N1p),
|
||||||
|
(N0p, N1p, N2p),
|
||||||
|
|
||||||
|
(P2c,),
|
||||||
|
(P1c,),
|
||||||
|
(N0c,),
|
||||||
|
(N1c,),
|
||||||
|
(N2c,),
|
||||||
|
|
||||||
|
(P1c, N0c),
|
||||||
|
(N0c, N1c),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
unigrams = (
|
unigrams = (
|
||||||
(S2W, S2p),
|
(S2W, S2p),
|
||||||
|
|
|
@ -40,6 +40,21 @@ cdef inline TokenC* get_n1(const State* s) nogil:
|
||||||
return &s.sent[s.i+1]
|
return &s.sent[s.i+1]
|
||||||
|
|
||||||
|
|
||||||
|
cdef inline TokenC* get_p1(const State* s) nogil:
|
||||||
|
if s.i < 1:
|
||||||
|
return NULL
|
||||||
|
else:
|
||||||
|
return &s.sent[s.i-1]
|
||||||
|
|
||||||
|
|
||||||
|
cdef inline TokenC* get_p2(const State* s) nogil:
|
||||||
|
if s.i < 2:
|
||||||
|
return NULL
|
||||||
|
else:
|
||||||
|
return &s.sent[s.i-2]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline TokenC* get_n2(const State* s) nogil:
|
cdef inline TokenC* get_n2(const State* s) nogil:
|
||||||
if (s.i + 2) >= s.sent_len:
|
if (s.i + 2) >= s.sent_len:
|
||||||
return NULL
|
return NULL
|
||||||
|
@ -77,7 +92,7 @@ cdef int head_in_buffer(const State *s, const int child, const int* gold) except
|
||||||
cdef int children_in_stack(const State *s, const int head, const int* gold) except -1
|
cdef int children_in_stack(const State *s, const int head, const int* gold) except -1
|
||||||
cdef int head_in_stack(const State *s, const int child, const int* gold) except -1
|
cdef int head_in_stack(const State *s, const int child, const int* gold) except -1
|
||||||
|
|
||||||
cdef State* init_state(Pool mem, TokenC* sent, const int sent_length) except NULL
|
cdef State* new_state(Pool mem, TokenC* sent, const int sent_length) except NULL
|
||||||
|
|
||||||
|
|
||||||
cdef int count_left_kids(const TokenC* head) nogil
|
cdef int count_left_kids(const TokenC* head) nogil
|
||||||
|
|
|
@ -2,7 +2,7 @@ from libc.string cimport memmove, memcpy
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
|
|
||||||
from ..lexeme cimport EMPTY_LEXEME
|
from ..lexeme cimport EMPTY_LEXEME
|
||||||
from ..structs cimport TokenC
|
from ..structs cimport TokenC, Entity
|
||||||
|
|
||||||
|
|
||||||
DEF PADDING = 5
|
DEF PADDING = 5
|
||||||
|
@ -112,13 +112,15 @@ cdef int count_right_kids(const TokenC* head) nogil:
|
||||||
return _popcount(head.r_kids)
|
return _popcount(head.r_kids)
|
||||||
|
|
||||||
|
|
||||||
cdef State* init_state(Pool mem, const TokenC* sent, const int sent_len) except NULL:
|
cdef State* new_state(Pool mem, const TokenC* sent, const int sent_len) except NULL:
|
||||||
cdef int padded_len = sent_len + PADDING + PADDING
|
cdef int padded_len = sent_len + PADDING + PADDING
|
||||||
cdef State* s = <State*>mem.alloc(1, sizeof(State))
|
cdef State* s = <State*>mem.alloc(1, sizeof(State))
|
||||||
|
s.ent = <Entity*>mem.alloc(padded_len, sizeof(Entity))
|
||||||
s.stack = <int*>mem.alloc(padded_len, sizeof(int))
|
s.stack = <int*>mem.alloc(padded_len, sizeof(int))
|
||||||
for i in range(PADDING):
|
for i in range(PADDING):
|
||||||
s.stack[i] = -1
|
s.stack[i] = -1
|
||||||
s.stack += (PADDING - 1)
|
s.stack += (PADDING - 1)
|
||||||
|
s.ent += (PADDING - 1)
|
||||||
assert s.stack[0] == -1
|
assert s.stack[0] == -1
|
||||||
state_sent = <TokenC*>mem.alloc(padded_len, sizeof(TokenC))
|
state_sent = <TokenC*>mem.alloc(padded_len, sizeof(TokenC))
|
||||||
memcpy(state_sent, sent - PADDING, padded_len * sizeof(TokenC))
|
memcpy(state_sent, sent - PADDING, padded_len * sizeof(TokenC))
|
||||||
|
@ -126,5 +128,4 @@ cdef State* init_state(Pool mem, const TokenC* sent, const int sent_len) except
|
||||||
s.stack_len = 0
|
s.stack_len = 0
|
||||||
s.i = 0
|
s.i = 0
|
||||||
s.sent_len = sent_len
|
s.sent_len = sent_len
|
||||||
push_stack(s)
|
|
||||||
return s
|
return s
|
||||||
|
|
|
@ -58,7 +58,6 @@ cdef class ArcEager(TransitionSystem):
|
||||||
gold.c_heads[i] = gold.heads[i]
|
gold.c_heads[i] = gold.heads[i]
|
||||||
gold.c_labels[i] = self.label_ids[gold.labels[i]]
|
gold.c_labels[i] = self.label_ids[gold.labels[i]]
|
||||||
|
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
if '-' in name:
|
if '-' in name:
|
||||||
move_str, label_str = name.split('-', 1)
|
move_str, label_str = name.split('-', 1)
|
||||||
|
@ -82,6 +81,9 @@ cdef class ArcEager(TransitionSystem):
|
||||||
t.get_cost = get_cost_funcs[move]
|
t.get_cost = get_cost_funcs[move]
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
cdef int first_state(self, State* state) except -1:
|
||||||
|
push_stack(state)
|
||||||
|
|
||||||
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
|
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
|
||||||
cdef bint[N_MOVES] is_valid
|
cdef bint[N_MOVES] is_valid
|
||||||
is_valid[SHIFT] = _can_shift(s)
|
is_valid[SHIFT] = _can_shift(s)
|
||||||
|
|
|
@ -14,6 +14,7 @@ cdef class GoldParse:
|
||||||
cdef readonly list heads
|
cdef readonly list heads
|
||||||
cdef readonly list labels
|
cdef readonly list labels
|
||||||
cdef readonly list ner
|
cdef readonly list ner
|
||||||
|
cdef readonly list ents
|
||||||
|
|
||||||
cdef int* c_tags
|
cdef int* c_tags
|
||||||
cdef int* c_heads
|
cdef int* c_heads
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import numpy
|
import numpy
|
||||||
import codecs
|
import codecs
|
||||||
from .ner_util import iob_to_biluo
|
|
||||||
|
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
|
|
||||||
|
@ -47,6 +46,7 @@ def _parse_line(line):
|
||||||
label = pieces[7]
|
label = pieces[7]
|
||||||
return id_, word, pos, head_idx, label, iob_ent
|
return id_, word, pos, head_idx, label, iob_ent
|
||||||
|
|
||||||
|
|
||||||
cdef class GoldParse:
|
cdef class GoldParse:
|
||||||
def __init__(self, tokens, annot_tuples):
|
def __init__(self, tokens, annot_tuples):
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
|
@ -62,9 +62,12 @@ cdef class GoldParse:
|
||||||
self.tags = [None] * len(tokens)
|
self.tags = [None] * len(tokens)
|
||||||
self.heads = [-1] * len(tokens)
|
self.heads = [-1] * len(tokens)
|
||||||
self.labels = ['MISSING'] * len(tokens)
|
self.labels = ['MISSING'] * len(tokens)
|
||||||
self.ner = [None] * len(tokens)
|
self.ner = ['O'] * len(tokens)
|
||||||
|
|
||||||
idx_map = {token.idx: token.i for token in tokens}
|
idx_map = {token.idx: token.i for token in tokens}
|
||||||
|
self.ents = []
|
||||||
|
ent_start = None
|
||||||
|
ent_label = None
|
||||||
for idx, tag, head, label, ner in zip(*annot_tuples):
|
for idx, tag, head, label, ner in zip(*annot_tuples):
|
||||||
if idx < tokens[0].idx:
|
if idx < tokens[0].idx:
|
||||||
pass
|
pass
|
||||||
|
@ -76,8 +79,29 @@ cdef class GoldParse:
|
||||||
self.heads[i] = idx_map.get(head, -1)
|
self.heads[i] = idx_map.get(head, -1)
|
||||||
self.labels[i] = label
|
self.labels[i] = label
|
||||||
self.tags[i] = tag
|
self.tags[i] = tag
|
||||||
self.labels[i] = label
|
if ner == '-':
|
||||||
self.ner[i] = ner
|
self.ner[i] = '-'
|
||||||
|
# Deal with inconsistencies in BILUO arising from tokenization
|
||||||
|
if ner[0] in ('B', 'U', 'O') and ent_start is not None:
|
||||||
|
self.ents.append((ent_start, i, ent_label))
|
||||||
|
ent_start = None
|
||||||
|
ent_label = None
|
||||||
|
if ner[0] in ('B', 'U'):
|
||||||
|
ent_start = i
|
||||||
|
ent_label = ner[2:]
|
||||||
|
if ent_start is not None:
|
||||||
|
self.ents.append((ent_start, self.length, ent_label))
|
||||||
|
for start, end, label in self.ents:
|
||||||
|
if start == (end - 1):
|
||||||
|
self.ner[start] = 'U-%s' % label
|
||||||
|
else:
|
||||||
|
self.ner[start] = 'B-%s' % label
|
||||||
|
for i in range(start+1, end-1):
|
||||||
|
self.ner[i] = 'I-%s' % label
|
||||||
|
self.ner[end-1] = 'L-%s' % label
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return self.length
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def n_non_punct(self):
|
def n_non_punct(self):
|
||||||
|
|
|
@ -34,15 +34,14 @@ cdef do_func_t[N_MOVES] do_funcs
|
||||||
|
|
||||||
|
|
||||||
cdef bint entity_is_open(const State *s) except -1:
|
cdef bint entity_is_open(const State *s) except -1:
|
||||||
return s.sent[s.i - 1].ent.tag >= 1
|
return s.ents_len >= 1 and s.ent.end == 0
|
||||||
|
|
||||||
|
|
||||||
cdef bint _entity_is_sunk(const State *s, Transition* golds) except -1:
|
cdef bint _entity_is_sunk(const State *s, Transition* golds) except -1:
|
||||||
if not entity_is_open(s):
|
if not entity_is_open(s):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
cdef const Entity* curr = &s.sent[s.i - 1].ent
|
cdef const Transition* gold = &golds[(s.i - 1) + s.ent.start]
|
||||||
cdef const Transition* gold = &golds[(s.i - 1) + curr.start]
|
|
||||||
if gold.move != BEGIN and gold.move != UNIT:
|
if gold.move != BEGIN and gold.move != UNIT:
|
||||||
return True
|
return True
|
||||||
elif gold.label != s.ent.label:
|
elif gold.label != s.ent.label:
|
||||||
|
@ -52,14 +51,16 @@ cdef bint _entity_is_sunk(const State *s, Transition* golds) except -1:
|
||||||
|
|
||||||
|
|
||||||
cdef int _is_valid(int act, int label, const State* s) except -1:
|
cdef int _is_valid(int act, int label, const State* s) except -1:
|
||||||
if act == BEGIN:
|
if act == MISSING:
|
||||||
return not entity_is_open(s)
|
return False
|
||||||
|
elif act == BEGIN:
|
||||||
|
return label != 0 and not entity_is_open(s)
|
||||||
elif act == IN:
|
elif act == IN:
|
||||||
return entity_is_open(s) and s.ent.label == label
|
return entity_is_open(s) and label != 0 and s.ent.label == label
|
||||||
elif act == LAST:
|
elif act == LAST:
|
||||||
return entity_is_open(s) and s.ent.label == label
|
return entity_is_open(s) and label != 0 and s.ent.label == label
|
||||||
elif act == UNIT:
|
elif act == UNIT:
|
||||||
return not entity_is_open(s)
|
return label != 0 and not entity_is_open(s)
|
||||||
elif act == OUT:
|
elif act == OUT:
|
||||||
return not entity_is_open(s)
|
return not entity_is_open(s)
|
||||||
else:
|
else:
|
||||||
|
@ -69,22 +70,34 @@ cdef int _is_valid(int act, int label, const State* s) except -1:
|
||||||
cdef class BiluoPushDown(TransitionSystem):
|
cdef class BiluoPushDown(TransitionSystem):
|
||||||
@classmethod
|
@classmethod
|
||||||
def get_labels(cls, gold_tuples):
|
def get_labels(cls, gold_tuples):
|
||||||
move_labels = {BEGIN: {}, IN: {}, LAST: {}, UNIT: {}, OUT: {'ROOT': True}}
|
move_labels = {MISSING: {'ROOT': True}, BEGIN: {}, IN: {}, LAST: {}, UNIT: {},
|
||||||
moves = ('-', 'B', 'I', 'L', 'U')
|
OUT: {'ROOT': True}}
|
||||||
for (raw_text, toks, (ids, tags, heads, labels, iob)) in gold_tuples:
|
moves = ('M', 'B', 'I', 'L', 'U')
|
||||||
for i, ner_tag in enumerate(iob_to_biluo(iob)):
|
for (raw_text, toks, (ids, tags, heads, labels, biluo)) in gold_tuples:
|
||||||
|
for i, ner_tag in enumerate(biluo):
|
||||||
if ner_tag != 'O' and ner_tag != '-':
|
if ner_tag != 'O' and ner_tag != '-':
|
||||||
move_str, label = ner_tag.split('-')
|
move_str, label = ner_tag.split('-')
|
||||||
move_labels[moves.index(move_str)][label] = True
|
move_labels[moves.index(move_str)][label] = True
|
||||||
return move_labels
|
return move_labels
|
||||||
|
|
||||||
|
def move_name(self, int move, int label):
|
||||||
|
if move == OUT:
|
||||||
|
return 'O'
|
||||||
|
elif move == 'MISSING':
|
||||||
|
return 'M'
|
||||||
|
else:
|
||||||
|
labels = {id_: name for name, id_ in self.label_ids.items()}
|
||||||
|
return MOVE_NAMES[move] + '-' + labels[label]
|
||||||
|
|
||||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||||
biluo_strings = iob_to_biluo(gold.ner)
|
|
||||||
for i in range(gold.length):
|
for i in range(gold.length):
|
||||||
gold.c_ner[i] = self.lookup_transition(biluo_strings[i])
|
gold.c_ner[i] = self.lookup_transition(gold.ner[i])
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *:
|
cdef Transition lookup_transition(self, object name) except *:
|
||||||
if '-' in name:
|
if name == '-':
|
||||||
|
move_str = 'M'
|
||||||
|
label = 0
|
||||||
|
elif '-' in name:
|
||||||
move_str, label_str = name.split('-', 1)
|
move_str, label_str = name.split('-', 1)
|
||||||
label = self.label_ids[label_str]
|
label = self.label_ids[label_str]
|
||||||
else:
|
else:
|
||||||
|
@ -107,6 +120,9 @@ cdef class BiluoPushDown(TransitionSystem):
|
||||||
t.get_cost = _get_cost
|
t.get_cost = _get_cost
|
||||||
return t
|
return t
|
||||||
|
|
||||||
|
cdef int first_state(self, State* state) except -1:
|
||||||
|
pass
|
||||||
|
|
||||||
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
|
cdef Transition best_valid(self, const weight_t* scores, const State* s) except *:
|
||||||
cdef int best = -1
|
cdef int best = -1
|
||||||
cdef weight_t score = -90000
|
cdef weight_t score = -90000
|
||||||
|
@ -128,8 +144,9 @@ cdef int _get_cost(const Transition* self, const State* s, GoldParse gold) excep
|
||||||
return 9000
|
return 9000
|
||||||
cdef bint is_sunk = _entity_is_sunk(s, gold.c_ner)
|
cdef bint is_sunk = _entity_is_sunk(s, gold.c_ner)
|
||||||
cdef int next_act = gold.c_ner[s.i+1].move if s.i < s.sent_len else OUT
|
cdef int next_act = gold.c_ner[s.i+1].move if s.i < s.sent_len else OUT
|
||||||
return not _is_gold(self.move, self.label, gold.c_ner[s.i].move, gold.c_ner[s.i].label,
|
cdef bint is_gold = _is_gold(self.move, self.label, gold.c_ner[s.i].move,
|
||||||
next_act, is_sunk)
|
gold.c_ner[s.i].label, next_act, is_sunk)
|
||||||
|
return not is_gold
|
||||||
|
|
||||||
cdef bint _is_gold(int act, int tag, int g_act, int g_tag,
|
cdef bint _is_gold(int act, int tag, int g_act, int g_tag,
|
||||||
int next_act, bint is_sunk):
|
int next_act, bint is_sunk):
|
||||||
|
@ -210,18 +227,21 @@ cdef int _do_begin(const Transition* self, State* s) except -1:
|
||||||
s.ents_len += 1
|
s.ents_len += 1
|
||||||
s.ent.start = s.i
|
s.ent.start = s.i
|
||||||
s.ent.label = self.label
|
s.ent.label = self.label
|
||||||
s.sent[s.i].ent.tag = self.clas
|
s.sent[s.i].ent_iob = 3
|
||||||
|
s.sent[s.i].ent_type = self.label
|
||||||
s.i += 1
|
s.i += 1
|
||||||
|
|
||||||
|
|
||||||
cdef int _do_in(const Transition* self, State* s) except -1:
|
cdef int _do_in(const Transition* self, State* s) except -1:
|
||||||
s.sent[s.i].ent.tag = self.clas
|
s.sent[s.i].ent_iob = 1
|
||||||
|
s.sent[s.i].ent_type = self.label
|
||||||
s.i += 1
|
s.i += 1
|
||||||
|
|
||||||
|
|
||||||
cdef int _do_last(const Transition* self, State* s) except -1:
|
cdef int _do_last(const Transition* self, State* s) except -1:
|
||||||
s.ent.end = s.i+1
|
s.ent.end = s.i+1
|
||||||
s.sent[s.i].ent.tag = self.clas
|
s.sent[s.i].ent_iob = 1
|
||||||
|
s.sent[s.i].ent_type = self.label
|
||||||
s.i += 1
|
s.i += 1
|
||||||
|
|
||||||
|
|
||||||
|
@ -231,12 +251,13 @@ cdef int _do_unit(const Transition* self, State* s) except -1:
|
||||||
s.ent.start = s.i
|
s.ent.start = s.i
|
||||||
s.ent.label = self.label
|
s.ent.label = self.label
|
||||||
s.ent.end = s.i+1
|
s.ent.end = s.i+1
|
||||||
s.sent[s.i].ent.tag = self.clas
|
s.sent[s.i].ent_iob = 3
|
||||||
|
s.sent[s.i].ent_type = self.label
|
||||||
s.i += 1
|
s.i += 1
|
||||||
|
|
||||||
|
|
||||||
cdef int _do_out(const Transition* self, State* s) except -1:
|
cdef int _do_out(const Transition* self, State* s) except -1:
|
||||||
s.sent[s.i].ent.tag = self.clas
|
s.sent[s.i].ent_iob = 2
|
||||||
s.i += 1
|
s.i += 1
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -28,14 +28,12 @@ from ..tokens cimport Tokens, TokenC
|
||||||
from .arc_eager cimport TransitionSystem, Transition
|
from .arc_eager cimport TransitionSystem, Transition
|
||||||
from .transition_system import OracleError
|
from .transition_system import OracleError
|
||||||
|
|
||||||
from ._state cimport init_state, State, is_final, get_idx, get_s0, get_s1, get_n0, get_n1
|
from ._state cimport new_state, State, is_final, get_idx, get_s0, get_s1, get_n0, get_n1
|
||||||
from .conll cimport GoldParse
|
from .conll cimport GoldParse
|
||||||
|
|
||||||
from . import _parse_features
|
from . import _parse_features
|
||||||
from ._parse_features cimport fill_context, CONTEXT_SIZE
|
from ._parse_features cimport fill_context, CONTEXT_SIZE
|
||||||
|
|
||||||
from ._ner_features cimport _ner_features
|
|
||||||
|
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
def set_debug(val):
|
def set_debug(val):
|
||||||
|
@ -50,7 +48,11 @@ cdef unicode print_state(State* s, list words):
|
||||||
third = words[s.stack[-2]] + '_%d' % s.sent[s.stack[-2]].head
|
third = words[s.stack[-2]] + '_%d' % s.sent[s.stack[-2]].head
|
||||||
n0 = words[s.i]
|
n0 = words[s.i]
|
||||||
n1 = words[s.i + 1]
|
n1 = words[s.i + 1]
|
||||||
return ' '.join((str(s.stack_len), third, second, top, '|', n0, n1))
|
if s.ents_len:
|
||||||
|
ent = '%s %d-%d' % (s.ent.label, s.ent.start, s.ent.end)
|
||||||
|
else:
|
||||||
|
ent = '-'
|
||||||
|
return ' '.join((ent, str(s.stack_len), third, second, top, '|', n0, n1))
|
||||||
|
|
||||||
|
|
||||||
def get_templates(name):
|
def get_templates(name):
|
||||||
|
@ -58,7 +60,7 @@ def get_templates(name):
|
||||||
if name == 'zhang':
|
if name == 'zhang':
|
||||||
return pf.arc_eager
|
return pf.arc_eager
|
||||||
elif name == 'ner':
|
elif name == 'ner':
|
||||||
return _ner_features.basic
|
return pf.ner
|
||||||
else:
|
else:
|
||||||
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
|
return (pf.unigrams + pf.s0_n0 + pf.s1_n0 + pf.s0_n1 + pf.n0_n1 + \
|
||||||
pf.tree_shape + pf.trigrams)
|
pf.tree_shape + pf.trigrams)
|
||||||
|
@ -79,7 +81,8 @@ cdef class GreedyParser:
|
||||||
cdef atom_t[CONTEXT_SIZE] context
|
cdef atom_t[CONTEXT_SIZE] context
|
||||||
cdef int n_feats
|
cdef int n_feats
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef State* state = init_state(mem, tokens.data, tokens.length)
|
cdef State* state = new_state(mem, tokens.data, tokens.length)
|
||||||
|
self.moves.first_state(state)
|
||||||
cdef Transition guess
|
cdef Transition guess
|
||||||
while not is_final(state):
|
while not is_final(state):
|
||||||
fill_context(context, state)
|
fill_context(context, state)
|
||||||
|
@ -102,7 +105,9 @@ cdef class GreedyParser:
|
||||||
|
|
||||||
self.moves.preprocess_gold(gold)
|
self.moves.preprocess_gold(gold)
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
cdef State* state = init_state(mem, tokens.data, tokens.length)
|
cdef State* state = new_state(mem, tokens.data, tokens.length)
|
||||||
|
self.moves.first_state(state)
|
||||||
|
py_words = [t.orth_ for t in tokens]
|
||||||
while not is_final(state):
|
while not is_final(state):
|
||||||
fill_context(context, state)
|
fill_context(context, state)
|
||||||
scores = self.model.score(context)
|
scores = self.model.score(context)
|
||||||
|
@ -114,7 +119,3 @@ cdef class GreedyParser:
|
||||||
best.do(&best, state)
|
best.do(&best, state)
|
||||||
else:
|
else:
|
||||||
guess.do(&guess, state)
|
guess.do(&guess, state)
|
||||||
n_corr = gold.heads_correct(state.sent, score_punct=True)
|
|
||||||
if force_gold and n_corr != tokens.length:
|
|
||||||
raise OracleError
|
|
||||||
return n_corr
|
|
||||||
|
|
|
@ -29,6 +29,8 @@ cdef class TransitionSystem:
|
||||||
cdef const Transition* c
|
cdef const Transition* c
|
||||||
cdef readonly int n_moves
|
cdef readonly int n_moves
|
||||||
|
|
||||||
|
cdef int first_state(self, State* state) except -1
|
||||||
|
|
||||||
cdef int preprocess_gold(self, GoldParse gold) except -1
|
cdef int preprocess_gold(self, GoldParse gold) except -1
|
||||||
|
|
||||||
cdef Transition lookup_transition(self, object name) except *
|
cdef Transition lookup_transition(self, object name) except *
|
||||||
|
|
|
@ -28,6 +28,9 @@ cdef class TransitionSystem:
|
||||||
self.label_ids['MISSING'] = -1
|
self.label_ids['MISSING'] = -1
|
||||||
self.c = moves
|
self.c = moves
|
||||||
|
|
||||||
|
cdef int first_state(self, State* state) except -1:
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
cdef int preprocess_gold(self, GoldParse gold) except -1:
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
|
@ -39,6 +39,7 @@ cdef class Tokens:
|
||||||
cdef unicode _string
|
cdef unicode _string
|
||||||
cdef tuple _tag_strings
|
cdef tuple _tag_strings
|
||||||
cdef tuple _dep_strings
|
cdef tuple _dep_strings
|
||||||
|
cdef public tuple _ent_strings
|
||||||
|
|
||||||
cdef public bint is_tagged
|
cdef public bint is_tagged
|
||||||
cdef public bint is_parsed
|
cdef public bint is_parsed
|
||||||
|
|
|
@ -94,6 +94,7 @@ cdef class Tokens:
|
||||||
self._py_tokens = []
|
self._py_tokens = []
|
||||||
self._tag_strings = tuple() # These will be set by the POS tagger and parser
|
self._tag_strings = tuple() # These will be set by the POS tagger and parser
|
||||||
self._dep_strings = tuple() # The strings are arbitrary and model-specific.
|
self._dep_strings = tuple() # The strings are arbitrary and model-specific.
|
||||||
|
self._ent_strings = tuple() # TODO: Clean this up
|
||||||
|
|
||||||
def __getitem__(self, object i):
|
def __getitem__(self, object i):
|
||||||
"""Retrieve a token.
|
"""Retrieve a token.
|
||||||
|
@ -129,6 +130,28 @@ cdef class Tokens:
|
||||||
cdef const TokenC* last = &self.data[self.length - 1]
|
cdef const TokenC* last = &self.data[self.length - 1]
|
||||||
return self._string[:last.idx + last.lex.length]
|
return self._string[:last.idx + last.lex.length]
|
||||||
|
|
||||||
|
property ents:
|
||||||
|
def __get__(self):
|
||||||
|
cdef int i
|
||||||
|
cdef const TokenC* token
|
||||||
|
cdef int start = -1
|
||||||
|
cdef object label = None
|
||||||
|
for i in range(self.length):
|
||||||
|
token = &self.data[i]
|
||||||
|
if token.ent_iob == 1:
|
||||||
|
assert start != -1
|
||||||
|
pass
|
||||||
|
elif token.ent_iob == 2:
|
||||||
|
if start != -1:
|
||||||
|
yield (start, i, label)
|
||||||
|
start = -1
|
||||||
|
label = None
|
||||||
|
elif token.ent_iob == 3:
|
||||||
|
start = i
|
||||||
|
label = self._ent_strings[token.ent_type]
|
||||||
|
if start != -1:
|
||||||
|
yield (start, self.length, label)
|
||||||
|
|
||||||
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
cdef int push_back(self, int idx, LexemeOrToken lex_or_tok) except -1:
|
||||||
if self.length == self.max_length:
|
if self.length == self.max_length:
|
||||||
self._realloc(self.length * 2)
|
self._realloc(self.length * 2)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user