mirror of
https://github.com/explosion/spaCy.git
synced 2025-02-14 02:20:34 +03:00
* Work on sense tagger
This commit is contained in:
parent
2fbcdd0ea8
commit
fb68df91b8
|
@ -69,6 +69,7 @@ cdef class Model:
|
||||||
assert self.n_classes == eg.c.nr_class
|
assert self.n_classes == eg.c.nr_class
|
||||||
memset(eg.c.scores, 0, sizeof(weight_t) * eg.c.nr_class)
|
memset(eg.c.scores, 0, sizeof(weight_t) * eg.c.nr_class)
|
||||||
self.set_scores(eg.c.scores, eg.c.atoms)
|
self.set_scores(eg.c.scores, eg.c.atoms)
|
||||||
|
|
||||||
eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
|
eg.c.guess = arg_max_if_true(eg.c.scores, eg.c.is_valid, self.n_classes)
|
||||||
if eg.c.guess == -1:
|
if eg.c.guess == -1:
|
||||||
raise ValidationError("No valid classes during prediction")
|
raise ValidationError("No valid classes during prediction")
|
||||||
|
|
|
@ -155,7 +155,8 @@ def read_json_file(loc, docs_filter=None):
|
||||||
if labels[-1].lower() == 'root':
|
if labels[-1].lower() == 'root':
|
||||||
labels[-1] = 'ROOT'
|
labels[-1] = 'ROOT'
|
||||||
ner.append(token.get('ner', '-'))
|
ner.append(token.get('ner', '-'))
|
||||||
t_wsd = [s.replace('.', '_') for s in token.get('ssenses', [])]
|
t_wsd = [s.replace('noun.', 'N_').replace('verb.', 'V_')
|
||||||
|
for s in token.get('ssenses', [])]
|
||||||
wsd.append(t_wsd)
|
wsd.append(t_wsd)
|
||||||
sents.append((
|
sents.append((
|
||||||
(ids, words, tags, heads, labels, ner, wsd),
|
(ids, words, tags, heads, labels, ner, wsd),
|
||||||
|
|
|
@ -133,6 +133,5 @@ class Scorer(object):
|
||||||
if gold_senses and gold.gold_to_cand[i] is not None:
|
if gold_senses and gold.gold_to_cand[i] is not None:
|
||||||
cand_i = gold.gold_to_cand[i]
|
cand_i = gold.gold_to_cand[i]
|
||||||
sense_str = tokens[cand_i].sense_
|
sense_str = tokens[cand_i].sense_
|
||||||
sense_str = sense_str.replace('N_', 'noun.').replace('V_', 'verb.')
|
|
||||||
self.wsd.tp += sense_str in gold_senses
|
self.wsd.tp += sense_str in gold_senses
|
||||||
self.wsd.fn += sense_str not in gold_senses
|
self.wsd.fn += sense_str not in gold_senses
|
||||||
|
|
|
@ -2,9 +2,12 @@ from .typedefs cimport flags_t
|
||||||
from .structs cimport TokenC
|
from .structs cimport TokenC
|
||||||
from .strings cimport StringStore
|
from .strings cimport StringStore
|
||||||
from .tokens cimport Tokens
|
from .tokens cimport Tokens
|
||||||
from .senses cimport POS_SENSES, N_SENSES, encode_sense_strs
|
from .senses cimport N_SENSES, encode_sense_strs
|
||||||
|
from .senses cimport N_Tops, J_ppl, V_body
|
||||||
from .gold cimport GoldParse
|
from .gold cimport GoldParse
|
||||||
from .parts_of_speech cimport NOUN, VERB
|
from .parts_of_speech cimport NOUN, VERB, N_UNIV_TAGS
|
||||||
|
|
||||||
|
from . cimport parts_of_speech
|
||||||
|
|
||||||
from thinc.learner cimport LinearModel
|
from thinc.learner cimport LinearModel
|
||||||
from thinc.features cimport Extractor
|
from thinc.features cimport Extractor
|
||||||
|
@ -21,40 +24,30 @@ cdef enum:
|
||||||
P2c
|
P2c
|
||||||
P2c6
|
P2c6
|
||||||
P2c4
|
P2c4
|
||||||
P2ss
|
|
||||||
P2s
|
|
||||||
|
|
||||||
P1W
|
P1W
|
||||||
P1p
|
P1p
|
||||||
P1c
|
P1c
|
||||||
P1c6
|
P1c6
|
||||||
P1c4
|
P1c4
|
||||||
P1ss
|
|
||||||
P1s
|
|
||||||
|
|
||||||
N0W
|
N0W
|
||||||
N0p
|
N0p
|
||||||
N0c
|
N0c
|
||||||
N0c6
|
N0c6
|
||||||
N0c4
|
N0c4
|
||||||
N0ss
|
|
||||||
N0s
|
|
||||||
|
|
||||||
N1W
|
N1W
|
||||||
N1p
|
N1p
|
||||||
N1c
|
N1c
|
||||||
N1c6
|
N1c6
|
||||||
N1c4
|
N1c4
|
||||||
N1ss
|
|
||||||
N1s
|
|
||||||
|
|
||||||
N2W
|
N2W
|
||||||
N2p
|
N2p
|
||||||
N2c
|
N2c
|
||||||
N2c6
|
N2c6
|
||||||
N2c4
|
N2c4
|
||||||
N2ss
|
|
||||||
N2s
|
|
||||||
|
|
||||||
CONTEXT_SIZE
|
CONTEXT_SIZE
|
||||||
|
|
||||||
|
@ -67,8 +60,6 @@ unigrams = (
|
||||||
(P2c6, P2p),
|
(P2c6, P2p),
|
||||||
(P2c4, P2p),
|
(P2c4, P2p),
|
||||||
(P2c,),
|
(P2c,),
|
||||||
(P2ss,),
|
|
||||||
(P1s,),
|
|
||||||
|
|
||||||
(P1W,),
|
(P1W,),
|
||||||
(P1p,),
|
(P1p,),
|
||||||
|
@ -84,8 +75,6 @@ unigrams = (
|
||||||
(P1c6, P1p),
|
(P1c6, P1p),
|
||||||
(P1c4, P1p),
|
(P1c4, P1p),
|
||||||
(P1c,),
|
(P1c,),
|
||||||
(P1ss,),
|
|
||||||
(P1s,),
|
|
||||||
|
|
||||||
(N0p,),
|
(N0p,),
|
||||||
(N0W, N0p),
|
(N0W, N0p),
|
||||||
|
@ -100,7 +89,6 @@ unigrams = (
|
||||||
(N0c6, N0p),
|
(N0c6, N0p),
|
||||||
(N0c4, N0p),
|
(N0c4, N0p),
|
||||||
(N0c,),
|
(N0c,),
|
||||||
(N0ss,),
|
|
||||||
|
|
||||||
(N1p,),
|
(N1p,),
|
||||||
(N1W, N1p),
|
(N1W, N1p),
|
||||||
|
@ -115,7 +103,6 @@ unigrams = (
|
||||||
(N1c6, N1p),
|
(N1c6, N1p),
|
||||||
(N1c4, N1p),
|
(N1c4, N1p),
|
||||||
(N1c,),
|
(N1c,),
|
||||||
(N1ss,),
|
|
||||||
|
|
||||||
(N2p,),
|
(N2p,),
|
||||||
(N2W, N2p),
|
(N2W, N2p),
|
||||||
|
@ -130,7 +117,6 @@ unigrams = (
|
||||||
(N2c6, N2p),
|
(N2c6, N2p),
|
||||||
(N2c4, N2p),
|
(N2c4, N2p),
|
||||||
(N2c,),
|
(N2c,),
|
||||||
(N2ss,),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -141,27 +127,21 @@ bigrams = (
|
||||||
(P1c, N0p),
|
(P1c, N0p),
|
||||||
(P1c6, N0p),
|
(P1c6, N0p),
|
||||||
|
|
||||||
(P2s, P1s),
|
|
||||||
(P2ss, P1s,),
|
|
||||||
(P2ss, P1ss,),
|
|
||||||
|
|
||||||
(P1ss, N0ss),
|
|
||||||
|
|
||||||
(N0p, N1p,),
|
(N0p, N1p,),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
trigrams = (
|
trigrams = (
|
||||||
(P1p, N0p, N1p),
|
(P1p, N0p, N1p),
|
||||||
(P2p, P1p, N0ss),
|
(P2p, P1p,),
|
||||||
(P2c4, P1c4, N0c4),
|
(P2c4, P1c4, N0c4),
|
||||||
|
|
||||||
(P1p, N0p, N1p),
|
(P1p, N0p, N1p),
|
||||||
(P1p, N0p, N1ss),
|
(P1p, N0p,),
|
||||||
(P1c4, N0c4, N1c4),
|
(P1c4, N0c4, N1c4),
|
||||||
|
|
||||||
(N0p, N1p, N2p),
|
(N0p, N1p, N2p),
|
||||||
(N0p, N1p, N2ss),
|
(N0p, N1p,),
|
||||||
(N0c4, N1c4, N2c4),
|
(N0c4, N1c4, N2c4),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -172,8 +152,6 @@ cdef int fill_token(atom_t* ctxt, const TokenC* token) except -1:
|
||||||
ctxt[2] = token.lex.cluster
|
ctxt[2] = token.lex.cluster
|
||||||
ctxt[3] = token.lex.cluster & 15
|
ctxt[3] = token.lex.cluster & 15
|
||||||
ctxt[4] = token.lex.cluster & 63
|
ctxt[4] = token.lex.cluster & 63
|
||||||
ctxt[5] = token.lex.senses & POS_SENSES[<int>token.pos]
|
|
||||||
ctxt[6] = token.sense
|
|
||||||
|
|
||||||
|
|
||||||
cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1:
|
cdef int fill_context(atom_t* ctxt, const TokenC* token) except -1:
|
||||||
|
@ -194,6 +172,7 @@ cdef class SenseTagger:
|
||||||
cdef readonly LinearModel model
|
cdef readonly LinearModel model
|
||||||
cdef readonly Extractor extractor
|
cdef readonly Extractor extractor
|
||||||
cdef readonly model_dir
|
cdef readonly model_dir
|
||||||
|
cdef readonly flags_t[<int>N_UNIV_TAGS] pos_senses
|
||||||
|
|
||||||
def __init__(self, StringStore strings, model_dir):
|
def __init__(self, StringStore strings, model_dir):
|
||||||
if model_dir is not None and path.isdir(model_dir):
|
if model_dir is not None and path.isdir(model_dir):
|
||||||
|
@ -207,25 +186,50 @@ cdef class SenseTagger:
|
||||||
self.model.load(self.model_dir, freq_thresh=0)
|
self.model.load(self.model_dir, freq_thresh=0)
|
||||||
self.strings = strings
|
self.strings = strings
|
||||||
|
|
||||||
|
self.pos_senses[<int>parts_of_speech.NO_TAG] = 0
|
||||||
|
self.pos_senses[<int>parts_of_speech.ADJ] = 0
|
||||||
|
self.pos_senses[<int>parts_of_speech.ADV] = 0
|
||||||
|
self.pos_senses[<int>parts_of_speech.ADP] = 0
|
||||||
|
self.pos_senses[<int>parts_of_speech.CONJ] = 0
|
||||||
|
self.pos_senses[<int>parts_of_speech.DET] = 0
|
||||||
|
self.pos_senses[<int>parts_of_speech.NOUN] = 0
|
||||||
|
self.pos_senses[<int>parts_of_speech.NUM] = 0
|
||||||
|
self.pos_senses[<int>parts_of_speech.PRON] = 0
|
||||||
|
self.pos_senses[<int>parts_of_speech.PRT] = 0
|
||||||
|
self.pos_senses[<int>parts_of_speech.VERB] = 0
|
||||||
|
self.pos_senses[<int>parts_of_speech.X] = 0
|
||||||
|
self.pos_senses[<int>parts_of_speech.PUNCT] = 0
|
||||||
|
self.pos_senses[<int>parts_of_speech.EOL] = 0
|
||||||
|
|
||||||
|
|
||||||
|
cdef flags_t sense = 0
|
||||||
|
for _sense in range(N_Tops, V_body):
|
||||||
|
self.pos_senses[<int>parts_of_speech.NOUN] |= 1 << sense
|
||||||
|
|
||||||
|
for _sense in range(V_body, J_ppl):
|
||||||
|
self.pos_senses[<int>parts_of_speech.VERB] |= 1 << sense
|
||||||
|
|
||||||
def __call__(self, Tokens tokens):
|
def __call__(self, Tokens tokens):
|
||||||
cdef atom_t[CONTEXT_SIZE] context
|
cdef atom_t[CONTEXT_SIZE] context
|
||||||
cdef int i, guess, n_feats
|
cdef int i, guess, n_feats
|
||||||
cdef const TokenC* token
|
cdef TokenC* token
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
token = &tokens.data[i]
|
token = &tokens.data[i]
|
||||||
if token.pos in (NOUN, VERB):
|
if token.pos in (NOUN, VERB):
|
||||||
fill_context(context, token)
|
fill_context(context, token)
|
||||||
feats = self.extractor.get_feats(context, &n_feats)
|
feats = self.extractor.get_feats(context, &n_feats)
|
||||||
scores = self.model.get_scores(feats, n_feats)
|
scores = self.model.get_scores(feats, n_feats)
|
||||||
tokens.data[i].sense = self.best_in_set(scores, POS_SENSES[<int>token.pos])
|
tokens.data[i].sense = self.best_in_set(scores, self.pos_senses[<int>token.pos])
|
||||||
|
|
||||||
def train(self, Tokens tokens, GoldParse gold):
|
def train(self, Tokens tokens, GoldParse gold):
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
|
cdef TokenC* token
|
||||||
for i, ssenses in enumerate(gold.ssenses):
|
for i, ssenses in enumerate(gold.ssenses):
|
||||||
|
token = &tokens.data[i]
|
||||||
if ssenses:
|
if ssenses:
|
||||||
gold.c.ssenses[i] = encode_sense_strs(ssenses)
|
gold.c.ssenses[i] = encode_sense_strs(ssenses)
|
||||||
else:
|
else:
|
||||||
gold.c.ssenses[i] = pos_senses(&tokens.data[i])
|
gold.c.ssenses[i] = token.lex.senses & self.pos_senses[<int>token.pos]
|
||||||
|
|
||||||
cdef atom_t[CONTEXT_SIZE] context
|
cdef atom_t[CONTEXT_SIZE] context
|
||||||
cdef int n_feats
|
cdef int n_feats
|
||||||
|
@ -240,7 +244,7 @@ cdef class SenseTagger:
|
||||||
fill_context(context, token)
|
fill_context(context, token)
|
||||||
feats = self.extractor.get_feats(context, &n_feats)
|
feats = self.extractor.get_feats(context, &n_feats)
|
||||||
scores = self.model.get_scores(feats, n_feats)
|
scores = self.model.get_scores(feats, n_feats)
|
||||||
token.sense = self.best_in_set(scores, POS_SENSES[<int>token.pos])
|
token.sense = self.best_in_set(scores, token.lex.senses)
|
||||||
best = self.best_in_set(scores, gold.c.ssenses[i])
|
best = self.best_in_set(scores, gold.c.ssenses[i])
|
||||||
guess_counts = {}
|
guess_counts = {}
|
||||||
gold_counts = {}
|
gold_counts = {}
|
||||||
|
@ -251,7 +255,7 @@ cdef class SenseTagger:
|
||||||
feat = (f_i, f_key)
|
feat = (f_i, f_key)
|
||||||
gold_counts[feat] = gold_counts.get(feat, 0) + 1.0
|
gold_counts[feat] = gold_counts.get(feat, 0) + 1.0
|
||||||
guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
|
guess_counts[feat] = guess_counts.get(feat, 0) - 1.0
|
||||||
#self.model.update({token.sense: guess_counts, best: gold_counts})
|
self.model.update({token.sense: guess_counts, best: gold_counts})
|
||||||
return cost
|
return cost
|
||||||
|
|
||||||
cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
|
cdef int best_in_set(self, const weight_t* scores, flags_t senses) except -1:
|
||||||
|
@ -266,10 +270,6 @@ cdef class SenseTagger:
|
||||||
return argmax
|
return argmax
|
||||||
|
|
||||||
|
|
||||||
cdef flags_t pos_senses(const TokenC* token) nogil:
|
|
||||||
return token.lex.senses & POS_SENSES[<int>token.pos]
|
|
||||||
|
|
||||||
|
|
||||||
cdef list _set_bits(flags_t flags):
|
cdef list _set_bits(flags_t flags):
|
||||||
bits = []
|
bits = []
|
||||||
cdef flags_t bit
|
cdef flags_t bit
|
||||||
|
|
|
@ -4,13 +4,17 @@ from .typedefs cimport flags_t
|
||||||
|
|
||||||
cpdef enum:
|
cpdef enum:
|
||||||
NO_SENSE
|
NO_SENSE
|
||||||
|
J_all
|
||||||
|
J_pert
|
||||||
|
A_all
|
||||||
|
N_Tops
|
||||||
N_act
|
N_act
|
||||||
N_animal
|
N_animal
|
||||||
N_artifact
|
N_artifact
|
||||||
N_attribute
|
N_attribute
|
||||||
N_body
|
N_body
|
||||||
N_cognition
|
N_cognition
|
||||||
N_communication
|
N_communication
|
||||||
N_event
|
N_event
|
||||||
N_feeling
|
N_feeling
|
||||||
N_food
|
N_food
|
||||||
|
@ -44,10 +48,8 @@ cpdef enum:
|
||||||
V_social
|
V_social
|
||||||
V_stative
|
V_stative
|
||||||
V_weather
|
V_weather
|
||||||
|
J_ppl
|
||||||
N_SENSES
|
N_SENSES
|
||||||
|
|
||||||
|
|
||||||
cdef flags_t[<int>parts_of_speech.N_UNIV_TAGS] POS_SENSES
|
|
||||||
|
|
||||||
|
|
||||||
cdef flags_t encode_sense_strs(sense_names) except 0
|
cdef flags_t encode_sense_strs(sense_names) except 0
|
||||||
|
|
120
spacy/senses.pyx
120
spacy/senses.pyx
|
@ -2,74 +2,56 @@ from __future__ import unicode_literals
|
||||||
cimport parts_of_speech
|
cimport parts_of_speech
|
||||||
|
|
||||||
|
|
||||||
POS_SENSES[<int>parts_of_speech.NO_TAG] = 0
|
lexnames_str = """
|
||||||
POS_SENSES[<int>parts_of_speech.ADJ] = 0
|
-1 NO_SENSE -1
|
||||||
POS_SENSES[<int>parts_of_speech.ADV] = 0
|
00 J_all 3
|
||||||
POS_SENSES[<int>parts_of_speech.ADP] = 0
|
01 A_pert 3
|
||||||
POS_SENSES[<int>parts_of_speech.CONJ] = 0
|
02 A_all 4
|
||||||
POS_SENSES[<int>parts_of_speech.DET] = 0
|
03 N_Tops 1
|
||||||
POS_SENSES[<int>parts_of_speech.NOUN] = 0
|
04 N_act 1
|
||||||
POS_SENSES[<int>parts_of_speech.NUM] = 0
|
05 N_animal 1
|
||||||
POS_SENSES[<int>parts_of_speech.PRON] = 0
|
06 N_artifact 1
|
||||||
POS_SENSES[<int>parts_of_speech.PRT] = 0
|
07 N_attribute 1
|
||||||
POS_SENSES[<int>parts_of_speech.VERB] = 0
|
08 N_body 1
|
||||||
POS_SENSES[<int>parts_of_speech.X] = 0
|
09 N_cognition 1
|
||||||
POS_SENSES[<int>parts_of_speech.PUNCT] = 0
|
10 N_communication 1
|
||||||
POS_SENSES[<int>parts_of_speech.EOL] = 0
|
11 N_event 1
|
||||||
|
12 N_feeling 1
|
||||||
|
13 N_food 1
|
||||||
|
14 N_group 1
|
||||||
|
15 N_location 1
|
||||||
|
16 N_motive 1
|
||||||
|
17 N_object 1
|
||||||
|
18 N_person 1
|
||||||
|
19 N_phenomenon 1
|
||||||
|
20 N_plant 1
|
||||||
|
21 N_possession 1
|
||||||
|
22 N_process 1
|
||||||
|
23 N_quantity 1
|
||||||
|
24 N_relation 1
|
||||||
|
25 N_shape 1
|
||||||
|
26 N_state 1
|
||||||
|
27 N_substance 1
|
||||||
|
28 N_time 1
|
||||||
|
29 V_body 2
|
||||||
|
30 V_change 2
|
||||||
|
31 V_cognition 2
|
||||||
|
32 V_communication 2
|
||||||
|
33 V_competition 2
|
||||||
|
34 V_consumption 2
|
||||||
|
35 V_contact 2
|
||||||
|
36 V_creation 2
|
||||||
|
37 V_emotion 2
|
||||||
|
38 V_motion 2
|
||||||
|
39 V_perception 2
|
||||||
|
40 V_possession 2
|
||||||
|
41 V_social 2
|
||||||
|
42 V_stative 2
|
||||||
|
43 V_weather 2
|
||||||
|
44 A_ppl 3
|
||||||
|
""".strip()
|
||||||
|
|
||||||
|
STRINGS = tuple(line.split()[1] for line in lexnames_str.split('\n'))
|
||||||
cdef int _sense = 0
|
|
||||||
|
|
||||||
for _sense in range(N_act, V_body):
|
|
||||||
POS_SENSES[<int>parts_of_speech.NOUN] |= 1 << _sense
|
|
||||||
|
|
||||||
for _sense in range(V_body, V_weather+1):
|
|
||||||
POS_SENSES[<int>parts_of_speech.VERB] |= 1 << _sense
|
|
||||||
|
|
||||||
|
|
||||||
STRINGS = (
|
|
||||||
'-NO_SENSE-',
|
|
||||||
'N_act',
|
|
||||||
'N_animal',
|
|
||||||
'N_artifact',
|
|
||||||
'N_attribute',
|
|
||||||
'N_body',
|
|
||||||
'N_cognition',
|
|
||||||
'N_communication',
|
|
||||||
'N_event',
|
|
||||||
'N_feeling',
|
|
||||||
'N_food',
|
|
||||||
'N_group',
|
|
||||||
'N_location',
|
|
||||||
'N_motive',
|
|
||||||
'N_object',
|
|
||||||
'N_person',
|
|
||||||
'N_phenomenon',
|
|
||||||
'N_plant',
|
|
||||||
'N_possession',
|
|
||||||
'N_process',
|
|
||||||
'N_quantity',
|
|
||||||
'N_relation',
|
|
||||||
'N_shape',
|
|
||||||
'N_state',
|
|
||||||
'N_substance',
|
|
||||||
'N_time',
|
|
||||||
'V_body',
|
|
||||||
'V_change',
|
|
||||||
'V_cognition',
|
|
||||||
'V_communication',
|
|
||||||
'V_competition',
|
|
||||||
'V_consumption',
|
|
||||||
'V_contact',
|
|
||||||
'V_creation',
|
|
||||||
'V_emotion',
|
|
||||||
'V_motion',
|
|
||||||
'V_perception',
|
|
||||||
'V_possession',
|
|
||||||
'V_social',
|
|
||||||
'V_stative',
|
|
||||||
'V_weather'
|
|
||||||
)
|
|
||||||
|
|
||||||
IDS = dict((sense_str, i) for i, sense_str in enumerate(STRINGS))
|
IDS = dict((sense_str, i) for i, sense_str in enumerate(STRINGS))
|
||||||
|
|
||||||
|
@ -80,8 +62,8 @@ cdef flags_t encode_sense_strs(sense_names) except 0:
|
||||||
return sense_bits | (1 << NO_SENSE)
|
return sense_bits | (1 << NO_SENSE)
|
||||||
cdef flags_t sense_id = 0
|
cdef flags_t sense_id = 0
|
||||||
for sense_str in sense_names:
|
for sense_str in sense_names:
|
||||||
if '.' in sense_str:
|
sense_str = sense_str.replace('noun', 'N').replace('verb', 'V')
|
||||||
sense_str = sense_str[0].upper() + '_' + sense_str.split('.')[1]
|
sense_str = sense_str.replace('adj', 'J').replace('adv', 'A')
|
||||||
sense_id = IDS[sense_str]
|
sense_id = IDS[sense_str]
|
||||||
sense_bits |= (1 << sense_id)
|
sense_bits |= (1 << sense_id)
|
||||||
return sense_bits
|
return sense_bits
|
||||||
|
|
|
@ -61,7 +61,7 @@ cdef inline void fill_token(atom_t* context, const TokenC* token) nogil:
|
||||||
context[9] = token.lex.shape
|
context[9] = token.lex.shape
|
||||||
context[10] = token.ent_iob
|
context[10] = token.ent_iob
|
||||||
context[11] = token.ent_type
|
context[11] = token.ent_type
|
||||||
context[12] = token.lex.senses & senses.POS_SENSES[<int>token.pos]
|
context[12] = 0 # token.lex.senses & senses.POS_SENSES[<int>token.pos]
|
||||||
|
|
||||||
cdef int fill_context(atom_t* ctxt, StateClass st) nogil:
|
cdef int fill_context(atom_t* ctxt, StateClass st) nogil:
|
||||||
# Take care to fill every element of context!
|
# Take care to fill every element of context!
|
||||||
|
|
|
@ -93,7 +93,7 @@ cdef class Tokens:
|
||||||
else:
|
else:
|
||||||
size = 5
|
size = 5
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
# Guarantee self.lex[i-x], for any i >= 0 and x < padding is in bounds
|
# Guarantee self.data[i-x], for any i >= 0 and x < padding is in bounds
|
||||||
# However, we need to remember the true starting places, so that we can
|
# However, we need to remember the true starting places, so that we can
|
||||||
# realloc.
|
# realloc.
|
||||||
data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
|
data_start = <TokenC*>self.mem.alloc(size + (PADDING*2), sizeof(TokenC))
|
||||||
|
|
Loading…
Reference in New Issue
Block a user