* Complete refactor of Tagger features, to use a generic list of context names.

This commit is contained in:
Matthew Honnibal 2014-11-05 20:45:29 +11:00
parent 0a8c84625d
commit 4ecbe8c893
14 changed files with 166 additions and 450 deletions

View File

@ -4,40 +4,42 @@ from .tokens cimport Tokens
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
cdef struct Token: cdef class Token:
atom_t i cdef readonly atom_t i
atom_t c cdef readonly atom_t c
atom_t w cdef readonly atom_t w
atom_t shape cdef readonly atom_t shape
atom_t pref cdef readonly atom_t pref
atom_t suff cdef readonly atom_t suff
atom_t oft_title cdef readonly atom_t oft_title
atom_t oft_upper cdef readonly atom_t oft_upper
atom_t is_alpha cdef readonly atom_t is_alpha
atom_t is_digit cdef readonly atom_t is_digit
atom_t is_title cdef readonly atom_t is_title
atom_t is_upper cdef readonly atom_t is_upper
atom_t url cdef readonly atom_t url
atom_t num cdef readonly atom_t num
atom_t postype cdef readonly atom_t postype
atom_t pos cdef readonly atom_t pos
atom_t ner cdef readonly atom_t ner
cdef struct Slots: cdef class Slots:
Token P2 cdef readonly Token P2
Token P1 cdef readonly Token P1
Token N0 cdef readonly Token N0
Token N1 cdef readonly Token N1
Token N2 cdef readonly Token N2
cdef Slots FIELD_IDS
cdef int N_FIELDS cdef int N_FIELDS
cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0 cdef hash_t fill_slots(Slots s, int i, Tokens tokens) except 0
cdef int fill_flat(atom_t* context, Slots* s) except -1 cdef int fill_flat(atom_t* context, Slots s) except -1
cpdef Slots FIELD_IDS

View File

@ -2,7 +2,16 @@ from murmurhash.mrmr cimport hash64
from .lexeme cimport * from .lexeme cimport *
cdef void _number_token(Token* t, int* n_fields): cdef class Slots:
def __init__(self):
self.P2 = Token()
self.P1 = Token()
self.N0 = Token()
self.N1 = Token()
self.N2 = Token()
cdef void _number_token(Token t, int* n_fields):
cdef int i = n_fields[0] cdef int i = n_fields[0]
t.i = i; i += 1 t.i = i; i += 1
t.c = i; i += 1 t.c = i; i += 1
@ -27,7 +36,7 @@ cdef void _number_token(Token* t, int* n_fields):
n_fields[0] = i n_fields[0] = i
cdef int fill_token(Token* t, Lexeme* lex, atom_t pos, atom_t ner): cdef int fill_token(Token t, Lexeme* lex, atom_t pos, atom_t ner):
t.i = lex.sic t.i = lex.sic
t.c = lex.cluster t.c = lex.cluster
t.w = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape t.w = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
@ -48,7 +57,7 @@ cdef int fill_token(Token* t, Lexeme* lex, atom_t pos, atom_t ner):
t.ner = ner t.ner = ner
cdef int _flatten_token(atom_t* context, Token* ids, Token* vals) except -1: cdef int _flatten_token(atom_t* context, Token ids, Token vals) except -1:
context[ids.i] = vals.i context[ids.i] = vals.i
context[ids.c] = vals.c context[ids.c] = vals.c
context[ids.w] = vals.w context[ids.w] = vals.w
@ -68,26 +77,27 @@ cdef int _flatten_token(atom_t* context, Token* ids, Token* vals) except -1:
context[ids.ner] = vals.ner context[ids.ner] = vals.ner
cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0: cdef hash_t fill_slots(Slots s, int i, Tokens tokens) except 0:
fill_token(&s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2]) fill_token(s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
fill_token(&s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1]) fill_token(s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
fill_token(&s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i]) fill_token(s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
fill_token(&s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1]) fill_token(s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
fill_token(&s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2]) fill_token(s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
return hash64(s, sizeof(Slots), 0) return 1
cdef int fill_flat(atom_t* context, Slots* s) except -1: cdef int fill_flat(atom_t* context, Slots s) except -1:
_flatten_token(context, &FIELD_IDS.P2, &s.P2) _flatten_token(context, FIELD_IDS.P2, s.P2)
_flatten_token(context, &FIELD_IDS.P1, &s.P1) _flatten_token(context, FIELD_IDS.P1, s.P1)
_flatten_token(context, &FIELD_IDS.N0, &s.N0) _flatten_token(context, FIELD_IDS.N0, s.N0)
_flatten_token(context, &FIELD_IDS.N1, &s.N1) _flatten_token(context, FIELD_IDS.N1, s.N1)
_flatten_token(context, &FIELD_IDS.N2, &s.N2) _flatten_token(context, FIELD_IDS.N2, s.N2)
N_FIELDS = 0 N_FIELDS = 0
_number_token(&FIELD_IDS.P2, &N_FIELDS) FIELD_IDS = Slots()
_number_token(&FIELD_IDS.P1, &N_FIELDS) _number_token(FIELD_IDS.P2, &N_FIELDS)
_number_token(&FIELD_IDS.N0, &N_FIELDS) _number_token(FIELD_IDS.P1, &N_FIELDS)
_number_token(&FIELD_IDS.N1, &N_FIELDS) _number_token(FIELD_IDS.N0, &N_FIELDS)
_number_token(&FIELD_IDS.N2, &N_FIELDS) _number_token(FIELD_IDS.N1, &N_FIELDS)
_number_token(FIELD_IDS.N2, &N_FIELDS)

View File

@ -42,6 +42,7 @@ cdef class Language:
cpdef readonly Lexicon lexicon cpdef readonly Lexicon lexicon
cpdef readonly Tagger pos_tagger cpdef readonly Tagger pos_tagger
cpdef readonly Tagger ner_tagger
cdef object _prefix_re cdef object _prefix_re
cdef object _suffix_re cdef object _suffix_re

View File

@ -45,6 +45,8 @@ cdef class Language:
self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos')) self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
else: else:
self.pos_tagger = None self.pos_tagger = None
if path.exists(path.join(util.DATA_DIR, name, 'ner')):
self.ner_tagger = Tagger(path.join(util.DATA_DIR, name, 'ner'))
cpdef Tokens tokenize(self, unicode string): cpdef Tokens tokenize(self, unicode string):
"""Tokenize a string. """Tokenize a string.

0
spacy/ner_feats.pxd Normal file
View File

35
spacy/ner_feats.pyx Normal file
View File

@ -0,0 +1,35 @@
from spacy.context cimport FIELD_IDS, Token
cdef Token P2 = FIELD_IDS.P2
cdef Token P1 = FIELD_IDS.P1
cdef Token N0 = FIELD_IDS.N0
cdef Token N1 = FIELD_IDS.N1
cdef Token N2 = FIELD_IDS.N2
TEMPLATES = (
(N0.i,),
(N0.c,),
(P1.pos,),
(P1.i,),
(N1.w,),
(N1.pos,),
(P1.ner,),
(P2.ner,),
(N0.c,),
(P1.c,),
(N1.c,),
(N0.is_alpha,),
(N0.is_digit,),
(N0.is_title,),
(N0.is_upper,),
(N0.is_title, N0.oft_title),
(N0.is_upper, N0.oft_upper),
)

View File

@ -1,229 +0,0 @@
# cython: profile=True
from os import path
import os
import shutil
import ujson
import random
import codecs
import gzip
import cython
from libc.stdint cimport uint32_t
from thinc.weights cimport arg_max
from thinc.features import NonZeroConjFeat
from thinc.features import ConjFeat
from .lexeme cimport *
from .lang cimport Lexicon
NULL_TAG = 0
cdef class Tagger:
tags = {'NULL': NULL_TAG}
def __init__(self, model_dir):
self.mem = Pool()
tags_loc = path.join(model_dir, 'postags.json')
if path.exists(tags_loc):
with open(tags_loc) as file_:
Tagger.tags.update(ujson.load(file_))
self.model = LinearModel(len(self.tags))
if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model'))
self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])
self._atoms = <atom_t*>self.mem.alloc(CONTEXT_SIZE, sizeof(atom_t))
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
self._scores = <weight_t*>self.mem.alloc(len(self.tags), sizeof(weight_t))
self._guess = NULL_TAG
cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
tokens.lex[i+1], tokens.lex[i+2], prev, prev_prev)
self.extractor.extract(self._feats, self._values, self._atoms, NULL)
self._guess = self.model.score(self._scores, self._feats, self._values)
return self._guess
cpdef bint tell_answer(self, class_t gold) except *:
cdef class_t guess = self._guess
if gold == guess or gold == NULL_TAG:
self.model.update({})
return 0
counts = {guess: {}, gold: {}}
self.extractor.count(counts[gold], self._feats, 1)
self.extractor.count(counts[guess], self._feats, -1)
self.model.update(counts)
@classmethod
def encode_pos(cls, tag):
if tag not in cls.tags:
cls.tags[tag] = len(cls.tags)
return cls.tags[tag]
@cython.boundscheck(False)
def count_tags(Tagger tagger, Tokens tokens, uint32_t[:, :] tag_counts):
cdef class_t prev_prev, prev, tag
prev = tagger.tags['EOL']; prev_prev = tagger.tags['EOL']
cdef int i
cdef id_t token
for i in range(tokens.length):
tag = tagger.predict(i, tokens, prev, prev_prev)
prev_prev = prev
prev = tag
token = tokens.lex[i].id
if token < tag_counts.shape[0]:
tag_counts[token, tag] += 1
cpdef enum:
P2i
P2c
P2w
P2shape
P2pref
P2suff
P2title
P2upper
P2oft_title
P2oft_upper
P2pos
P2url
P2num
P1i
P1c
P1w
P1shape
P1pre
P1suff
P1title
P1upper
P1oft_title
P1oft_upper
P1pos
P1url
P1num
N0i
N0c
N0w
N0shape
N0pref
N0suff
N0title
N0upper
N0oft_title
N0oft_upper
N0pos
N0url
N0num
N1i
N1c
N1w
N1shape
N1pref
N1suff
N1title
N1upper
N1oft_title
N1oft_upper
N1pos
N1url
N1num
N2i
N2c
N2w
N2shape
N2pref
N2suff
N2title
N2upper
N2oft_title
N2oft_upper
N2pos
N2url
N2num
P2t
P1t
CONTEXT_SIZE
cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,
Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
_fill_token(&atoms[P2i], p2)
_fill_token(&atoms[P1i], p1)
_fill_token(&atoms[N0i], n0)
_fill_token(&atoms[N1i], n1)
_fill_token(&atoms[N2i], n2)
atoms[P1t] = prev_tag
atoms[P2t] = prev_prev_tag
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
atoms[0] = lex.sic
atoms[1] = lex.cluster
atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
atoms[3] = lex.shape
atoms[4] = lex.prefix
atoms[5] = lex.suffix
atoms[6] = lex.flags & (1 << IS_TITLE)
atoms[7] = lex.flags & (1 << IS_UPPER)
atoms[8] = lex.flags & (1 << OFT_TITLE)
atoms[9] = lex.flags & (1 << OFT_UPPER)
atoms[10] = lex.postype
atoms[11] = lex.flags & (1 << LIKE_URL)
atoms[12] = lex.flags & (1 << LIKE_NUMBER)
TEMPLATES = (
(N0i,),
(N0w,),
(N0suff,),
(N0pref,),
(P1t,),
(P2t,),
(P1t, P2t),
(P1t, N0w),
(P1w,),
(P1suff,),
(P2w,),
(N1w,),
(N1suff,),
(N2w,),
(N0shape,),
(N0c,),
(N1c,),
(N2c,),
(P1c,),
(P2c,),
(P1c, N0c),
(N0c, N1c),
(P1c, P1t),
(P1c, P1t, N0c),
(P1t, N0c),
(N0oft_upper,),
(N0oft_title,),
(P1w, N0w),
(N0w, N1w),
(N0pos,),
(P1t, N0pos, N1pos),
(P1t, N1pos),
(N0url,),
(N0num,),
(P1url,),
(P1url,),
(N1num,),
(N1url,),
)

View File

@ -1,83 +0,0 @@
from .tokens cimport Tokens
from thinc.typedefs cimport atom_t
cpdef enum:
P2i
P2c
P2w
P2shape
P2pref
P2suff
P2title
P2upper
P2oft_title
P2oft_upper
P2pos
P2url
P2num
P1i
P1c
P1w
P1shape
P1pre
P1suff
P1title
P1upper
P1oft_title
P1oft_upper
P1pos
P1url
P1num
N0i
N0c
N0w
N0shape
N0pref
N0suff
N0title
N0upper
N0oft_title
N0oft_upper
N0pos
N0url
N0num
N1i
N1c
N1w
N1shape
N1pref
N1suff
N1title
N1upper
N1oft_title
N1oft_upper
N1pos
N1url
N1num
N2i
N2c
N2w
N2shape
N2pref
N2suff
N2title
N2upper
N2oft_title
N2oft_upper
N2pos
N2url
N2num
P2t
P1t
CONTEXT_SIZE
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1

View File

@ -1,77 +1,41 @@
from .lexeme cimport * from spacy.context cimport FIELD_IDS, Token
from thinc.typedefs cimport atom_t
cpdef Token P2 = FIELD_IDS.P2
cpdef Token P1 = FIELD_IDS.P1
cpdef Token N0 = FIELD_IDS.N0
cpdef Token N1 = FIELD_IDS.N1
cpdef Token N2 = FIELD_IDS.N2
TEMPLATES = ( TEMPLATES = (
(N0i,), (N0.i,),
(N0w,), (N0.w,),
(N0suff,), (N0.suff,),
(N0pref,), (N0.pref,),
(P1t,), (P1.pos,),
(P2t,), (P2.pos,),
(P1t, P2t), (P1.pos, P2.pos),
(P1t, N0w), (P1.pos, N0.w),
(P1w,), (P1.w,),
(P1suff,), (P1.suff,),
(P2w,), (P2.w,),
(N1w,), (N1.w,),
(N1suff,), (N1.suff,),
(N2w,), (N2.w,),
(N0shape,), (N0.shape,),
(N0c,), (N0.c,),
(N1c,), (N1.c,),
(N2c,), (N2.c,),
(P1c,), (P1.c,),
(P2c,), (P2.c,),
(P1c, N0c), (N0.oft_upper,),
(N0c, N1c), (N0.oft_title,),
(P1c, P1t),
(P1c, P1t, N0c),
(P1t, N0c),
(N0oft_upper,),
(N0oft_title,),
(P1w, N0w), (N0.postype,),
(N0w, N1w),
(N0pos,), (P1.url,),
(P1t, N0pos, N1pos), (N1.num,),
(P1t, N1pos), (N1.url,),
(N0url,),
(N0num,),
(P1url,),
(P1url,),
(N1num,),
(N1url,),
) )
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
_fill_token(&context[P2i], tokens.lex[i-2])
_fill_token(&context[P1i], tokens.lex[i-1])
_fill_token(&context[N0i], tokens.lex[i])
_fill_token(&context[N1i], tokens.lex[i+1])
_fill_token(&context[N2i], tokens.lex[i+2])
context[P1t] = tokens.pos[i-1]
context[P2t] = tokens.pos[i-2]
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
atoms[0] = lex.sic
atoms[1] = lex.cluster
atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
atoms[3] = lex.shape
atoms[4] = lex.prefix
atoms[5] = lex.suffix
atoms[6] = lex.flags & (1 << IS_TITLE)
atoms[7] = lex.flags & (1 << IS_UPPER)
atoms[8] = lex.flags & (1 << OFT_TITLE)
atoms[9] = lex.flags & (1 << OFT_UPPER)
atoms[10] = lex.postype
atoms[11] = lex.flags & (1 << LIKE_URL)
atoms[12] = lex.flags & (1 << LIKE_NUMBER)

View File

@ -3,10 +3,8 @@ from . import util
from . import tokens from . import tokens
from .en import EN from .en import EN
from .pos import Tagger
def read_gold(file_, tag_list, col):
def read_gold(file_, tag_list):
paras = file_.read().strip().split('\n\n') paras = file_.read().strip().split('\n\n')
golds = [] golds = []
tag_ids = dict((tag, i) for i, tag in enumerate(tag_list)) tag_ids = dict((tag, i) for i, tag in enumerate(tag_list))
@ -21,7 +19,7 @@ def read_gold(file_, tag_list):
conll_toks = [] conll_toks = []
for line in lines: for line in lines:
pieces = line.split() pieces = line.split()
conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[3])) conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[col]))
for i, token in enumerate(tokens): for i, token in enumerate(tokens):
if not conll_toks: if not conll_toks:
tags.append('NULL') tags.append('NULL')

View File

@ -4,6 +4,8 @@ from thinc.learner cimport LinearModel
from thinc.features cimport Extractor from thinc.features cimport Extractor
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from .typedefs cimport hash_t
from .context cimport Slots
from .tokens cimport Tokens from .tokens cimport Tokens
@ -26,7 +28,8 @@ cdef class Tagger:
cpdef readonly list tag_names cpdef readonly list tag_names
cdef class_t _guess cdef class_t _guess
cdef atom_t* _context cdef atom_t* _context_flat
cdef Slots _context_slots
cdef feat_t* _feats cdef feat_t* _feats
cdef weight_t* _values cdef weight_t* _values
cdef weight_t* _scores cdef weight_t* _scores

View File

@ -10,8 +10,9 @@ import random
import json import json
import cython import cython
from .pos_feats cimport fill_context as pos_fill_context from .context cimport fill_slots
from .pos_feats cimport CONTEXT_SIZE as POS_CONTEXT_SIZE from .context cimport fill_flat
from .context cimport N_FIELDS
from thinc.features cimport ConjFeat from thinc.features cimport ConjFeat
@ -46,6 +47,7 @@ def train(train_sents, model_dir, nr_iter=5):
if gold != NULL_TAG: if gold != NULL_TAG:
total += 1 total += 1
n_corr += guess == gold n_corr += guess == gold
#print('%s\t%d\t%d' % (tokens[i].string, guess, gold))
print('%.4f' % ((n_corr / total) * 100)) print('%.4f' % ((n_corr / total) * 100))
random.shuffle(train_sents) random.shuffle(train_sents)
tagger.model.end_training() tagger.model.end_training()
@ -76,15 +78,12 @@ cdef class Tagger:
self.tag_names = cfg['tag_names'] self.tag_names = cfg['tag_names']
self.tag_type = cfg['tag_type'] self.tag_type = cfg['tag_type']
self.extractor = Extractor(templates, [ConjFeat] * len(templates)) self.extractor = Extractor(templates, [ConjFeat] * len(templates))
self.model = LinearModel(len(self.tag_names), self.extractor.n) self.model = LinearModel(len(self.tag_names))
print("Load tagger model")
if path.exists(path.join(model_dir, 'model')): if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model')) self.model.load(path.join(model_dir, 'model'))
print("Done")
if self.tag_type == POS: self._context_flat = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
n_context = POS_CONTEXT_SIZE self._context_slots = Slots()
self._context = <atom_t*>self.mem.alloc(n_context, sizeof(atom_t))
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t)) self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t)) self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t)) self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
@ -110,11 +109,9 @@ cdef class Tagger:
>>> tag = EN.pos_tagger.predict(0, tokens) >>> tag = EN.pos_tagger.predict(0, tokens)
>>> assert tag == EN.pos_tagger.tag_id('DT') == 5 >>> assert tag == EN.pos_tagger.tag_id('DT') == 5
""" """
if self.tag_type == POS: cdef hash_t hashed = fill_slots(self._context_slots, i, tokens)
pos_fill_context(self._context, i, tokens) fill_flat(self._context_flat, self._context_slots)
else: self.extractor.extract(self._feats, self._values, self._context_flat, NULL)
raise StandardError
self.extractor.extract(self._feats, self._values, self._context, NULL)
self._guess = self.model.score(self._scores, self._feats, self._values) self._guess = self.model.score(self._scores, self._feats, self._values)
return self._guess return self._guess

View File

@ -15,9 +15,11 @@ cdef class Tokens:
cdef Lexeme** _lex_ptr cdef Lexeme** _lex_ptr
cdef int* _idx_ptr cdef int* _idx_ptr
cdef int* _pos_ptr cdef int* _pos_ptr
cdef int* _ner_ptr
cdef Lexeme** lex cdef Lexeme** lex
cdef int* idx cdef int* idx
cdef int* pos cdef int* pos
cdef int* ner
cdef int length cdef int length
cdef int max_length cdef int max_length
@ -32,6 +34,7 @@ cdef class Token:
cdef public int i cdef public int i
cdef public int idx cdef public int idx
cdef public int pos cdef public int pos
cdef public int ner
cdef public atom_t id cdef public atom_t id
cdef public atom_t cluster cdef public atom_t cluster

View File

@ -1,6 +1,7 @@
# cython: profile=True # cython: profile=True
from .lexeme cimport * from .lexeme cimport *
cimport cython cimport cython
from .tagger cimport POS, ENTITY
DEF PADDING = 5 DEF PADDING = 5
@ -44,21 +45,25 @@ cdef class Tokens:
self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*)) self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int)) self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int)) self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
self._ner_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
self.lex = self._lex_ptr self.lex = self._lex_ptr
self.idx = self._idx_ptr self.idx = self._idx_ptr
self.pos = self._pos_ptr self.pos = self._pos_ptr
self.ner = self._ner_ptr
cdef int i cdef int i
for i in range(size + (PADDING*2)): for i in range(size + (PADDING*2)):
self.lex[i] = &EMPTY_LEXEME self.lex[i] = &EMPTY_LEXEME
self.lex += PADDING self.lex += PADDING
self.idx += PADDING self.idx += PADDING
self.pos += PADDING self.pos += PADDING
self.ner += PADDING
self.max_length = size self.max_length = size
self.length = 0 self.length = 0
def __getitem__(self, i): def __getitem__(self, i):
bounds_check(i, self.length, PADDING) bounds_check(i, self.length, PADDING)
return Token(self._string_store, i, self.idx[i], self.pos[i], self.lex[i][0]) return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i],
self.lex[i][0])
def __iter__(self): def __iter__(self):
for i in range(self.length): for i in range(self.length):
@ -73,6 +78,7 @@ cdef class Tokens:
self.lex[self.length] = lexeme self.lex[self.length] = lexeme
self.idx[self.length] = idx self.idx[self.length] = idx
self.pos[self.length] = 0 self.pos[self.length] = 0
self.ner[self.length] = 0
self.length += 1 self.length += 1
return idx + lexeme.length return idx + lexeme.length
@ -91,7 +97,10 @@ cdef class Tokens:
return idx return idx
cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1: cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1:
if tag_type == POS:
self.pos[i] = tag self.pos[i] = tag
elif tag_type == ENTITY:
self.ner[i] = tag
def _realloc(self, new_size): def _realloc(self, new_size):
self.max_length = new_size self.max_length = new_size
@ -99,19 +108,23 @@ cdef class Tokens:
self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*)) self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int)) self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int)) self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
self._ner_ptr = <int*>self.mem.realloc(self._ner_ptr, n * sizeof(int))
self.lex = self._lex_ptr + PADDING self.lex = self._lex_ptr + PADDING
self.idx = self._idx_ptr + PADDING self.idx = self._idx_ptr + PADDING
self.pos = self._pos_ptr + PADDING self.pos = self._pos_ptr + PADDING
self.ner = self._ner_ptr + PADDING
for i in range(self.length, self.max_length + PADDING): for i in range(self.length, self.max_length + PADDING):
self.lex[i] = &EMPTY_LEXEME self.lex[i] = &EMPTY_LEXEME
@cython.freelist(64) @cython.freelist(64)
cdef class Token: cdef class Token:
def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex): def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
dict lex):
self._string_store = string_store self._string_store = string_store
self.idx = idx self.idx = idx
self.pos = pos self.pos = pos
self.ner = ner
self.i = i self.i = i
self.id = lex['id'] self.id = lex['id']