* Complete refactor of Tagger features, to use a generic list of context names.

This commit is contained in:
Matthew Honnibal 2014-11-05 20:45:29 +11:00
parent 0a8c84625d
commit 4ecbe8c893
14 changed files with 166 additions and 450 deletions

View File

@ -4,40 +4,42 @@ from .tokens cimport Tokens
from .lexeme cimport Lexeme
cdef struct Token:
atom_t i
atom_t c
atom_t w
atom_t shape
atom_t pref
atom_t suff
atom_t oft_title
atom_t oft_upper
atom_t is_alpha
atom_t is_digit
atom_t is_title
atom_t is_upper
cdef class Token:
cdef readonly atom_t i
cdef readonly atom_t c
cdef readonly atom_t w
cdef readonly atom_t shape
cdef readonly atom_t pref
cdef readonly atom_t suff
cdef readonly atom_t oft_title
cdef readonly atom_t oft_upper
cdef readonly atom_t is_alpha
cdef readonly atom_t is_digit
cdef readonly atom_t is_title
cdef readonly atom_t is_upper
atom_t url
atom_t num
cdef readonly atom_t url
cdef readonly atom_t num
atom_t postype
atom_t pos
atom_t ner
cdef readonly atom_t postype
cdef readonly atom_t pos
cdef readonly atom_t ner
cdef struct Slots:
Token P2
Token P1
Token N0
Token N1
Token N2
cdef class Slots:
cdef readonly Token P2
cdef readonly Token P1
cdef readonly Token N0
cdef readonly Token N1
cdef readonly Token N2
cdef Slots FIELD_IDS
cdef int N_FIELDS
cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0
cdef hash_t fill_slots(Slots s, int i, Tokens tokens) except 0
cdef int fill_flat(atom_t* context, Slots* s) except -1
cdef int fill_flat(atom_t* context, Slots s) except -1
cpdef Slots FIELD_IDS

View File

@ -2,7 +2,16 @@ from murmurhash.mrmr cimport hash64
from .lexeme cimport *
cdef void _number_token(Token* t, int* n_fields):
cdef class Slots:
def __init__(self):
self.P2 = Token()
self.P1 = Token()
self.N0 = Token()
self.N1 = Token()
self.N2 = Token()
cdef void _number_token(Token t, int* n_fields):
cdef int i = n_fields[0]
t.i = i; i += 1
t.c = i; i += 1
@ -27,7 +36,7 @@ cdef void _number_token(Token* t, int* n_fields):
n_fields[0] = i
cdef int fill_token(Token* t, Lexeme* lex, atom_t pos, atom_t ner):
cdef int fill_token(Token t, Lexeme* lex, atom_t pos, atom_t ner):
t.i = lex.sic
t.c = lex.cluster
t.w = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
@ -48,7 +57,7 @@ cdef int fill_token(Token* t, Lexeme* lex, atom_t pos, atom_t ner):
t.ner = ner
cdef int _flatten_token(atom_t* context, Token* ids, Token* vals) except -1:
cdef int _flatten_token(atom_t* context, Token ids, Token vals) except -1:
context[ids.i] = vals.i
context[ids.c] = vals.c
context[ids.w] = vals.w
@ -68,26 +77,27 @@ cdef int _flatten_token(atom_t* context, Token* ids, Token* vals) except -1:
context[ids.ner] = vals.ner
cdef hash_t fill_slots(Slots* s, int i, Tokens tokens) except 0:
fill_token(&s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
fill_token(&s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
fill_token(&s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
fill_token(&s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
fill_token(&s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
return hash64(s, sizeof(Slots), 0)
cdef hash_t fill_slots(Slots s, int i, Tokens tokens) except 0:
fill_token(s.P2, tokens.lex[i-2], tokens.pos[i-2], tokens.ner[i-2])
fill_token(s.P1, tokens.lex[i-1], tokens.pos[i-1], tokens.ner[i-1])
fill_token(s.N0, tokens.lex[i], tokens.pos[i], tokens.ner[i])
fill_token(s.N1, tokens.lex[i+1], tokens.pos[i+1], tokens.ner[i+1])
fill_token(s.N2, tokens.lex[i+2], tokens.pos[i+2], tokens.ner[i+2])
return 1
cdef int fill_flat(atom_t* context, Slots* s) except -1:
_flatten_token(context, &FIELD_IDS.P2, &s.P2)
_flatten_token(context, &FIELD_IDS.P1, &s.P1)
_flatten_token(context, &FIELD_IDS.N0, &s.N0)
_flatten_token(context, &FIELD_IDS.N1, &s.N1)
_flatten_token(context, &FIELD_IDS.N2, &s.N2)
cdef int fill_flat(atom_t* context, Slots s) except -1:
_flatten_token(context, FIELD_IDS.P2, s.P2)
_flatten_token(context, FIELD_IDS.P1, s.P1)
_flatten_token(context, FIELD_IDS.N0, s.N0)
_flatten_token(context, FIELD_IDS.N1, s.N1)
_flatten_token(context, FIELD_IDS.N2, s.N2)
N_FIELDS = 0
_number_token(&FIELD_IDS.P2, &N_FIELDS)
_number_token(&FIELD_IDS.P1, &N_FIELDS)
_number_token(&FIELD_IDS.N0, &N_FIELDS)
_number_token(&FIELD_IDS.N1, &N_FIELDS)
_number_token(&FIELD_IDS.N2, &N_FIELDS)
FIELD_IDS = Slots()
_number_token(FIELD_IDS.P2, &N_FIELDS)
_number_token(FIELD_IDS.P1, &N_FIELDS)
_number_token(FIELD_IDS.N0, &N_FIELDS)
_number_token(FIELD_IDS.N1, &N_FIELDS)
_number_token(FIELD_IDS.N2, &N_FIELDS)

View File

@ -42,6 +42,7 @@ cdef class Language:
cpdef readonly Lexicon lexicon
cpdef readonly Tagger pos_tagger
cpdef readonly Tagger ner_tagger
cdef object _prefix_re
cdef object _suffix_re

View File

@ -45,6 +45,8 @@ cdef class Language:
self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
else:
self.pos_tagger = None
if path.exists(path.join(util.DATA_DIR, name, 'ner')):
self.ner_tagger = Tagger(path.join(util.DATA_DIR, name, 'ner'))
cpdef Tokens tokenize(self, unicode string):
"""Tokenize a string.

0
spacy/ner_feats.pxd Normal file
View File

35
spacy/ner_feats.pyx Normal file
View File

@ -0,0 +1,35 @@
from spacy.context cimport FIELD_IDS, Token
cdef Token P2 = FIELD_IDS.P2
cdef Token P1 = FIELD_IDS.P1
cdef Token N0 = FIELD_IDS.N0
cdef Token N1 = FIELD_IDS.N1
cdef Token N2 = FIELD_IDS.N2
TEMPLATES = (
(N0.i,),
(N0.c,),
(P1.pos,),
(P1.i,),
(N1.w,),
(N1.pos,),
(P1.ner,),
(P2.ner,),
(N0.c,),
(P1.c,),
(N1.c,),
(N0.is_alpha,),
(N0.is_digit,),
(N0.is_title,),
(N0.is_upper,),
(N0.is_title, N0.oft_title),
(N0.is_upper, N0.oft_upper),
)

View File

@ -1,229 +0,0 @@
# cython: profile=True
from os import path
import os
import shutil
import ujson
import random
import codecs
import gzip
import cython
from libc.stdint cimport uint32_t
from thinc.weights cimport arg_max
from thinc.features import NonZeroConjFeat
from thinc.features import ConjFeat
from .lexeme cimport *
from .lang cimport Lexicon
NULL_TAG = 0
cdef class Tagger:
tags = {'NULL': NULL_TAG}
def __init__(self, model_dir):
self.mem = Pool()
tags_loc = path.join(model_dir, 'postags.json')
if path.exists(tags_loc):
with open(tags_loc) as file_:
Tagger.tags.update(ujson.load(file_))
self.model = LinearModel(len(self.tags))
if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model'))
self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])
self._atoms = <atom_t*>self.mem.alloc(CONTEXT_SIZE, sizeof(atom_t))
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
self._scores = <weight_t*>self.mem.alloc(len(self.tags), sizeof(weight_t))
self._guess = NULL_TAG
cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
tokens.lex[i+1], tokens.lex[i+2], prev, prev_prev)
self.extractor.extract(self._feats, self._values, self._atoms, NULL)
self._guess = self.model.score(self._scores, self._feats, self._values)
return self._guess
cpdef bint tell_answer(self, class_t gold) except *:
cdef class_t guess = self._guess
if gold == guess or gold == NULL_TAG:
self.model.update({})
return 0
counts = {guess: {}, gold: {}}
self.extractor.count(counts[gold], self._feats, 1)
self.extractor.count(counts[guess], self._feats, -1)
self.model.update(counts)
@classmethod
def encode_pos(cls, tag):
if tag not in cls.tags:
cls.tags[tag] = len(cls.tags)
return cls.tags[tag]
@cython.boundscheck(False)
def count_tags(Tagger tagger, Tokens tokens, uint32_t[:, :] tag_counts):
cdef class_t prev_prev, prev, tag
prev = tagger.tags['EOL']; prev_prev = tagger.tags['EOL']
cdef int i
cdef id_t token
for i in range(tokens.length):
tag = tagger.predict(i, tokens, prev, prev_prev)
prev_prev = prev
prev = tag
token = tokens.lex[i].id
if token < tag_counts.shape[0]:
tag_counts[token, tag] += 1
cpdef enum:
P2i
P2c
P2w
P2shape
P2pref
P2suff
P2title
P2upper
P2oft_title
P2oft_upper
P2pos
P2url
P2num
P1i
P1c
P1w
P1shape
P1pre
P1suff
P1title
P1upper
P1oft_title
P1oft_upper
P1pos
P1url
P1num
N0i
N0c
N0w
N0shape
N0pref
N0suff
N0title
N0upper
N0oft_title
N0oft_upper
N0pos
N0url
N0num
N1i
N1c
N1w
N1shape
N1pref
N1suff
N1title
N1upper
N1oft_title
N1oft_upper
N1pos
N1url
N1num
N2i
N2c
N2w
N2shape
N2pref
N2suff
N2title
N2upper
N2oft_title
N2oft_upper
N2pos
N2url
N2num
P2t
P1t
CONTEXT_SIZE
cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,
Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
_fill_token(&atoms[P2i], p2)
_fill_token(&atoms[P1i], p1)
_fill_token(&atoms[N0i], n0)
_fill_token(&atoms[N1i], n1)
_fill_token(&atoms[N2i], n2)
atoms[P1t] = prev_tag
atoms[P2t] = prev_prev_tag
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
atoms[0] = lex.sic
atoms[1] = lex.cluster
atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
atoms[3] = lex.shape
atoms[4] = lex.prefix
atoms[5] = lex.suffix
atoms[6] = lex.flags & (1 << IS_TITLE)
atoms[7] = lex.flags & (1 << IS_UPPER)
atoms[8] = lex.flags & (1 << OFT_TITLE)
atoms[9] = lex.flags & (1 << OFT_UPPER)
atoms[10] = lex.postype
atoms[11] = lex.flags & (1 << LIKE_URL)
atoms[12] = lex.flags & (1 << LIKE_NUMBER)
TEMPLATES = (
(N0i,),
(N0w,),
(N0suff,),
(N0pref,),
(P1t,),
(P2t,),
(P1t, P2t),
(P1t, N0w),
(P1w,),
(P1suff,),
(P2w,),
(N1w,),
(N1suff,),
(N2w,),
(N0shape,),
(N0c,),
(N1c,),
(N2c,),
(P1c,),
(P2c,),
(P1c, N0c),
(N0c, N1c),
(P1c, P1t),
(P1c, P1t, N0c),
(P1t, N0c),
(N0oft_upper,),
(N0oft_title,),
(P1w, N0w),
(N0w, N1w),
(N0pos,),
(P1t, N0pos, N1pos),
(P1t, N1pos),
(N0url,),
(N0num,),
(P1url,),
(P1url,),
(N1num,),
(N1url,),
)

View File

@ -1,83 +0,0 @@
from .tokens cimport Tokens
from thinc.typedefs cimport atom_t
cpdef enum:
P2i
P2c
P2w
P2shape
P2pref
P2suff
P2title
P2upper
P2oft_title
P2oft_upper
P2pos
P2url
P2num
P1i
P1c
P1w
P1shape
P1pre
P1suff
P1title
P1upper
P1oft_title
P1oft_upper
P1pos
P1url
P1num
N0i
N0c
N0w
N0shape
N0pref
N0suff
N0title
N0upper
N0oft_title
N0oft_upper
N0pos
N0url
N0num
N1i
N1c
N1w
N1shape
N1pref
N1suff
N1title
N1upper
N1oft_title
N1oft_upper
N1pos
N1url
N1num
N2i
N2c
N2w
N2shape
N2pref
N2suff
N2title
N2upper
N2oft_title
N2oft_upper
N2pos
N2url
N2num
P2t
P1t
CONTEXT_SIZE
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1

View File

@ -1,77 +1,41 @@
from .lexeme cimport *
from spacy.context cimport FIELD_IDS, Token
from thinc.typedefs cimport atom_t
cpdef Token P2 = FIELD_IDS.P2
cpdef Token P1 = FIELD_IDS.P1
cpdef Token N0 = FIELD_IDS.N0
cpdef Token N1 = FIELD_IDS.N1
cpdef Token N2 = FIELD_IDS.N2
TEMPLATES = (
(N0i,),
(N0w,),
(N0suff,),
(N0pref,),
(P1t,),
(P2t,),
(P1t, P2t),
(P1t, N0w),
(P1w,),
(P1suff,),
(P2w,),
(N1w,),
(N1suff,),
(N2w,),
(N0.i,),
(N0.w,),
(N0.suff,),
(N0.pref,),
(P1.pos,),
(P2.pos,),
(P1.pos, P2.pos),
(P1.pos, N0.w),
(P1.w,),
(P1.suff,),
(P2.w,),
(N1.w,),
(N1.suff,),
(N2.w,),
(N0shape,),
(N0c,),
(N1c,),
(N2c,),
(P1c,),
(P2c,),
(P1c, N0c),
(N0c, N1c),
(P1c, P1t),
(P1c, P1t, N0c),
(P1t, N0c),
(N0oft_upper,),
(N0oft_title,),
(N0.shape,),
(N0.c,),
(N1.c,),
(N2.c,),
(P1.c,),
(P2.c,),
(N0.oft_upper,),
(N0.oft_title,),
(P1w, N0w),
(N0w, N1w),
(N0.postype,),
(N0pos,),
(P1t, N0pos, N1pos),
(P1t, N1pos),
(N0url,),
(N0num,),
(P1url,),
(P1url,),
(N1num,),
(N1url,),
(P1.url,),
(N1.num,),
(N1.url,),
)
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
_fill_token(&context[P2i], tokens.lex[i-2])
_fill_token(&context[P1i], tokens.lex[i-1])
_fill_token(&context[N0i], tokens.lex[i])
_fill_token(&context[N1i], tokens.lex[i+1])
_fill_token(&context[N2i], tokens.lex[i+2])
context[P1t] = tokens.pos[i-1]
context[P2t] = tokens.pos[i-2]
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
atoms[0] = lex.sic
atoms[1] = lex.cluster
atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
atoms[3] = lex.shape
atoms[4] = lex.prefix
atoms[5] = lex.suffix
atoms[6] = lex.flags & (1 << IS_TITLE)
atoms[7] = lex.flags & (1 << IS_UPPER)
atoms[8] = lex.flags & (1 << OFT_TITLE)
atoms[9] = lex.flags & (1 << OFT_UPPER)
atoms[10] = lex.postype
atoms[11] = lex.flags & (1 << LIKE_URL)
atoms[12] = lex.flags & (1 << LIKE_NUMBER)

View File

@ -3,10 +3,8 @@ from . import util
from . import tokens
from .en import EN
from .pos import Tagger
def read_gold(file_, tag_list):
def read_gold(file_, tag_list, col):
paras = file_.read().strip().split('\n\n')
golds = []
tag_ids = dict((tag, i) for i, tag in enumerate(tag_list))
@ -21,7 +19,7 @@ def read_gold(file_, tag_list):
conll_toks = []
for line in lines:
pieces = line.split()
conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[3]))
conll_toks.append((int(pieces[0]), len(pieces[1]), pieces[col]))
for i, token in enumerate(tokens):
if not conll_toks:
tags.append('NULL')

View File

@ -4,6 +4,8 @@ from thinc.learner cimport LinearModel
from thinc.features cimport Extractor
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from .typedefs cimport hash_t
from .context cimport Slots
from .tokens cimport Tokens
@ -26,7 +28,8 @@ cdef class Tagger:
cpdef readonly list tag_names
cdef class_t _guess
cdef atom_t* _context
cdef atom_t* _context_flat
cdef Slots _context_slots
cdef feat_t* _feats
cdef weight_t* _values
cdef weight_t* _scores

View File

@ -10,8 +10,9 @@ import random
import json
import cython
from .pos_feats cimport fill_context as pos_fill_context
from .pos_feats cimport CONTEXT_SIZE as POS_CONTEXT_SIZE
from .context cimport fill_slots
from .context cimport fill_flat
from .context cimport N_FIELDS
from thinc.features cimport ConjFeat
@ -46,6 +47,7 @@ def train(train_sents, model_dir, nr_iter=5):
if gold != NULL_TAG:
total += 1
n_corr += guess == gold
#print('%s\t%d\t%d' % (tokens[i].string, guess, gold))
print('%.4f' % ((n_corr / total) * 100))
random.shuffle(train_sents)
tagger.model.end_training()
@ -76,15 +78,12 @@ cdef class Tagger:
self.tag_names = cfg['tag_names']
self.tag_type = cfg['tag_type']
self.extractor = Extractor(templates, [ConjFeat] * len(templates))
self.model = LinearModel(len(self.tag_names), self.extractor.n)
print("Load tagger model")
self.model = LinearModel(len(self.tag_names))
if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model'))
print("Done")
if self.tag_type == POS:
n_context = POS_CONTEXT_SIZE
self._context = <atom_t*>self.mem.alloc(n_context, sizeof(atom_t))
self._context_flat = <atom_t*>self.mem.alloc(N_FIELDS, sizeof(atom_t))
self._context_slots = Slots()
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
@ -110,11 +109,9 @@ cdef class Tagger:
>>> tag = EN.pos_tagger.predict(0, tokens)
>>> assert tag == EN.pos_tagger.tag_id('DT') == 5
"""
if self.tag_type == POS:
pos_fill_context(self._context, i, tokens)
else:
raise StandardError
self.extractor.extract(self._feats, self._values, self._context, NULL)
cdef hash_t hashed = fill_slots(self._context_slots, i, tokens)
fill_flat(self._context_flat, self._context_slots)
self.extractor.extract(self._feats, self._values, self._context_flat, NULL)
self._guess = self.model.score(self._scores, self._feats, self._values)
return self._guess

View File

@ -15,9 +15,11 @@ cdef class Tokens:
cdef Lexeme** _lex_ptr
cdef int* _idx_ptr
cdef int* _pos_ptr
cdef int* _ner_ptr
cdef Lexeme** lex
cdef int* idx
cdef int* pos
cdef int* ner
cdef int length
cdef int max_length
@ -32,6 +34,7 @@ cdef class Token:
cdef public int i
cdef public int idx
cdef public int pos
cdef public int ner
cdef public atom_t id
cdef public atom_t cluster

View File

@ -1,6 +1,7 @@
# cython: profile=True
from .lexeme cimport *
cimport cython
from .tagger cimport POS, ENTITY
DEF PADDING = 5
@ -44,21 +45,25 @@ cdef class Tokens:
self._lex_ptr = <Lexeme**>self.mem.alloc(size + (PADDING*2), sizeof(Lexeme*))
self._idx_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
self._pos_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
self._ner_ptr = <int*>self.mem.alloc(size + (PADDING*2), sizeof(int))
self.lex = self._lex_ptr
self.idx = self._idx_ptr
self.pos = self._pos_ptr
self.ner = self._ner_ptr
cdef int i
for i in range(size + (PADDING*2)):
self.lex[i] = &EMPTY_LEXEME
self.lex += PADDING
self.idx += PADDING
self.pos += PADDING
self.ner += PADDING
self.max_length = size
self.length = 0
def __getitem__(self, i):
bounds_check(i, self.length, PADDING)
return Token(self._string_store, i, self.idx[i], self.pos[i], self.lex[i][0])
return Token(self._string_store, i, self.idx[i], self.pos[i], self.ner[i],
self.lex[i][0])
def __iter__(self):
for i in range(self.length):
@ -73,6 +78,7 @@ cdef class Tokens:
self.lex[self.length] = lexeme
self.idx[self.length] = idx
self.pos[self.length] = 0
self.ner[self.length] = 0
self.length += 1
return idx + lexeme.length
@ -91,7 +97,10 @@ cdef class Tokens:
return idx
cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1:
self.pos[i] = tag
if tag_type == POS:
self.pos[i] = tag
elif tag_type == ENTITY:
self.ner[i] = tag
def _realloc(self, new_size):
self.max_length = new_size
@ -99,19 +108,23 @@ cdef class Tokens:
self._lex_ptr = <Lexeme**>self.mem.realloc(self._lex_ptr, n * sizeof(Lexeme*))
self._idx_ptr = <int*>self.mem.realloc(self._idx_ptr, n * sizeof(int))
self._pos_ptr = <int*>self.mem.realloc(self._pos_ptr, n * sizeof(int))
self._ner_ptr = <int*>self.mem.realloc(self._ner_ptr, n * sizeof(int))
self.lex = self._lex_ptr + PADDING
self.idx = self._idx_ptr + PADDING
self.pos = self._pos_ptr + PADDING
self.ner = self._ner_ptr + PADDING
for i in range(self.length, self.max_length + PADDING):
self.lex[i] = &EMPTY_LEXEME
@cython.freelist(64)
cdef class Token:
def __init__(self, StringStore string_store, int i, int idx, int pos, dict lex):
def __init__(self, StringStore string_store, int i, int idx, int pos, int ner,
dict lex):
self._string_store = string_store
self.idx = idx
self.pos = pos
self.ner = ner
self.i = i
self.id = lex['id']