* Generalize tagger code, in preparation for NER and supersense tagging.

This commit is contained in:
Matthew Honnibal 2014-11-05 03:42:14 +11:00
parent 81da61f3cf
commit 3733444101
12 changed files with 247 additions and 52 deletions

View File

@ -6,7 +6,7 @@ from cymem.cymem cimport Pool
from .typedefs cimport hash_t
from .tokens cimport Tokens
from .lexeme cimport Lexeme
from .pos cimport Tagger as PosTagger
from .tagger cimport Tagger
from .utf8string cimport StringStore
@ -41,14 +41,13 @@ cdef class Language:
cdef PreshMap _specials
cpdef readonly Lexicon lexicon
cpdef readonly PosTagger pos_tagger
cpdef readonly Tagger pos_tagger
cdef object _prefix_re
cdef object _suffix_re
cdef object _infix_re
cpdef Tokens tokenize(self, unicode text)
cpdef Tokens pos_tag(self, Tokens t)
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,

View File

@ -23,7 +23,7 @@ from . import util
from .util import read_lang_data
from .tokens import Tokens
from .pos cimport Tagger as PosTagger
from .tagger cimport Tagger
cdef class Language:
@ -42,7 +42,7 @@ cdef class Language:
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
self._load_special_tokenization(rules)
if path.exists(path.join(util.DATA_DIR, name, 'pos')):
self.pos_tagger = PosTagger(path.join(util.DATA_DIR, name, 'pos'))
self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
else:
self.pos_tagger = None
@ -93,16 +93,6 @@ cdef class Language:
self._tokenize(tokens, &span, start, i)
return tokens
cpdef Tokens pos_tag(self, Tokens t):
if self.pos_tagger is None:
return t
cdef int i
t.pos[-1] = self.pos_tagger.encode_pos('EOL')
t.pos[-2] = self.pos_tagger.encode_pos('EOL')
for i in range(t.length):
t.pos[i] = self.pos_tagger.predict(i, t, t.pos[i-1], t.pos[i-2])
return t
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
cdef vector[Lexeme*] prefixes
cdef vector[Lexeme*] suffixes

View File

@ -1,22 +0,0 @@
from cymem.cymem cimport Pool
from thinc.learner cimport LinearModel
from thinc.features cimport Extractor
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from .tokens cimport Tokens
cdef class Tagger:
cpdef readonly Extractor extractor
cpdef readonly LinearModel model
cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0
cpdef bint tell_answer(self, class_t gold_tag) except *
cdef Pool mem
cdef class_t _guess
cdef atom_t* _atoms
cdef feat_t* _feats
cdef weight_t* _values
cdef weight_t* _scores

View File

@ -30,7 +30,7 @@ cdef class Tagger:
if path.exists(tags_loc):
with open(tags_loc) as file_:
Tagger.tags.update(ujson.load(file_))
self.model = LinearModel(len(self.tags), self.extractor.n)
self.model = LinearModel(len(self.tags))
if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model'))
self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])

83
spacy/pos_feats.pxd Normal file
View File

@ -0,0 +1,83 @@
from .tokens cimport Tokens
from thinc.typedefs cimport atom_t
cpdef enum:
P2i
P2c
P2w
P2shape
P2pref
P2suff
P2title
P2upper
P2oft_title
P2oft_upper
P2pos
P2url
P2num
P1i
P1c
P1w
P1shape
P1pre
P1suff
P1title
P1upper
P1oft_title
P1oft_upper
P1pos
P1url
P1num
N0i
N0c
N0w
N0shape
N0pref
N0suff
N0title
N0upper
N0oft_title
N0oft_upper
N0pos
N0url
N0num
N1i
N1c
N1w
N1shape
N1pref
N1suff
N1title
N1upper
N1oft_title
N1oft_upper
N1pos
N1url
N1num
N2i
N2c
N2w
N2shape
N2pref
N2suff
N2title
N2upper
N2oft_title
N2oft_upper
N2pos
N2url
N2num
P2t
P1t
CONTEXT_SIZE
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1

77
spacy/pos_feats.pyx Normal file
View File

@ -0,0 +1,77 @@
from .lexeme cimport *
from thinc.typedefs cimport atom_t
TEMPLATES = (
(N0i,),
(N0w,),
(N0suff,),
(N0pref,),
(P1t,),
(P2t,),
(P1t, P2t),
(P1t, N0w),
(P1w,),
(P1suff,),
(P2w,),
(N1w,),
(N1suff,),
(N2w,),
(N0shape,),
(N0c,),
(N1c,),
(N2c,),
(P1c,),
(P2c,),
(P1c, N0c),
(N0c, N1c),
(P1c, P1t),
(P1c, P1t, N0c),
(P1t, N0c),
(N0oft_upper,),
(N0oft_title,),
(P1w, N0w),
(N0w, N1w),
(N0pos,),
(P1t, N0pos, N1pos),
(P1t, N1pos),
(N0url,),
(N0num,),
(P1url,),
(P1url,),
(N1num,),
(N1url,),
)
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
_fill_token(&context[P2i], tokens.lex[i-2])
_fill_token(&context[P1i], tokens.lex[i-1])
_fill_token(&context[N0i], tokens.lex[i])
_fill_token(&context[N1i], tokens.lex[i+1])
_fill_token(&context[N2i], tokens.lex[i+2])
context[P1t] = tokens.pos[i-1]
context[P2t] = tokens.pos[i-2]
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
atoms[0] = lex.sic
atoms[1] = lex.cluster
atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
atoms[3] = lex.shape
atoms[4] = lex.prefix
atoms[5] = lex.suffix
atoms[6] = lex.flags & (1 << IS_TITLE)
atoms[7] = lex.flags & (1 << IS_UPPER)
atoms[8] = lex.flags & (1 << OFT_TITLE)
atoms[9] = lex.flags & (1 << OFT_UPPER)
atoms[10] = lex.postype
atoms[11] = lex.flags & (1 << LIKE_URL)
atoms[12] = lex.flags & (1 << LIKE_NUMBER)

View File

@ -6,9 +6,10 @@ from .en import EN
from .pos import Tagger
def read_gold(file_):
def read_gold(file_, tag_list):
paras = file_.read().strip().split('\n\n')
golds = []
tag_ids = dict((tag, i) for i, tag in enumerate(tag_list))
for para in paras:
if not para.strip():
continue
@ -32,10 +33,16 @@ def read_gold(file_):
else:
conll_toks.pop(0)
assert len(tags) == len(tokens)
tags = [Tagger.encode_pos(t) for t in tags]
tags = [_encode_pos(t, tag_ids, tag_list) for t in tags]
golds.append((tokens, tags))
return golds
def _encode_pos(tag, tag_ids, tag_list):
if tag not in tag_ids:
tag_ids[tag] = len(tag_list)
tag_list.append(tag)
return tag_ids[tag]
def ptb_to_univ(tag):
mapping = dict(tuple(line.split()) for line in """

View File

@ -7,7 +7,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from .tokens cimport Tokens
cdef enum TagType:
cpdef enum TagType:
POS
ENTITY
SENSE

View File

@ -1,37 +1,93 @@
# cython: profile=True
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from os import path
import os
import shutil
import random
import codecs
import gzip
import json
import cython
from .pos_feats cimport fill_context as pos_fill_context
from .pos_feats cimport CONTEXT_SIZE as POS_CONTEXT_SIZE
from thinc.features cimport ConjFeat
NULL_TAG = 0
def setup_model_dir(tag_type, tag_names, templates, model_dir):
if path.exists(model_dir):
shutil.rmtree(model_dir)
os.mkdir(model_dir)
config = {
'tag_type': tag_type,
'templates': templates,
'tag_names': tag_names,
}
with open(path.join(model_dir, 'config.json'), 'w') as file_:
json.dump(config, file_)
def train(train_sents, model_dir, nr_iter=5):
tagger = Tagger(model_dir)
for _ in range(nr_iter):
n_corr = 0
total = 0
for tokens, golds in train_sents:
assert len(tokens) == len(golds), [t.string for t in tokens]
for i, gold in enumerate(golds):
guess = tagger.predict(i, tokens)
tokens.set_tag(i, tagger.tag_type, guess)
tagger.tell_answer(gold)
if gold != NULL_TAG:
total += 1
n_corr += guess == gold
print('%.4f' % ((n_corr / total) * 100))
random.shuffle(train_sents)
tagger.model.end_training()
tagger.model.dump(path.join(model_dir, 'model'), freq_thresh=10)
def evaluate(tagger, sents):
n_corr = 0
total = 0
for tokens, golds in sents:
for i, gold in enumerate(golds):
guess = tagger.predict(i, tokens)
tokens.set_tag(i, tagger.tag_type, guess)
if gold != NULL_TAG:
total += 1
n_corr += guess == gold
return n_corr / total
cdef class Tagger:
"""Assign part-of-speech, named entity or supersense tags, using greedy
decoding. The tagger reads its model and configuration from disk.
"""
def __init__(self, model_dir):
self.mem = Pool()
cfg = json.load(path.join(model_dir, 'config.json'))
cfg = json.load(open(path.join(model_dir, 'config.json')))
templates = cfg['templates']
self.tag_names = cfg['tag_names']
self.tag_type = cfg['tag_type']
self.model = LinearModel(len(self.tag_names))
self.extractor = Extractor(templates, [ConjFeat] * len(templates))
self.model = LinearModel(len(self.tag_names), self.extractor.n)
print("Load tagger model")
if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model'))
self.extractor = Extractor(templates, [ConjFeat] * len(templates))
print("Done")
if self.tag_type == POS:
n_context = POS_CONTEXT_SIZE
self._context = <atom_t*>self.mem.alloc(n_context, sizeof(atom_t))
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
self._scores = <weight_t*>self.mem.alloc(len(self.cfg.tags), sizeof(weight_t))
self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
self._guess = NULL_TAG
cpdef int set_tags(self, Tokens tokens) except -1:
@ -54,8 +110,10 @@ cdef class Tagger:
>>> tag = EN.pos_tagger.predict(0, tokens)
>>> assert tag == EN.pos_tagger.tag_id('DT') == 5
"""
#if self.tag_type == POS:
# _pos_feats.fill_context(self._context, i, tokens)
if self.tag_type == POS:
pos_fill_context(self._context, i, tokens)
else:
raise StandardError
self.extractor.extract(self._feats, self._values, self._context, NULL)
self._guess = self.model.score(self._scores, self._feats, self._values)
return self._guess

View File

@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
from .lexeme cimport Lexeme
from .typedefs cimport flag_t
from .utf8string cimport StringStore
from .tagger cimport TagType
from thinc.typedefs cimport atom_t
@ -23,6 +24,7 @@ cdef class Tokens:
cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
cdef int push_back(self, int i, Lexeme* lexeme) except -1
cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1
cdef class Token:

View File

@ -4,6 +4,7 @@ cimport cython
DEF PADDING = 5
cdef int bounds_check(int i, int length, int padding) except -1:
if (i + padding) < 0:
raise IndexError
@ -89,6 +90,9 @@ cdef class Tokens:
idx = self.push_back(idx, lexemes[i])
return idx
cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1:
self.pos[i] = tag
def _realloc(self, new_size):
self.max_length = new_size
n = new_size + (PADDING * 2)
@ -130,4 +134,3 @@ cdef class Token:
return ''
cdef bytes utf8string = self._string_store[self.sic]
return utf8string.decode('utf8')

View File

@ -6,5 +6,3 @@ ctypedef uint64_t flag_t
ctypedef uint32_t id_t
ctypedef uint16_t len_t
ctypedef uint16_t tag_t