* Generalize tagger code, in preparation for NER and supersense tagging.

This commit is contained in:
Matthew Honnibal 2014-11-05 03:42:14 +11:00
parent 81da61f3cf
commit 3733444101
12 changed files with 247 additions and 52 deletions

View File

@ -6,7 +6,7 @@ from cymem.cymem cimport Pool
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .tokens cimport Tokens from .tokens cimport Tokens
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .pos cimport Tagger as PosTagger from .tagger cimport Tagger
from .utf8string cimport StringStore from .utf8string cimport StringStore
@ -41,14 +41,13 @@ cdef class Language:
cdef PreshMap _specials cdef PreshMap _specials
cpdef readonly Lexicon lexicon cpdef readonly Lexicon lexicon
cpdef readonly PosTagger pos_tagger cpdef readonly Tagger pos_tagger
cdef object _prefix_re cdef object _prefix_re
cdef object _suffix_re cdef object _suffix_re
cdef object _infix_re cdef object _infix_re
cpdef Tokens tokenize(self, unicode text) cpdef Tokens tokenize(self, unicode text)
cpdef Tokens pos_tag(self, Tokens t)
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1 cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1
cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes, cdef String* _split_affixes(self, String* string, vector[Lexeme*] *prefixes,

View File

@ -23,7 +23,7 @@ from . import util
from .util import read_lang_data from .util import read_lang_data
from .tokens import Tokens from .tokens import Tokens
from .pos cimport Tagger as PosTagger from .tagger cimport Tagger
cdef class Language: cdef class Language:
@ -42,7 +42,7 @@ cdef class Language:
self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings')) self.lexicon.strings.load(path.join(util.DATA_DIR, name, 'strings'))
self._load_special_tokenization(rules) self._load_special_tokenization(rules)
if path.exists(path.join(util.DATA_DIR, name, 'pos')): if path.exists(path.join(util.DATA_DIR, name, 'pos')):
self.pos_tagger = PosTagger(path.join(util.DATA_DIR, name, 'pos')) self.pos_tagger = Tagger(path.join(util.DATA_DIR, name, 'pos'))
else: else:
self.pos_tagger = None self.pos_tagger = None
@ -93,16 +93,6 @@ cdef class Language:
self._tokenize(tokens, &span, start, i) self._tokenize(tokens, &span, start, i)
return tokens return tokens
cpdef Tokens pos_tag(self, Tokens t):
if self.pos_tagger is None:
return t
cdef int i
t.pos[-1] = self.pos_tagger.encode_pos('EOL')
t.pos[-2] = self.pos_tagger.encode_pos('EOL')
for i in range(t.length):
t.pos[i] = self.pos_tagger.predict(i, t, t.pos[i-1], t.pos[i-2])
return t
cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1: cdef int _tokenize(self, Tokens tokens, String* span, int start, int end) except -1:
cdef vector[Lexeme*] prefixes cdef vector[Lexeme*] prefixes
cdef vector[Lexeme*] suffixes cdef vector[Lexeme*] suffixes

View File

@ -1,22 +0,0 @@
from cymem.cymem cimport Pool
from thinc.learner cimport LinearModel
from thinc.features cimport Extractor
from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from .tokens cimport Tokens
cdef class Tagger:
cpdef readonly Extractor extractor
cpdef readonly LinearModel model
cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0
cpdef bint tell_answer(self, class_t gold_tag) except *
cdef Pool mem
cdef class_t _guess
cdef atom_t* _atoms
cdef feat_t* _feats
cdef weight_t* _values
cdef weight_t* _scores

View File

@ -30,7 +30,7 @@ cdef class Tagger:
if path.exists(tags_loc): if path.exists(tags_loc):
with open(tags_loc) as file_: with open(tags_loc) as file_:
Tagger.tags.update(ujson.load(file_)) Tagger.tags.update(ujson.load(file_))
self.model = LinearModel(len(self.tags), self.extractor.n) self.model = LinearModel(len(self.tags))
if path.exists(path.join(model_dir, 'model')): if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model')) self.model.load(path.join(model_dir, 'model'))
self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES]) self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])

83
spacy/pos_feats.pxd Normal file
View File

@ -0,0 +1,83 @@
from .tokens cimport Tokens
from thinc.typedefs cimport atom_t
cpdef enum:
P2i
P2c
P2w
P2shape
P2pref
P2suff
P2title
P2upper
P2oft_title
P2oft_upper
P2pos
P2url
P2num
P1i
P1c
P1w
P1shape
P1pre
P1suff
P1title
P1upper
P1oft_title
P1oft_upper
P1pos
P1url
P1num
N0i
N0c
N0w
N0shape
N0pref
N0suff
N0title
N0upper
N0oft_title
N0oft_upper
N0pos
N0url
N0num
N1i
N1c
N1w
N1shape
N1pref
N1suff
N1title
N1upper
N1oft_title
N1oft_upper
N1pos
N1url
N1num
N2i
N2c
N2w
N2shape
N2pref
N2suff
N2title
N2upper
N2oft_title
N2oft_upper
N2pos
N2url
N2num
P2t
P1t
CONTEXT_SIZE
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1

77
spacy/pos_feats.pyx Normal file
View File

@ -0,0 +1,77 @@
from .lexeme cimport *
from thinc.typedefs cimport atom_t
TEMPLATES = (
(N0i,),
(N0w,),
(N0suff,),
(N0pref,),
(P1t,),
(P2t,),
(P1t, P2t),
(P1t, N0w),
(P1w,),
(P1suff,),
(P2w,),
(N1w,),
(N1suff,),
(N2w,),
(N0shape,),
(N0c,),
(N1c,),
(N2c,),
(P1c,),
(P2c,),
(P1c, N0c),
(N0c, N1c),
(P1c, P1t),
(P1c, P1t, N0c),
(P1t, N0c),
(N0oft_upper,),
(N0oft_title,),
(P1w, N0w),
(N0w, N1w),
(N0pos,),
(P1t, N0pos, N1pos),
(P1t, N1pos),
(N0url,),
(N0num,),
(P1url,),
(P1url,),
(N1num,),
(N1url,),
)
cdef int fill_context(atom_t* context, int i, Tokens tokens) except -1:
_fill_token(&context[P2i], tokens.lex[i-2])
_fill_token(&context[P1i], tokens.lex[i-1])
_fill_token(&context[N0i], tokens.lex[i])
_fill_token(&context[N1i], tokens.lex[i+1])
_fill_token(&context[N2i], tokens.lex[i+2])
context[P1t] = tokens.pos[i-1]
context[P2t] = tokens.pos[i-2]
cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
atoms[0] = lex.sic
atoms[1] = lex.cluster
atoms[2] = lex.norm if (lex.prob != 0 and lex.prob >= -10) else lex.shape
atoms[3] = lex.shape
atoms[4] = lex.prefix
atoms[5] = lex.suffix
atoms[6] = lex.flags & (1 << IS_TITLE)
atoms[7] = lex.flags & (1 << IS_UPPER)
atoms[8] = lex.flags & (1 << OFT_TITLE)
atoms[9] = lex.flags & (1 << OFT_UPPER)
atoms[10] = lex.postype
atoms[11] = lex.flags & (1 << LIKE_URL)
atoms[12] = lex.flags & (1 << LIKE_NUMBER)

View File

@ -6,9 +6,10 @@ from .en import EN
from .pos import Tagger from .pos import Tagger
def read_gold(file_): def read_gold(file_, tag_list):
paras = file_.read().strip().split('\n\n') paras = file_.read().strip().split('\n\n')
golds = [] golds = []
tag_ids = dict((tag, i) for i, tag in enumerate(tag_list))
for para in paras: for para in paras:
if not para.strip(): if not para.strip():
continue continue
@ -32,10 +33,16 @@ def read_gold(file_):
else: else:
conll_toks.pop(0) conll_toks.pop(0)
assert len(tags) == len(tokens) assert len(tags) == len(tokens)
tags = [Tagger.encode_pos(t) for t in tags] tags = [_encode_pos(t, tag_ids, tag_list) for t in tags]
golds.append((tokens, tags)) golds.append((tokens, tags))
return golds return golds
def _encode_pos(tag, tag_ids, tag_list):
if tag not in tag_ids:
tag_ids[tag] = len(tag_list)
tag_list.append(tag)
return tag_ids[tag]
def ptb_to_univ(tag): def ptb_to_univ(tag):
mapping = dict(tuple(line.split()) for line in """ mapping = dict(tuple(line.split()) for line in """

View File

@ -7,7 +7,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from .tokens cimport Tokens from .tokens cimport Tokens
cdef enum TagType: cpdef enum TagType:
POS POS
ENTITY ENTITY
SENSE SENSE

View File

@ -1,37 +1,93 @@
# cython: profile=True # cython: profile=True
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from os import path from os import path
import os import os
import shutil import shutil
import random import random
import codecs
import gzip
import json import json
import cython import cython
from .pos_feats cimport fill_context as pos_fill_context
from .pos_feats cimport CONTEXT_SIZE as POS_CONTEXT_SIZE
from thinc.features cimport ConjFeat from thinc.features cimport ConjFeat
NULL_TAG = 0 NULL_TAG = 0
def setup_model_dir(tag_type, tag_names, templates, model_dir):
if path.exists(model_dir):
shutil.rmtree(model_dir)
os.mkdir(model_dir)
config = {
'tag_type': tag_type,
'templates': templates,
'tag_names': tag_names,
}
with open(path.join(model_dir, 'config.json'), 'w') as file_:
json.dump(config, file_)
def train(train_sents, model_dir, nr_iter=5):
tagger = Tagger(model_dir)
for _ in range(nr_iter):
n_corr = 0
total = 0
for tokens, golds in train_sents:
assert len(tokens) == len(golds), [t.string for t in tokens]
for i, gold in enumerate(golds):
guess = tagger.predict(i, tokens)
tokens.set_tag(i, tagger.tag_type, guess)
tagger.tell_answer(gold)
if gold != NULL_TAG:
total += 1
n_corr += guess == gold
print('%.4f' % ((n_corr / total) * 100))
random.shuffle(train_sents)
tagger.model.end_training()
tagger.model.dump(path.join(model_dir, 'model'), freq_thresh=10)
def evaluate(tagger, sents):
n_corr = 0
total = 0
for tokens, golds in sents:
for i, gold in enumerate(golds):
guess = tagger.predict(i, tokens)
tokens.set_tag(i, tagger.tag_type, guess)
if gold != NULL_TAG:
total += 1
n_corr += guess == gold
return n_corr / total
cdef class Tagger: cdef class Tagger:
"""Assign part-of-speech, named entity or supersense tags, using greedy """Assign part-of-speech, named entity or supersense tags, using greedy
decoding. The tagger reads its model and configuration from disk. decoding. The tagger reads its model and configuration from disk.
""" """
def __init__(self, model_dir): def __init__(self, model_dir):
self.mem = Pool() self.mem = Pool()
cfg = json.load(path.join(model_dir, 'config.json')) cfg = json.load(open(path.join(model_dir, 'config.json')))
templates = cfg['templates'] templates = cfg['templates']
self.tag_names = cfg['tag_names'] self.tag_names = cfg['tag_names']
self.tag_type = cfg['tag_type'] self.tag_type = cfg['tag_type']
self.model = LinearModel(len(self.tag_names)) self.extractor = Extractor(templates, [ConjFeat] * len(templates))
self.model = LinearModel(len(self.tag_names), self.extractor.n)
print("Load tagger model")
if path.exists(path.join(model_dir, 'model')): if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model')) self.model.load(path.join(model_dir, 'model'))
self.extractor = Extractor(templates, [ConjFeat] * len(templates)) print("Done")
if self.tag_type == POS:
n_context = POS_CONTEXT_SIZE
self._context = <atom_t*>self.mem.alloc(n_context, sizeof(atom_t))
self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t)) self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t)) self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
self._scores = <weight_t*>self.mem.alloc(len(self.cfg.tags), sizeof(weight_t)) self._scores = <weight_t*>self.mem.alloc(self.model.nr_class, sizeof(weight_t))
self._guess = NULL_TAG self._guess = NULL_TAG
cpdef int set_tags(self, Tokens tokens) except -1: cpdef int set_tags(self, Tokens tokens) except -1:
@ -54,8 +110,10 @@ cdef class Tagger:
>>> tag = EN.pos_tagger.predict(0, tokens) >>> tag = EN.pos_tagger.predict(0, tokens)
>>> assert tag == EN.pos_tagger.tag_id('DT') == 5 >>> assert tag == EN.pos_tagger.tag_id('DT') == 5
""" """
#if self.tag_type == POS: if self.tag_type == POS:
# _pos_feats.fill_context(self._context, i, tokens) pos_fill_context(self._context, i, tokens)
else:
raise StandardError
self.extractor.extract(self._feats, self._values, self._context, NULL) self.extractor.extract(self._feats, self._values, self._context, NULL)
self._guess = self.model.score(self._scores, self._feats, self._values) self._guess = self.model.score(self._scores, self._feats, self._values)
return self._guess return self._guess

View File

@ -3,6 +3,7 @@ from cymem.cymem cimport Pool
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .typedefs cimport flag_t from .typedefs cimport flag_t
from .utf8string cimport StringStore from .utf8string cimport StringStore
from .tagger cimport TagType
from thinc.typedefs cimport atom_t from thinc.typedefs cimport atom_t
@ -23,6 +24,7 @@ cdef class Tokens:
cdef int extend(self, int i, Lexeme** lexemes, int n) except -1 cdef int extend(self, int i, Lexeme** lexemes, int n) except -1
cdef int push_back(self, int i, Lexeme* lexeme) except -1 cdef int push_back(self, int i, Lexeme* lexeme) except -1
cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1
cdef class Token: cdef class Token:

View File

@ -4,6 +4,7 @@ cimport cython
DEF PADDING = 5 DEF PADDING = 5
cdef int bounds_check(int i, int length, int padding) except -1: cdef int bounds_check(int i, int length, int padding) except -1:
if (i + padding) < 0: if (i + padding) < 0:
raise IndexError raise IndexError
@ -89,6 +90,9 @@ cdef class Tokens:
idx = self.push_back(idx, lexemes[i]) idx = self.push_back(idx, lexemes[i])
return idx return idx
cpdef int set_tag(self, int i, TagType tag_type, int tag) except -1:
self.pos[i] = tag
def _realloc(self, new_size): def _realloc(self, new_size):
self.max_length = new_size self.max_length = new_size
n = new_size + (PADDING * 2) n = new_size + (PADDING * 2)
@ -130,4 +134,3 @@ cdef class Token:
return '' return ''
cdef bytes utf8string = self._string_store[self.sic] cdef bytes utf8string = self._string_store[self.sic]
return utf8string.decode('utf8') return utf8string.decode('utf8')

View File

@ -6,5 +6,3 @@ ctypedef uint64_t flag_t
ctypedef uint32_t id_t ctypedef uint32_t id_t
ctypedef uint16_t len_t ctypedef uint16_t len_t
ctypedef uint16_t tag_t ctypedef uint16_t tag_t