* Refactor language-independent tagger class

This commit is contained in:
Matthew Honnibal 2015-08-26 19:19:21 +02:00
parent a3d5e6c0dd
commit b4faf551f5
2 changed files with 151 additions and 83 deletions

View File

@ -4,24 +4,23 @@ from cymem.cymem cimport Pool
from ._ml cimport Model from ._ml cimport Model
from .strings cimport StringStore from .strings cimport StringStore
from .structs cimport TokenC, LexemeC, Morphology, PosTag from .structs cimport TokenC, LexemeC
from .parts_of_speech cimport univ_pos_t from .parts_of_speech cimport univ_pos_t
from .vocab cimport Vocab
cdef class Tagger: cdef class Tagger:
cdef readonly Pool mem cdef readonly Pool mem
cdef readonly StringStore strings cdef readonly StringStore strings
cdef readonly Model model cdef readonly Model model
cdef readonly Vocab vocab
cdef public object lemmatizer cdef public object lemmatizer
cdef PreshMapArray _morph_cache cdef PreshMapArray _morph_cache
cdef public dict freqs cdef public dict freqs
cdef PosTag* tags
cdef readonly object tag_names
cdef readonly object tag_map
cdef readonly int n_tags cdef readonly int n_tags
cdef int predict(self, int i, const TokenC* tokens) except -1 cdef int predict(self, int i, const TokenC* tokens) except -1
cdef int update(self, int i, const TokenC* tokens, int gold) except -1 cdef int update(self, int i, const TokenC* tokens, int gold) except -1
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1 #cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1 #cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1

View File

@ -6,50 +6,129 @@ from thinc.typedefs cimport atom_t, weight_t
from .typedefs cimport attr_t from .typedefs cimport attr_t
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .morphology cimport set_morph_from_dict
from .attrs cimport TAG from .attrs cimport TAG
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE from .parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
from .attrs cimport *
from ._ml cimport arg_max
cdef struct _CachedMorph: cpdef enum:
Morphology morph P2_orth
int lemma P2_cluster
P2_shape
P2_prefix
P2_suffix
P2_pos
P2_lemma
P2_flags
P1_orth
P1_cluster
P1_shape
P1_prefix
P1_suffix
P1_pos
P1_lemma
P1_flags
W_orth
W_cluster
W_shape
W_prefix
W_suffix
W_pos
W_lemma
W_flags
N1_orth
N1_cluster
N1_shape
N1_prefix
N1_suffix
N1_pos
N1_lemma
N1_flags
N2_orth
N2_cluster
N2_shape
N2_prefix
N2_suffix
N2_pos
N2_lemma
N2_flags
N_CONTEXT_FIELDS
cdef class Tagger: cdef class Tagger:
"""A part-of-speech tagger for English""" """A part-of-speech tagger for English"""
@classmethod
def read_config(cls, data_dir):
return json.load(open(path.join(data_dir, 'pos', 'config.json')))
@classmethod
def default_templates(cls):
return (
(W_orth,),
(P1_lemma, P1_pos),
(P2_lemma, P2_pos),
(N1_orth,),
(N2_orth,),
(W_suffix,),
(W_prefix,),
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
(P1_pos, W_orth),
(P1_suffix,),
(N1_suffix,),
(W_shape,),
(W_cluster,),
(N1_cluster,),
(N2_cluster,),
(P1_cluster,),
(P2_cluster,),
(W_flags,),
(N1_flags,),
(N2_flags,),
(P1_flags,),
(P2_flags,),
)
def make_lemmatizer(self): def make_lemmatizer(self):
return None return None
def __init__(self, StringStore strings, data_dir): def __init__(self, Vocab vocab, templates):
self.mem = Pool() self.mem = Pool()
model_dir = path.join(data_dir, 'pos') self.vocab = vocab
self.strings = strings
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) cdef int n_tags = self.vocab.morphology.n_tags + 1
self.tag_names = sorted(cfg['tag_names'])
assert self.tag_names
self.n_tags = len(self.tag_names)
self.tag_map = cfg['tag_map']
cdef int n_tags = len(self.tag_names) + 1
self.model = Model(n_tags, cfg['templates'], model_dir) self.model = Model(n_tags, templates)
self._morph_cache = PreshMapArray(n_tags)
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
for i, tag in enumerate(sorted(self.tag_names)):
pos, props = self.tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
set_morph_from_dict(&self.tags[i].morph, props)
if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')):
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',
'morphs.json'))))
self.lemmatizer = self.make_lemmatizer(data_dir)
self.freqs = {TAG: defaultdict(int)} self.freqs = {TAG: defaultdict(int)}
for tag in self.tag_names: for tag in self.tag_names:
self.freqs[TAG][self.strings[tag]] = 1 self.freqs[TAG][self.vocab.strings[tag]] = 1
self.freqs[TAG][0] = 1 self.freqs[TAG][0] = 1
@property
def tag_names(self):
return tuple(sorted(self.vocab.morphology.tag_map.keys()))
@classmethod
def from_dir(cls, data_dir, vocab):
if path.exists(path.join(data_dir, 'templates.json')):
templates = json.loads(open(path.join(data_dir, 'templates.json')))
else:
templates = cls.default_templates()
return cls(vocab, templates)
def __call__(self, Doc tokens): def __call__(self, Doc tokens):
"""Apply the tagger, setting the POS tags onto the Doc object. """Apply the tagger, setting the POS tags onto the Doc object.
@ -63,18 +142,14 @@ cdef class Tagger:
for i in range(tokens.length): for i in range(tokens.length):
if tokens.data[i].pos == 0: if tokens.data[i].pos == 0:
guess = self.predict(i, tokens.data) guess = self.predict(i, tokens.data)
tokens.data[i].tag = self.strings[self.tag_names[guess]] self.vocab.morphology.assign_tag(&tokens.data[i], guess)
self.set_morph(i, &self.tags[guess], tokens.data)
tokens.is_tagged = True tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
def tag_from_strings(self, Doc tokens, object tag_strs): def tag_from_strings(self, Doc tokens, object tag_strs):
cdef int i cdef int i
for i in range(tokens.length): for i in range(tokens.length):
tokens.data[i].tag = self.strings[tag_strs[i]] self.vocab.morphology.assign_tag(&tokens.data[i], tag_strs[i])
self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])],
tokens.data)
tokens.is_tagged = True tokens.is_tagged = True
tokens._py_tokens = [None] * tokens.length tokens._py_tokens = [None] * tokens.length
@ -88,57 +163,51 @@ cdef class Tagger:
for i in range(tokens.length): for i in range(tokens.length):
guess = self.update(i, tokens.data, golds[i]) guess = self.update(i, tokens.data, golds[i])
loss = golds[i] != -1 and guess != golds[i] loss = golds[i] != -1 and guess != golds[i]
tokens.data[i].tag = self.strings[self.tag_names[guess]]
self.set_morph(i, &self.tags[guess], tokens.data) self.vocab.morphology.assign_tag(&tokens.data[i], guess)
correct += loss == 0 correct += loss == 0
self.freqs[TAG][tokens.data[i].tag] += 1 self.freqs[TAG][tokens.data[i].tag] += 1
return correct return correct
cdef int predict(self, int i, const TokenC* tokens) except -1: cdef int predict(self, int i, const TokenC* tokens) except -1:
raise NotImplementedError cdef atom_t[N_CONTEXT_FIELDS] context
_fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_orth], &tokens[i+2])
scores = self.model.score(context)
return arg_max(scores, self.model.n_classes)
cdef int update(self, int i, const TokenC* tokens, int gold) except -1: cdef int update(self, int i, const TokenC* tokens, int gold) except -1:
raise NotImplementedError cdef atom_t[N_CONTEXT_FIELDS] context
_fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_orth], &tokens[i+2])
scores = self.model.score(context)
guess = arg_max(scores, self.model.n_classes)
loss = guess != gold if gold != -1 else 0
self.model.update(context, guess, gold, loss)
return guess
cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1:
tokens[i].pos = tag.pos
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
if cached is NULL:
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
cached.morph = tag.morph
self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1: cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
if self.lemmatizer is None: context[0] = t.lex.lower
return lex.orth context[1] = t.lex.cluster
cdef unicode py_string = self.strings[lex.orth] context[2] = t.lex.shape
if pos != NOUN and pos != VERB and pos != ADJ: context[3] = t.lex.prefix
return lex.orth context[4] = t.lex.suffix
cdef set lemma_strings context[5] = t.tag
cdef unicode lemma_string context[6] = t.lemma
lemma_strings = self.lemmatizer(py_string, pos) if t.lex.flags & (1 << IS_ALPHA):
lemma_string = sorted(lemma_strings)[0] context[7] = 1
lemma = self.strings[lemma_string] elif t.lex.flags & (1 << IS_PUNCT):
return lemma context[7] = 2
elif t.lex.flags & (1 << LIKE_URL):
def load_morph_exceptions(self, dict exc): context[7] = 3
cdef unicode pos_str elif t.lex.flags & (1 << LIKE_NUM):
cdef unicode form_str context[7] = 4
cdef unicode lemma_str else:
cdef dict entries context[7] = 0
cdef dict props
cdef int lemma
cdef attr_t orth
cdef int pos
for pos_str, entries in exc.items():
pos = self.tag_names.index(pos_str)
for form_str, props in entries.items():
lemma_str = props.get('L', form_str)
orth = self.strings[form_str]
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.strings[lemma_str]
set_morph_from_dict(&cached.morph, props)
self._morph_cache.set(pos, orth, <void*>cached)