* Improve efficiency of tagger, and improve morphological processing

This commit is contained in:
Matthew Honnibal 2014-12-10 01:02:04 +11:00
parent 6b34a2f34b
commit 42973c4b37
7 changed files with 83 additions and 61 deletions

View File

@ -125,23 +125,5 @@ cpdef enum:
N_CONTEXT_FIELDS N_CONTEXT_FIELDS
cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil:
_fill_from_token(&context[P2_sic], &tokens[i-2])
_fill_from_token(&context[P1_sic], &tokens[i-1])
_fill_from_token(&context[W_sic], &tokens[i])
_fill_from_token(&context[N1_sic], &tokens[i+1])
_fill_from_token(&context[N2_sic], &tokens[i+2])
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.sic
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
context[5] = t.pos
context[6] = t.sense
cdef class English(Language): cdef class English(Language):
pass pass

View File

@ -151,10 +151,14 @@ cdef class English(Language):
cdef int i cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context cdef atom_t[N_CONTEXT_FIELDS] context
cdef TokenC* t = tokens.data cdef TokenC* t = tokens.data
assert self.morphologizer is not None
cdef dict tagdict = self.pos_tagger.tagdict
for i in range(tokens.length): for i in range(tokens.length):
if t[i].lex.sic in tagdict:
t[i].pos = tagdict[t[i].lex.sic]
else:
fill_pos_context(context, i, t) fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context) t[i].pos = self.pos_tagger.predict(context)
if self.morphologizer:
self.morphologizer.set_morph(i, t) self.morphologizer.set_morph(i, t)
def train_pos(self, Tokens tokens, golds): def train_pos(self, Tokens tokens, golds):
@ -165,27 +169,27 @@ cdef class English(Language):
for i in range(tokens.length): for i in range(tokens.length):
fill_pos_context(context, i, t) fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context, [golds[i]]) t[i].pos = self.pos_tagger.predict(context, [golds[i]])
if self.morphologizer:
self.morphologizer.set_morph(i, t) self.morphologizer.set_morph(i, t)
c += t[i].pos == golds[i] c += t[i].pos == golds[i]
return c return c
cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1: cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
if tok_morph.number == 0: _fill_from_token(&context[P2_sic], &tokens[i-2])
tok_morph.number = pos_morph.number _fill_from_token(&context[P1_sic], &tokens[i-1])
if tok_morph.tenspect == 0: _fill_from_token(&context[W_sic], &tokens[i])
tok_morph.tenspect = pos_morph.tenspect _fill_from_token(&context[N1_sic], &tokens[i+1])
if tok_morph.mood == 0: _fill_from_token(&context[N2_sic], &tokens[i+2])
tok_morph.mood = pos_morph.mood
if tok_morph.gender == 0:
tok_morph.gender = pos_morph.gender cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
if tok_morph.person == 0: context[0] = t.lex.sic
tok_morph.person = pos_morph.person context[1] = t.lex.cluster
if tok_morph.case == 0: context[2] = t.lex.shape
tok_morph.case = pos_morph.case context[3] = t.lex.prefix
if tok_morph.misc == 0: context[4] = t.lex.suffix
tok_morph.misc = pos_morph.misc context[5] = t.pos
context[6] = t.sense
EN = English('en') EN = English('en')

View File

@ -35,8 +35,8 @@ cdef class Morphologizer:
cdef StringStore strings cdef StringStore strings
cdef object lemmatizer cdef object lemmatizer
cdef PosTag* tags cdef PosTag* tags
cdef readonly list tag_names
cdef PreshMapArray _morph cdef PreshMapArray _cache
cdef PreshMapArray _lemmas
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1 cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
cdef int set_morph(self, const int i, TokenC* tokens) except -1 cdef int set_morph(self, const int i, TokenC* tokens) except -1

View File

@ -1,8 +1,10 @@
# cython: profile=True
# cython: embedsignature=True
from os import path from os import path
import json import json
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .typedefs cimport id_t
UNIV_TAGS = { UNIV_TAGS = {
'NULL': NO_TAG, 'NULL': NO_TAG,
@ -22,6 +24,11 @@ UNIV_TAGS = {
} }
cdef struct _Cached:
Morphology morph
int lemma
cdef class Morphologizer: cdef class Morphologizer:
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis. """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
""" """
@ -30,12 +37,11 @@ cdef class Morphologizer:
self.strings = strings self.strings = strings
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
tag_map = cfg['tag_map'] tag_map = cfg['tag_map']
tag_names = cfg['tag_names'] self.tag_names = cfg['tag_names']
self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet')) self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
self._lemmas = PreshMapArray(N_UNIV_TAGS) self._cache = PreshMapArray(len(self.tag_names))
self._morph = PreshMapArray(len(tag_names)) self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
self.tags = <PosTag*>self.mem.alloc(len(tag_names), sizeof(PosTag)) for i, tag in enumerate(self.tag_names):
for i, tag in enumerate(tag_names):
pos, props = tag_map[tag] pos, props = tag_map[tag]
self.tags[i].id = i self.tags[i].id = i
self.tags[i].pos = pos self.tags[i].pos = pos
@ -46,15 +52,15 @@ cdef class Morphologizer:
self.tags[i].morph.person = props.get('person', 0) self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0) self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0) self.tags[i].morph.misc = props.get('misc', 0)
if path.exists(path.join(data_dir, 'morph.json')):
with open(path.join(data_dir, 'morph.json')) as file_:
self.load_exceptions(json.loads(file_))
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None: if self.lemmatizer is None:
return lex.sic return lex.sic
if pos != NOUN and pos != VERB and pos != ADJ: if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic return lex.sic
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
if lemma != 0:
return lemma
cdef bytes py_string = self.strings[lex.sic] cdef bytes py_string = self.strings[lex.sic]
cdef set lemma_strings cdef set lemma_strings
cdef bytes lemma_string cdef bytes lemma_string
@ -67,15 +73,45 @@ cdef class Morphologizer:
lemma_strings = self.lemmatizer.adj(py_string) lemma_strings = self.lemmatizer.adj(py_string)
lemma_string = sorted(lemma_strings)[0] lemma_string = sorted(lemma_strings)[0]
lemma = self.strings.intern(lemma_string, len(lemma_string)).i lemma = self.strings.intern(lemma_string, len(lemma_string)).i
self._lemmas.set(pos, lex.sic, <void*>lemma)
return lemma return lemma
cdef int set_morph(self, const int i, TokenC* tokens) except -1: cdef int set_morph(self, const int i, TokenC* tokens) except -1:
cdef const PosTag* tag = &self.tags[tokens[i].pos] cdef const PosTag* tag = &self.tags[tokens[i].pos]
tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex) cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
morph = <Morphology*>self._morph.get(tag.id, tokens[i].lemma) if cached is NULL:
if morph is NULL: cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
self._morph.set(tag.id, tokens[i].lemma, <void*>&tag.morph) cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
tokens[i].morph = tag.morph cached.morph = tag.morph
else: self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
tokens[i].morph = morph[0]
tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph
def load_exceptions(self, dict exc):
cdef unicode pos_str
cdef unicode form_str
cdef unicode lemma_str
cdef dict entries
cdef dict props
cdef int lemma
cdef id_t sic
cdef univ_tag_t pos
for pos_str, entries in exc.items():
pos = self.tag_names.index(pos_str)
for form_str, props in entries.items():
lemma_str = props.get('L', form_str)
sic = self.strings[form_str]
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.lemma = self.strings[lemma_str]
set_morph_from_dict(&cached.morph, props)
self._cache.set(pos, sic, <void*>cached)
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
morph.number = props.get('number', 0)
morph.tenspect = props.get('tenspect', 0)
morph.mood = props.get('mood', 0)
morph.gender = props.get('gender', 0)
morph.person = props.get('person', 0)
morph.case = props.get('case', 0)
morph.misc = props.get('misc', 0)

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
import unicodedata import unicodedata
from unidecode import unidecode from unidecode import unidecode
import re
import math import math

View File

@ -8,7 +8,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from preshed.maps cimport PreshMapArray from preshed.maps cimport PreshMapArray
from .typedefs cimport hash_t from .typedefs cimport hash_t, id_t
from .tokens cimport Tokens, Morphology from .tokens cimport Tokens, Morphology

View File

@ -72,10 +72,9 @@ cdef class Tagger:
return tag_id return tag_id
def _make_tag_dict(counts): def _make_tag_dict(counts):
freq_thresh = 50 freq_thresh = 20
ambiguity_thresh = 0.98 ambiguity_thresh = 0.97
tagdict = {} tagdict = {}
cdef atom_t word cdef atom_t word
cdef atom_t tag cdef atom_t tag