* Improve efficiency of tagger, and improve morphological processing

This commit is contained in:
Matthew Honnibal 2014-12-10 01:02:04 +11:00
parent 6b34a2f34b
commit 42973c4b37
7 changed files with 83 additions and 61 deletions

View File

@ -125,23 +125,5 @@ cpdef enum:
N_CONTEXT_FIELDS
cdef inline void fill_pos_context(atom_t* context, const int i, const TokenC* tokens) nogil:
_fill_from_token(&context[P2_sic], &tokens[i-2])
_fill_from_token(&context[P1_sic], &tokens[i-1])
_fill_from_token(&context[W_sic], &tokens[i])
_fill_from_token(&context[N1_sic], &tokens[i+1])
_fill_from_token(&context[N2_sic], &tokens[i+2])
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.sic
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
context[5] = t.pos
context[6] = t.sense
cdef class English(Language):
pass

View File

@ -151,10 +151,14 @@ cdef class English(Language):
cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context
cdef TokenC* t = tokens.data
assert self.morphologizer is not None
cdef dict tagdict = self.pos_tagger.tagdict
for i in range(tokens.length):
fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context)
if self.morphologizer:
if t[i].lex.sic in tagdict:
t[i].pos = tagdict[t[i].lex.sic]
else:
fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context)
self.morphologizer.set_morph(i, t)
def train_pos(self, Tokens tokens, golds):
@ -165,27 +169,27 @@ cdef class English(Language):
for i in range(tokens.length):
fill_pos_context(context, i, t)
t[i].pos = self.pos_tagger.predict(context, [golds[i]])
if self.morphologizer:
self.morphologizer.set_morph(i, t)
self.morphologizer.set_morph(i, t)
c += t[i].pos == golds[i]
return c
cdef int _merge_morph(Morphology* tok_morph, const Morphology* pos_morph) except -1:
if tok_morph.number == 0:
tok_morph.number = pos_morph.number
if tok_morph.tenspect == 0:
tok_morph.tenspect = pos_morph.tenspect
if tok_morph.mood == 0:
tok_morph.mood = pos_morph.mood
if tok_morph.gender == 0:
tok_morph.gender = pos_morph.gender
if tok_morph.person == 0:
tok_morph.person = pos_morph.person
if tok_morph.case == 0:
tok_morph.case = pos_morph.case
if tok_morph.misc == 0:
tok_morph.misc = pos_morph.misc
cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
_fill_from_token(&context[P2_sic], &tokens[i-2])
_fill_from_token(&context[P1_sic], &tokens[i-1])
_fill_from_token(&context[W_sic], &tokens[i])
_fill_from_token(&context[N1_sic], &tokens[i+1])
_fill_from_token(&context[N2_sic], &tokens[i+2])
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.sic
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
context[5] = t.pos
context[6] = t.sense
EN = English('en')

View File

@ -35,8 +35,8 @@ cdef class Morphologizer:
cdef StringStore strings
cdef object lemmatizer
cdef PosTag* tags
cdef readonly list tag_names
cdef PreshMapArray _morph
cdef PreshMapArray _lemmas
cdef PreshMapArray _cache
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1
cdef int set_morph(self, const int i, TokenC* tokens) except -1

View File

@ -1,8 +1,10 @@
# cython: profile=True
# cython: embedsignature=True
from os import path
import json
from .lemmatizer import Lemmatizer
from .typedefs cimport id_t
UNIV_TAGS = {
'NULL': NO_TAG,
@ -22,6 +24,11 @@ UNIV_TAGS = {
}
cdef struct _Cached:
Morphology morph
int lemma
cdef class Morphologizer:
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
"""
@ -30,12 +37,11 @@ cdef class Morphologizer:
self.strings = strings
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
tag_map = cfg['tag_map']
tag_names = cfg['tag_names']
self.tag_names = cfg['tag_names']
self.lemmatizer = Lemmatizer(path.join(data_dir, '..', 'wordnet'))
self._lemmas = PreshMapArray(N_UNIV_TAGS)
self._morph = PreshMapArray(len(tag_names))
self.tags = <PosTag*>self.mem.alloc(len(tag_names), sizeof(PosTag))
for i, tag in enumerate(tag_names):
self._cache = PreshMapArray(len(self.tag_names))
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
for i, tag in enumerate(self.tag_names):
pos, props = tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
@ -46,15 +52,15 @@ cdef class Morphologizer:
self.tags[i].morph.person = props.get('person', 0)
self.tags[i].morph.case = props.get('case', 0)
self.tags[i].morph.misc = props.get('misc', 0)
if path.exists(path.join(data_dir, 'morph.json')):
with open(path.join(data_dir, 'morph.json')) as file_:
self.load_exceptions(json.loads(file_))
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None:
return lex.sic
if pos != NOUN and pos != VERB and pos != ADJ:
return lex.sic
cdef int lemma = <int><size_t>self._lemmas.get(pos, lex.sic)
if lemma != 0:
return lemma
cdef bytes py_string = self.strings[lex.sic]
cdef set lemma_strings
cdef bytes lemma_string
@ -67,15 +73,45 @@ cdef class Morphologizer:
lemma_strings = self.lemmatizer.adj(py_string)
lemma_string = sorted(lemma_strings)[0]
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
self._lemmas.set(pos, lex.sic, <void*>lemma)
return lemma
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
cdef const PosTag* tag = &self.tags[tokens[i].pos]
tokens[i].lemma = self.lemmatize(tag.pos, tokens[i].lex)
morph = <Morphology*>self._morph.get(tag.id, tokens[i].lemma)
if morph is NULL:
self._morph.set(tag.id, tokens[i].lemma, <void*>&tag.morph)
tokens[i].morph = tag.morph
else:
tokens[i].morph = morph[0]
cached = <_Cached*>self._cache.get(tag.id, tokens[i].lex.sic)
if cached is NULL:
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
cached.morph = tag.morph
self._cache.set(tag.id, tokens[i].lex.sic, <void*>cached)
tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph
def load_exceptions(self, dict exc):
cdef unicode pos_str
cdef unicode form_str
cdef unicode lemma_str
cdef dict entries
cdef dict props
cdef int lemma
cdef id_t sic
cdef univ_tag_t pos
for pos_str, entries in exc.items():
pos = self.tag_names.index(pos_str)
for form_str, props in entries.items():
lemma_str = props.get('L', form_str)
sic = self.strings[form_str]
cached = <_Cached*>self.mem.alloc(1, sizeof(_Cached))
cached.lemma = self.strings[lemma_str]
set_morph_from_dict(&cached.morph, props)
self._cache.set(pos, sic, <void*>cached)
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
morph.number = props.get('number', 0)
morph.tenspect = props.get('tenspect', 0)
morph.mood = props.get('mood', 0)
morph.gender = props.get('gender', 0)
morph.person = props.get('person', 0)
morph.case = props.get('case', 0)
morph.misc = props.get('misc', 0)

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
import unicodedata
from unidecode import unidecode
import re
import math

View File

@ -8,7 +8,7 @@ from thinc.typedefs cimport atom_t, feat_t, weight_t, class_t
from preshed.maps cimport PreshMapArray
from .typedefs cimport hash_t
from .typedefs cimport hash_t, id_t
from .tokens cimport Tokens, Morphology

View File

@ -72,10 +72,9 @@ cdef class Tagger:
return tag_id
def _make_tag_dict(counts):
freq_thresh = 50
ambiguity_thresh = 0.98
freq_thresh = 20
ambiguity_thresh = 0.97
tagdict = {}
cdef atom_t word
cdef atom_t tag