* POS tagger training working after reorg

This commit is contained in:
Matthew Honnibal 2014-12-22 08:54:47 +11:00
parent 4c4aa2c5c9
commit cf8d26c3d2
8 changed files with 38 additions and 63 deletions

View File

@ -9,6 +9,7 @@ from ..tokens import Tokens
from ..morphology import Morphologizer
from .lemmatizer import Lemmatizer
from .pos import EnPosTagger
from .pos import POS_TAGS
from .attrs import get_flags
@ -21,13 +22,13 @@ class English(object):
if data_dir is None:
data_dir = path.join(path.dirname(__file__), 'data')
self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props)
for pos_str in POS_TAGS:
_ = self.vocab.strings.pos_tags[pos_str]
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
if pos_tag:
self.pos_tagger = EnPosTagger(data_dir,
Morphologizer.from_dir(
self.vocab.strings,
Lemmatizer(path.join(data_dir, 'wordnet')),
data_dir))
morph = Morphologizer(self.vocab.strings, POS_TAGS,
Lemmatizer(path.join(data_dir, 'wordnet')))
self.pos_tagger = EnPosTagger(data_dir, morph)
else:
self.pos_tagger = None
if parse:

View File

@ -35,15 +35,15 @@ cdef struct _Cached:
cdef class Morphologizer:
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
"""
def __init__(self, StringStore strings, object lemmatizer,
irregulars=None, tag_map=None, tag_names=None):
def __init__(self, StringStore strings, object tag_map, object lemmatizer,
irregulars=None):
self.mem = Pool()
self.strings = strings
self.tag_names = tag_names
self.lemmatizer = lemmatizer
self._cache = PreshMapArray(len(self.tag_names))
self.tags = <PosTag*>self.mem.alloc(len(self.tag_names), sizeof(PosTag))
for i, tag in enumerate(self.tag_names):
cdef int n_tags = len(self.strings.pos_tags) + 1
self._cache = PreshMapArray(n_tags)
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
for tag, i in self.strings.pos_tags:
pos, props = tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
@ -57,15 +57,6 @@ cdef class Morphologizer:
if irregulars is not None:
self.load_exceptions(irregulars)
@classmethod
def from_dir(cls, StringStore strings, object lemmatizer, data_dir):
tagger_cfg = json.loads(open(path.join(data_dir, 'pos', 'config.json')).read())
tag_map = tagger_cfg['tag_map']
tag_names = tagger_cfg['tag_names']
irregulars = json.loads(open(path.join(data_dir, 'morphs.json')).read())
return cls(strings, lemmatizer, tag_map=tag_map, irregulars=irregulars,
tag_names=tag_names)
cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1:
if self.lemmatizer is None:
return lex.sic
@ -104,9 +95,10 @@ cdef class Morphologizer:
cdef dict props
cdef int lemma
cdef id_t sic
cdef univ_tag_t pos
cdef int pos
for pos_str, entries in exc.items():
pos = self.tag_names.index(pos_str)
pos = self.strings.pos_tags[pos_str]
assert pos < len(self.strings.pos_tags)
for form_str, props in entries.items():
lemma_str = props.get('L', form_str)
sic = self.strings[form_str]

View File

@ -19,6 +19,8 @@ cdef class _SymbolMap:
cdef class StringStore:
cdef Pool mem
cdef Utf8Str* strings
cdef readonly _SymbolMap pos_tags
cdef readonly _SymbolMap dep_tags
cdef size_t size
cdef PreshMap _map

View File

@ -18,6 +18,9 @@ cdef class _SymbolMap:
for id_, string in enumerate(self._id_to_string[1:]):
yield string, id_
def __len__(self):
return len(self._id_to_string)
def __getitem__(self, object string_or_id):
cdef bytes byte_string
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
@ -42,6 +45,7 @@ cdef class StringStore:
self.mem = Pool()
self._map = PreshMap()
self._resize_at = 10000
self.size = 1
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.pos_tags = _SymbolMap()
self.dep_tags = _SymbolMap()

View File

@ -18,6 +18,3 @@ cdef class Tagger:
cpdef readonly Pool mem
cpdef readonly Extractor extractor
cpdef readonly LinearModel model
cpdef readonly list tag_names
cdef dict tagdict

View File

@ -12,15 +12,13 @@ import cython
from thinc.features cimport Feature, count_feats
def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir):
def setup_model_dir(tag_names, templates, model_dir):
if path.exists(model_dir):
shutil.rmtree(model_dir)
os.mkdir(model_dir)
config = {
'templates': templates,
'tag_names': tag_names,
'tag_map': tag_map,
'tag_counts': tag_counts,
}
with open(path.join(model_dir, 'config.json'), 'w') as file_:
json.dump(config, file_)
@ -37,10 +35,9 @@ cdef class Tagger:
univ_counts = {}
cdef unicode tag
cdef unicode univ_tag
self.tag_names = cfg['tag_names']
self.tagdict = _make_tag_dict(cfg['tag_counts'])
tag_names = cfg['tag_names']
self.extractor = Extractor(templates)
self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2)
self.model = LinearModel(len(tag_names) + 1, self.extractor.n_templ+2) # TODO
if path.exists(path.join(model_dir, 'model')):
self.model.load(path.join(model_dir, 'model'))
@ -63,30 +60,6 @@ cdef class Tagger:
self.model.update(counts)
return guess
def tag_id(self, object tag_name):
"""Encode tag_name into a tag ID integer."""
tag_id = self.tag_names.index(tag_name)
if tag_id == -1:
tag_id = len(self.tag_names)
self.tag_names.append(tag_name)
return tag_id
def _make_tag_dict(counts):
freq_thresh = 20
ambiguity_thresh = 0.97
tagdict = {}
cdef atom_t word
cdef atom_t tag
for word_str, tag_freqs in counts.items():
tag_str, mode = max(tag_freqs.items(), key=lambda item: item[1])
n = sum(tag_freqs.values())
word = int(word_str)
tag = int(tag_str)
if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
tagdict[word] = tag
return tagdict
cdef int _arg_max(const weight_t* scores, int n_classes) except -1:
cdef int best = 0

View File

@ -39,10 +39,10 @@ cdef class Token:
cdef readonly StringStore string_store
cdef public int i
cdef public int idx
cdef int pos
cdef readonly int pos_id
cdef readonly int dep_id
cdef int lemma
cdef public int head
cdef public int dep_tag
cdef public atom_t id
cdef public atom_t cluster

View File

@ -1,19 +1,25 @@
from __future__ import unicode_literals
from spacy.en import EN
from spacy.en import English
import pytest
from spacy.en import English
@pytest.fixture
def EN():
return English(pos_tag=True, parse=False)
@pytest.fixture
def morph_exc():
return {
'PRP$': {'his': {'L': '-PRP-', 'person': 3, 'case': 2}},
}
def test_load_exc(morph_exc):
EN.load()
EN.morphologizer.load_exceptions(morph_exc)
tokens = EN.tokenize('I like his style.')
EN.set_pos(tokens)
def test_load_exc(EN, morph_exc):
EN.pos_tagger.morphologizer.load_exceptions(morph_exc)
tokens = EN('I like his style.', pos_tag=True)
his = tokens[2]
assert his.pos == 'PRP$'
assert his.lemma == '-PRP-'