From cf8d26c3d2ba6498668b48f7650a4464b3f2115c Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Mon, 22 Dec 2014 08:54:47 +1100 Subject: [PATCH] * POS tagger training working after reorg --- spacy/en/__init__.py | 11 ++++++----- spacy/morphology.pyx | 26 +++++++++----------------- spacy/strings.pxd | 2 ++ spacy/strings.pyx | 4 ++++ spacy/tagger.pxd | 3 --- spacy/tagger.pyx | 33 +++------------------------------ spacy/tokens.pxd | 4 ++-- tests/test_morph_exceptions.py | 18 ++++++++++++------ 8 files changed, 38 insertions(+), 63 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 1cebec7ba..d17eaf61b 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -9,6 +9,7 @@ from ..tokens import Tokens from ..morphology import Morphologizer from .lemmatizer import Lemmatizer from .pos import EnPosTagger +from .pos import POS_TAGS from .attrs import get_flags @@ -21,13 +22,13 @@ class English(object): if data_dir is None: data_dir = path.join(path.dirname(__file__), 'data') self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props) + for pos_str in POS_TAGS: + _ = self.vocab.strings.pos_tags[pos_str] self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir) if pos_tag: - self.pos_tagger = EnPosTagger(data_dir, - Morphologizer.from_dir( - self.vocab.strings, - Lemmatizer(path.join(data_dir, 'wordnet')), - data_dir)) + morph = Morphologizer(self.vocab.strings, POS_TAGS, + Lemmatizer(path.join(data_dir, 'wordnet'))) + self.pos_tagger = EnPosTagger(data_dir, morph) else: self.pos_tagger = None if parse: diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 9efee6da3..7f9df80da 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -35,15 +35,15 @@ cdef struct _Cached: cdef class Morphologizer: """Given a POS tag and a Lexeme, find its lemma and morphological analysis. """ - def __init__(self, StringStore strings, object lemmatizer, - irregulars=None, tag_map=None, tag_names=None): + def __init__(self, StringStore strings, object tag_map, object lemmatizer, + irregulars=None): self.mem = Pool() self.strings = strings - self.tag_names = tag_names self.lemmatizer = lemmatizer - self._cache = PreshMapArray(len(self.tag_names)) - self.tags = self.mem.alloc(len(self.tag_names), sizeof(PosTag)) - for i, tag in enumerate(self.tag_names): + cdef int n_tags = len(self.strings.pos_tags) + 1 + self._cache = PreshMapArray(n_tags) + self.tags = self.mem.alloc(n_tags, sizeof(PosTag)) + for tag, i in self.strings.pos_tags: pos, props = tag_map[tag] self.tags[i].id = i self.tags[i].pos = pos @@ -57,15 +57,6 @@ cdef class Morphologizer: if irregulars is not None: self.load_exceptions(irregulars) - @classmethod - def from_dir(cls, StringStore strings, object lemmatizer, data_dir): - tagger_cfg = json.loads(open(path.join(data_dir, 'pos', 'config.json')).read()) - tag_map = tagger_cfg['tag_map'] - tag_names = tagger_cfg['tag_names'] - irregulars = json.loads(open(path.join(data_dir, 'morphs.json')).read()) - return cls(strings, lemmatizer, tag_map=tag_map, irregulars=irregulars, - tag_names=tag_names) - cdef int lemmatize(self, const univ_tag_t pos, const Lexeme* lex) except -1: if self.lemmatizer is None: return lex.sic @@ -104,9 +95,10 @@ cdef class Morphologizer: cdef dict props cdef int lemma cdef id_t sic - cdef univ_tag_t pos + cdef int pos for pos_str, entries in exc.items(): - pos = self.tag_names.index(pos_str) + pos = self.strings.pos_tags[pos_str] + assert pos < len(self.strings.pos_tags) for form_str, props in entries.items(): lemma_str = props.get('L', form_str) sic = self.strings[form_str] diff --git a/spacy/strings.pxd b/spacy/strings.pxd index d5b674527..00a5fbf66 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -19,6 +19,8 @@ cdef class _SymbolMap: cdef class StringStore: cdef Pool mem cdef Utf8Str* strings + cdef readonly _SymbolMap pos_tags + cdef readonly _SymbolMap dep_tags cdef size_t size cdef PreshMap _map diff --git a/spacy/strings.pyx b/spacy/strings.pyx index c7aa9c7ac..903256874 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -18,6 +18,9 @@ cdef class _SymbolMap: for id_, string in enumerate(self._id_to_string[1:]): yield string, id_ + def __len__(self): + return len(self._id_to_string) + def __getitem__(self, object string_or_id): cdef bytes byte_string if isinstance(string_or_id, int) or isinstance(string_or_id, long): @@ -42,6 +45,7 @@ cdef class StringStore: self.mem = Pool() self._map = PreshMap() self._resize_at = 10000 + self.size = 1 self.strings = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) self.pos_tags = _SymbolMap() self.dep_tags = _SymbolMap() diff --git a/spacy/tagger.pxd b/spacy/tagger.pxd index 33732f987..abae428bb 100644 --- a/spacy/tagger.pxd +++ b/spacy/tagger.pxd @@ -18,6 +18,3 @@ cdef class Tagger: cpdef readonly Pool mem cpdef readonly Extractor extractor cpdef readonly LinearModel model - - cpdef readonly list tag_names - cdef dict tagdict diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 50aa05931..a4aae827f 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -12,15 +12,13 @@ import cython from thinc.features cimport Feature, count_feats -def setup_model_dir(tag_names, tag_map, tag_counts, templates, model_dir): +def setup_model_dir(tag_names, templates, model_dir): if path.exists(model_dir): shutil.rmtree(model_dir) os.mkdir(model_dir) config = { 'templates': templates, 'tag_names': tag_names, - 'tag_map': tag_map, - 'tag_counts': tag_counts, } with open(path.join(model_dir, 'config.json'), 'w') as file_: json.dump(config, file_) @@ -37,10 +35,9 @@ cdef class Tagger: univ_counts = {} cdef unicode tag cdef unicode univ_tag - self.tag_names = cfg['tag_names'] - self.tagdict = _make_tag_dict(cfg['tag_counts']) + tag_names = cfg['tag_names'] self.extractor = Extractor(templates) - self.model = LinearModel(len(self.tag_names), self.extractor.n_templ+2) + self.model = LinearModel(len(tag_names) + 1, self.extractor.n_templ+2) # TODO if path.exists(path.join(model_dir, 'model')): self.model.load(path.join(model_dir, 'model')) @@ -63,30 +60,6 @@ cdef class Tagger: self.model.update(counts) return guess - def tag_id(self, object tag_name): - """Encode tag_name into a tag ID integer.""" - tag_id = self.tag_names.index(tag_name) - if tag_id == -1: - tag_id = len(self.tag_names) - self.tag_names.append(tag_name) - return tag_id - - -def _make_tag_dict(counts): - freq_thresh = 20 - ambiguity_thresh = 0.97 - tagdict = {} - cdef atom_t word - cdef atom_t tag - for word_str, tag_freqs in counts.items(): - tag_str, mode = max(tag_freqs.items(), key=lambda item: item[1]) - n = sum(tag_freqs.values()) - word = int(word_str) - tag = int(tag_str) - if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh: - tagdict[word] = tag - return tagdict - cdef int _arg_max(const weight_t* scores, int n_classes) except -1: cdef int best = 0 diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index ec16c77d6..5ee5d01f7 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -39,10 +39,10 @@ cdef class Token: cdef readonly StringStore string_store cdef public int i cdef public int idx - cdef int pos + cdef readonly int pos_id + cdef readonly int dep_id cdef int lemma cdef public int head - cdef public int dep_tag cdef public atom_t id cdef public atom_t cluster diff --git a/tests/test_morph_exceptions.py b/tests/test_morph_exceptions.py index 0d5d3d178..f60cb5683 100644 --- a/tests/test_morph_exceptions.py +++ b/tests/test_morph_exceptions.py @@ -1,19 +1,25 @@ from __future__ import unicode_literals -from spacy.en import EN +from spacy.en import English import pytest +from spacy.en import English + + +@pytest.fixture +def EN(): + return English(pos_tag=True, parse=False) + + @pytest.fixture def morph_exc(): return { 'PRP$': {'his': {'L': '-PRP-', 'person': 3, 'case': 2}}, } -def test_load_exc(morph_exc): - EN.load() - EN.morphologizer.load_exceptions(morph_exc) - tokens = EN.tokenize('I like his style.') - EN.set_pos(tokens) +def test_load_exc(EN, morph_exc): + EN.pos_tagger.morphologizer.load_exceptions(morph_exc) + tokens = EN('I like his style.', pos_tag=True) his = tokens[2] assert his.pos == 'PRP$' assert his.lemma == '-PRP-'