* Fix POS tagger, so that it loads correctly. Lexemes are being read in.

2025-11-25 20:36:02 +03:00 · 2014-10-30 13:38:55 +11:00 · 2014-10-30 13:38:55 +11:00 · 889b7b48b4
commit 889b7b48b4
parent 67c8c8019f
2 changed files with 10 additions and 9 deletions
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -300,6 +300,7 @@ cdef class Lexicon:
        assert fp != NULL
        cdef size_t st
        cdef Lexeme* lexeme
        i = 0
        while True:
            lexeme = <Lexeme*>self.mem.alloc(sizeof(Lexeme), 1)
            st = fread(lexeme, sizeof(Lexeme), 1, fp)
@ -307,6 +308,8 @@ cdef class Lexicon:
                break
            self.lexemes.push_back(lexeme)
            self._dict.set(lexeme.hash, lexeme)
            i += 1
        print "Load %d lexemes" % i
        fclose(fp)
--- a/spacy/pos.pyx
+++ b/spacy/pos.pyx
@ -24,21 +24,19 @@ cdef class Tagger:
    tags = {'NULL': NULL_TAG}
    def __init__(self, model_dir):
        self.mem = Pool()
-        self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])
+        tags_loc = path.join(model_dir, 'postags.json')
        if path.exists(tags_loc):
            with open(tags_loc) as file_:
                Tagger.tags.update(ujson.load(file_))
        self.model = LinearModel(len(self.tags), self.extractor.n)
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))
        self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])
        self._atoms = <atom_t*>self.mem.alloc(CONTEXT_SIZE, sizeof(atom_t))
        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
        self._scores = <weight_t*>self.mem.alloc(len(self.tags), sizeof(weight_t))
        self._guess = NULL_TAG
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))
        tags_loc = path.join(model_dir, 'postags.json')
        if path.exists(tags_loc):
            with open(tags_loc) as file_:
                Tagger.tags.update(ujson.load(file_))
        if path.exists(path.join(model_dir, 'strings')):
            EN.lexicon.strings.load(path.join(model_dir, 'strings'))
    cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
        assert i >= 0