spaCy/spacy/pos.pyx

# cython: profile=True
from os import path
import os
import shutil
import ujson
import random
import codecs
import gzip


from thinc.weights cimport arg_max
from thinc.features import NonZeroConjFeat
from thinc.features import ConjFeat

from .en import EN

from .lexeme cimport *


NULL_TAG = 0


cdef class Tagger:
    tags = {'NULL': NULL_TAG}
    def __init__(self, model_dir):
        self.mem = Pool()
        self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])
        self.model = LinearModel(len(self.tags), self.extractor.n)
        self._atoms = <atom_t*>self.mem.alloc(CONTEXT_SIZE, sizeof(atom_t))
        self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))
        self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))
        self._scores = <weight_t*>self.mem.alloc(len(self.tags), sizeof(weight_t))
        self._guess = NULL_TAG
        if path.exists(path.join(model_dir, 'model')):
            self.model.load(path.join(model_dir, 'model'))
        tags_loc = path.join(model_dir, 'postags.json')
        if path.exists(tags_loc):
            with open(tags_loc) as file_:
                Tagger.tags.update(ujson.load(file_))
        if path.exists(path.join(model_dir, 'strings')):
            EN.lexicon.strings.load(path.join(model_dir, 'strings'))
            
    cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:
        assert i >= 0
        get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],
                  tokens.lex[i+1], tokens.lex[i+2], prev, prev_prev)
        self.extractor.extract(self._feats, self._values, self._atoms, NULL)
        assert self._feats[self.extractor.n] == 0
        self._guess = self.model.score(self._scores, self._feats, self._values)
        return self._guess

    cpdef bint tell_answer(self, class_t gold) except *:
        cdef class_t guess = self._guess
        if gold == guess or gold == NULL_TAG:
            self.model.update({})
            return 0
        counts = {guess: {}, gold: {}}
        self.extractor.count(counts[gold], self._feats, 1)
        self.extractor.count(counts[guess], self._feats, -1)
        self.model.update(counts)

    @classmethod
    def encode_pos(cls, tag):
        if tag not in cls.tags:
            cls.tags[tag] = len(cls.tags)
        return cls.tags[tag]


cpdef enum:
    P2i
    P2c
    P2w
    P2shape
    P2pref
    P2suff
    P2oft_title
    P2oft_upper

    P1i
    P1c
    P1w
    P1shape
    P1pre
    P1suff
    P1oft_title
    P1oft_upper

    N0i
    N0c
    N0w
    N0shape
    N0pref
    N0suff
    N0oft_title
    N0oft_upper

    N1i
    N1c
    N1w
    N1shape
    N1pref
    N1suff
    N1oft_title
    N1oft_upper

    N2i
    N2c
    N2w
    N2shape
    N2pref
    N2suff
    N2oft_title
    N2oft_upper

    P2t
    P1t

    CONTEXT_SIZE


cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,
                   Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:
    _fill_token(&atoms[P2i], p2)
    _fill_token(&atoms[P1i], p1)
    _fill_token(&atoms[N0i], n0)
    _fill_token(&atoms[N1i], n1)
    _fill_token(&atoms[N2i], n2)
    atoms[P1t] = prev_tag
    atoms[P2t] = prev_prev_tag


cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:
    atoms[0] = lex.i
    atoms[1] = lex.cluster
    atoms[2] = lex.norm
    atoms[3] = lex.shape
    atoms[4] = lex.prefix
    atoms[5] = lex.suffix

    atoms[6] = lex.flags & (1 << OFT_TITLE)
    atoms[7] = lex.flags & (1 << OFT_UPPER)


TEMPLATES = (
    (N0i,),
    (N0w,),
    (N0suff,),
    (N0pref,),
    (P1t,),
    (P2t,),
    (P1t, P2t),
    (P1t, N0w),
    (P1w,),
    (P1suff,),
    (P2w,),
    (N1w,),
    (N1suff,),
    (N2w,),

    (N0shape,),
    (N0c,),
    (N1c,),
    (N2c,),
    (P1c,),
    (P2c,),
    (N0oft_upper,),
    (N0oft_title,),
)
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 05:57:06 +04:00			`# cython: profile=True`
* Add greedy pos tagger 2014-10-22 03:17:26 +04:00			`from os import path`
			`import os`
			`import shutil`
			`import ujson`
			`import random`
			`import codecs`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 05:57:06 +04:00			`import gzip`
* Add greedy pos tagger 2014-10-22 03:17:26 +04:00

			`from thinc.weights cimport arg_max`
			`from thinc.features import NonZeroConjFeat`
			`from thinc.features import ConjFeat`

			`from .en import EN`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 20:20:02 +04:00
			`from .lexeme cimport *`
* Add greedy pos tagger 2014-10-22 03:17:26 +04:00

			`NULL_TAG = 0`


			`cdef class Tagger:`
			`tags = {'NULL': NULL_TAG}`
			`def __init__(self, model_dir):`
			`self.mem = Pool()`
			`self.extractor = Extractor(TEMPLATES, [ConjFeat for _ in TEMPLATES])`
			`self.model = LinearModel(len(self.tags), self.extractor.n)`
			`self._atoms = <atom_t*>self.mem.alloc(CONTEXT_SIZE, sizeof(atom_t))`
			`self._feats = <feat_t*>self.mem.alloc(self.extractor.n+1, sizeof(feat_t))`
			`self._values = <weight_t*>self.mem.alloc(self.extractor.n+1, sizeof(weight_t))`
			`self._scores = <weight_t*>self.mem.alloc(len(self.tags), sizeof(weight_t))`
			`self._guess = NULL_TAG`
* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`if path.exists(path.join(model_dir, 'model')):`
			`self.model.load(path.join(model_dir, 'model'))`
			`tags_loc = path.join(model_dir, 'postags.json')`
			`if path.exists(tags_loc):`
			`with open(tags_loc) as file_:`
			`Tagger.tags.update(ujson.load(file_))`
			`if path.exists(path.join(model_dir, 'strings')):`
			`EN.lexicon.strings.load(path.join(model_dir, 'strings'))`

* Add greedy pos tagger 2014-10-22 03:17:26 +04:00			`cpdef class_t predict(self, int i, Tokens tokens, class_t prev, class_t prev_prev) except 0:`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 20:20:02 +04:00			`assert i >= 0`
			`get_atoms(self._atoms, tokens.lex[i-2], tokens.lex[i-1], tokens.lex[i],`
			`tokens.lex[i+1], tokens.lex[i+2], prev, prev_prev)`
* Add greedy pos tagger 2014-10-22 03:17:26 +04:00			`self.extractor.extract(self._feats, self._values, self._atoms, NULL)`
			`assert self._feats[self.extractor.n] == 0`
			`self._guess = self.model.score(self._scores, self._feats, self._values)`
			`return self._guess`

			`cpdef bint tell_answer(self, class_t gold) except *:`
			`cdef class_t guess = self._guess`
			`if gold == guess or gold == NULL_TAG:`
			`self.model.update({})`
			`return 0`
			`counts = {guess: {}, gold: {}}`
			`self.extractor.count(counts[gold], self._feats, 1)`
			`self.extractor.count(counts[guess], self._feats, -1)`
			`self.model.update(counts)`

			`@classmethod`
			`def encode_pos(cls, tag):`
			`if tag not in cls.tags:`
			`cls.tags[tag] = len(cls.tags)`
			`return cls.tags[tag]`


			`cpdef enum:`
			`P2i`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 05:57:06 +04:00			`P2c`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 20:20:02 +04:00			`P2w`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 05:57:06 +04:00			`P2shape`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 20:20:02 +04:00			`P2pref`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 05:57:06 +04:00			`P2suff`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 20:20:02 +04:00			`P2oft_title`
			`P2oft_upper`

			`P1i`
			`P1c`
			`P1w`
			`P1shape`
			`P1pre`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 05:57:06 +04:00			`P1suff`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 20:20:02 +04:00			`P1oft_title`
			`P1oft_upper`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 05:57:06 +04:00
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 20:20:02 +04:00			`N0i`
			`N0c`
			`N0w`
			`N0shape`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 05:57:06 +04:00			`N0pref`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 20:20:02 +04:00			`N0suff`
			`N0oft_title`
			`N0oft_upper`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 05:57:06 +04:00
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 20:20:02 +04:00			`N1i`
			`N1c`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 05:57:06 +04:00			`N1w`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 20:20:02 +04:00			`N1shape`
			`N1pref`
			`N1suff`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 05:57:06 +04:00			`N1oft_title`
			`N1oft_upper`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 20:20:02 +04:00
			`N2i`
			`N2c`
			`N2w`
			`N2shape`
			`N2pref`
			`N2suff`
			`N2oft_title`
* Add greedy pos tagger 2014-10-22 03:17:26 +04:00			`N2oft_upper`

			`P2t`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 20:20:02 +04:00			`P1t`

* Add greedy pos tagger 2014-10-22 03:17:26 +04:00			`CONTEXT_SIZE`


* Large refactor, particularly to Python API 2014-10-23 17:59:17 +04:00			`cdef int get_atoms(atom_t* atoms, Lexeme* p2, Lexeme* p1, Lexeme* n0, Lexeme* n1,`
			`Lexeme* n2, class_t prev_tag, class_t prev_prev_tag) except -1:`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 20:20:02 +04:00			`_fill_token(&atoms[P2i], p2)`
			`_fill_token(&atoms[P1i], p1)`
			`_fill_token(&atoms[N0i], n0)`
			`_fill_token(&atoms[N1i], n1)`
			`_fill_token(&atoms[N2i], n2)`
			`atoms[P1t] = prev_tag`
			`atoms[P2t] = prev_prev_tag`
* Refactoring get_atoms, improving tokens API 2014-10-22 06:10:56 +04:00
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 05:57:06 +04:00
* Large refactor, particularly to Python API 2014-10-23 17:59:17 +04:00			`cdef inline void _fill_token(atom_t* atoms, Lexeme* lex) nogil:`
* Rewriting Lexeme serialization. 2014-10-29 15:19:38 +03:00			`atoms[0] = lex.i`
* Large refactor, particularly to Python API 2014-10-23 17:59:17 +04:00			`atoms[1] = lex.cluster`
			`atoms[2] = lex.norm`
			`atoms[3] = lex.shape`
			`atoms[4] = lex.prefix`
			`atoms[5] = lex.suffix`
* Upd for refactored Tokens class. Now gets 95.74, 185ms training on swbd_wsj_ewtb, eval on onto_web, Google POS tags. 2014-10-22 20:20:02 +04:00
* Large refactor, particularly to Python API 2014-10-23 17:59:17 +04:00			`atoms[6] = lex.flags & (1 << OFT_TITLE)`
			`atoms[7] = lex.flags & (1 << OFT_UPPER)`
* Add greedy pos tagger 2014-10-22 03:17:26 +04:00

			`TEMPLATES = (`
			`(N0i,),`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 05:57:06 +04:00			`(N0w,),`
			`(N0suff,),`
			`(N0pref,),`
* Add greedy pos tagger 2014-10-22 03:17:26 +04:00			`(P1t,),`
			`(P2t,),`
* Tagger now gets 97pc on wsj, parsing 19-21 in 500ms. Gets 92.7 on web text. 2014-10-22 05:57:06 +04:00			`(P1t, P2t),`
			`(P1t, N0w),`
			`(P1w,),`
			`(P1suff,),`
			`(P2w,),`
			`(N1w,),`
			`(N1suff,),`
			`(N2w,),`

			`(N0shape,),`
			`(N0c,),`
			`(N1c,),`
			`(N2c,),`
			`(P1c,),`
			`(P2c,),`
			`(N0oft_upper,),`
			`(N0oft_title,),`
* Add greedy pos tagger 2014-10-22 03:17:26 +04:00			`)`