spaCy/spacy/en/pos.pyx

from os import path
import json
import os
import shutil

from libc.string cimport memset

from cymem.cymem cimport Address
from thinc.typedefs cimport atom_t, weight_t

from ..parts_of_speech cimport univ_pos_t
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON

from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
from ..structs cimport TokenC, Morphology, LexemeC
from ..tokens.doc cimport Doc
from ..morphology cimport set_morph_from_dict
from .._ml cimport arg_max

from .attrs cimport IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
from ..typedefs cimport attr_t

from .lemmatizer import Lemmatizer


cpdef enum en_person_t:
    NO_PERSON
    FIRST
    SECOND
    THIRD
    NON_THIRD


cpdef enum en_number_t:
    NO_NUMBER
    SINGULAR
    PLURAL
    MASS


cpdef enum en_gender_t:
    NO_GENDER
    MASCULINE
    FEMININE
    NEUTER


cpdef enum en_case_t:
    NO_CASE
    NOMINATIVE
    GENITIVE
    ACCUSATIVE
    REFLEXIVE
    DEMONYM


cpdef enum en_tenspect_t:
    NO_TENSE
    BASE_VERB
    PRESENT
    PAST
    PASSIVE
    ING
    MODAL


cpdef enum misc_t:
    NO_MISC
    COMPARATIVE
    SUPERLATIVE
    RELATIVE
    NAME


cpdef enum:
    P2_orth
    P2_cluster
    P2_shape
    P2_prefix
    P2_suffix
    P2_pos
    P2_lemma
    P2_flags

    P1_orth
    P1_cluster
    P1_shape
    P1_prefix
    P1_suffix
    P1_pos
    P1_lemma
    P1_flags

    W_orth
    W_cluster
    W_shape
    W_prefix
    W_suffix
    W_pos
    W_lemma
    W_flags

    N1_orth
    N1_cluster
    N1_shape
    N1_prefix
    N1_suffix
    N1_pos
    N1_lemma
    N1_flags

    N2_orth
    N2_cluster
    N2_shape
    N2_prefix
    N2_suffix
    N2_pos
    N2_lemma
    N2_flags

    N_CONTEXT_FIELDS


POS_TAGS = {
    'NULL': (NO_TAG, {}),
    'EOL': (EOL, {}),
    'CC': (CONJ, {}),
    'CD': (NUM, {}),
    'DT': (DET, {}),
    'EX': (DET, {}),
    'FW': (X, {}),
    'IN': (ADP, {}),
    'JJ': (ADJ, {}),
    'JJR': (ADJ, {'misc': COMPARATIVE}),
    'JJS': (ADJ, {'misc': SUPERLATIVE}),
    'LS': (X, {}),
    'MD': (VERB, {'tenspect': MODAL}),
    'NN': (NOUN, {}),
    'NNS': (NOUN, {'number': PLURAL}),
    'NNP': (NOUN, {'misc': NAME}),
    'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
    'PDT': (DET, {}),
    'POS': (PRT, {'case': GENITIVE}),
    'PRP': (PRON, {}),
    'PRP$': (PRON, {'case': GENITIVE}),
    'RB': (ADV, {}),
    'RBR': (ADV, {'misc': COMPARATIVE}),
    'RBS': (ADV, {'misc': SUPERLATIVE}),
    'RP': (PRT, {}),
    'SYM': (X, {}),
    'TO': (PRT, {}),
    'UH': (X, {}),
    'VB': (VERB, {}),
    'VBD': (VERB, {'tenspect': PAST}),
    'VBG': (VERB, {'tenspect': ING}),
    'VBN': (VERB, {'tenspect': PASSIVE}),
    'VBP': (VERB, {'tenspect': PRESENT}),
    'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
    'WDT': (DET, {'misc': RELATIVE}),
    'WP': (PRON, {'misc': RELATIVE}),
    'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
    'WRB': (ADV, {'misc': RELATIVE}),
    '!': (PUNCT, {}),
    '#': (PUNCT, {}),
    '$': (PUNCT, {}),
    "''": (PUNCT, {}),
    "(": (PUNCT, {}),
    ")": (PUNCT, {}),
    "-LRB-": (PUNCT, {}),
    "-RRB-": (PUNCT, {}),
    ".": (PUNCT, {}),
    ",": (PUNCT, {}),
    "``": (PUNCT, {}),
    ":": (PUNCT, {}),
    "?": (PUNCT, {}),
    "ADD": (X, {}),
    "NFP": (PUNCT, {}),
    "GW": (X, {}),
    "AFX": (X, {}),
    "HYPH": (PUNCT, {}),
    "XX": (X, {}),
    "BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
    "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
    "SP": (SPACE, {})
}


POS_TEMPLATES = (
    (W_orth,),
    (P1_lemma, P1_pos),
    (P2_lemma, P2_pos),
    (N1_orth,),
    (N2_orth,),

    (W_suffix,),
    (W_prefix,),

    (P1_pos,),
    (P2_pos,),
    (P1_pos, P2_pos),
    (P1_pos, W_orth),
    (P1_suffix,),
    (N1_suffix,),

    (W_shape,),
    (W_cluster,),
    (N1_cluster,),
    (N2_cluster,),
    (P1_cluster,),
    (P2_cluster,),

    (W_flags,),
    (N1_flags,),
    (N2_flags,),
    (P1_flags,),
    (P2_flags,),
)


cdef struct _CachedMorph:
    Morphology morph
    int lemma


def setup_model_dir(tag_names, tag_map, templates, model_dir):
    if path.exists(model_dir):
        shutil.rmtree(model_dir)
    os.mkdir(model_dir)
    config = {
        'templates': templates,
        'tag_names': tag_names,
        'tag_map': tag_map
    }
    with open(path.join(model_dir, 'config.json'), 'w') as file_:
        json.dump(config, file_)


cdef class EnPosTagger:
    """A part-of-speech tagger for English"""
    def __init__(self, StringStore strings, data_dir):
        self.mem = Pool()
        model_dir = path.join(data_dir, 'pos')
        self.strings = strings
        cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
        self.tag_names = sorted(cfg['tag_names'])
        assert self.tag_names
        self.n_tags = len(self.tag_names)
        self.tag_map = cfg['tag_map']
        cdef int n_tags = len(self.tag_names) + 1

        self.model = Model(n_tags, cfg['templates'], model_dir)
        self._morph_cache = PreshMapArray(n_tags)
        self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
        for i, tag in enumerate(sorted(self.tag_names)):
            pos, props = self.tag_map[tag]
            self.tags[i].id = i
            self.tags[i].pos = pos
            set_morph_from_dict(&self.tags[i].morph, props)
        if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')):
            self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',
                                                 'morphs.json'))))
        self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)

    def __call__(self, Doc tokens):
        """Apply the tagger, setting the POS tags onto the Doc object.

        Args:
            tokens (Doc): The tokens to be tagged.
        """
        if tokens.length == 0:
            return 0
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
        cdef const weight_t* scores
        for i in range(tokens.length):
            if tokens.data[i].pos == 0:
                fill_context(context, i, tokens.data)
                scores = self.model.score(context)
                guess = arg_max(scores, self.model.n_classes)
                tokens.data[i].tag = self.strings[self.tag_names[guess]]
                self.set_morph(i, &self.tags[guess], tokens.data)

        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

    def tag_from_strings(self, Doc tokens, object tag_strs):
        cdef int i
        for i in range(tokens.length):
            tokens.data[i].tag = self.strings[tag_strs[i]]
            self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])],
                           tokens.data)
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

    def train(self, Doc tokens, object gold_tag_strs):
        cdef int i
        cdef int loss
        cdef atom_t[N_CONTEXT_FIELDS] context
        cdef const weight_t* scores
        golds = [self.tag_names.index(g) if g is not None else -1
                 for g in gold_tag_strs]
        correct = 0
        for i in range(tokens.length):
            fill_context(context, i, tokens.data)
            scores = self.model.score(context)
            guess = arg_max(scores, self.model.n_classes)
            loss = guess != golds[i] if golds[i] != -1 else 0
            self.model.update(context, guess, golds[i], loss)
            tokens.data[i].tag = self.strings[self.tag_names[guess]]
            self.set_morph(i, &self.tags[guess], tokens.data)
            correct += loss == 0
        return correct

    cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1:
        tokens[i].pos = tag.pos
        cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
        if cached is NULL:
            cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
            cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
            cached.morph = tag.morph
            self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
        tokens[i].lemma = cached.lemma
        tokens[i].morph = cached.morph

    cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
        if self.lemmatizer is None:
            return lex.orth
        cdef unicode py_string = self.strings[lex.orth]
        if pos != NOUN and pos != VERB and pos != ADJ:
            return lex.orth
        cdef set lemma_strings
        cdef unicode lemma_string
        lemma_strings = self.lemmatizer(py_string, pos)
        lemma_string = sorted(lemma_strings)[0]
        lemma = self.strings[lemma_string]
        return lemma

    def load_morph_exceptions(self, dict exc):
        cdef unicode pos_str
        cdef unicode form_str
        cdef unicode lemma_str
        cdef dict entries
        cdef dict props
        cdef int lemma
        cdef attr_t orth
        cdef int pos
        for pos_str, entries in exc.items():
            pos = self.tag_names.index(pos_str)
            for form_str, props in entries.items():
                lemma_str = props.get('L', form_str)
                orth = self.strings[form_str]
                cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
                cached.lemma = self.strings[lemma_str]
                set_morph_from_dict(&cached.morph, props)
                self._morph_cache.set(pos, orth, <void*>cached)


cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
    _fill_from_token(&context[P2_orth], &tokens[i-2])
    _fill_from_token(&context[P1_orth], &tokens[i-1])
    _fill_from_token(&context[W_orth], &tokens[i])
    _fill_from_token(&context[N1_orth], &tokens[i+1])
    _fill_from_token(&context[N2_orth], &tokens[i+2])


cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
    context[0] = t.lex.lower
    context[1] = t.lex.cluster
    context[2] = t.lex.shape
    context[3] = t.lex.prefix
    context[4] = t.lex.suffix
    context[5] = t.tag
    context[6] = t.lemma
    if t.lex.flags & (1 << IS_ALPHA):
        context[7] = 1
    elif t.lex.flags & (1 << IS_PUNCT):
        context[7] = 2
    elif t.lex.flags & (1 << LIKE_URL):
        context[7] = 3
    elif t.lex.flags & (1 << LIKE_NUM):
        context[7] = 4
    else:
        context[7] = 0
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`from os import path`
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00			`import json`
* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile. 2014-12-31 11:40:59 +03:00			`import os`
			`import shutil`
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00
* Repurporse the Tagger class as a generic Model, wrapping thinc's interface 2014-12-30 13:20:15 +03:00			`from libc.string cimport memset`

			`from cymem.cymem cimport Address`
* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile. 2014-12-31 11:40:59 +03:00			`from thinc.typedefs cimport atom_t, weight_t`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00
* Move POS tag definitions to parts_of_speech.pxd 2015-01-25 08:31:07 +03:00			`from ..parts_of_speech cimport univ_pos_t`
* Fix Issue #22: PRP and PRP$ were mapped to NOUN. Should be PRON. 2015-02-09 02:36:18 +03:00			`from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON`

* Add SPACE part-of-speech tag, and train tagger to assign it. Also train tagger not to make whitespace an entity 2015-07-09 14:30:41 +03:00			`from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE`
* Tmp commit. Refactoring to create a Python Lexeme class. 2015-01-12 02:26:22 +03:00			`from ..structs cimport TokenC, Morphology, LexemeC`
* Break up tokens.pyx into tokens/doc.pyx, tokens/token.pyx, tokens/spans.pyx 2015-07-13 21:20:58 +03:00			`from ..tokens.doc cimport Doc`
* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`from ..morphology cimport set_morph_from_dict`
* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile. 2014-12-31 11:40:59 +03:00			`from .._ml cimport arg_max`

* Tweak POS features for web text 2015-02-02 03:59:36 +03:00			`from .attrs cimport IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL`
* Fix type declarations for attr_t. Remove unused id_t. 2015-07-18 23:39:57 +03:00			`from ..typedefs cimport attr_t`
* Tweak POS features for web text 2015-02-02 03:59:36 +03:00
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00			`from .lemmatizer import Lemmatizer`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00

			`cpdef enum en_person_t:`
			`NO_PERSON`
			`FIRST`
			`SECOND`
			`THIRD`
			`NON_THIRD`


			`cpdef enum en_number_t:`
			`NO_NUMBER`
			`SINGULAR`
			`PLURAL`
			`MASS`


			`cpdef enum en_gender_t:`
			`NO_GENDER`
			`MASCULINE`
			`FEMININE`
			`NEUTER`


			`cpdef enum en_case_t:`
			`NO_CASE`
			`NOMINATIVE`
			`GENITIVE`
			`ACCUSATIVE`
			`REFLEXIVE`
			`DEMONYM`


			`cpdef enum en_tenspect_t:`
			`NO_TENSE`
			`BASE_VERB`
			`PRESENT`
			`PAST`
			`PASSIVE`
			`ING`
			`MODAL`


			`cpdef enum misc_t:`
			`NO_MISC`
			`COMPARATIVE`
			`SUPERLATIVE`
			`RELATIVE`
			`NAME`


			`cpdef enum:`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`P2_orth`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`P2_cluster`
			`P2_shape`
			`P2_prefix`
			`P2_suffix`
			`P2_pos`
			`P2_lemma`
* Tweak POS features for web text 2015-02-02 03:59:36 +03:00			`P2_flags`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`P1_orth`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`P1_cluster`
			`P1_shape`
			`P1_prefix`
			`P1_suffix`
			`P1_pos`
			`P1_lemma`
* Tweak POS features for web text 2015-02-02 03:59:36 +03:00			`P1_flags`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`W_orth`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`W_cluster`
			`W_shape`
			`W_prefix`
			`W_suffix`
			`W_pos`
			`W_lemma`
* Tweak POS features for web text 2015-02-02 03:59:36 +03:00			`W_flags`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`N1_orth`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`N1_cluster`
			`N1_shape`
			`N1_prefix`
			`N1_suffix`
			`N1_pos`
			`N1_lemma`
* Tweak POS features for web text 2015-02-02 03:59:36 +03:00			`N1_flags`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`N2_orth`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`N2_cluster`
			`N2_shape`
			`N2_prefix`
			`N2_suffix`
			`N2_pos`
			`N2_lemma`
* Tweak POS features for web text 2015-02-02 03:59:36 +03:00			`N2_flags`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00
			`N_CONTEXT_FIELDS`


			`POS_TAGS = {`
			`'NULL': (NO_TAG, {}),`
			`'EOL': (EOL, {}),`
			`'CC': (CONJ, {}),`
			`'CD': (NUM, {}),`
			`'DT': (DET, {}),`
			`'EX': (DET, {}),`
			`'FW': (X, {}),`
			`'IN': (ADP, {}),`
			`'JJ': (ADJ, {}),`
			`'JJR': (ADJ, {'misc': COMPARATIVE}),`
			`'JJS': (ADJ, {'misc': SUPERLATIVE}),`
			`'LS': (X, {}),`
			`'MD': (VERB, {'tenspect': MODAL}),`
			`'NN': (NOUN, {}),`
			`'NNS': (NOUN, {'number': PLURAL}),`
			`'NNP': (NOUN, {'misc': NAME}),`
			`'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),`
			`'PDT': (DET, {}),`
			`'POS': (PRT, {'case': GENITIVE}),`
* Fix Issue #22: PRP and PRP$ were mapped to NOUN. Should be PRON. 2015-02-09 02:36:18 +03:00			`'PRP': (PRON, {}),`
			`'PRP$': (PRON, {'case': GENITIVE}),`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`'RB': (ADV, {}),`
			`'RBR': (ADV, {'misc': COMPARATIVE}),`
			`'RBS': (ADV, {'misc': SUPERLATIVE}),`
			`'RP': (PRT, {}),`
			`'SYM': (X, {}),`
			`'TO': (PRT, {}),`
			`'UH': (X, {}),`
			`'VB': (VERB, {}),`
			`'VBD': (VERB, {'tenspect': PAST}),`
			`'VBG': (VERB, {'tenspect': ING}),`
			`'VBN': (VERB, {'tenspect': PASSIVE}),`
			`'VBP': (VERB, {'tenspect': PRESENT}),`
			`'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),`
			`'WDT': (DET, {'misc': RELATIVE}),`
			`'WP': (PRON, {'misc': RELATIVE}),`
			`'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),`
			`'WRB': (ADV, {'misc': RELATIVE}),`
			`'!': (PUNCT, {}),`
			`'#': (PUNCT, {}),`
			`'$': (PUNCT, {}),`
			`"''": (PUNCT, {}),`
			`"(": (PUNCT, {}),`
			`")": (PUNCT, {}),`
			`"-LRB-": (PUNCT, {}),`
			`"-RRB-": (PUNCT, {}),`
			`".": (PUNCT, {}),`
			`",": (PUNCT, {}),`
			"``": (PUNCT, {}),
			`":": (PUNCT, {}),`
			`"?": (PUNCT, {}),`
* Tweak POS features for web text 2015-02-02 03:59:36 +03:00			`"ADD": (X, {}),`
			`"NFP": (PUNCT, {}),`
			`"GW": (X, {}),`
			`"AFX": (X, {}),`
			`"HYPH": (PUNCT, {}),`
* Add POS tags to support SWBD tag set 2015-02-11 22:08:28 +03:00			`"XX": (X, {}),`
			`"BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),`
* Add SPACE part-of-speech tag, and train tagger to assign it. Also train tagger not to make whitespace an entity 2015-07-09 14:30:41 +03:00			`"HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),`
			`"SP": (SPACE, {})`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`}`


			`POS_TEMPLATES = (`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`(W_orth,),`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`(P1_lemma, P1_pos),`
			`(P2_lemma, P2_pos),`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`(N1_orth,),`
			`(N2_orth,),`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00
			`(W_suffix,),`
			`(W_prefix,),`

			`(P1_pos,),`
			`(P2_pos,),`
			`(P1_pos, P2_pos),`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`(P1_pos, W_orth),`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`(P1_suffix,),`
			`(N1_suffix,),`

			`(W_shape,),`
			`(W_cluster,),`
			`(N1_cluster,),`
			`(N2_cluster,),`
			`(P1_cluster,),`
			`(P2_cluster,),`
* Tweak POS features for web text 2015-02-02 03:59:36 +03:00
			`(W_flags,),`
			`(N1_flags,),`
			`(N2_flags,),`
			`(P1_flags,),`
			`(P2_flags,),`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`)`


* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`cdef struct _CachedMorph:`
			`Morphology morph`
			`int lemma`


* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile. 2014-12-31 11:40:59 +03:00			`def setup_model_dir(tag_names, tag_map, templates, model_dir):`
			`if path.exists(model_dir):`
			`shutil.rmtree(model_dir)`
			`os.mkdir(model_dir)`
			`config = {`
			`'templates': templates,`
			`'tag_names': tag_names,`
			`'tag_map': tag_map`
			`}`
			`with open(path.join(model_dir, 'config.json'), 'w') as file_:`
			`json.dump(config, file_)`


* Repurporse the Tagger class as a generic Model, wrapping thinc's interface 2014-12-30 13:20:15 +03:00			`cdef class EnPosTagger:`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""A part-of-speech tagger for English"""`
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00			`def __init__(self, StringStore strings, data_dir):`
* Repurporse the Tagger class as a generic Model, wrapping thinc's interface 2014-12-30 13:20:15 +03:00			`self.mem = Pool()`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`model_dir = path.join(data_dir, 'pos')`
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00			`self.strings = strings`
			`cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))`
* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`self.tag_names = sorted(cfg['tag_names'])`
* Tweak POS features for web text 2015-02-02 03:59:36 +03:00			`assert self.tag_names`
* Repurporse the Tagger class as a generic Model, wrapping thinc's interface 2014-12-30 13:20:15 +03:00			`self.n_tags = len(self.tag_names)`
* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`self.tag_map = cfg['tag_map']`
			`cdef int n_tags = len(self.tag_names) + 1`
* Repurporse the Tagger class as a generic Model, wrapping thinc's interface 2014-12-30 13:20:15 +03:00
* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile. 2014-12-31 11:40:59 +03:00			`self.model = Model(n_tags, cfg['templates'], model_dir)`
* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`self._morph_cache = PreshMapArray(n_tags)`
			`self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))`
			`for i, tag in enumerate(sorted(self.tag_names)):`
			`pos, props = self.tag_map[tag]`
			`self.tags[i].id = i`
			`self.tags[i].pos = pos`
			`set_morph_from_dict(&self.tags[i].morph, props)`
* Fix loading of special morph words 2015-01-03 15:13:00 +03:00			`if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')):`
			`self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',`
			`'morphs.json'))))`
* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00
* Rename Tokens to Doc 2015-07-08 19:53:00 +03:00			`def __call__(self, Doc tokens):`
			`"""Apply the tagger, setting the POS tags onto the Doc object.`
* Upd docstrings 2014-12-27 10:45:16 +03:00
			`Args:`
* Rename Tokens to Doc 2015-07-08 19:53:00 +03:00			`tokens (Doc): The tokens to be tagged.`
* Upd docstrings 2014-12-27 10:45:16 +03:00			`"""`
* Fix issue #19: Allow parsing/pos tagging of empty strings 2015-02-10 18:15:58 +03:00			`if tokens.length == 0:`
			`return 0`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`cdef int i`
			`cdef atom_t[N_CONTEXT_FIELDS] context`
* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile. 2014-12-31 11:40:59 +03:00			`cdef const weight_t* scores`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`for i in range(tokens.length):`
* Work on word vectors, and other stuff 2015-01-17 08:21:17 +03:00			`if tokens.data[i].pos == 0:`
* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile. 2014-12-31 11:40:59 +03:00			`fill_context(context, i, tokens.data)`
			`scores = self.model.score(context)`
* Use values encoded by StringStore in POS tagging, rather than indices into a list of tags 2015-03-16 00:01:58 +03:00			`guess = arg_max(scores, self.model.n_classes)`
			`tokens.data[i].tag = self.strings[self.tag_names[guess]]`
			`self.set_morph(i, &self.tags[guess], tokens.data)`
* Clear buffered python tokens when modifying the Tokens object. Need to clean this up, and modify via a method on Tokens. 2015-02-09 11:56:51 +03:00
* Add error if try to access head and not is_parsed 2015-01-25 07:33:54 +03:00			`tokens.is_tagged = True`
* Clear buffered python tokens when modifying the Tokens object. Need to clean this up, and modify via a method on Tokens. 2015-02-09 11:56:51 +03:00			`tokens._py_tokens = [None] * tokens.length`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00
* Rename Tokens to Doc 2015-07-08 19:53:00 +03:00			`def tag_from_strings(self, Doc tokens, object tag_strs):`
* Hastily add method to apply tags from a list of strings, instead of predicting the tags. 2015-02-23 23:40:17 +03:00			`cdef int i`
			`for i in range(tokens.length):`
* Use values encoded by StringStore in POS tagging, rather than indices into a list of tags 2015-03-16 00:01:58 +03:00			`tokens.data[i].tag = self.strings[tag_strs[i]]`
			`self.set_morph(i, &self.tags[self.tag_names.index(tag_strs[i])],`
			`tokens.data)`
* Hastily add method to apply tags from a list of strings, instead of predicting the tags. 2015-02-23 23:40:17 +03:00			`tokens.is_tagged = True`
			`tokens._py_tokens = [None] * tokens.length`

* Rename Tokens to Doc 2015-07-08 19:53:00 +03:00			`def train(self, Doc tokens, object gold_tag_strs):`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`cdef int i`
* Ensure parser and tagger function correctly when training from missing values, indicated by -1 2015-01-30 06:08:56 +03:00			`cdef int loss`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`cdef atom_t[N_CONTEXT_FIELDS] context`
* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile. 2014-12-31 11:40:59 +03:00			`cdef const weight_t* scores`
* Ensure parser and tagger function correctly when training from missing values, indicated by -1 2015-01-30 06:08:56 +03:00			`golds = [self.tag_names.index(g) if g is not None else -1`
			`for g in gold_tag_strs]`
* Repurporse the Tagger class as a generic Model, wrapping thinc's interface 2014-12-30 13:20:15 +03:00			`correct = 0`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`for i in range(tokens.length):`
* Refactor _ml.Model, and finish implementing HastyModel so far not worthwhile. 2014-12-31 11:40:59 +03:00			`fill_context(context, i, tokens.data)`
			`scores = self.model.score(context)`
			`guess = arg_max(scores, self.model.n_classes)`
* Ensure parser and tagger function correctly when training from missing values, indicated by -1 2015-01-30 06:08:56 +03:00			`loss = guess != golds[i] if golds[i] != -1 else 0`
			`self.model.update(context, guess, golds[i], loss)`
* Use values encoded by StringStore in POS tagging, rather than indices into a list of tags 2015-03-16 00:01:58 +03:00			`tokens.data[i].tag = self.strings[self.tag_names[guess]]`
			`self.set_morph(i, &self.tags[guess], tokens.data)`
* Ensure parser and tagger function correctly when training from missing values, indicated by -1 2015-01-30 06:08:56 +03:00			`correct += loss == 0`
* Repurporse the Tagger class as a generic Model, wrapping thinc's interface 2014-12-30 13:20:15 +03:00			`return correct`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00
* Use values encoded by StringStore in POS tagging, rather than indices into a list of tags 2015-03-16 00:01:58 +03:00			`cdef int set_morph(self, const int i, const PosTag* tag, TokenC* tokens) except -1:`
* Tmp 2014-12-24 09:42:00 +03:00			`tokens[i].pos = tag.pos`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)`
* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`if cached is NULL:`
			`cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))`
			`cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)`
			`cached.morph = tag.morph`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)`
* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`tokens[i].lemma = cached.lemma`
			`tokens[i].morph = cached.morph`

* Move POS tag definitions to parts_of_speech.pxd 2015-01-25 08:31:07 +03:00			`cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:`
* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`if self.lemmatizer is None:`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`return lex.orth`
			`cdef unicode py_string = self.strings[lex.orth]`
* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`if pos != NOUN and pos != VERB and pos != ADJ:`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`return lex.orth`
* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`cdef set lemma_strings`
* Fix encoding in lemmatization 2015-01-05 03:54:29 +03:00			`cdef unicode lemma_string`
* Tmp. Working on refactor. Compiles, must hook up lexical feats. 2015-01-13 16:03:48 +03:00			`lemma_strings = self.lemmatizer(py_string, pos)`
* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`lemma_string = sorted(lemma_strings)[0]`
* Remove unnecessary key and id properties from Utf8String. 2015-07-16 20:29:02 +03:00			`lemma = self.strings[lemma_string]`
* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`return lemma`

			`def load_morph_exceptions(self, dict exc):`
			`cdef unicode pos_str`
			`cdef unicode form_str`
			`cdef unicode lemma_str`
			`cdef dict entries`
			`cdef dict props`
			`cdef int lemma`
* Fix type declarations for attr_t. Remove unused id_t. 2015-07-18 23:39:57 +03:00			`cdef attr_t orth`
* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`cdef int pos`
			`for pos_str, entries in exc.items():`
			`pos = self.tag_names.index(pos_str)`
			`for form_str, props in entries.items():`
			`lemma_str = props.get('L', form_str)`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`orth = self.strings[form_str]`
* All tests now passing for reorg 2014-12-23 05:18:59 +03:00			`cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))`
			`cached.lemma = self.strings[lemma_str]`
			`set_morph_from_dict(&cached.morph, props)`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`self._morph_cache.set(pos, orth, <void*>cached)`
Remove trailing whitespace 2015-04-19 11:31:31 +03:00
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00
			`cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:`
* Rename sic to orth 2015-01-22 18:08:25 +03:00			`_fill_from_token(&context[P2_orth], &tokens[i-2])`
			`_fill_from_token(&context[P1_orth], &tokens[i-1])`
			`_fill_from_token(&context[W_orth], &tokens[i])`
			`_fill_from_token(&context[N1_orth], &tokens[i+1])`
			`_fill_from_token(&context[N2_orth], &tokens[i+2])`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00

			`cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:`
* Tweak POS features for web text 2015-02-02 03:59:36 +03:00			`context[0] = t.lex.lower`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`context[1] = t.lex.cluster`
			`context[2] = t.lex.shape`
			`context[3] = t.lex.prefix`
			`context[4] = t.lex.suffix`
* Fix POS model: make it use tag instead of pos in history features 2015-04-29 01:02:53 +03:00			`context[5] = t.tag`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`context[6] = t.lemma`
* Tweak POS features for web text 2015-02-02 03:59:36 +03:00			`if t.lex.flags & (1 << IS_ALPHA):`
			`context[7] = 1`
			`elif t.lex.flags & (1 << IS_PUNCT):`
			`context[7] = 2`
			`elif t.lex.flags & (1 << LIKE_URL):`
			`context[7] = 3`
			`elif t.lex.flags & (1 << LIKE_NUM):`
			`context[7] = 4`
			`else:`
			`context[7] = 0`