spaCy/spacy/en/pos.pyx

from os import path
import json

from thinc.typedefs cimport atom_t

from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from ..typedefs cimport X, PUNCT, EOL
from ..structs cimport TokenC, Morphology
from ..tokens cimport Tokens
from .lemmatizer import Lemmatizer


cpdef enum en_person_t:
    NO_PERSON
    FIRST
    SECOND
    THIRD
    NON_THIRD


cpdef enum en_number_t:
    NO_NUMBER
    SINGULAR
    PLURAL
    MASS


cpdef enum en_gender_t:
    NO_GENDER
    MASCULINE
    FEMININE
    NEUTER


cpdef enum en_case_t:
    NO_CASE
    NOMINATIVE
    GENITIVE
    ACCUSATIVE
    REFLEXIVE
    DEMONYM


cpdef enum en_tenspect_t:
    NO_TENSE
    BASE_VERB
    PRESENT
    PAST
    PASSIVE
    ING
    MODAL


cpdef enum misc_t:
    NO_MISC
    COMPARATIVE
    SUPERLATIVE
    RELATIVE
    NAME


cpdef enum:
    P2_sic
    P2_cluster
    P2_shape
    P2_prefix
    P2_suffix
    P2_pos
    P2_lemma
    P2_pos_type

    P1_sic
    P1_cluster
    P1_shape
    P1_prefix
    P1_suffix
    P1_pos
    P1_lemma
    P1_pos_type

    W_sic
    W_cluster
    W_shape
    W_prefix
    W_suffix
    W_pos
    W_lemma
    W_pos_type

    N1_sic
    N1_cluster
    N1_shape
    N1_prefix
    N1_suffix
    N1_pos
    N1_lemma
    N1_pos_type

    N2_sic
    N2_cluster
    N2_shape
    N2_prefix
    N2_suffix
    N2_pos
    N2_lemma
    N2_pos_type

    N_CONTEXT_FIELDS


POS_TAGS = {
    'NULL': (NO_TAG, {}),
    'EOL': (EOL, {}),
    'CC': (CONJ, {}),
    'CD': (NUM, {}),
    'DT': (DET, {}),
    'EX': (DET, {}),
    'FW': (X, {}),
    'IN': (ADP, {}),
    'JJ': (ADJ, {}),
    'JJR': (ADJ, {'misc': COMPARATIVE}),
    'JJS': (ADJ, {'misc': SUPERLATIVE}),
    'LS': (X, {}),
    'MD': (VERB, {'tenspect': MODAL}),
    'NN': (NOUN, {}),
    'NNS': (NOUN, {'number': PLURAL}),
    'NNP': (NOUN, {'misc': NAME}),
    'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
    'PDT': (DET, {}),
    'POS': (PRT, {'case': GENITIVE}),
    'PRP': (NOUN, {}),
    'PRP$': (NOUN, {'case': GENITIVE}),
    'RB': (ADV, {}),
    'RBR': (ADV, {'misc': COMPARATIVE}),
    'RBS': (ADV, {'misc': SUPERLATIVE}),
    'RP': (PRT, {}),
    'SYM': (X, {}),
    'TO': (PRT, {}),
    'UH': (X, {}),
    'VB': (VERB, {}),
    'VBD': (VERB, {'tenspect': PAST}),
    'VBG': (VERB, {'tenspect': ING}),
    'VBN': (VERB, {'tenspect': PASSIVE}),
    'VBP': (VERB, {'tenspect': PRESENT}),
    'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
    'WDT': (DET, {'misc': RELATIVE}),
    'WP': (PRON, {'misc': RELATIVE}),
    'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
    'WRB': (ADV, {'misc': RELATIVE}),
    '!': (PUNCT, {}),
    '#': (PUNCT, {}),
    '$': (PUNCT, {}),
    "''": (PUNCT, {}),
    "(": (PUNCT, {}),
    ")": (PUNCT, {}),
    "-LRB-": (PUNCT, {}),
    "-RRB-": (PUNCT, {}),
    ".": (PUNCT, {}),
    ",": (PUNCT, {}),
    "``": (PUNCT, {}),
    ":": (PUNCT, {}),
    "?": (PUNCT, {}),
}


POS_TEMPLATES = (
    (W_sic,),
    (P1_lemma, P1_pos),
    (P2_lemma, P2_pos),
    (N1_sic,),
    (N2_sic,),

    (W_suffix,),
    (W_prefix,),

    (P1_pos,),
    (P2_pos,),
    (P1_pos, P2_pos),
    (P1_pos, W_sic),
    (P1_suffix,),
    (N1_suffix,),

    (W_shape,),
    (W_cluster,),
    (N1_cluster,),
    (N2_cluster,),
    (P1_cluster,),
    (P2_cluster,),

    (W_pos_type,),
    (N1_pos_type,),
    (N1_pos_type,),
    (P1_pos, W_pos_type, N1_pos_type),
)


cdef class EnPosTagger(Tagger):
    def __init__(self, StringStore strings, data_dir):
        model_dir = path.join(data_dir, 'pos')
        Tagger.__init__(self, path.join(model_dir))
        self.strings = strings
        cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
        self.tags = StringStore()
        for tag in sorted(cfg['tag_names']):
            _ = self.tags[tag]
        self.morphologizer = Morphologizer(self.strings, cfg['tag_names'],
                                           cfg['tag_map'],
                                 Lemmatizer(path.join(data_dir, 'wordnet'),
                                            NOUN, VERB, ADJ))

    def __call__(self, Tokens tokens):
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
        cdef TokenC* t = tokens.data
        assert self.morphologizer is not None
        for i in range(tokens.length):
            fill_context(context, i, t)
            t[i].pos = self.predict(context)
            self.morphologizer.set_morph(i, t)

    def train(self, Tokens tokens, golds):
        cdef int i
        cdef atom_t[N_CONTEXT_FIELDS] context
        c = 0
        cdef TokenC* t = tokens.data
        for i in range(tokens.length):
            fill_context(context, i, t)
            t[i].pos = self.predict(context, [golds[i]])
            self.morphologizer.set_morph(i, t)
            c += t[i].pos == golds[i]
        return c


cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
    _fill_from_token(&context[P2_sic], &tokens[i-2])
    _fill_from_token(&context[P1_sic], &tokens[i-1])
    _fill_from_token(&context[W_sic], &tokens[i])
    _fill_from_token(&context[N1_sic], &tokens[i+1])
    _fill_from_token(&context[N2_sic], &tokens[i+2])


cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
    context[0] = t.lex.sic
    context[1] = t.lex.cluster
    context[2] = t.lex.shape
    context[3] = t.lex.prefix
    context[4] = t.lex.suffix
    context[5] = t.pos
    context[6] = t.lemma
    context[7] = t.lex.pos_type
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`from os import path`
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00			`import json`

* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`from thinc.typedefs cimport atom_t`

			`from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB`
			`from ..typedefs cimport X, PUNCT, EOL`
			`from ..structs cimport TokenC, Morphology`
			`from ..tokens cimport Tokens`
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00			`from .lemmatizer import Lemmatizer`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00

			`cpdef enum en_person_t:`
			`NO_PERSON`
			`FIRST`
			`SECOND`
			`THIRD`
			`NON_THIRD`


			`cpdef enum en_number_t:`
			`NO_NUMBER`
			`SINGULAR`
			`PLURAL`
			`MASS`


			`cpdef enum en_gender_t:`
			`NO_GENDER`
			`MASCULINE`
			`FEMININE`
			`NEUTER`


			`cpdef enum en_case_t:`
			`NO_CASE`
			`NOMINATIVE`
			`GENITIVE`
			`ACCUSATIVE`
			`REFLEXIVE`
			`DEMONYM`


			`cpdef enum en_tenspect_t:`
			`NO_TENSE`
			`BASE_VERB`
			`PRESENT`
			`PAST`
			`PASSIVE`
			`ING`
			`MODAL`


			`cpdef enum misc_t:`
			`NO_MISC`
			`COMPARATIVE`
			`SUPERLATIVE`
			`RELATIVE`
			`NAME`


			`cpdef enum:`
			`P2_sic`
			`P2_cluster`
			`P2_shape`
			`P2_prefix`
			`P2_suffix`
			`P2_pos`
			`P2_lemma`
			`P2_pos_type`

			`P1_sic`
			`P1_cluster`
			`P1_shape`
			`P1_prefix`
			`P1_suffix`
			`P1_pos`
			`P1_lemma`
			`P1_pos_type`

			`W_sic`
			`W_cluster`
			`W_shape`
			`W_prefix`
			`W_suffix`
			`W_pos`
			`W_lemma`
			`W_pos_type`

			`N1_sic`
			`N1_cluster`
			`N1_shape`
			`N1_prefix`
			`N1_suffix`
			`N1_pos`
			`N1_lemma`
			`N1_pos_type`

			`N2_sic`
			`N2_cluster`
			`N2_shape`
			`N2_prefix`
			`N2_suffix`
			`N2_pos`
			`N2_lemma`
			`N2_pos_type`

			`N_CONTEXT_FIELDS`


			`POS_TAGS = {`
			`'NULL': (NO_TAG, {}),`
			`'EOL': (EOL, {}),`
			`'CC': (CONJ, {}),`
			`'CD': (NUM, {}),`
			`'DT': (DET, {}),`
			`'EX': (DET, {}),`
			`'FW': (X, {}),`
			`'IN': (ADP, {}),`
			`'JJ': (ADJ, {}),`
			`'JJR': (ADJ, {'misc': COMPARATIVE}),`
			`'JJS': (ADJ, {'misc': SUPERLATIVE}),`
			`'LS': (X, {}),`
			`'MD': (VERB, {'tenspect': MODAL}),`
			`'NN': (NOUN, {}),`
			`'NNS': (NOUN, {'number': PLURAL}),`
			`'NNP': (NOUN, {'misc': NAME}),`
			`'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),`
			`'PDT': (DET, {}),`
			`'POS': (PRT, {'case': GENITIVE}),`
			`'PRP': (NOUN, {}),`
			`'PRP$': (NOUN, {'case': GENITIVE}),`
			`'RB': (ADV, {}),`
			`'RBR': (ADV, {'misc': COMPARATIVE}),`
			`'RBS': (ADV, {'misc': SUPERLATIVE}),`
			`'RP': (PRT, {}),`
			`'SYM': (X, {}),`
			`'TO': (PRT, {}),`
			`'UH': (X, {}),`
			`'VB': (VERB, {}),`
			`'VBD': (VERB, {'tenspect': PAST}),`
			`'VBG': (VERB, {'tenspect': ING}),`
			`'VBN': (VERB, {'tenspect': PASSIVE}),`
			`'VBP': (VERB, {'tenspect': PRESENT}),`
			`'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),`
			`'WDT': (DET, {'misc': RELATIVE}),`
			`'WP': (PRON, {'misc': RELATIVE}),`
			`'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),`
			`'WRB': (ADV, {'misc': RELATIVE}),`
			`'!': (PUNCT, {}),`
			`'#': (PUNCT, {}),`
			`'$': (PUNCT, {}),`
			`"''": (PUNCT, {}),`
			`"(": (PUNCT, {}),`
			`")": (PUNCT, {}),`
			`"-LRB-": (PUNCT, {}),`
			`"-RRB-": (PUNCT, {}),`
			`".": (PUNCT, {}),`
			`",": (PUNCT, {}),`
			"``": (PUNCT, {}),
			`":": (PUNCT, {}),`
			`"?": (PUNCT, {}),`
			`}`


			`POS_TEMPLATES = (`
			`(W_sic,),`
			`(P1_lemma, P1_pos),`
			`(P2_lemma, P2_pos),`
			`(N1_sic,),`
			`(N2_sic,),`

			`(W_suffix,),`
			`(W_prefix,),`

			`(P1_pos,),`
			`(P2_pos,),`
			`(P1_pos, P2_pos),`
			`(P1_pos, W_sic),`
			`(P1_suffix,),`
			`(N1_suffix,),`

			`(W_shape,),`
			`(W_cluster,),`
			`(N1_cluster,),`
			`(N2_cluster,),`
			`(P1_cluster,),`
			`(P2_cluster,),`

			`(W_pos_type,),`
			`(N1_pos_type,),`
			`(N1_pos_type,),`
			`(P1_pos, W_pos_type, N1_pos_type),`
			`)`


			`cdef class EnPosTagger(Tagger):`
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00			`def __init__(self, StringStore strings, data_dir):`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00			`model_dir = path.join(data_dir, 'pos')`
			`Tagger.__init__(self, path.join(model_dir))`
* Tests passing except for morphology/lemmatization stuff 2014-12-23 03:40:32 +03:00			`self.strings = strings`
			`cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))`
			`self.tags = StringStore()`
			`for tag in sorted(cfg['tag_names']):`
			`_ = self.tags[tag]`
			`self.morphologizer = Morphologizer(self.strings, cfg['tag_names'],`
			`cfg['tag_map'],`
			`Lemmatizer(path.join(data_dir, 'wordnet'),`
			`NOUN, VERB, ADJ))`
* Add English-subclass POS tagger 2014-12-21 12:59:07 +03:00
			`def __call__(self, Tokens tokens):`
			`cdef int i`
			`cdef atom_t[N_CONTEXT_FIELDS] context`
			`cdef TokenC* t = tokens.data`
			`assert self.morphologizer is not None`
			`for i in range(tokens.length):`
			`fill_context(context, i, t)`
			`t[i].pos = self.predict(context)`
			`self.morphologizer.set_morph(i, t)`

			`def train(self, Tokens tokens, golds):`
			`cdef int i`
			`cdef atom_t[N_CONTEXT_FIELDS] context`
			`c = 0`
			`cdef TokenC* t = tokens.data`
			`for i in range(tokens.length):`
			`fill_context(context, i, t)`
			`t[i].pos = self.predict(context, [golds[i]])`
			`self.morphologizer.set_morph(i, t)`
			`c += t[i].pos == golds[i]`
			`return c`


			`cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:`
			`_fill_from_token(&context[P2_sic], &tokens[i-2])`
			`_fill_from_token(&context[P1_sic], &tokens[i-1])`
			`_fill_from_token(&context[W_sic], &tokens[i])`
			`_fill_from_token(&context[N1_sic], &tokens[i+1])`
			`_fill_from_token(&context[N2_sic], &tokens[i+2])`


			`cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:`
			`context[0] = t.lex.sic`
			`context[1] = t.lex.cluster`
			`context[2] = t.lex.shape`
			`context[3] = t.lex.prefix`
			`context[4] = t.lex.suffix`
			`context[5] = t.pos`
			`context[6] = t.lemma`
			`context[7] = t.lex.pos_type`