spaCy/spacy/en/pos.pyx

from os import path
import json
import os
import shutil

from libc.string cimport memset

from cymem.cymem cimport Address
from thinc.typedefs cimport atom_t, weight_t
from collections import defaultdict

from ..parts_of_speech cimport univ_pos_t
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON

from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
from ..structs cimport TokenC, Morphology, LexemeC
from ..tokens.doc cimport Doc
from ..morphology cimport set_morph_from_dict
from .._ml cimport arg_max

from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
from ..typedefs cimport attr_t

from .lemmatizer import Lemmatizer


cpdef enum en_person_t:
    NO_PERSON
    FIRST
    SECOND
    THIRD
    NON_THIRD


cpdef enum en_number_t:
    NO_NUMBER
    SINGULAR
    PLURAL
    MASS


cpdef enum en_gender_t:
    NO_GENDER
    MASCULINE
    FEMININE
    NEUTER


cpdef enum en_case_t:
    NO_CASE
    NOMINATIVE
    GENITIVE
    ACCUSATIVE
    REFLEXIVE
    DEMONYM


cpdef enum en_tenspect_t:
    NO_TENSE
    BASE_VERB
    PRESENT
    PAST
    PASSIVE
    ING
    MODAL


cpdef enum misc_t:
    NO_MISC
    COMPARATIVE
    SUPERLATIVE
    RELATIVE
    NAME


cpdef enum:
    P2_orth
    P2_cluster
    P2_shape
    P2_prefix
    P2_suffix
    P2_pos
    P2_lemma
    P2_flags

    P1_orth
    P1_cluster
    P1_shape
    P1_prefix
    P1_suffix
    P1_pos
    P1_lemma
    P1_flags

    W_orth
    W_cluster
    W_shape
    W_prefix
    W_suffix
    W_pos
    W_lemma
    W_flags

    N1_orth
    N1_cluster
    N1_shape
    N1_prefix
    N1_suffix
    N1_pos
    N1_lemma
    N1_flags

    N2_orth
    N2_cluster
    N2_shape
    N2_prefix
    N2_suffix
    N2_pos
    N2_lemma
    N2_flags

    N_CONTEXT_FIELDS


POS_TAGS = {
    'NULL': (NO_TAG, {}),
    'EOL': (EOL, {}),
    'CC': (CONJ, {}),
    'CD': (NUM, {}),
    'DT': (DET, {}),
    'EX': (DET, {}),
    'FW': (X, {}),
    'IN': (ADP, {}),
    'JJ': (ADJ, {}),
    'JJR': (ADJ, {'misc': COMPARATIVE}),
    'JJS': (ADJ, {'misc': SUPERLATIVE}),
    'LS': (X, {}),
    'MD': (VERB, {'tenspect': MODAL}),
    'NN': (NOUN, {}),
    'NNS': (NOUN, {'number': PLURAL}),
    'NNP': (NOUN, {'misc': NAME}),
    'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
    'PDT': (DET, {}),
    'POS': (PRT, {'case': GENITIVE}),
    'PRP': (PRON, {}),
    'PRP$': (PRON, {'case': GENITIVE}),
    'RB': (ADV, {}),
    'RBR': (ADV, {'misc': COMPARATIVE}),
    'RBS': (ADV, {'misc': SUPERLATIVE}),
    'RP': (PRT, {}),
    'SYM': (X, {}),
    'TO': (PRT, {}),
    'UH': (X, {}),
    'VB': (VERB, {}),
    'VBD': (VERB, {'tenspect': PAST}),
    'VBG': (VERB, {'tenspect': ING}),
    'VBN': (VERB, {'tenspect': PASSIVE}),
    'VBP': (VERB, {'tenspect': PRESENT}),
    'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
    'WDT': (DET, {'misc': RELATIVE}),
    'WP': (PRON, {'misc': RELATIVE}),
    'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
    'WRB': (ADV, {'misc': RELATIVE}),
    '!': (PUNCT, {}),
    '#': (PUNCT, {}),
    '$': (PUNCT, {}),
    "''": (PUNCT, {}),
    "(": (PUNCT, {}),
    ")": (PUNCT, {}),
    "-LRB-": (PUNCT, {}),
    "-RRB-": (PUNCT, {}),
    ".": (PUNCT, {}),
    ",": (PUNCT, {}),
    "``": (PUNCT, {}),
    ":": (PUNCT, {}),
    "?": (PUNCT, {}),
    "ADD": (X, {}),
    "NFP": (PUNCT, {}),
    "GW": (X, {}),
    "AFX": (X, {}),
    "HYPH": (PUNCT, {}),
    "XX": (X, {}),
    "BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
    "HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
    "SP": (SPACE, {})
}


POS_TEMPLATES = (
    (W_orth,),
    (P1_lemma, P1_pos),
    (P2_lemma, P2_pos),
    (N1_orth,),
    (N2_orth,),

    (W_suffix,),
    (W_prefix,),

    (P1_pos,),
    (P2_pos,),
    (P1_pos, P2_pos),
    (P1_pos, W_orth),
    (P1_suffix,),
    (N1_suffix,),

    (W_shape,),
    (W_cluster,),
    (N1_cluster,),
    (N2_cluster,),
    (P1_cluster,),
    (P2_cluster,),

    (W_flags,),
    (N1_flags,),
    (N2_flags,),
    (P1_flags,),
    (P2_flags,),
)


cdef class EnPosTagger(Tagger):
    """A part-of-speech tagger for English"""
    def make_lemmatizer(self, data_dir):
        return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)

    cdef int predict(self, int i, const TokenC* tokens) except -1:
        cdef atom_t[N_CONTEXT_FIELDS] context
        _fill_from_token(&context[P2_orth], &tokens[i-2])
        _fill_from_token(&context[P1_orth], &tokens[i-1])
        _fill_from_token(&context[W_orth], &tokens[i])
        _fill_from_token(&context[N1_orth], &tokens[i+1])
        _fill_from_token(&context[N2_orth], &tokens[i+2])
        scores = self.model.score(context)
        return arg_max(scores, self.model.n_classes)

    cdef int update(self, int i, const TokenC* tokens, int gold) except -1:
        cdef atom_t[N_CONTEXT_FIELDS] context
        _fill_from_token(&context[P2_orth], &tokens[i-2])
        _fill_from_token(&context[P1_orth], &tokens[i-1])
        _fill_from_token(&context[W_orth], &tokens[i])
        _fill_from_token(&context[N1_orth], &tokens[i+1])
        _fill_from_token(&context[N2_orth], &tokens[i+2])
        scores = self.model.score(context)
        guess = arg_max(scores, self.model.n_classes)
        loss = guess != gold if gold != -1 else 0
        self.model.update(context, guess, gold, loss)
        return guess


cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
    context[0] = t.lex.lower
    context[1] = t.lex.cluster
    context[2] = t.lex.shape
    context[3] = t.lex.prefix
    context[4] = t.lex.suffix
    context[5] = t.tag
    context[6] = t.lemma
    if t.lex.flags & (1 << IS_ALPHA):
        context[7] = 1
    elif t.lex.flags & (1 << IS_PUNCT):
        context[7] = 2
    elif t.lex.flags & (1 << LIKE_URL):
        context[7] = 3
    elif t.lex.flags & (1 << LIKE_NUM):
        context[7] = 4
    else:
        context[7] = 0