spaCy/spacy/en/pos.pyx
2015-08-24 05:25:55 +02:00

269 lines
5.8 KiB
Cython

from os import path
import json
import os
import shutil
from libc.string cimport memset
from cymem.cymem cimport Address
from thinc.typedefs cimport atom_t, weight_t
from collections import defaultdict
from ..parts_of_speech cimport univ_pos_t
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
from ..parts_of_speech cimport PRT, VERB, X, PUNCT, EOL, SPACE
from ..structs cimport TokenC, Morphology, LexemeC
from ..tokens.doc cimport Doc
from ..morphology cimport set_morph_from_dict
from .._ml cimport arg_max
from .attrs cimport TAG, IS_ALPHA, IS_PUNCT, LIKE_NUM, LIKE_URL
from ..typedefs cimport attr_t
from .lemmatizer import Lemmatizer
cpdef enum en_person_t:
NO_PERSON
FIRST
SECOND
THIRD
NON_THIRD
cpdef enum en_number_t:
NO_NUMBER
SINGULAR
PLURAL
MASS
cpdef enum en_gender_t:
NO_GENDER
MASCULINE
FEMININE
NEUTER
cpdef enum en_case_t:
NO_CASE
NOMINATIVE
GENITIVE
ACCUSATIVE
REFLEXIVE
DEMONYM
cpdef enum en_tenspect_t:
NO_TENSE
BASE_VERB
PRESENT
PAST
PASSIVE
ING
MODAL
cpdef enum misc_t:
NO_MISC
COMPARATIVE
SUPERLATIVE
RELATIVE
NAME
cpdef enum:
P2_orth
P2_cluster
P2_shape
P2_prefix
P2_suffix
P2_pos
P2_lemma
P2_flags
P1_orth
P1_cluster
P1_shape
P1_prefix
P1_suffix
P1_pos
P1_lemma
P1_flags
W_orth
W_cluster
W_shape
W_prefix
W_suffix
W_pos
W_lemma
W_flags
N1_orth
N1_cluster
N1_shape
N1_prefix
N1_suffix
N1_pos
N1_lemma
N1_flags
N2_orth
N2_cluster
N2_shape
N2_prefix
N2_suffix
N2_pos
N2_lemma
N2_flags
N_CONTEXT_FIELDS
POS_TAGS = {
'NULL': (NO_TAG, {}),
'EOL': (EOL, {}),
'CC': (CONJ, {}),
'CD': (NUM, {}),
'DT': (DET, {}),
'EX': (DET, {}),
'FW': (X, {}),
'IN': (ADP, {}),
'JJ': (ADJ, {}),
'JJR': (ADJ, {'misc': COMPARATIVE}),
'JJS': (ADJ, {'misc': SUPERLATIVE}),
'LS': (X, {}),
'MD': (VERB, {'tenspect': MODAL}),
'NN': (NOUN, {}),
'NNS': (NOUN, {'number': PLURAL}),
'NNP': (NOUN, {'misc': NAME}),
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
'PDT': (DET, {}),
'POS': (PRT, {'case': GENITIVE}),
'PRP': (PRON, {}),
'PRP$': (PRON, {'case': GENITIVE}),
'RB': (ADV, {}),
'RBR': (ADV, {'misc': COMPARATIVE}),
'RBS': (ADV, {'misc': SUPERLATIVE}),
'RP': (PRT, {}),
'SYM': (X, {}),
'TO': (PRT, {}),
'UH': (X, {}),
'VB': (VERB, {}),
'VBD': (VERB, {'tenspect': PAST}),
'VBG': (VERB, {'tenspect': ING}),
'VBN': (VERB, {'tenspect': PASSIVE}),
'VBP': (VERB, {'tenspect': PRESENT}),
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
'WDT': (DET, {'misc': RELATIVE}),
'WP': (PRON, {'misc': RELATIVE}),
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
'WRB': (ADV, {'misc': RELATIVE}),
'!': (PUNCT, {}),
'#': (PUNCT, {}),
'$': (PUNCT, {}),
"''": (PUNCT, {}),
"(": (PUNCT, {}),
")": (PUNCT, {}),
"-LRB-": (PUNCT, {}),
"-RRB-": (PUNCT, {}),
".": (PUNCT, {}),
",": (PUNCT, {}),
"``": (PUNCT, {}),
":": (PUNCT, {}),
"?": (PUNCT, {}),
"ADD": (X, {}),
"NFP": (PUNCT, {}),
"GW": (X, {}),
"AFX": (X, {}),
"HYPH": (PUNCT, {}),
"XX": (X, {}),
"BES": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
"HVS": (VERB, {'tenspect': PRESENT, 'person': THIRD}),
"SP": (SPACE, {})
}
POS_TEMPLATES = (
(W_orth,),
(P1_lemma, P1_pos),
(P2_lemma, P2_pos),
(N1_orth,),
(N2_orth,),
(W_suffix,),
(W_prefix,),
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
(P1_pos, W_orth),
(P1_suffix,),
(N1_suffix,),
(W_shape,),
(W_cluster,),
(N1_cluster,),
(N2_cluster,),
(P1_cluster,),
(P2_cluster,),
(W_flags,),
(N1_flags,),
(N2_flags,),
(P1_flags,),
(P2_flags,),
)
cdef class EnPosTagger(Tagger):
"""A part-of-speech tagger for English"""
def make_lemmatizer(self, data_dir):
return Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
cdef int predict(self, int i, const TokenC* tokens) except -1:
cdef atom_t[N_CONTEXT_FIELDS] context
_fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_orth], &tokens[i+2])
scores = self.model.score(context)
return arg_max(scores, self.model.n_classes)
cdef int update(self, int i, const TokenC* tokens, int gold) except -1:
cdef atom_t[N_CONTEXT_FIELDS] context
_fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_orth], &tokens[i+2])
scores = self.model.score(context)
guess = arg_max(scores, self.model.n_classes)
loss = guess != gold if gold != -1 else 0
self.model.update(context, guess, gold, loss)
return guess
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
context[0] = t.lex.lower
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
context[5] = t.tag
context[6] = t.lemma
if t.lex.flags & (1 << IS_ALPHA):
context[7] = 1
elif t.lex.flags & (1 << IS_PUNCT):
context[7] = 2
elif t.lex.flags & (1 << LIKE_URL):
context[7] = 3
elif t.lex.flags & (1 << LIKE_NUM):
context[7] = 4
else:
context[7] = 0