spaCy/spacy/en/pos.pyx

338 lines
8.9 KiB
Cython
Raw Normal View History

2014-12-21 12:59:07 +03:00
from os import path
import json
import os
import shutil
from libc.string cimport memset
from cymem.cymem cimport Address
from thinc.typedefs cimport atom_t, weight_t
2014-12-21 12:59:07 +03:00
from ..parts_of_speech cimport univ_pos_t
from ..parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
from ..parts_of_speech cimport X, PUNCT, EOL
2014-12-23 05:18:59 +03:00
from ..typedefs cimport id_t
from ..structs cimport TokenC, Morphology, LexemeC
2014-12-21 12:59:07 +03:00
from ..tokens cimport Tokens
2014-12-23 05:18:59 +03:00
from ..morphology cimport set_morph_from_dict
from .._ml cimport arg_max
from .lemmatizer import Lemmatizer
2014-12-21 12:59:07 +03:00
cpdef enum en_person_t:
NO_PERSON
FIRST
SECOND
THIRD
NON_THIRD
cpdef enum en_number_t:
NO_NUMBER
SINGULAR
PLURAL
MASS
cpdef enum en_gender_t:
NO_GENDER
MASCULINE
FEMININE
NEUTER
cpdef enum en_case_t:
NO_CASE
NOMINATIVE
GENITIVE
ACCUSATIVE
REFLEXIVE
DEMONYM
cpdef enum en_tenspect_t:
NO_TENSE
BASE_VERB
PRESENT
PAST
PASSIVE
ING
MODAL
cpdef enum misc_t:
NO_MISC
COMPARATIVE
SUPERLATIVE
RELATIVE
NAME
cpdef enum:
2015-01-22 18:08:25 +03:00
P2_orth
2014-12-21 12:59:07 +03:00
P2_cluster
P2_shape
P2_prefix
P2_suffix
P2_pos
P2_lemma
2015-01-22 18:08:25 +03:00
P1_orth
2014-12-21 12:59:07 +03:00
P1_cluster
P1_shape
P1_prefix
P1_suffix
P1_pos
P1_lemma
2015-01-22 18:08:25 +03:00
W_orth
2014-12-21 12:59:07 +03:00
W_cluster
W_shape
W_prefix
W_suffix
W_pos
W_lemma
2015-01-22 18:08:25 +03:00
N1_orth
2014-12-21 12:59:07 +03:00
N1_cluster
N1_shape
N1_prefix
N1_suffix
N1_pos
N1_lemma
2015-01-22 18:08:25 +03:00
N2_orth
2014-12-21 12:59:07 +03:00
N2_cluster
N2_shape
N2_prefix
N2_suffix
N2_pos
N2_lemma
N_CONTEXT_FIELDS
POS_TAGS = {
'NULL': (NO_TAG, {}),
'EOL': (EOL, {}),
'CC': (CONJ, {}),
'CD': (NUM, {}),
'DT': (DET, {}),
'EX': (DET, {}),
'FW': (X, {}),
'IN': (ADP, {}),
'JJ': (ADJ, {}),
'JJR': (ADJ, {'misc': COMPARATIVE}),
'JJS': (ADJ, {'misc': SUPERLATIVE}),
'LS': (X, {}),
'MD': (VERB, {'tenspect': MODAL}),
'NN': (NOUN, {}),
'NNS': (NOUN, {'number': PLURAL}),
'NNP': (NOUN, {'misc': NAME}),
'NNPS': (NOUN, {'misc': NAME, 'number': PLURAL}),
'PDT': (DET, {}),
'POS': (PRT, {'case': GENITIVE}),
'PRP': (NOUN, {}),
'PRP$': (NOUN, {'case': GENITIVE}),
'RB': (ADV, {}),
'RBR': (ADV, {'misc': COMPARATIVE}),
'RBS': (ADV, {'misc': SUPERLATIVE}),
'RP': (PRT, {}),
'SYM': (X, {}),
'TO': (PRT, {}),
'UH': (X, {}),
'VB': (VERB, {}),
'VBD': (VERB, {'tenspect': PAST}),
'VBG': (VERB, {'tenspect': ING}),
'VBN': (VERB, {'tenspect': PASSIVE}),
'VBP': (VERB, {'tenspect': PRESENT}),
'VBZ': (VERB, {'tenspect': PRESENT, 'person': THIRD}),
'WDT': (DET, {'misc': RELATIVE}),
'WP': (PRON, {'misc': RELATIVE}),
'WP$': (PRON, {'misc': RELATIVE, 'case': GENITIVE}),
'WRB': (ADV, {'misc': RELATIVE}),
'!': (PUNCT, {}),
'#': (PUNCT, {}),
'$': (PUNCT, {}),
"''": (PUNCT, {}),
"(": (PUNCT, {}),
")": (PUNCT, {}),
"-LRB-": (PUNCT, {}),
"-RRB-": (PUNCT, {}),
".": (PUNCT, {}),
",": (PUNCT, {}),
"``": (PUNCT, {}),
":": (PUNCT, {}),
"?": (PUNCT, {}),
}
POS_TEMPLATES = (
2015-01-22 18:08:25 +03:00
(W_orth,),
2014-12-21 12:59:07 +03:00
(P1_lemma, P1_pos),
(P2_lemma, P2_pos),
2015-01-22 18:08:25 +03:00
(N1_orth,),
(N2_orth,),
2014-12-21 12:59:07 +03:00
(W_suffix,),
(W_prefix,),
(P1_pos,),
(P2_pos,),
(P1_pos, P2_pos),
2015-01-22 18:08:25 +03:00
(P1_pos, W_orth),
2014-12-21 12:59:07 +03:00
(P1_suffix,),
(N1_suffix,),
(W_shape,),
(W_cluster,),
(N1_cluster,),
(N2_cluster,),
(P1_cluster,),
(P2_cluster,),
)
2014-12-23 05:18:59 +03:00
cdef struct _CachedMorph:
Morphology morph
int lemma
def setup_model_dir(tag_names, tag_map, templates, model_dir):
if path.exists(model_dir):
shutil.rmtree(model_dir)
os.mkdir(model_dir)
config = {
'templates': templates,
'tag_names': tag_names,
'tag_map': tag_map
}
with open(path.join(model_dir, 'config.json'), 'w') as file_:
json.dump(config, file_)
cdef class EnPosTagger:
2014-12-27 10:45:16 +03:00
"""A part-of-speech tagger for English"""
def __init__(self, StringStore strings, data_dir):
self.mem = Pool()
2014-12-21 12:59:07 +03:00
model_dir = path.join(data_dir, 'pos')
self.strings = strings
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
2014-12-23 05:18:59 +03:00
self.tag_names = sorted(cfg['tag_names'])
self.n_tags = len(self.tag_names)
2014-12-23 05:18:59 +03:00
self.tag_map = cfg['tag_map']
cdef int n_tags = len(self.tag_names) + 1
self.model = Model(n_tags, cfg['templates'], model_dir)
2014-12-23 05:18:59 +03:00
self._morph_cache = PreshMapArray(n_tags)
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
for i, tag in enumerate(sorted(self.tag_names)):
pos, props = self.tag_map[tag]
self.tags[i].id = i
self.tags[i].pos = pos
set_morph_from_dict(&self.tags[i].morph, props)
2015-01-03 15:13:00 +03:00
if path.exists(path.join(data_dir, 'tokenizer', 'morphs.json')):
self.load_morph_exceptions(json.load(open(path.join(data_dir, 'tokenizer',
'morphs.json'))))
2014-12-23 05:18:59 +03:00
self.lemmatizer = Lemmatizer(path.join(data_dir, 'wordnet'), NOUN, VERB, ADJ)
2014-12-21 12:59:07 +03:00
def __call__(self, Tokens tokens):
2014-12-27 10:45:16 +03:00
"""Apply the tagger, setting the POS tags onto the Tokens object.
Args:
tokens (Tokens): The tokens to be tagged.
"""
2014-12-21 12:59:07 +03:00
cdef int i
cdef atom_t[N_CONTEXT_FIELDS] context
cdef const weight_t* scores
2014-12-21 12:59:07 +03:00
for i in range(tokens.length):
if tokens.data[i].pos == 0:
fill_context(context, i, tokens.data)
scores = self.model.score(context)
tokens.data[i].tag = arg_max(scores, self.model.n_classes)
self.set_morph(i, tokens.data)
tokens._tag_strings = self.tag_names
tokens.is_tagged = True
2014-12-21 12:59:07 +03:00
def train(self, Tokens tokens, object gold_tag_strs):
2014-12-21 12:59:07 +03:00
cdef int i
cdef int loss
2014-12-21 12:59:07 +03:00
cdef atom_t[N_CONTEXT_FIELDS] context
cdef const weight_t* scores
golds = [self.tag_names.index(g) if g is not None else -1
for g in gold_tag_strs]
correct = 0
2014-12-21 12:59:07 +03:00
for i in range(tokens.length):
fill_context(context, i, tokens.data)
scores = self.model.score(context)
guess = arg_max(scores, self.model.n_classes)
loss = guess != golds[i] if golds[i] != -1 else 0
self.model.update(context, guess, golds[i], loss)
tokens.data[i].tag = guess
self.set_morph(i, tokens.data)
correct += loss == 0
return correct
2014-12-21 12:59:07 +03:00
2014-12-23 05:18:59 +03:00
cdef int set_morph(self, const int i, TokenC* tokens) except -1:
cdef const PosTag* tag = &self.tags[tokens[i].tag]
2014-12-24 09:42:00 +03:00
tokens[i].pos = tag.pos
2015-01-22 18:08:25 +03:00
cached = <_CachedMorph*>self._morph_cache.get(tag.id, tokens[i].lex.orth)
2014-12-23 05:18:59 +03:00
if cached is NULL:
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.lemmatize(tag.pos, tokens[i].lex)
cached.morph = tag.morph
2015-01-22 18:08:25 +03:00
self._morph_cache.set(tag.id, tokens[i].lex.orth, <void*>cached)
2014-12-23 05:18:59 +03:00
tokens[i].lemma = cached.lemma
tokens[i].morph = cached.morph
cdef int lemmatize(self, const univ_pos_t pos, const LexemeC* lex) except -1:
2014-12-23 05:18:59 +03:00
if self.lemmatizer is None:
2015-01-22 18:08:25 +03:00
return lex.orth
cdef unicode py_string = self.strings[lex.orth]
2014-12-23 05:18:59 +03:00
if pos != NOUN and pos != VERB and pos != ADJ:
2015-01-22 18:08:25 +03:00
return lex.orth
2014-12-23 05:18:59 +03:00
cdef set lemma_strings
2015-01-05 03:54:29 +03:00
cdef unicode lemma_string
lemma_strings = self.lemmatizer(py_string, pos)
2014-12-23 05:18:59 +03:00
lemma_string = sorted(lemma_strings)[0]
2015-01-05 03:54:29 +03:00
lemma = self.strings.intern(lemma_string.encode('utf8'), len(lemma_string)).i
2014-12-23 05:18:59 +03:00
return lemma
def load_morph_exceptions(self, dict exc):
cdef unicode pos_str
cdef unicode form_str
cdef unicode lemma_str
cdef dict entries
cdef dict props
cdef int lemma
2015-01-22 18:08:25 +03:00
cdef id_t orth
2014-12-23 05:18:59 +03:00
cdef int pos
for pos_str, entries in exc.items():
pos = self.tag_names.index(pos_str)
for form_str, props in entries.items():
lemma_str = props.get('L', form_str)
2015-01-22 18:08:25 +03:00
orth = self.strings[form_str]
2014-12-23 05:18:59 +03:00
cached = <_CachedMorph*>self.mem.alloc(1, sizeof(_CachedMorph))
cached.lemma = self.strings[lemma_str]
set_morph_from_dict(&cached.morph, props)
2015-01-22 18:08:25 +03:00
self._morph_cache.set(pos, orth, <void*>cached)
2014-12-23 05:18:59 +03:00
2014-12-21 12:59:07 +03:00
cdef int fill_context(atom_t* context, const int i, const TokenC* tokens) except -1:
2015-01-22 18:08:25 +03:00
_fill_from_token(&context[P2_orth], &tokens[i-2])
_fill_from_token(&context[P1_orth], &tokens[i-1])
_fill_from_token(&context[W_orth], &tokens[i])
_fill_from_token(&context[N1_orth], &tokens[i+1])
_fill_from_token(&context[N2_orth], &tokens[i+2])
2014-12-21 12:59:07 +03:00
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
2015-01-22 18:08:25 +03:00
context[0] = t.lex.orth
2014-12-21 12:59:07 +03:00
context[1] = t.lex.cluster
context[2] = t.lex.shape
context[3] = t.lex.prefix
context[4] = t.lex.suffix
2015-01-24 18:20:15 +03:00
context[5] = t.tag
2014-12-21 12:59:07 +03:00
context[6] = t.lemma