# cython: infer_types=True # cython: profile=True import json import pathlib from collections import defaultdict from libc.string cimport memset, memcpy from libcpp.vector cimport vector from libc.stdint cimport uint64_t, int32_t, int64_t cimport numpy as np import numpy as np np.import_array() from cymem.cymem cimport Pool from thinc.typedefs cimport atom_t, weight_t from thinc.extra.eg cimport Example from thinc.structs cimport ExampleC from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linalg cimport Vec, VecVec from thinc.linear.linear import LinearModel from thinc.structs cimport FeatureC from thinc.neural.optimizers import Adam from thinc.neural.ops import NumpyOps from .typedefs cimport attr_t from .tokens.doc cimport Doc from .attrs cimport TAG from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE from .gold cimport GoldParse from .attrs cimport * cpdef enum: P2_orth P2_cluster P2_shape P2_prefix P2_suffix P2_pos P2_lemma P2_flags P1_orth P1_cluster P1_shape P1_prefix P1_suffix P1_pos P1_lemma P1_flags W_orth W_cluster W_shape W_prefix W_suffix W_pos W_lemma W_flags N1_orth N1_cluster N1_shape N1_prefix N1_suffix N1_pos N1_lemma N1_flags N2_orth N2_cluster N2_shape N2_prefix N2_suffix N2_pos N2_lemma N2_flags N_CONTEXT_FIELDS cdef class TaggerModel: def __init__(self, int nr_tag, templates): self.extracter = ConjunctionExtracter(templates) self.model = LinearModel(nr_tag) def begin_update(self, atom_t[:, ::1] contexts, drop=0.): cdef vector[uint64_t]* keys = new vector[uint64_t]() cdef vector[float]* values = new vector[float]() cdef vector[int64_t]* lengths = new vector[int64_t]() features = new vector[FeatureC](self.extracter.nr_templ) features.resize(self.extracter.nr_templ) cdef FeatureC feat cdef int i, j for i in range(contexts.shape[0]): nr_feat = self.extracter.set_features(features.data(), &contexts[i, 0]) for j in range(nr_feat): keys.push_back(features.at(j).key) values.push_back(features.at(j).value) lengths.push_back(nr_feat) cdef np.ndarray[uint64_t, ndim=1] py_keys cdef np.ndarray[float, ndim=1] py_values cdef np.ndarray[long, ndim=1] py_lengths py_keys = vector_uint64_2numpy(keys) py_values = vector_float_2numpy(values) py_lengths = vector_long_2numpy(lengths) instance = (py_keys, py_values, py_lengths) del keys del values del lengths del features return self.model.begin_update(instance, drop=drop) def end_training(self, *args, **kwargs): pass def dump(self, *args, **kwargs): pass cdef np.ndarray[uint64_t, ndim=1] vector_uint64_2numpy(vector[uint64_t]* vec): cdef np.ndarray[uint64_t, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='uint64') memcpy(arr.data, vec.data(), sizeof(uint64_t) * vec.size()) return arr cdef np.ndarray[long, ndim=1] vector_long_2numpy(vector[int64_t]* vec): cdef np.ndarray[long, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='int64') memcpy(arr.data, vec.data(), sizeof(int64_t) * vec.size()) return arr cdef np.ndarray[float, ndim=1] vector_float_2numpy(vector[float]* vec): cdef np.ndarray[float, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='float32') memcpy(arr.data, vec.data(), sizeof(float) * vec.size()) return arr cdef void fill_context(atom_t* context, const TokenC* tokens, int i) nogil: _fill_from_token(&context[P2_orth], &tokens[i-2]) _fill_from_token(&context[P1_orth], &tokens[i-1]) _fill_from_token(&context[W_orth], &tokens[i]) _fill_from_token(&context[N1_orth], &tokens[i+1]) _fill_from_token(&context[N2_orth], &tokens[i+2]) cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: context[0] = t.lex.lower context[1] = t.lex.cluster context[2] = t.lex.shape context[3] = t.lex.prefix context[4] = t.lex.suffix context[5] = t.tag context[6] = t.lemma if t.lex.flags & (1 << IS_ALPHA): context[7] = 1 elif t.lex.flags & (1 << IS_PUNCT): context[7] = 2 elif t.lex.flags & (1 << LIKE_URL): context[7] = 3 elif t.lex.flags & (1 << LIKE_NUM): context[7] = 4 else: context[7] = 0 cdef class Tagger: """Annotate part-of-speech tags on Doc objects.""" @classmethod def load(cls, path, vocab, require=False): """Load the statistical model from the supplied path. Arguments: path (Path): The path to load from. vocab (Vocab): The vocabulary. Must be shared by the documents to be processed. require (bool): Whether to raise an error if the files are not found. Returns (Tagger): The newly created object. """ # TODO: Change this to expect config.json when we don't have to # support old data. path = path if not isinstance(path, basestring) else pathlib.Path(path) if (path / 'templates.json').exists(): with (path / 'templates.json').open('r', encoding='utf8') as file_: templates = json.load(file_) elif require: raise IOError( "Required file %s/templates.json not found when loading Tagger" % str(path)) else: templates = cls.feature_templates self = cls(vocab, model=None, feature_templates=templates) if (path / 'model').exists(): self.model.load(str(path / 'model')) elif require: raise IOError( "Required file %s/model not found when loading Tagger" % str(path)) return self def __init__(self, Vocab vocab, TaggerModel model=None, **cfg): """Create a Tagger. Arguments: vocab (Vocab): The vocabulary object. Must be shared with documents to be processed. model (thinc.linear.AveragedPerceptron): The statistical model. Returns (Tagger): The newly constructed object. """ if model is None: model = TaggerModel(vocab.morphology.n_tags, cfg.get('features', self.feature_templates)) self.vocab = vocab self.model = model # TODO: Move this to tag map self.freqs = {TAG: defaultdict(int)} for tag in self.tag_names: self.freqs[TAG][self.vocab.strings[tag]] = 1 self.freqs[TAG][0] = 1 self.cfg = cfg self.optimizer = Adam(NumpyOps(), 0.001) @property def tag_names(self): return self.vocab.morphology.tag_names def __reduce__(self): return (self.__class__, (self.vocab, self.model), None, None) def tag_from_strings(self, Doc tokens, object tag_strs): cdef int i for i in range(tokens.length): self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i]) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def __call__(self, Doc tokens): """Apply the tagger, setting the POS tags onto the Doc object. Arguments: doc (Doc): The tokens to be tagged. Returns: None """ if tokens.length == 0: return 0 cdef atom_t[1][N_CONTEXT_FIELDS] c_context memset(c_context, 0, sizeof(c_context)) cdef atom_t[:, ::1] context = c_context cdef float[:, ::1] scores cdef int nr_class = self.vocab.morphology.n_tags for i in range(tokens.length): if tokens.c[i].pos == 0: fill_context(&context[0, 0], tokens.c, i) scores, _ = self.model.begin_update(context) guess = Vec.arg_max(&scores[0, 0], nr_class) self.vocab.morphology.assign_tag_id(&tokens.c[i], guess) memset(&scores[0, 0], 0, sizeof(float) * scores.size) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def pipe(self, stream, batch_size=1000, n_threads=2): """Tag a stream of documents. Arguments: stream: The sequence of documents to tag. batch_size (int): The number of documents to accumulate into a working set. n_threads (int): The number of threads with which to work on the buffer in parallel, if the Matcher implementation supports multi-threading. Yields: Doc Documents, in order. """ for doc in stream: self(doc) yield doc def update(self, Doc tokens, GoldParse gold, itn=0): """Update the statistical model, with tags supplied for the given document. Arguments: doc (Doc): The document to update on. gold (GoldParse): Manager for the gold-standard tags. Returns (int): Number of tags correct. """ cdef int nr_class = self.vocab.morphology.n_tags gold_tag_strs = gold.tags assert len(tokens) == len(gold_tag_strs) for tag in gold_tag_strs: if tag != None and tag not in self.tag_names: msg = ("Unrecognized gold tag: %s. tag_map.json must contain all " "gold tags, to maintain coarse-grained mapping.") raise ValueError(msg % tag) golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] cdef int correct = 0 cdef atom_t[:, ::1] context = np.zeros((1, N_CONTEXT_FIELDS), dtype='uint64') cdef float[:, ::1] scores for i in range(tokens.length): fill_context(&context[0, 0], tokens.c, i) scores, finish_update = self.model.begin_update(context) guess = Vec.arg_max(&scores[0, 0], nr_class) self.vocab.morphology.assign_tag_id(&tokens.c[i], guess) if golds[i] != -1: scores[0, golds[i]] -= 1 finish_update(scores, lambda *args, **kwargs: None) if (golds[i] in (guess, -1)): correct += 1 self.freqs[TAG][tokens.c[i].tag] += 1 self.optimizer(self.model.model.weights, self.model.model.d_weights, key=self.model.model.id) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length return correct feature_templates = ( (W_orth,), (P1_lemma, P1_pos), (P2_lemma, P2_pos), (N1_orth,), (N2_orth,), (W_suffix,), (W_prefix,), (P1_pos,), (P2_pos,), (P1_pos, P2_pos), (P1_pos, W_orth), (P1_suffix,), (N1_suffix,), (W_shape,), (W_cluster,), (N1_cluster,), (N2_cluster,), (P1_cluster,), (P2_cluster,), (W_flags,), (N1_flags,), (N2_flags,), (P1_flags,), (P2_flags,), )