# coding: utf8 from __future__ import unicode_literals import ujson from collections import defaultdict from cymem.cymem cimport Pool from thinc.typedefs cimport atom_t from thinc.extra.eg cimport Example from thinc.structs cimport ExampleC from thinc.linear.avgtron cimport AveragedPerceptron from thinc.linalg cimport VecVec from .tokens.doc cimport Doc from .attrs cimport TAG from .gold cimport GoldParse from .attrs cimport * from . import util cpdef enum: P2_orth P2_cluster P2_shape P2_prefix P2_suffix P2_pos P2_lemma P2_flags P1_orth P1_cluster P1_shape P1_prefix P1_suffix P1_pos P1_lemma P1_flags W_orth W_cluster W_shape W_prefix W_suffix W_pos W_lemma W_flags N1_orth N1_cluster N1_shape N1_prefix N1_suffix N1_pos N1_lemma N1_flags N2_orth N2_cluster N2_shape N2_prefix N2_suffix N2_pos N2_lemma N2_flags N_CONTEXT_FIELDS cdef class TaggerModel(AveragedPerceptron): def update(self, Example eg): self.time += 1 guess = eg.guess best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class) if guess != best: for feat in eg.c.features[:eg.c.nr_feat]: self.update_weight(feat.key, best, -feat.value) self.update_weight(feat.key, guess, feat.value) cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *: _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2]) _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1]) _fill_from_token(&eg.atoms[W_orth], &tokens[i]) _fill_from_token(&eg.atoms[N1_orth], &tokens[i+1]) _fill_from_token(&eg.atoms[N2_orth], &tokens[i+2]) eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms) cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil: context[0] = t.lex.lower context[1] = t.lex.cluster context[2] = t.lex.shape context[3] = t.lex.prefix context[4] = t.lex.suffix context[5] = t.tag context[6] = t.lemma if t.lex.flags & (1 << IS_ALPHA): context[7] = 1 elif t.lex.flags & (1 << IS_PUNCT): context[7] = 2 elif t.lex.flags & (1 << LIKE_URL): context[7] = 3 elif t.lex.flags & (1 << LIKE_NUM): context[7] = 4 else: context[7] = 0 cdef class Tagger: """ Annotate part-of-speech tags on Doc objects. """ @classmethod def load(cls, path, vocab, require=False): """ Load the statistical model from the supplied path. Arguments: path (Path): The path to load from. vocab (Vocab): The vocabulary. Must be shared by the documents to be processed. require (bool): Whether to raise an error if the files are not found. Returns (Tagger): The newly created object. """ # TODO: Change this to expect config.json when we don't have to # support old data. path = util.ensure_path(path) if (path / 'templates.json').exists(): with (path / 'templates.json').open('r', encoding='utf8') as file_: templates = ujson.load(file_) elif require: raise IOError( "Required file %s/templates.json not found when loading Tagger" % str(path)) else: templates = cls.feature_templates self = cls(vocab, model=None, feature_templates=templates) if (path / 'model').exists(): self.model.load(str(path / 'model')) elif require: raise IOError( "Required file %s/model not found when loading Tagger" % str(path)) return self def __init__(self, Vocab vocab, TaggerModel model=None, **cfg): """ Create a Tagger. Arguments: vocab (Vocab): The vocabulary object. Must be shared with documents to be processed. model (thinc.linear.AveragedPerceptron): The statistical model. Returns (Tagger): The newly constructed object. """ if model is None: model = TaggerModel(cfg.get('features', self.feature_templates), L1=0.0) self.vocab = vocab self.model = model self.model.l1_penalty = 0.0 # TODO: Move this to tag map self.freqs = {TAG: defaultdict(int)} for tag in self.tag_names: self.freqs[TAG][self.vocab.strings[tag]] = 1 self.freqs[TAG][0] = 1 self.cfg = cfg @property def tag_names(self): return self.vocab.morphology.tag_names def __reduce__(self): return (self.__class__, (self.vocab, self.model), None, None) def tag_from_strings(self, Doc tokens, object tag_strs): cdef int i for i in range(tokens.length): self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i]) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def __call__(self, Doc tokens): """ Apply the tagger, setting the POS tags onto the Doc object. Arguments: doc (Doc): The tokens to be tagged. Returns: None """ if tokens.length == 0: return 0 cdef Pool mem = Pool() cdef int i, tag cdef Example eg = Example(nr_atom=N_CONTEXT_FIELDS, nr_class=self.vocab.morphology.n_tags, nr_feat=self.model.nr_feat) for i in range(tokens.length): if tokens.c[i].pos == 0: self.model.set_featuresC(&eg.c, tokens.c, i) self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class) self.vocab.morphology.assign_tag_id(&tokens.c[i], guess) eg.fill_scores(0, eg.c.nr_class) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length def pipe(self, stream, batch_size=1000, n_threads=2): """ Tag a stream of documents. Arguments: stream: The sequence of documents to tag. batch_size (int): The number of documents to accumulate into a working set. n_threads (int): The number of threads with which to work on the buffer in parallel, if the Matcher implementation supports multi-threading. Yields: Doc Documents, in order. """ for doc in stream: self(doc) yield doc def update(self, Doc tokens, GoldParse gold, itn=0): """ Update the statistical model, with tags supplied for the given document. Arguments: doc (Doc): The document to update on. gold (GoldParse): Manager for the gold-standard tags. Returns (int): Number of tags correct. """ gold_tag_strs = gold.tags assert len(tokens) == len(gold_tag_strs) for tag in gold_tag_strs: if tag != None and tag not in self.tag_names: msg = ("Unrecognized gold tag: %s. tag_map.json must contain all " "gold tags, to maintain coarse-grained mapping.") raise ValueError(msg % tag) golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs] cdef int correct = 0 cdef Pool mem = Pool() cdef Example eg = Example( nr_atom=N_CONTEXT_FIELDS, nr_class=self.vocab.morphology.n_tags, nr_feat=self.model.nr_feat) for i in range(tokens.length): self.model.set_featuresC(&eg.c, tokens.c, i) eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ] self.model.set_scoresC(eg.c.scores, eg.c.features, eg.c.nr_feat) self.model.update(eg) self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess) correct += eg.cost == 0 self.freqs[TAG][tokens.c[i].tag] += 1 eg.fill_scores(0, eg.c.nr_class) eg.fill_costs(0, eg.c.nr_class) tokens.is_tagged = True tokens._py_tokens = [None] * tokens.length return correct feature_templates = ( (W_orth,), (P1_lemma, P1_pos), (P2_lemma, P2_pos), (N1_orth,), (N2_orth,), (W_suffix,), (W_prefix,), (P1_pos,), (P2_pos,), (P1_pos, P2_pos), (P1_pos, W_orth), (P1_suffix,), (N1_suffix,), (W_shape,), (W_cluster,), (N1_cluster,), (N2_cluster,), (P1_cluster,), (P2_cluster,), (W_flags,), (N1_flags,), (N2_flags,), (P1_flags,), (P2_flags,), )