spaCy/spacy/tagger.pyx

# coding: utf8
from __future__ import unicode_literals

from collections import defaultdict

from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t
from thinc.extra.eg cimport Example
from thinc.structs cimport ExampleC
from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec

from .tokens.doc cimport Doc
from .attrs cimport TAG
from .gold cimport GoldParse
from .attrs cimport *


cpdef enum:
    P2_orth
    P2_cluster
    P2_shape
    P2_prefix
    P2_suffix
    P2_pos
    P2_lemma
    P2_flags

    P1_orth
    P1_cluster
    P1_shape
    P1_prefix
    P1_suffix
    P1_pos
    P1_lemma
    P1_flags

    W_orth
    W_cluster
    W_shape
    W_prefix
    W_suffix
    W_pos
    W_lemma
    W_flags

    N1_orth
    N1_cluster
    N1_shape
    N1_prefix
    N1_suffix
    N1_pos
    N1_lemma
    N1_flags

    N2_orth
    N2_cluster
    N2_shape
    N2_prefix
    N2_suffix
    N2_pos
    N2_lemma
    N2_flags

    N_CONTEXT_FIELDS


cdef class TaggerModel(AveragedPerceptron):
    def update(self, Example eg):
        self.time += 1
        guess = eg.guess
        best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class)
        if guess != best:
            for feat in eg.c.features[:eg.c.nr_feat]:
                self.update_weight(feat.key, best, -feat.value)
                self.update_weight(feat.key, guess, feat.value)

    cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
        _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
        _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
        _fill_from_token(&eg.atoms[W_orth], &tokens[i])
        _fill_from_token(&eg.atoms[N1_orth], &tokens[i+1])
        _fill_from_token(&eg.atoms[N2_orth], &tokens[i+2])

        eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)


cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
    context[0] = t.lex.lower
    context[1] = t.lex.cluster
    context[2] = t.lex.shape
    context[3] = t.lex.prefix
    context[4] = t.lex.suffix
    context[5] = t.tag
    context[6] = t.lemma
    if t.lex.flags & (1 << IS_ALPHA):
        context[7] = 1
    elif t.lex.flags & (1 << IS_PUNCT):
        context[7] = 2
    elif t.lex.flags & (1 << LIKE_URL):
        context[7] = 3
    elif t.lex.flags & (1 << LIKE_NUM):
        context[7] = 4
    else:
        context[7] = 0


cdef class Tagger:
    """Annotate part-of-speech tags on Doc objects."""

    def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
        """Create a Tagger.

        vocab (Vocab): The vocabulary object. Must be shared with documents to
            be processed.
        model (thinc.linear.AveragedPerceptron): The statistical model.
        RETURNS (Tagger): The newly constructed object.
        """
        if model is None:
            model = TaggerModel(cfg.get('features', self.feature_templates),
                                L1=0.0)
        self.vocab = vocab
        self.model = model
        self.model.l1_penalty = 0.0
        # TODO: Move this to tag map
        self.freqs = {TAG: defaultdict(int)}
        for tag in self.tag_names:
            self.freqs[TAG][self.vocab.strings[tag]] = 1
        self.freqs[TAG][0] = 1
        self.cfg = cfg

    @property
    def tag_names(self):
        return self.vocab.morphology.tag_names

    def __reduce__(self):
        return (self.__class__, (self.vocab, self.model), None, None)

    def tag_from_strings(self, Doc tokens, object tag_strs):
        cdef int i
        for i in range(tokens.length):
            self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i])
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

    def __call__(self, Doc tokens):
        """Apply the tagger, setting the POS tags onto the Doc object.

        doc (Doc): The tokens to be tagged.
        """
        if tokens.length == 0:
            return 0

        cdef Pool mem = Pool()

        cdef int i, tag
        cdef Example eg = Example(nr_atom=N_CONTEXT_FIELDS,
                                  nr_class=self.vocab.morphology.n_tags,
                                  nr_feat=self.model.nr_feat)
        for i in range(tokens.length):
            if tokens.c[i].pos == 0:
                self.model.set_featuresC(&eg.c, tokens.c, i)
                self.model.set_scoresC(eg.c.scores,
                    eg.c.features, eg.c.nr_feat)
                guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
                self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)
                eg.fill_scores(0, eg.c.nr_class)
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

    def pipe(self, stream, batch_size=1000, n_threads=2):
        """Tag a stream of documents.

        Arguments:
        stream: The sequence of documents to tag.
        batch_size (int): The number of documents to accumulate into a working set.
        n_threads (int): The number of threads with which to work on the buffer
            in parallel, if the Matcher implementation supports multi-threading.
        YIELDS (Doc): Documents, in order.
        """
        for doc in stream:
            self(doc)
            yield doc

    def update(self, Doc tokens, GoldParse gold, itn=0):
        """Update the statistical model, with tags supplied for the given document.

        doc (Doc): The document to update on.
        gold (GoldParse): Manager for the gold-standard tags.
        RETURNS (int): Number of tags predicted correctly.
        """
        gold_tag_strs = gold.tags
        assert len(tokens) == len(gold_tag_strs)
        for tag in gold_tag_strs:
            if tag != None and tag not in self.tag_names:
                msg = ("Unrecognized gold tag: %s. tag_map.json must contain all "
                       "gold tags, to maintain coarse-grained mapping.")
                raise ValueError(msg % tag)
        golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
        cdef int correct = 0
        cdef Pool mem = Pool()
        cdef Example eg = Example(
            nr_atom=N_CONTEXT_FIELDS,
            nr_class=self.vocab.morphology.n_tags,
            nr_feat=self.model.nr_feat)
        for i in range(tokens.length):
            self.model.set_featuresC(&eg.c, tokens.c, i)
            eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
            self.model.set_scoresC(eg.c.scores,
                eg.c.features, eg.c.nr_feat)
            self.model.update(eg)

            self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)

            correct += eg.cost == 0
            self.freqs[TAG][tokens.c[i].tag] += 1
            eg.fill_scores(0, eg.c.nr_class)
            eg.fill_costs(0, eg.c.nr_class)
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length
        return correct


    feature_templates = (
        (W_orth,),
        (P1_lemma, P1_pos),
        (P2_lemma, P2_pos),
        (N1_orth,),
        (N2_orth,),

        (W_suffix,),
        (W_prefix,),

        (P1_pos,),
        (P2_pos,),
        (P1_pos, P2_pos),
        (P1_pos, W_orth),
        (P1_suffix,),
        (N1_suffix,),

        (W_shape,),
        (W_cluster,),
        (N1_cluster,),
        (N2_cluster,),
        (P1_cluster,),
        (P2_cluster,),

        (W_flags,),
        (N1_flags,),
        (N2_flags,),
        (P1_flags,),
        (P2_flags,),
    )
Clean up imports, unused code, whitespace, docstrings 2017-04-15 13:05:47 +03:00			`# coding: utf8`
			`from __future__ import unicode_literals`

* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`from collections import defaultdict`

* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`from cymem.cymem cimport Pool`
Remove unused import statements 2017-03-21 23:08:54 +03:00			`from thinc.typedefs cimport atom_t`
* Move to thinc 5.0 2016-01-29 05:58:55 +03:00			`from thinc.extra.eg cimport Example`
			`from thinc.structs cimport ExampleC`
			`from thinc.linear.avgtron cimport AveragedPerceptron`
			`from thinc.linalg cimport VecVec`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00
			`from .tokens.doc cimport Doc`
			`from .attrs cimport TAG`
Use GoldParse in tagger.update 2016-10-17 01:55:15 +03:00			`from .gold cimport GoldParse`
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`from .attrs cimport *`

Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`cpdef enum:`
			`P2_orth`
			`P2_cluster`
			`P2_shape`
			`P2_prefix`
			`P2_suffix`
			`P2_pos`
			`P2_lemma`
			`P2_flags`

			`P1_orth`
			`P1_cluster`
			`P1_shape`
			`P1_prefix`
			`P1_suffix`
			`P1_pos`
			`P1_lemma`
			`P1_flags`

			`W_orth`
			`W_cluster`
			`W_shape`
			`W_prefix`
			`W_suffix`
			`W_pos`
			`W_lemma`
			`W_flags`

			`N1_orth`
			`N1_cluster`
			`N1_shape`
			`N1_prefix`
			`N1_suffix`
			`N1_pos`
			`N1_lemma`
			`N1_flags`

			`N2_orth`
			`N2_cluster`
			`N2_shape`
			`N2_prefix`
			`N2_suffix`
			`N2_pos`
			`N2_lemma`
			`N2_flags`

			`N_CONTEXT_FIELDS`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00

* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`cdef class TaggerModel(AveragedPerceptron):`
Use ftrl optimizer in tagger 2017-03-11 15:59:13 +03:00			`def update(self, Example eg):`
			`self.time += 1`
			`guess = eg.guess`
			`best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class)`
			`if guess != best:`
			`for feat in eg.c.features[:eg.c.nr_feat]:`
Clean up FTRL parsing stuff. 2017-03-16 19:58:20 +03:00			`self.update_weight(feat.key, best, -feat.value)`
			`self.update_weight(feat.key, guess, feat.value)`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00
Use ftrl optimizer in tagger 2017-03-11 15:59:13 +03:00			`cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:`
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])`
			`_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])`
			`_fill_from_token(&eg.atoms[W_orth], &tokens[i])`
			`_fill_from_token(&eg.atoms[N1_orth], &tokens[i+1])`
			`_fill_from_token(&eg.atoms[N2_orth], &tokens[i+2])`

			`eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)`


			`cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:`
			`context[0] = t.lex.lower`
			`context[1] = t.lex.cluster`
			`context[2] = t.lex.shape`
			`context[3] = t.lex.prefix`
			`context[4] = t.lex.suffix`
			`context[5] = t.tag`
			`context[6] = t.lemma`
			`if t.lex.flags & (1 << IS_ALPHA):`
			`context[7] = 1`
			`elif t.lex.flags & (1 << IS_PUNCT):`
			`context[7] = 2`
			`elif t.lex.flags & (1 << LIKE_URL):`
			`context[7] = 3`
			`elif t.lex.flags & (1 << LIKE_NUM):`
			`context[7] = 4`
			`else:`
			`context[7] = 0`


* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`cdef class Tagger:`
Update docstrings and remove deprecated load classmethod 2017-05-21 14:27:52 +03:00			`"""Annotate part-of-speech tags on Doc objects."""`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00
			`def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):`
Update docstrings and remove deprecated load classmethod 2017-05-21 14:27:52 +03:00			`"""Create a Tagger.`
Fix doc strings 2016-11-01 14:25:36 +03:00
Update docstrings and remove deprecated load classmethod 2017-05-21 14:27:52 +03:00			`vocab (Vocab): The vocabulary object. Must be shared with documents to`
			`be processed.`
			`model (thinc.linear.AveragedPerceptron): The statistical model.`
			`RETURNS (Tagger): The newly constructed object.`
Fix doc strings 2016-11-01 14:25:36 +03:00			`"""`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00			`if model is None:`
Use ftrl optimizer in tagger 2017-03-11 15:59:13 +03:00			`model = TaggerModel(cfg.get('features', self.feature_templates),`
			`L1=0.0)`
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`self.vocab = vocab`
* Tagger training now working. Still need to test load/save of model. Morphology still broken. 2015-08-27 10:16:11 +03:00			`self.model = model`
Set L1 penalty to 0 in tagger. 2017-03-10 03:43:47 +03:00			`self.model.l1_penalty = 0.0`
* Tagger training now working. Still need to test load/save of model. Morphology still broken. 2015-08-27 10:16:11 +03:00			`# TODO: Move this to tag map`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`self.freqs = {TAG: defaultdict(int)}`
			`for tag in self.tag_names:`
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`self.freqs[TAG][self.vocab.strings[tag]] = 1`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`self.freqs[TAG][0] = 1`
Use GoldParse in tagger.update 2016-10-17 01:55:15 +03:00			`self.cfg = cfg`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`@property`
			`def tag_names(self):`
* More work on language independent parsing 2015-08-28 04:44:54 +03:00			`return self.vocab.morphology.tag_names`
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`def __reduce__(self):`
			`return (self.__class__, (self.vocab, self.model), None, None)`

			`def tag_from_strings(self, Doc tokens, object tag_strs):`
			`cdef int i`
			`for i in range(tokens.length):`
			`self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i])`
			`tokens.is_tagged = True`
			`tokens._py_tokens = [None] * tokens.length`

* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`def __call__(self, Doc tokens):`
Update docstrings and remove deprecated load classmethod 2017-05-21 14:27:52 +03:00			`"""Apply the tagger, setting the POS tags onto the Doc object.`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00
Update docstrings and remove deprecated load classmethod 2017-05-21 14:27:52 +03:00			`doc (Doc): The tokens to be tagged.`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`"""`
			`if tokens.length == 0:`
			`return 0`
* Update to use thinc 3.0 2015-11-05 16:25:59 +03:00
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`cdef Pool mem = Pool()`
* Very scrappy, likely buggy first-cut pickle implementation, to work on Issue #125: allow pickle for Apache Spark. The current implementation sends stuff to temp files, and does almost nothing to ensure all modifiable state is actually preserved. The Language() instance is a deep tree of extension objects, and if pickling during training, some of the C-data state is hard to preserve. 2015-10-12 11:33:11 +03:00
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`cdef int i, tag`
* Update for thinc 5.0, including changing cost from int to weight_t, and updating the tagger and parser 2016-01-30 16:31:12 +03:00			`cdef Example eg = Example(nr_atom=N_CONTEXT_FIELDS,`
			`nr_class=self.vocab.morphology.n_tags,`
			`nr_feat=self.model.nr_feat)`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`for i in range(tokens.length):`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00			`if tokens.c[i].pos == 0:`
Revert changes to tagger and parser for thinc 6 2017-01-09 19:08:34 +03:00			`self.model.set_featuresC(&eg.c, tokens.c, i)`
* Move to thinc 5.0 2016-01-29 05:58:55 +03:00			`self.model.set_scoresC(eg.c.scores,`
Revert "* Working neural net, but features hacky. Switching to extractor." This reverts commit 7c2f1a673bab69d1b58b6ed87b945e2239fce5b7. 2016-09-21 13:26:14 +03:00			`eg.c.features, eg.c.nr_feat)`
* Move to thinc 5.0 2016-01-29 05:58:55 +03:00			`guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)`
Fix morphology tagger 2016-11-04 21:19:09 +03:00			`self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)`
* Update for modified thinc interface 2016-02-05 13:44:39 +03:00			`eg.fill_scores(0, eg.c.nr_class)`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`tokens.is_tagged = True`
			`tokens._py_tokens = [None] * tokens.length`
* Add a .pipe method, that takes a stream of input, operates on it, and streams the output. Internally, the stream may be buffered, to allow multi-threading. 2016-02-03 04:04:55 +03:00
			`def pipe(self, stream, batch_size=1000, n_threads=2):`
Update docstrings and remove deprecated load classmethod 2017-05-21 14:27:52 +03:00			`"""Tag a stream of documents.`
Fix doc strings 2016-11-01 14:25:36 +03:00
			`Arguments:`
Update docstrings and remove deprecated load classmethod 2017-05-21 14:27:52 +03:00			`stream: The sequence of documents to tag.`
			`batch_size (int): The number of documents to accumulate into a working set.`
			`n_threads (int): The number of threads with which to work on the buffer`
			`in parallel, if the Matcher implementation supports multi-threading.`
			`YIELDS (Doc): Documents, in order.`
Fix doc strings 2016-11-01 14:25:36 +03:00			`"""`
* Add a .pipe method, that takes a stream of input, operates on it, and streams the output. Internally, the stream may be buffered, to allow multi-threading. 2016-02-03 04:04:55 +03:00			`for doc in stream:`
			`self(doc)`
			`yield doc`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00
Add itn argument to tagger.update 2017-03-11 20:12:21 +03:00			`def update(self, Doc tokens, GoldParse gold, itn=0):`
Update docstrings and remove deprecated load classmethod 2017-05-21 14:27:52 +03:00			`"""Update the statistical model, with tags supplied for the given document.`
Fix doc strings 2016-11-01 14:25:36 +03:00
Update docstrings and remove deprecated load classmethod 2017-05-21 14:27:52 +03:00			`doc (Doc): The document to update on.`
			`gold (GoldParse): Manager for the gold-standard tags.`
			`RETURNS (int): Number of tags predicted correctly.`
Fix doc strings 2016-11-01 14:25:36 +03:00			`"""`
Use GoldParse in tagger.update 2016-10-17 01:55:15 +03:00			`gold_tag_strs = gold.tags`
* Tagger training now working. Still need to test load/save of model. Morphology still broken. 2015-08-27 10:16:11 +03:00			`assert len(tokens) == len(gold_tag_strs)`
* Add error warning that a gold tag is unrecognised 2016-02-03 00:59:59 +03:00			`for tag in gold_tag_strs:`
add tokenizer files for German, add/change code to train German pos tagger - add files to specify rules for German tokenization - change generate_specials.py to generate from an external file (abbrev.de.tab) - copy gazetteer.json from lang_data/en/ - init_model.py - change doc freq threshold to 0 - add train_german_tagger.py - expects conll09-formatted input 2016-02-18 15:24:20 +03:00			`if tag != None and tag not in self.tag_names:`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00			`msg = ("Unrecognized gold tag: %s. tag_map.json must contain all "`
* Add error warning that a gold tag is unrecognised 2016-02-03 00:59:59 +03:00			`"gold tags, to maintain coarse-grained mapping.")`
			`raise ValueError(msg % tag)`
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]`
			`cdef int correct = 0`
			`cdef Pool mem = Pool()`
* Update for thinc 5.0, including changing cost from int to weight_t, and updating the tagger and parser 2016-01-30 16:31:12 +03:00			`cdef Example eg = Example(`
			`nr_atom=N_CONTEXT_FIELDS,`
			`nr_class=self.vocab.morphology.n_tags,`
			`nr_feat=self.model.nr_feat)`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`for i in range(tokens.length):`
Revert changes to tagger and parser for thinc 6 2017-01-09 19:08:34 +03:00			`self.model.set_featuresC(&eg.c, tokens.c, i)`
add tokenizer files for German, add/change code to train German pos tagger - add files to specify rules for German tokenization - change generate_specials.py to generate from an external file (abbrev.de.tab) - copy gazetteer.json from lang_data/en/ - init_model.py - change doc freq threshold to 0 - add train_german_tagger.py - expects conll09-formatted input 2016-02-18 15:24:20 +03:00			`eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]`
* Move to thinc 5.0 2016-01-29 05:58:55 +03:00			`self.model.set_scoresC(eg.c.scores,`
Revert "* Working neural net, but features hacky. Switching to extractor." This reverts commit 7c2f1a673bab69d1b58b6ed87b945e2239fce5b7. 2016-09-21 13:26:14 +03:00			`eg.c.features, eg.c.nr_feat)`
Use ftrl optimizer in tagger 2017-03-11 15:59:13 +03:00			`self.model.update(eg)`
* More work on language-generic parsing 2015-08-28 03:02:33 +03:00
Fixed bug: eg.guess is a tag id, rather than tag 2016-11-15 09:11:22 +03:00			`self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`correct += eg.cost == 0`
* Rename Doc.data to Doc.c 2015-11-03 16:15:14 +03:00			`self.freqs[TAG][tokens.c[i].tag] += 1`
* Update for modified thinc interface 2016-02-05 13:44:39 +03:00			`eg.fill_scores(0, eg.c.nr_class)`
			`eg.fill_costs(0, eg.c.nr_class)`
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`tokens.is_tagged = True`
			`tokens._py_tokens = [None] * tokens.length`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`return correct`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00

			`feature_templates = (`
			`(W_orth,),`
			`(P1_lemma, P1_pos),`
			`(P2_lemma, P2_pos),`
			`(N1_orth,),`
			`(N2_orth,),`

			`(W_suffix,),`
			`(W_prefix,),`

			`(P1_pos,),`
			`(P2_pos,),`
			`(P1_pos, P2_pos),`
			`(P1_pos, W_orth),`
			`(P1_suffix,),`
			`(N1_suffix,),`

			`(W_shape,),`
			`(W_cluster,),`
			`(N1_cluster,),`
			`(N2_cluster,),`
			`(P1_cluster,),`
			`(P2_cluster,),`

			`(W_flags,),`
			`(N1_flags,),`
			`(N2_flags,),`
			`(P1_flags,),`
			`(P2_flags,),`
			`)`