spaCy/spacy/tagger.pyx

# coding: utf8
from __future__ import unicode_literals

import ujson
from collections import defaultdict

from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t
from thinc.extra.eg cimport Example
from thinc.structs cimport ExampleC
from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport VecVec

from .tokens.doc cimport Doc
from .attrs cimport TAG
from .gold cimport GoldParse
from .attrs cimport *
from . import util


cpdef enum:
    P2_orth
    P2_cluster
    P2_shape
    P2_prefix
    P2_suffix
    P2_pos
    P2_lemma
    P2_flags

    P1_orth
    P1_cluster
    P1_shape
    P1_prefix
    P1_suffix
    P1_pos
    P1_lemma
    P1_flags

    W_orth
    W_cluster
    W_shape
    W_prefix
    W_suffix
    W_pos
    W_lemma
    W_flags

    N1_orth
    N1_cluster
    N1_shape
    N1_prefix
    N1_suffix
    N1_pos
    N1_lemma
    N1_flags

    N2_orth
    N2_cluster
    N2_shape
    N2_prefix
    N2_suffix
    N2_pos
    N2_lemma
    N2_flags

    N_CONTEXT_FIELDS


cdef class TaggerModel(AveragedPerceptron):
    def update(self, Example eg):
        self.time += 1
        guess = eg.guess
        best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class)
        if guess != best:
            for feat in eg.c.features[:eg.c.nr_feat]:
                self.update_weight(feat.key, best, -feat.value)
                self.update_weight(feat.key, guess, feat.value)

    cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:
        _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
        _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
        _fill_from_token(&eg.atoms[W_orth], &tokens[i])
        _fill_from_token(&eg.atoms[N1_orth], &tokens[i+1])
        _fill_from_token(&eg.atoms[N2_orth], &tokens[i+2])

        eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)


cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
    context[0] = t.lex.lower
    context[1] = t.lex.cluster
    context[2] = t.lex.shape
    context[3] = t.lex.prefix
    context[4] = t.lex.suffix
    context[5] = t.tag
    context[6] = t.lemma
    if t.lex.flags & (1 << IS_ALPHA):
        context[7] = 1
    elif t.lex.flags & (1 << IS_PUNCT):
        context[7] = 2
    elif t.lex.flags & (1 << LIKE_URL):
        context[7] = 3
    elif t.lex.flags & (1 << LIKE_NUM):
        context[7] = 4
    else:
        context[7] = 0


cdef class Tagger:
    """
    Annotate part-of-speech tags on Doc objects.
    """
    @classmethod
    def load(cls, path, vocab, require=False):
        """
        Load the statistical model from the supplied path.

        Arguments:
            path (Path):
                The path to load from.
            vocab (Vocab):
                The vocabulary. Must be shared by the documents to be processed.
            require (bool):
                Whether to raise an error if the files are not found.
        Returns (Tagger):
            The newly created object.
        """
        # TODO: Change this to expect config.json when we don't have to
        # support old data.
        path = util.ensure_path(path)
        if (path / 'templates.json').exists():
            with (path / 'templates.json').open('r', encoding='utf8') as file_:
                templates = ujson.load(file_)
        elif require:
            raise IOError(
                "Required file %s/templates.json not found when loading Tagger" % str(path))
        else:
            templates = cls.feature_templates
        self = cls(vocab, model=None, feature_templates=templates)

        if (path / 'model').exists():
            self.model.load(str(path / 'model'))
        elif require:
            raise IOError(
                "Required file %s/model not found when loading Tagger" % str(path))
        return self

    def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
        """
        Create a Tagger.

        Arguments:
            vocab (Vocab):
                The vocabulary object. Must be shared with documents to be processed.
            model (thinc.linear.AveragedPerceptron):
                The statistical model.
        Returns (Tagger):
            The newly constructed object.
        """
        if model is None:
            model = TaggerModel(cfg.get('features', self.feature_templates),
                                L1=0.0)
        self.vocab = vocab
        self.model = model
        self.model.l1_penalty = 0.0
        # TODO: Move this to tag map
        self.freqs = {TAG: defaultdict(int)}
        for tag in self.tag_names:
            self.freqs[TAG][self.vocab.strings[tag]] = 1
        self.freqs[TAG][0] = 1
        self.cfg = cfg

    @property
    def tag_names(self):
        return self.vocab.morphology.tag_names

    def __reduce__(self):
        return (self.__class__, (self.vocab, self.model), None, None)

    def tag_from_strings(self, Doc tokens, object tag_strs):
        cdef int i
        for i in range(tokens.length):
            self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i])
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

    def __call__(self, Doc tokens):
        """
        Apply the tagger, setting the POS tags onto the Doc object.

        Arguments:
            doc (Doc): The tokens to be tagged.
        Returns:
            None
        """
        if tokens.length == 0:
            return 0

        cdef Pool mem = Pool()

        cdef int i, tag
        cdef Example eg = Example(nr_atom=N_CONTEXT_FIELDS,
                                  nr_class=self.vocab.morphology.n_tags,
                                  nr_feat=self.model.nr_feat)
        for i in range(tokens.length):
            if tokens.c[i].pos == 0:
                self.model.set_featuresC(&eg.c, tokens.c, i)
                self.model.set_scoresC(eg.c.scores,
                    eg.c.features, eg.c.nr_feat)
                guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)
                self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)
                eg.fill_scores(0, eg.c.nr_class)
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

    def pipe(self, stream, batch_size=1000, n_threads=2):
        """
        Tag a stream of documents.

        Arguments:
            stream: The sequence of documents to tag.
            batch_size (int):
                The number of documents to accumulate into a working set.
            n_threads (int):
                The number of threads with which to work on the buffer in parallel,
                if the Matcher implementation supports multi-threading.
        Yields:
            Doc Documents, in order.
        """
        for doc in stream:
            self(doc)
            yield doc

    def update(self, Doc tokens, GoldParse gold, itn=0):
        """
        Update the statistical model, with tags supplied for the given document.

        Arguments:
            doc (Doc):
                The document to update on.
            gold (GoldParse):
                Manager for the gold-standard tags.
        Returns (int):
            Number of tags correct.
        """
        gold_tag_strs = gold.tags
        assert len(tokens) == len(gold_tag_strs)
        for tag in gold_tag_strs:
            if tag != None and tag not in self.tag_names:
                msg = ("Unrecognized gold tag: %s. tag_map.json must contain all "
                       "gold tags, to maintain coarse-grained mapping.")
                raise ValueError(msg % tag)
        golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
        cdef int correct = 0
        cdef Pool mem = Pool()
        cdef Example eg = Example(
            nr_atom=N_CONTEXT_FIELDS,
            nr_class=self.vocab.morphology.n_tags,
            nr_feat=self.model.nr_feat)
        for i in range(tokens.length):
            self.model.set_featuresC(&eg.c, tokens.c, i)
            eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]
            self.model.set_scoresC(eg.c.scores,
                eg.c.features, eg.c.nr_feat)
            self.model.update(eg)

            self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)

            correct += eg.cost == 0
            self.freqs[TAG][tokens.c[i].tag] += 1
            eg.fill_scores(0, eg.c.nr_class)
            eg.fill_costs(0, eg.c.nr_class)
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length
        return correct


    feature_templates = (
        (W_orth,),
        (P1_lemma, P1_pos),
        (P2_lemma, P2_pos),
        (N1_orth,),
        (N2_orth,),

        (W_suffix,),
        (W_prefix,),

        (P1_pos,),
        (P2_pos,),
        (P1_pos, P2_pos),
        (P1_pos, W_orth),
        (P1_suffix,),
        (N1_suffix,),

        (W_shape,),
        (W_cluster,),
        (N1_cluster,),
        (N2_cluster,),
        (P1_cluster,),
        (P2_cluster,),

        (W_flags,),
        (N1_flags,),
        (N2_flags,),
        (P1_flags,),
        (P2_flags,),
    )
Clean up imports, unused code, whitespace, docstrings 2017-04-15 13:05:47 +03:00			`# coding: utf8`
			`from __future__ import unicode_literals`

Fix json imports and use ujson 2017-04-15 13:13:34 +03:00			`import ujson`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`from collections import defaultdict`

* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`from cymem.cymem cimport Pool`
Remove unused import statements 2017-03-21 23:08:54 +03:00			`from thinc.typedefs cimport atom_t`
* Move to thinc 5.0 2016-01-29 05:58:55 +03:00			`from thinc.extra.eg cimport Example`
			`from thinc.structs cimport ExampleC`
			`from thinc.linear.avgtron cimport AveragedPerceptron`
			`from thinc.linalg cimport VecVec`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00
			`from .tokens.doc cimport Doc`
			`from .attrs cimport TAG`
Use GoldParse in tagger.update 2016-10-17 01:55:15 +03:00			`from .gold cimport GoldParse`
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`from .attrs cimport *`
Add compat functions and remove old workarounds Add ensure_path util function to handle checking instance of path 2017-04-15 13:11:16 +03:00			`from . import util`
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`cpdef enum:`
			`P2_orth`
			`P2_cluster`
			`P2_shape`
			`P2_prefix`
			`P2_suffix`
			`P2_pos`
			`P2_lemma`
			`P2_flags`

			`P1_orth`
			`P1_cluster`
			`P1_shape`
			`P1_prefix`
			`P1_suffix`
			`P1_pos`
			`P1_lemma`
			`P1_flags`

			`W_orth`
			`W_cluster`
			`W_shape`
			`W_prefix`
			`W_suffix`
			`W_pos`
			`W_lemma`
			`W_flags`

			`N1_orth`
			`N1_cluster`
			`N1_shape`
			`N1_prefix`
			`N1_suffix`
			`N1_pos`
			`N1_lemma`
			`N1_flags`

			`N2_orth`
			`N2_cluster`
			`N2_shape`
			`N2_prefix`
			`N2_suffix`
			`N2_pos`
			`N2_lemma`
			`N2_flags`

			`N_CONTEXT_FIELDS`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00

* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`cdef class TaggerModel(AveragedPerceptron):`
Use ftrl optimizer in tagger 2017-03-11 15:59:13 +03:00			`def update(self, Example eg):`
			`self.time += 1`
			`guess = eg.guess`
			`best = VecVec.arg_max_if_zero(eg.c.scores, eg.c.costs, eg.c.nr_class)`
			`if guess != best:`
			`for feat in eg.c.features[:eg.c.nr_feat]:`
Clean up FTRL parsing stuff. 2017-03-16 19:58:20 +03:00			`self.update_weight(feat.key, best, -feat.value)`
			`self.update_weight(feat.key, guess, feat.value)`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00
Use ftrl optimizer in tagger 2017-03-11 15:59:13 +03:00			`cdef void set_featuresC(self, ExampleC* eg, const TokenC* tokens, int i) except *:`
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`_fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])`
			`_fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])`
			`_fill_from_token(&eg.atoms[W_orth], &tokens[i])`
			`_fill_from_token(&eg.atoms[N1_orth], &tokens[i+1])`
			`_fill_from_token(&eg.atoms[N2_orth], &tokens[i+2])`

			`eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)`


			`cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:`
			`context[0] = t.lex.lower`
			`context[1] = t.lex.cluster`
			`context[2] = t.lex.shape`
			`context[3] = t.lex.prefix`
			`context[4] = t.lex.suffix`
			`context[5] = t.tag`
			`context[6] = t.lemma`
			`if t.lex.flags & (1 << IS_ALPHA):`
			`context[7] = 1`
			`elif t.lex.flags & (1 << IS_PUNCT):`
			`context[7] = 2`
			`elif t.lex.flags & (1 << LIKE_URL):`
			`context[7] = 3`
			`elif t.lex.flags & (1 << LIKE_NUM):`
			`context[7] = 4`
			`else:`
			`context[7] = 0`


* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`cdef class Tagger:`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`"""`
			`Annotate part-of-speech tags on Doc objects.`
			`"""`
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`@classmethod`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00			`def load(cls, path, vocab, require=False):`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`"""`
			`Load the statistical model from the supplied path.`
Fix doc strings 2016-11-01 14:25:36 +03:00
			`Arguments:`
			`path (Path):`
			`The path to load from.`
			`vocab (Vocab):`
			`The vocabulary. Must be shared by the documents to be processed.`
			`require (bool):`
			`Whether to raise an error if the files are not found.`
			`Returns (Tagger):`
			`The newly created object.`
			`"""`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00			`# TODO: Change this to expect config.json when we don't have to`
			`# support old data.`
Add compat functions and remove old workarounds Add ensure_path util function to handle checking instance of path 2017-04-15 13:11:16 +03:00			`path = util.ensure_path(path)`
Finish refactoring data loading 2016-09-24 21:26:17 +03:00			`if (path / 'templates.json').exists():`
Fix JSON in tagger 2016-10-21 02:44:10 +03:00			`with (path / 'templates.json').open('r', encoding='utf8') as file_:`
Fix json imports and use ujson 2017-04-15 13:13:34 +03:00			`templates = ujson.load(file_)`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00			`elif require:`
			`raise IOError(`
			`"Required file %s/templates.json not found when loading Tagger" % str(path))`
Mostly finished loading refactoring. Design is in place, but doesn't work yet. 2016-09-24 16:42:01 +03:00			`else:`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00			`templates = cls.feature_templates`
			`self = cls(vocab, model=None, feature_templates=templates)`
access model via sputnik 2015-12-07 08:01:28 +03:00
Finish refactoring data loading 2016-09-24 21:26:17 +03:00			`if (path / 'model').exists():`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00			`self.model.load(str(path / 'model'))`
			`elif require:`
			`raise IOError(`
			`"Required file %s/model not found when loading Tagger" % str(path))`
			`return self`

			`def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`"""`
			`Create a Tagger.`
Fix doc strings 2016-11-01 14:25:36 +03:00
			`Arguments:`
			`vocab (Vocab):`
			`The vocabulary object. Must be shared with documents to be processed.`
			`model (thinc.linear.AveragedPerceptron):`
			`The statistical model.`
			`Returns (Tagger):`
			`The newly constructed object.`
			`"""`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00			`if model is None:`
Use ftrl optimizer in tagger 2017-03-11 15:59:13 +03:00			`model = TaggerModel(cfg.get('features', self.feature_templates),`
			`L1=0.0)`
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`self.vocab = vocab`
* Tagger training now working. Still need to test load/save of model. Morphology still broken. 2015-08-27 10:16:11 +03:00			`self.model = model`
Set L1 penalty to 0 in tagger. 2017-03-10 03:43:47 +03:00			`self.model.l1_penalty = 0.0`
* Tagger training now working. Still need to test load/save of model. Morphology still broken. 2015-08-27 10:16:11 +03:00			`# TODO: Move this to tag map`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`self.freqs = {TAG: defaultdict(int)}`
			`for tag in self.tag_names:`
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`self.freqs[TAG][self.vocab.strings[tag]] = 1`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`self.freqs[TAG][0] = 1`
Use GoldParse in tagger.update 2016-10-17 01:55:15 +03:00			`self.cfg = cfg`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`@property`
			`def tag_names(self):`
* More work on language independent parsing 2015-08-28 04:44:54 +03:00			`return self.vocab.morphology.tag_names`
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`def __reduce__(self):`
			`return (self.__class__, (self.vocab, self.model), None, None)`

			`def tag_from_strings(self, Doc tokens, object tag_strs):`
			`cdef int i`
			`for i in range(tokens.length):`
			`self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i])`
			`tokens.is_tagged = True`
			`tokens._py_tokens = [None] * tokens.length`

* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`def __call__(self, Doc tokens):`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`"""`
			`Apply the tagger, setting the POS tags onto the Doc object.`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00
Fix doc strings 2016-11-01 14:25:36 +03:00			`Arguments:`
			`doc (Doc): The tokens to be tagged.`
			`Returns:`
			`None`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`"""`
			`if tokens.length == 0:`
			`return 0`
* Update to use thinc 3.0 2015-11-05 16:25:59 +03:00
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`cdef Pool mem = Pool()`
* Very scrappy, likely buggy first-cut pickle implementation, to work on Issue #125: allow pickle for Apache Spark. The current implementation sends stuff to temp files, and does almost nothing to ensure all modifiable state is actually preserved. The Language() instance is a deep tree of extension objects, and if pickling during training, some of the C-data state is hard to preserve. 2015-10-12 11:33:11 +03:00
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`cdef int i, tag`
* Update for thinc 5.0, including changing cost from int to weight_t, and updating the tagger and parser 2016-01-30 16:31:12 +03:00			`cdef Example eg = Example(nr_atom=N_CONTEXT_FIELDS,`
			`nr_class=self.vocab.morphology.n_tags,`
			`nr_feat=self.model.nr_feat)`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`for i in range(tokens.length):`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00			`if tokens.c[i].pos == 0:`
Revert changes to tagger and parser for thinc 6 2017-01-09 19:08:34 +03:00			`self.model.set_featuresC(&eg.c, tokens.c, i)`
* Move to thinc 5.0 2016-01-29 05:58:55 +03:00			`self.model.set_scoresC(eg.c.scores,`
Revert "* Working neural net, but features hacky. Switching to extractor." This reverts commit 7c2f1a673bab69d1b58b6ed87b945e2239fce5b7. 2016-09-21 13:26:14 +03:00			`eg.c.features, eg.c.nr_feat)`
* Move to thinc 5.0 2016-01-29 05:58:55 +03:00			`guess = VecVec.arg_max_if_true(eg.c.scores, eg.c.is_valid, eg.c.nr_class)`
Fix morphology tagger 2016-11-04 21:19:09 +03:00			`self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)`
* Update for modified thinc interface 2016-02-05 13:44:39 +03:00			`eg.fill_scores(0, eg.c.nr_class)`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`tokens.is_tagged = True`
			`tokens._py_tokens = [None] * tokens.length`
* Add a .pipe method, that takes a stream of input, operates on it, and streams the output. Internally, the stream may be buffered, to allow multi-threading. 2016-02-03 04:04:55 +03:00
			`def pipe(self, stream, batch_size=1000, n_threads=2):`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`"""`
			`Tag a stream of documents.`
Fix doc strings 2016-11-01 14:25:36 +03:00
			`Arguments:`
			`stream: The sequence of documents to tag.`
			`batch_size (int):`
			`The number of documents to accumulate into a working set.`
			`n_threads (int):`
			`The number of threads with which to work on the buffer in parallel,`
			`if the Matcher implementation supports multi-threading.`
			`Yields:`
			`Doc Documents, in order.`
			`"""`
* Add a .pipe method, that takes a stream of input, operates on it, and streams the output. Internally, the stream may be buffered, to allow multi-threading. 2016-02-03 04:04:55 +03:00			`for doc in stream:`
			`self(doc)`
			`yield doc`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00
Add itn argument to tagger.update 2017-03-11 20:12:21 +03:00			`def update(self, Doc tokens, GoldParse gold, itn=0):`
Use consistent formatting for docstrings 2017-04-15 12:59:21 +03:00			`"""`
			`Update the statistical model, with tags supplied for the given document.`
Fix doc strings 2016-11-01 14:25:36 +03:00
			`Arguments:`
			`doc (Doc):`
			`The document to update on.`
			`gold (GoldParse):`
			`Manager for the gold-standard tags.`
			`Returns (int):`
			`Number of tags correct.`
			`"""`
Use GoldParse in tagger.update 2016-10-17 01:55:15 +03:00			`gold_tag_strs = gold.tags`
* Tagger training now working. Still need to test load/save of model. Morphology still broken. 2015-08-27 10:16:11 +03:00			`assert len(tokens) == len(gold_tag_strs)`
* Add error warning that a gold tag is unrecognised 2016-02-03 00:59:59 +03:00			`for tag in gold_tag_strs:`
add tokenizer files for German, add/change code to train German pos tagger - add files to specify rules for German tokenization - change generate_specials.py to generate from an external file (abbrev.de.tab) - copy gazetteer.json from lang_data/en/ - init_model.py - change doc freq threshold to 0 - add train_german_tagger.py - expects conll09-formatted input 2016-02-18 15:24:20 +03:00			`if tag != None and tag not in self.tag_names:`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00			`msg = ("Unrecognized gold tag: %s. tag_map.json must contain all "`
* Add error warning that a gold tag is unrecognised 2016-02-03 00:59:59 +03:00			`"gold tags, to maintain coarse-grained mapping.")`
			`raise ValueError(msg % tag)`
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]`
			`cdef int correct = 0`
			`cdef Pool mem = Pool()`
* Update for thinc 5.0, including changing cost from int to weight_t, and updating the tagger and parser 2016-01-30 16:31:12 +03:00			`cdef Example eg = Example(`
			`nr_atom=N_CONTEXT_FIELDS,`
			`nr_class=self.vocab.morphology.n_tags,`
			`nr_feat=self.model.nr_feat)`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`for i in range(tokens.length):`
Revert changes to tagger and parser for thinc 6 2017-01-09 19:08:34 +03:00			`self.model.set_featuresC(&eg.c, tokens.c, i)`
add tokenizer files for German, add/change code to train German pos tagger - add files to specify rules for German tokenization - change generate_specials.py to generate from an external file (abbrev.de.tab) - copy gazetteer.json from lang_data/en/ - init_model.py - change doc freq threshold to 0 - add train_german_tagger.py - expects conll09-formatted input 2016-02-18 15:24:20 +03:00			`eg.costs = [ 1 if golds[i] not in (c, -1) else 0 for c in xrange(eg.nr_class) ]`
* Move to thinc 5.0 2016-01-29 05:58:55 +03:00			`self.model.set_scoresC(eg.c.scores,`
Revert "* Working neural net, but features hacky. Switching to extractor." This reverts commit 7c2f1a673bab69d1b58b6ed87b945e2239fce5b7. 2016-09-21 13:26:14 +03:00			`eg.c.features, eg.c.nr_feat)`
Use ftrl optimizer in tagger 2017-03-11 15:59:13 +03:00			`self.model.update(eg)`
* More work on language-generic parsing 2015-08-28 03:02:33 +03:00
Fixed bug: eg.guess is a tag id, rather than tag 2016-11-15 09:11:22 +03:00			`self.vocab.morphology.assign_tag_id(&tokens.c[i], eg.guess)`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`correct += eg.cost == 0`
* Rename Doc.data to Doc.c 2015-11-03 16:15:14 +03:00			`self.freqs[TAG][tokens.c[i].tag] += 1`
* Update for modified thinc interface 2016-02-05 13:44:39 +03:00			`eg.fill_scores(0, eg.c.nr_class)`
			`eg.fill_costs(0, eg.c.nr_class)`
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`tokens.is_tagged = True`
			`tokens._py_tokens = [None] * tokens.length`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`return correct`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00

			`feature_templates = (`
			`(W_orth,),`
			`(P1_lemma, P1_pos),`
			`(P2_lemma, P2_pos),`
			`(N1_orth,),`
			`(N2_orth,),`

			`(W_suffix,),`
			`(W_prefix,),`

			`(P1_pos,),`
			`(P2_pos,),`
			`(P1_pos, P2_pos),`
			`(P1_pos, W_orth),`
			`(P1_suffix,),`
			`(N1_suffix,),`

			`(W_shape,),`
			`(W_cluster,),`
			`(N1_cluster,),`
			`(N2_cluster,),`
			`(P1_cluster,),`
			`(P2_cluster,),`

			`(W_flags,),`
			`(N1_flags,),`
			`(N2_flags,),`
			`(P1_flags,),`
			`(P2_flags,),`
			`)`