spaCy/spacy/tagger.pyx

# cython: infer_types=True
# cython: profile=True
import json
import pathlib
from collections import defaultdict
from libc.string cimport memset, memcpy
from libcpp.vector cimport vector
from libc.stdint cimport uint64_t, int32_t, int64_t
cimport numpy as np
import numpy as np
np.import_array()

from cymem.cymem cimport Pool
from thinc.typedefs cimport atom_t, weight_t
from thinc.extra.eg cimport Example
from thinc.structs cimport ExampleC
from thinc.linear.avgtron cimport AveragedPerceptron
from thinc.linalg cimport Vec, VecVec
from thinc.linear.linear import LinearModel
from thinc.structs cimport FeatureC
from thinc.neural.optimizers import Adam
from thinc.neural.ops import NumpyOps

from .typedefs cimport attr_t
from .tokens.doc cimport Doc
from .attrs cimport TAG
from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
from .gold cimport GoldParse

from .attrs cimport *


cpdef enum:
    P2_orth
    P2_cluster
    P2_shape
    P2_prefix
    P2_suffix
    P2_pos
    P2_lemma
    P2_flags

    P1_orth
    P1_cluster
    P1_shape
    P1_prefix
    P1_suffix
    P1_pos
    P1_lemma
    P1_flags

    W_orth
    W_cluster
    W_shape
    W_prefix
    W_suffix
    W_pos
    W_lemma
    W_flags

    N1_orth
    N1_cluster
    N1_shape
    N1_prefix
    N1_suffix
    N1_pos
    N1_lemma
    N1_flags

    N2_orth
    N2_cluster
    N2_shape
    N2_prefix
    N2_suffix
    N2_pos
    N2_lemma
    N2_flags

    N_CONTEXT_FIELDS


cdef class TaggerModel:
    def __init__(self, int nr_tag, templates):
        self.extracter = ConjunctionExtracter(templates)
        self.model = LinearModel(nr_tag)

    def begin_update(self, atom_t[:, ::1] contexts, drop=0.):
        cdef vector[uint64_t]* keys = new vector[uint64_t]()
        cdef vector[float]* values = new vector[float]()
        cdef vector[int64_t]* lengths = new vector[int64_t]()
        features = new vector[FeatureC](self.extracter.nr_templ)
        features.resize(self.extracter.nr_templ)
        cdef FeatureC feat
        cdef int i, j
        for i in range(contexts.shape[0]):
            nr_feat = self.extracter.set_features(features.data(), &contexts[i, 0])
            for j in range(nr_feat):
                keys.push_back(features.at(j).key)
                values.push_back(features.at(j).value)
            lengths.push_back(nr_feat)
        cdef np.ndarray[uint64_t, ndim=1] py_keys
        cdef np.ndarray[float, ndim=1] py_values
        cdef np.ndarray[long, ndim=1] py_lengths
        py_keys = vector_uint64_2numpy(keys)
        py_values = vector_float_2numpy(values)
        py_lengths = vector_long_2numpy(lengths)
        instance = (py_keys, py_values, py_lengths)
        del keys
        del values
        del lengths
        del features
        return self.model.begin_update(instance, drop=drop)

    def end_training(self, *args, **kwargs):
        pass

    def dump(self, *args, **kwargs):
        pass


cdef np.ndarray[uint64_t, ndim=1] vector_uint64_2numpy(vector[uint64_t]* vec):
    cdef np.ndarray[uint64_t, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='uint64')
    memcpy(arr.data, vec.data(), sizeof(uint64_t) * vec.size())
    return arr


cdef np.ndarray[long, ndim=1] vector_long_2numpy(vector[int64_t]* vec):
    cdef np.ndarray[long, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='int64')
    memcpy(arr.data, vec.data(), sizeof(int64_t) * vec.size())
    return arr


cdef np.ndarray[float, ndim=1] vector_float_2numpy(vector[float]* vec):
    cdef np.ndarray[float, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='float32')
    memcpy(arr.data, vec.data(), sizeof(float) * vec.size())
    return arr


cdef void fill_context(atom_t* context, const TokenC* tokens, int i) nogil:
    _fill_from_token(&context[P2_orth], &tokens[i-2])
    _fill_from_token(&context[P1_orth], &tokens[i-1])
    _fill_from_token(&context[W_orth], &tokens[i])
    _fill_from_token(&context[N1_orth], &tokens[i+1])
    _fill_from_token(&context[N2_orth], &tokens[i+2])


cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
    context[0] = t.lex.lower
    context[1] = t.lex.cluster
    context[2] = t.lex.shape
    context[3] = t.lex.prefix
    context[4] = t.lex.suffix
    context[5] = t.tag
    context[6] = t.lemma
    if t.lex.flags & (1 << IS_ALPHA):
        context[7] = 1
    elif t.lex.flags & (1 << IS_PUNCT):
        context[7] = 2
    elif t.lex.flags & (1 << LIKE_URL):
        context[7] = 3
    elif t.lex.flags & (1 << LIKE_NUM):
        context[7] = 4
    else:
        context[7] = 0


cdef class Tagger:
    """Annotate part-of-speech tags on Doc objects."""
    @classmethod
    def load(cls, path, vocab, require=False):
        """Load the statistical model from the supplied path.

        Arguments:
            path (Path):
                The path to load from.
            vocab (Vocab):
                The vocabulary. Must be shared by the documents to be processed.
            require (bool):
                Whether to raise an error if the files are not found.
        Returns (Tagger):
            The newly created object.
        """
        # TODO: Change this to expect config.json when we don't have to
        # support old data.
        path = path if not isinstance(path, basestring) else pathlib.Path(path)
        if (path / 'templates.json').exists():
            with (path / 'templates.json').open('r', encoding='utf8') as file_:
                templates = json.load(file_)
        elif require:
            raise IOError(
                "Required file %s/templates.json not found when loading Tagger" % str(path))
        else:
            templates = cls.feature_templates
        self = cls(vocab, model=None, feature_templates=templates)

        if (path / 'model').exists():
            self.model.load(str(path / 'model'))
        elif require:
            raise IOError(
                "Required file %s/model not found when loading Tagger" % str(path))
        return self

    def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):
        """Create a Tagger.

        Arguments:
            vocab (Vocab):
                The vocabulary object. Must be shared with documents to be processed.
            model (thinc.linear.AveragedPerceptron):
                The statistical model.
        Returns (Tagger):
            The newly constructed object.
        """
        if model is None:
            model = TaggerModel(vocab.morphology.n_tags,
                        cfg.get('features', self.feature_templates))
        self.vocab = vocab
        self.model = model
        # TODO: Move this to tag map
        self.freqs = {TAG: defaultdict(int)}
        for tag in self.tag_names:
            self.freqs[TAG][self.vocab.strings[tag]] = 1
        self.freqs[TAG][0] = 1
        self.cfg = cfg
        self.optimizer = Adam(NumpyOps(), 0.001)

    @property
    def tag_names(self):
        return self.vocab.morphology.tag_names

    def __reduce__(self):
        return (self.__class__, (self.vocab, self.model), None, None)

    def tag_from_strings(self, Doc tokens, object tag_strs):
        cdef int i
        for i in range(tokens.length):
            self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i])
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

    def __call__(self, Doc tokens):
        """Apply the tagger, setting the POS tags onto the Doc object.

        Arguments:
            doc (Doc): The tokens to be tagged.
        Returns:
            None
        """
        if tokens.length == 0:
            return 0

        cdef atom_t[1][N_CONTEXT_FIELDS] c_context
        memset(c_context, 0, sizeof(c_context))
        cdef atom_t[:, ::1] context = c_context
        cdef float[:, ::1] scores

        cdef int nr_class = self.vocab.morphology.n_tags
        for i in range(tokens.length):
            if tokens.c[i].pos == 0:
                fill_context(&context[0, 0], tokens.c, i)
                scores, _ = self.model.begin_update(context)

                guess = Vec.arg_max(&scores[0, 0], nr_class)
                self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)
                memset(&scores[0, 0], 0, sizeof(float) * scores.size)
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

    def pipe(self, stream, batch_size=1000, n_threads=2):
        """Tag a stream of documents.

        Arguments:
            stream: The sequence of documents to tag.
            batch_size (int):
                The number of documents to accumulate into a working set.
            n_threads (int):
                The number of threads with which to work on the buffer in parallel,
                if the Matcher implementation supports multi-threading.
        Yields:
            Doc Documents, in order.
        """
        for doc in stream:
            self(doc)
            yield doc

    def update(self, Doc tokens, GoldParse gold, itn=0):
        """Update the statistical model, with tags supplied for the given document.

        Arguments:
            doc (Doc):
                The document to update on.
            gold (GoldParse):
                Manager for the gold-standard tags.
        Returns (int):
            Number of tags correct.
        """
        cdef int nr_class = self.vocab.morphology.n_tags
        gold_tag_strs = gold.tags
        assert len(tokens) == len(gold_tag_strs)
        for tag in gold_tag_strs:
            if tag != None and tag not in self.tag_names:
                msg = ("Unrecognized gold tag: %s. tag_map.json must contain all "
                       "gold tags, to maintain coarse-grained mapping.")
                raise ValueError(msg % tag)
        golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
        cdef int correct = 0

        cdef atom_t[:, ::1] context = np.zeros((1, N_CONTEXT_FIELDS), dtype='uint64')
        cdef float[:, ::1] scores

        for i in range(tokens.length):
            fill_context(&context[0, 0], tokens.c, i)
            scores, finish_update = self.model.begin_update(context)
            guess = Vec.arg_max(&scores[0, 0], nr_class)
            self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)

            if golds[i] != -1:
                scores[0, golds[i]] -= 1 
                finish_update(scores, lambda *args, **kwargs: None)

            if (golds[i] in (guess, -1)):
                correct += 1
            self.freqs[TAG][tokens.c[i].tag] += 1
        self.optimizer(self.model.model.weights, self.model.model.d_weights,
            key=self.model.model.id)
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length
        return correct


    feature_templates = (
        (W_orth,),
        (P1_lemma, P1_pos),
        (P2_lemma, P2_pos),
        (N1_orth,),
        (N2_orth,),

        (W_suffix,),
        (W_prefix,),

        (P1_pos,),
        (P2_pos,),
        (P1_pos, P2_pos),
        (P1_pos, W_orth),
        (P1_suffix,),
        (N1_suffix,),

        (W_shape,),
        (W_cluster,),
        (N1_cluster,),
        (N2_cluster,),
        (P1_cluster,),
        (P2_cluster,),

        (W_flags,),
        (N1_flags,),
        (N2_flags,),
        (P1_flags,),
        (P2_flags,),
    )
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`# cython: infer_types=True`
			`# cython: profile=True`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`import json`
Finish refactoring data loading 2016-09-24 21:26:17 +03:00			`import pathlib`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`from collections import defaultdict`
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`from libc.string cimport memset, memcpy`
			`from libcpp.vector cimport vector`
			`from libc.stdint cimport uint64_t, int32_t, int64_t`
			`cimport numpy as np`
			`import numpy as np`
			`np.import_array()`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`from cymem.cymem cimport Pool`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`from thinc.typedefs cimport atom_t, weight_t`
* Move to thinc 5.0 2016-01-29 05:58:55 +03:00			`from thinc.extra.eg cimport Example`
			`from thinc.structs cimport ExampleC`
			`from thinc.linear.avgtron cimport AveragedPerceptron`
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`from thinc.linalg cimport Vec, VecVec`
			`from thinc.linear.linear import LinearModel`
			`from thinc.structs cimport FeatureC`
			`from thinc.neural.optimizers import Adam`
			`from thinc.neural.ops import NumpyOps`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00
			`from .typedefs cimport attr_t`
			`from .tokens.doc cimport Doc`
			`from .attrs cimport TAG`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00			`from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CCONJ, DET, NOUN, NUM, PRON`
* Tagger training now working. Still need to test load/save of model. Morphology still broken. 2015-08-27 10:16:11 +03:00			`from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE`
Use GoldParse in tagger.update 2016-10-17 01:55:15 +03:00			`from .gold cimport GoldParse`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`from .attrs cimport *`

Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`cpdef enum:`
			`P2_orth`
			`P2_cluster`
			`P2_shape`
			`P2_prefix`
			`P2_suffix`
			`P2_pos`
			`P2_lemma`
			`P2_flags`

			`P1_orth`
			`P1_cluster`
			`P1_shape`
			`P1_prefix`
			`P1_suffix`
			`P1_pos`
			`P1_lemma`
			`P1_flags`

			`W_orth`
			`W_cluster`
			`W_shape`
			`W_prefix`
			`W_suffix`
			`W_pos`
			`W_lemma`
			`W_flags`

			`N1_orth`
			`N1_cluster`
			`N1_shape`
			`N1_prefix`
			`N1_suffix`
			`N1_pos`
			`N1_lemma`
			`N1_flags`

			`N2_orth`
			`N2_cluster`
			`N2_shape`
			`N2_prefix`
			`N2_suffix`
			`N2_pos`
			`N2_lemma`
			`N2_flags`

			`N_CONTEXT_FIELDS`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00

Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`cdef class TaggerModel:`
			`def __init__(self, int nr_tag, templates):`
			`self.extracter = ConjunctionExtracter(templates)`
			`self.model = LinearModel(nr_tag)`

			`def begin_update(self, atom_t[:, ::1] contexts, drop=0.):`
			`cdef vector[uint64_t]* keys = new vector[uint64_t]()`
			`cdef vector[float]* values = new vector[float]()`
			`cdef vector[int64_t]* lengths = new vector[int64_t]()`
			`features = new vector[FeatureC](self.extracter.nr_templ)`
			`features.resize(self.extracter.nr_templ)`
			`cdef FeatureC feat`
			`cdef int i, j`
			`for i in range(contexts.shape[0]):`
			`nr_feat = self.extracter.set_features(features.data(), &contexts[i, 0])`
			`for j in range(nr_feat):`
			`keys.push_back(features.at(j).key)`
			`values.push_back(features.at(j).value)`
			`lengths.push_back(nr_feat)`
			`cdef np.ndarray[uint64_t, ndim=1] py_keys`
			`cdef np.ndarray[float, ndim=1] py_values`
			`cdef np.ndarray[long, ndim=1] py_lengths`
			`py_keys = vector_uint64_2numpy(keys)`
			`py_values = vector_float_2numpy(values)`
			`py_lengths = vector_long_2numpy(lengths)`
			`instance = (py_keys, py_values, py_lengths)`
			`del keys`
			`del values`
			`del lengths`
			`del features`
			`return self.model.begin_update(instance, drop=drop)`

			`def end_training(self, args, *kwargs):`
			`pass`

			`def dump(self, args, *kwargs):`
			`pass`


			`cdef np.ndarray[uint64_t, ndim=1] vector_uint64_2numpy(vector[uint64_t]* vec):`
			`cdef np.ndarray[uint64_t, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='uint64')`
			`memcpy(arr.data, vec.data(), sizeof(uint64_t) * vec.size())`
			`return arr`


			`cdef np.ndarray[long, ndim=1] vector_long_2numpy(vector[int64_t]* vec):`
			`cdef np.ndarray[long, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='int64')`
			`memcpy(arr.data, vec.data(), sizeof(int64_t) * vec.size())`
			`return arr`


			`cdef np.ndarray[float, ndim=1] vector_float_2numpy(vector[float]* vec):`
			`cdef np.ndarray[float, ndim=1, mode="c"] arr = np.zeros(vec.size(), dtype='float32')`
			`memcpy(arr.data, vec.data(), sizeof(float) * vec.size())`
			`return arr`


			`cdef void fill_context(atom_t* context, const TokenC* tokens, int i) nogil:`
			`_fill_from_token(&context[P2_orth], &tokens[i-2])`
			`_fill_from_token(&context[P1_orth], &tokens[i-1])`
			`_fill_from_token(&context[W_orth], &tokens[i])`
			`_fill_from_token(&context[N1_orth], &tokens[i+1])`
			`_fill_from_token(&context[N2_orth], &tokens[i+2])`
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00

			`cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:`
			`context[0] = t.lex.lower`
			`context[1] = t.lex.cluster`
			`context[2] = t.lex.shape`
			`context[3] = t.lex.prefix`
			`context[4] = t.lex.suffix`
			`context[5] = t.tag`
			`context[6] = t.lemma`
			`if t.lex.flags & (1 << IS_ALPHA):`
			`context[7] = 1`
			`elif t.lex.flags & (1 << IS_PUNCT):`
			`context[7] = 2`
			`elif t.lex.flags & (1 << LIKE_URL):`
			`context[7] = 3`
			`elif t.lex.flags & (1 << LIKE_NUM):`
			`context[7] = 4`
			`else:`
			`context[7] = 0`


* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`cdef class Tagger:`
Fix doc strings 2016-11-01 14:25:36 +03:00			`"""Annotate part-of-speech tags on Doc objects."""`
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`@classmethod`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00			`def load(cls, path, vocab, require=False):`
Fix doc strings 2016-11-01 14:25:36 +03:00			`"""Load the statistical model from the supplied path.`

			`Arguments:`
			`path (Path):`
			`The path to load from.`
			`vocab (Vocab):`
			`The vocabulary. Must be shared by the documents to be processed.`
			`require (bool):`
			`Whether to raise an error if the files are not found.`
			`Returns (Tagger):`
			`The newly created object.`
			`"""`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00			`# TODO: Change this to expect config.json when we don't have to`
			`# support old data.`
Finish refactoring data loading 2016-09-24 21:26:17 +03:00			`path = path if not isinstance(path, basestring) else pathlib.Path(path)`
			`if (path / 'templates.json').exists():`
Fix JSON in tagger 2016-10-21 02:44:10 +03:00			`with (path / 'templates.json').open('r', encoding='utf8') as file_:`
Mostly finished loading refactoring. Design is in place, but doesn't work yet. 2016-09-24 16:42:01 +03:00			`templates = json.load(file_)`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00			`elif require:`
			`raise IOError(`
			`"Required file %s/templates.json not found when loading Tagger" % str(path))`
Mostly finished loading refactoring. Design is in place, but doesn't work yet. 2016-09-24 16:42:01 +03:00			`else:`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00			`templates = cls.feature_templates`
			`self = cls(vocab, model=None, feature_templates=templates)`
access model via sputnik 2015-12-07 08:01:28 +03:00
Finish refactoring data loading 2016-09-24 21:26:17 +03:00			`if (path / 'model').exists():`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00			`self.model.load(str(path / 'model'))`
			`elif require:`
			`raise IOError(`
			`"Required file %s/model not found when loading Tagger" % str(path))`
			`return self`

			`def __init__(self, Vocab vocab, TaggerModel model=None, **cfg):`
Fix doc strings 2016-11-01 14:25:36 +03:00			`"""Create a Tagger.`

			`Arguments:`
			`vocab (Vocab):`
			`The vocabulary object. Must be shared with documents to be processed.`
			`model (thinc.linear.AveragedPerceptron):`
			`The statistical model.`
			`Returns (Tagger):`
			`The newly constructed object.`
			`"""`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00			`if model is None:`
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`model = TaggerModel(vocab.morphology.n_tags,`
			`cfg.get('features', self.feature_templates))`
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`self.vocab = vocab`
* Tagger training now working. Still need to test load/save of model. Morphology still broken. 2015-08-27 10:16:11 +03:00			`self.model = model`
			`# TODO: Move this to tag map`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`self.freqs = {TAG: defaultdict(int)}`
			`for tag in self.tag_names:`
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`self.freqs[TAG][self.vocab.strings[tag]] = 1`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`self.freqs[TAG][0] = 1`
Use GoldParse in tagger.update 2016-10-17 01:55:15 +03:00			`self.cfg = cfg`
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`self.optimizer = Adam(NumpyOps(), 0.001)`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00			`@property`
			`def tag_names(self):`
* More work on language independent parsing 2015-08-28 04:44:54 +03:00			`return self.vocab.morphology.tag_names`
* Refactor language-independent tagger class 2015-08-26 20:19:21 +03:00
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`def __reduce__(self):`
			`return (self.__class__, (self.vocab, self.model), None, None)`

			`def tag_from_strings(self, Doc tokens, object tag_strs):`
			`cdef int i`
			`for i in range(tokens.length):`
			`self.vocab.morphology.assign_tag(&tokens.c[i], tag_strs[i])`
			`tokens.is_tagged = True`
			`tokens._py_tokens = [None] * tokens.length`

* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`def __call__(self, Doc tokens):`
			`"""Apply the tagger, setting the POS tags onto the Doc object.`

Fix doc strings 2016-11-01 14:25:36 +03:00			`Arguments:`
			`doc (Doc): The tokens to be tagged.`
			`Returns:`
			`None`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`"""`
			`if tokens.length == 0:`
			`return 0`
* Update to use thinc 3.0 2015-11-05 16:25:59 +03:00
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`cdef atom_t[1][N_CONTEXT_FIELDS] c_context`
			`memset(c_context, 0, sizeof(c_context))`
			`cdef atom_t[:, ::1] context = c_context`
			`cdef float[:, ::1] scores`
* Very scrappy, likely buggy first-cut pickle implementation, to work on Issue #125: allow pickle for Apache Spark. The current implementation sends stuff to temp files, and does almost nothing to ensure all modifiable state is actually preserved. The Language() instance is a deep tree of extension objects, and if pickling during training, some of the C-data state is hard to preserve. 2015-10-12 11:33:11 +03:00
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`cdef int nr_class = self.vocab.morphology.n_tags`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`for i in range(tokens.length):`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00			`if tokens.c[i].pos == 0:`
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`fill_context(&context[0, 0], tokens.c, i)`
			`scores, _ = self.model.begin_update(context)`

			`guess = Vec.arg_max(&scores[0, 0], nr_class)`
Fix morphology tagger 2016-11-04 21:19:09 +03:00			`self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)`
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`memset(&scores[0, 0], 0, sizeof(float) * scores.size)`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`tokens.is_tagged = True`
			`tokens._py_tokens = [None] * tokens.length`
* Add a .pipe method, that takes a stream of input, operates on it, and streams the output. Internally, the stream may be buffered, to allow multi-threading. 2016-02-03 04:04:55 +03:00
			`def pipe(self, stream, batch_size=1000, n_threads=2):`
Fix doc strings 2016-11-01 14:25:36 +03:00			`"""Tag a stream of documents.`

			`Arguments:`
			`stream: The sequence of documents to tag.`
			`batch_size (int):`
			`The number of documents to accumulate into a working set.`
			`n_threads (int):`
			`The number of threads with which to work on the buffer in parallel,`
			`if the Matcher implementation supports multi-threading.`
			`Yields:`
			`Doc Documents, in order.`
			`"""`
* Add a .pipe method, that takes a stream of input, operates on it, and streams the output. Internally, the stream may be buffered, to allow multi-threading. 2016-02-03 04:04:55 +03:00			`for doc in stream:`
			`self(doc)`
			`yield doc`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00
Add itn argument to tagger.update 2017-03-11 20:12:21 +03:00			`def update(self, Doc tokens, GoldParse gold, itn=0):`
Fix doc strings 2016-11-01 14:25:36 +03:00			`"""Update the statistical model, with tags supplied for the given document.`

			`Arguments:`
			`doc (Doc):`
			`The document to update on.`
			`gold (GoldParse):`
			`Manager for the gold-standard tags.`
			`Returns (int):`
			`Number of tags correct.`
			`"""`
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`cdef int nr_class = self.vocab.morphology.n_tags`
Use GoldParse in tagger.update 2016-10-17 01:55:15 +03:00			`gold_tag_strs = gold.tags`
* Tagger training now working. Still need to test load/save of model. Morphology still broken. 2015-08-27 10:16:11 +03:00			`assert len(tokens) == len(gold_tag_strs)`
* Add error warning that a gold tag is unrecognised 2016-02-03 00:59:59 +03:00			`for tag in gold_tag_strs:`
add tokenizer files for German, add/change code to train German pos tagger - add files to specify rules for German tokenization - change generate_specials.py to generate from an external file (abbrev.de.tab) - copy gazetteer.json from lang_data/en/ - init_model.py - change doc freq threshold to 0 - add train_german_tagger.py - expects conll09-formatted input 2016-02-18 15:24:20 +03:00			`if tag != None and tag not in self.tag_names:`
Refactor training, with new spacy.train module. Defaults still a little awkward. 2016-10-09 13:24:24 +03:00			`msg = ("Unrecognized gold tag: %s. tag_map.json must contain all "`
* Add error warning that a gold tag is unrecognised 2016-02-03 00:59:59 +03:00			`"gold tags, to maintain coarse-grained mapping.")`
			`raise ValueError(msg % tag)`
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]`
			`cdef int correct = 0`
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00
			`cdef atom_t[:, ::1] context = np.zeros((1, N_CONTEXT_FIELDS), dtype='uint64')`
			`cdef float[:, ::1] scores`

* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`for i in range(tokens.length):`
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`fill_context(&context[0, 0], tokens.c, i)`
			`scores, finish_update = self.model.begin_update(context)`
			`guess = Vec.arg_max(&scores[0, 0], nr_class)`
			`self.vocab.morphology.assign_tag_id(&tokens.c[i], guess)`
* More work on language-generic parsing 2015-08-28 03:02:33 +03:00
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`if golds[i] != -1:`
			`scores[0, golds[i]] -= 1`
			`finish_update(scores, lambda args, *kwargs: None)`
Add support for Universal Dependencies v2.0 2017-02-27 00:27:11 +03:00
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`if (golds[i] in (guess, -1)):`
			`correct += 1`
* Rename Doc.data to Doc.c 2015-11-03 16:15:14 +03:00			`self.freqs[TAG][tokens.c[i].tag] += 1`
Try using LinearModel in tagger. 2017-03-13 13:24:02 +03:00			`self.optimizer(self.model.model.weights, self.model.model.d_weights,`
			`key=self.model.model.id)`
* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc. 2015-11-06 19:24:30 +03:00			`tokens.is_tagged = True`
			`tokens._py_tokens = [None] * tokens.length`
* Split EnPosTagger up into base class and subclass 2015-08-24 06:25:55 +03:00			`return correct`
Refactor the pipeline classes to make them more consistent, and remove the redundant blank() constructor. 2016-10-16 22:34:57 +03:00

			`feature_templates = (`
			`(W_orth,),`
			`(P1_lemma, P1_pos),`
			`(P2_lemma, P2_pos),`
			`(N1_orth,),`
			`(N2_orth,),`

			`(W_suffix,),`
			`(W_prefix,),`

			`(P1_pos,),`
			`(P2_pos,),`
			`(P1_pos, P2_pos),`
			`(P1_pos, W_orth),`
			`(P1_suffix,),`
			`(N1_suffix,),`

			`(W_shape,),`
			`(W_cluster,),`
			`(N1_cluster,),`
			`(N2_cluster,),`
			`(P1_cluster,),`
			`(P2_cluster,),`

			`(W_flags,),`
			`(N1_flags,),`
			`(N2_flags,),`
			`(P1_flags,),`
			`(P2_flags,),`
			`)`