mirror of
https://github.com/explosion/spaCy.git
synced 2025-06-30 18:03:04 +03:00
* Update to use thinc 3.0
This commit is contained in:
parent
802ad3d71a
commit
b9991fbd20
|
@ -10,6 +10,8 @@ import json
|
||||||
import cython
|
import cython
|
||||||
import numpy.random
|
import numpy.random
|
||||||
|
|
||||||
|
from libc.string cimport memcpy
|
||||||
|
|
||||||
from thinc.features cimport Feature, count_feats
|
from thinc.features cimport Feature, count_feats
|
||||||
from thinc.api cimport Example
|
from thinc.api cimport Example
|
||||||
|
|
||||||
|
@ -52,28 +54,12 @@ cdef class Model:
|
||||||
cdef const weight_t* score(self, atom_t* context) except NULL:
|
cdef const weight_t* score(self, atom_t* context) except NULL:
|
||||||
memcpy(self._eg.c.atoms, context, self._eg.c.nr_atom * sizeof(context[0]))
|
memcpy(self._eg.c.atoms, context, self._eg.c.nr_atom * sizeof(context[0]))
|
||||||
self._model(self._eg)
|
self._model(self._eg)
|
||||||
return self._eg.scores
|
return self._eg.c.scores
|
||||||
|
|
||||||
cdef int set_scores(self, weight_t* scores, atom_t* context) nogil:
|
cdef int set_scores(self, weight_t* scores, atom_t* context) nogil:
|
||||||
cdef int nr_feat = self._model.extractor.set_feats(self._eg.features, context)
|
cdef int nr_feat = self._extractor.set_feats(self._eg.c.features, context)
|
||||||
|
|
||||||
self._model.set_scores(
|
self._model.set_scores(scores, self._eg.c.features, nr_feat)
|
||||||
scores,
|
|
||||||
self._model.weights.c_map,
|
|
||||||
self._eg.c.features,
|
|
||||||
nr_feat
|
|
||||||
)
|
|
||||||
|
|
||||||
cdef int update(self, atom_t* context, class_t guess, class_t gold, int cost) except -1:
|
|
||||||
cdef int n_feats
|
|
||||||
if cost == 0:
|
|
||||||
self._model.update({})
|
|
||||||
else:
|
|
||||||
feats = self._extractor.get_feats(context, &n_feats)
|
|
||||||
counts = {gold: {}, guess: {}}
|
|
||||||
count_feats(counts[gold], feats, n_feats, cost)
|
|
||||||
count_feats(counts[guess], feats, n_feats, -cost)
|
|
||||||
self._model.update(counts)
|
|
||||||
|
|
||||||
def end_training(self, model_loc=None):
|
def end_training(self, model_loc=None):
|
||||||
if model_loc is None:
|
if model_loc is None:
|
||||||
|
|
|
@ -40,7 +40,7 @@ from ._parse_features cimport CONTEXT_SIZE
|
||||||
from ._parse_features cimport fill_context
|
from ._parse_features cimport fill_context
|
||||||
from .stateclass cimport StateClass
|
from .stateclass cimport StateClass
|
||||||
|
|
||||||
from .._ml cimport arg_max_if_true
|
from thinc.learner cimport arg_max_if_true
|
||||||
|
|
||||||
|
|
||||||
DEBUG = False
|
DEBUG = False
|
||||||
|
|
|
@ -7,6 +7,3 @@ cdef class Tagger:
|
||||||
cdef readonly Vocab vocab
|
cdef readonly Vocab vocab
|
||||||
cdef readonly Model model
|
cdef readonly Model model
|
||||||
cdef public dict freqs
|
cdef public dict freqs
|
||||||
|
|
||||||
cdef int predict(self, int i, const TokenC* tokens) except -1
|
|
||||||
cdef int update(self, int i, const TokenC* tokens, int gold) except -1
|
|
||||||
|
|
|
@ -3,6 +3,8 @@ from os import path
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
|
|
||||||
from thinc.typedefs cimport atom_t, weight_t
|
from thinc.typedefs cimport atom_t, weight_t
|
||||||
|
from thinc.learner cimport arg_max, arg_max_if_true, arg_max_if_zero
|
||||||
|
from thinc.api cimport Example
|
||||||
|
|
||||||
from .typedefs cimport attr_t
|
from .typedefs cimport attr_t
|
||||||
from .tokens.doc cimport Doc
|
from .tokens.doc cimport Doc
|
||||||
|
@ -11,7 +13,6 @@ from .parts_of_speech cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON
|
||||||
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
|
||||||
|
|
||||||
from .attrs cimport *
|
from .attrs cimport *
|
||||||
from ._ml cimport arg_max
|
|
||||||
|
|
||||||
|
|
||||||
cpdef enum:
|
cpdef enum:
|
||||||
|
@ -138,12 +139,15 @@ cdef class Tagger:
|
||||||
"""
|
"""
|
||||||
if tokens.length == 0:
|
if tokens.length == 0:
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
cdef Example eg = self.model._eg
|
||||||
cdef int i
|
cdef int i
|
||||||
cdef const weight_t* scores
|
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
if tokens.c[i].pos == 0:
|
if tokens.c[i].pos == 0:
|
||||||
guess = self.predict(i, tokens.c)
|
eg.wipe()
|
||||||
self.vocab.morphology.assign_tag(&tokens.c[i], guess)
|
fill_atoms(eg.c.atoms, tokens.c, i)
|
||||||
|
self.model(eg)
|
||||||
|
self.vocab.morphology.assign_tag(&tokens.c[i], eg.c.guess)
|
||||||
|
|
||||||
tokens.is_tagged = True
|
tokens.is_tagged = True
|
||||||
tokens._py_tokens = [None] * tokens.length
|
tokens._py_tokens = [None] * tokens.length
|
||||||
|
@ -169,38 +173,25 @@ cdef class Tagger:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
[g for g in gold_tag_strs if g is not None and g not in self.tag_names])
|
[g for g in gold_tag_strs if g is not None and g not in self.tag_names])
|
||||||
correct = 0
|
correct = 0
|
||||||
|
cdef Example eg = self.model._eg
|
||||||
for i in range(tokens.length):
|
for i in range(tokens.length):
|
||||||
guess = self.update(i, tokens.c, golds[i])
|
eg.wipe()
|
||||||
loss = golds[i] != -1 and guess != golds[i]
|
fill_atoms(eg.c.atoms, tokens.c, i)
|
||||||
|
self.train(eg)
|
||||||
|
|
||||||
self.vocab.morphology.assign_tag(&tokens.c[i], guess)
|
self.vocab.morphology.assign_tag(&tokens.c[i], eg.c.guess)
|
||||||
|
|
||||||
correct += loss == 0
|
correct += eg.c.cost == 0
|
||||||
self.freqs[TAG][tokens.c[i].tag] += 1
|
self.freqs[TAG][tokens.c[i].tag] += 1
|
||||||
return correct
|
return correct
|
||||||
|
|
||||||
cdef int predict(self, int i, const TokenC* tokens) except -1:
|
|
||||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
|
||||||
_fill_from_token(&context[P2_orth], &tokens[i-2])
|
|
||||||
_fill_from_token(&context[P1_orth], &tokens[i-1])
|
|
||||||
_fill_from_token(&context[W_orth], &tokens[i])
|
|
||||||
_fill_from_token(&context[N1_orth], &tokens[i+1])
|
|
||||||
_fill_from_token(&context[N2_orth], &tokens[i+2])
|
|
||||||
scores = self.model.score(context)
|
|
||||||
return arg_max(scores, self.model.n_classes)
|
|
||||||
|
|
||||||
cdef int update(self, int i, const TokenC* tokens, int gold) except -1:
|
cdef inline void fill_atoms(atom_t* atoms, const TokenC* tokens, int i) nogil:
|
||||||
cdef atom_t[N_CONTEXT_FIELDS] context
|
_fill_from_token(&atoms[P2_orth], &tokens[i-2])
|
||||||
_fill_from_token(&context[P2_orth], &tokens[i-2])
|
_fill_from_token(&atoms[P1_orth], &tokens[i-1])
|
||||||
_fill_from_token(&context[P1_orth], &tokens[i-1])
|
_fill_from_token(&atoms[W_orth], &tokens[i])
|
||||||
_fill_from_token(&context[W_orth], &tokens[i])
|
_fill_from_token(&atoms[N1_orth], &tokens[i+1])
|
||||||
_fill_from_token(&context[N1_orth], &tokens[i+1])
|
_fill_from_token(&atoms[N2_orth], &tokens[i+2])
|
||||||
_fill_from_token(&context[N2_orth], &tokens[i+2])
|
|
||||||
scores = self.model.score(context)
|
|
||||||
guess = arg_max(scores, self.model.n_classes)
|
|
||||||
loss = guess != gold if gold != -1 else 0
|
|
||||||
self.model.update(context, guess, gold, loss)
|
|
||||||
return guess
|
|
||||||
|
|
||||||
|
|
||||||
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
|
||||||
|
|
Loading…
Reference in New Issue
Block a user