* Refactor away from the _ml module, to use thinc 4.0. Still some work needs to be done, e.g. to add __reduce__ to the models, more testing, etc.

2025-10-24 04:31:17 +03:00 · 2015-11-07 03:24:30 +11:00 · 2015-11-07 03:24:30 +11:00 · 3c162dcac3
commit 3c162dcac3
parent c339783bbe
9 changed files with 155 additions and 142 deletions
--- a/setup.py
+++ b/setup.py
@ -210,7 +210,6 @@ MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
             'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
             'spacy.morphology', 'spacy.tagger',
             'spacy.syntax.stateclass', 
-             'spacy._ml',
             'spacy.tokenizer',
             'spacy.syntax.parser', 
             'spacy.syntax.transition_system',
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1,3 +1,4 @@
+from __future__ import absolute_import
 from os import path
 from warnings import warn
 import io
@ -13,7 +14,6 @@ from .syntax.parser import Parser
 from .tagger import Tagger
 from .matcher import Matcher
 from .serialize.packer import Packer
-from ._ml import Model
 from . import attrs
 from . import orth
 from .syntax.ner import BiluoPushDown
@ -245,9 +245,12 @@ class Language(object):
    def end_training(self, data_dir=None):
        if data_dir is None:
            data_dir = self.data_dir
-        self.parser.model.end_training(path.join(data_dir, 'deps', 'model'))
-        self.entity.model.end_training(path.join(data_dir, 'ner', 'model'))
-        self.tagger.model.end_training(path.join(data_dir, 'pos', 'model'))
+        self.parser.model.end_training()
+        self.parser.model.dump(path.join(data_dir, 'deps', 'model'))
+        self.entity.model.end_training()
+        self.entity.model.dump(path.join(data_dir, 'ner', 'model'))
+        self.tagger.model.end_training()
+        self.tagger.model.dump(path.join(data_dir, 'pos', 'model'))

        strings_loc = path.join(data_dir, 'vocab', 'strings.json')
        with io.open(strings_loc, 'w', encoding='utf8') as file_:
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -78,7 +78,7 @@ cdef class StringStore:
    def __init__(self, strings=None):
        self.mem = Pool()
        self._map = PreshMap()
-        self._resize_at = 10
+        self._resize_at = 10000
        self.c = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
        self.size = 1
        if strings is not None:
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@ -1,18 +1,17 @@
 from thinc.search cimport Beam
+from thinc.api cimport AveragedPerceptron
+from thinc.api cimport Example, ExampleC

-from .._ml cimport Model
-
+from .stateclass cimport StateClass
 from .arc_eager cimport TransitionSystem
-
 from ..tokens.doc cimport Doc
 from ..structs cimport TokenC
-from thinc.api cimport Example, ExampleC
-from .stateclass cimport StateClass
+
+
+cdef class ParserModel(AveragedPerceptron):
+    cdef void set_features(self, ExampleC* eg, StateClass stcls) except *


 cdef class Parser:
-    cdef readonly Model model
+    cdef readonly ParserModel model
    cdef readonly TransitionSystem moves
-
-    cdef void parse(self, StateClass stcls, ExampleC eg) nogil
-    cdef void predict(self, StateClass stcls, ExampleC* eg) nogil
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@ -18,18 +18,15 @@ import sys
 from cymem.cymem cimport Pool, Address
 from murmurhash.mrmr cimport hash64
 from thinc.typedefs cimport weight_t, class_t, feat_t, atom_t, hash_t
+from thinc.features cimport ConjunctionExtracter

 from util import Config

-from thinc.api cimport Example, ExampleC
-
-
 from ..structs cimport TokenC

 from ..tokens.doc cimport Doc
 from ..strings cimport StringStore

-
 from .transition_system import OracleError
 from .transition_system cimport TransitionSystem, Transition

@ -40,7 +37,6 @@ from ._parse_features cimport CONTEXT_SIZE
 from ._parse_features cimport fill_context
 from .stateclass cimport StateClass

-from thinc.learner cimport arg_max_if_true


 DEBUG = False
@ -66,8 +62,18 @@ def ParserFactory(transition_system):
    return lambda strings, dir_: Parser(strings, dir_, transition_system)


+cdef class ParserModel(AveragedPerceptron):
+    def __init__(self, n_classes, templates):
+        AveragedPerceptron.__init__(self, n_classes,
+            ConjunctionExtracter(CONTEXT_SIZE, templates))
+
+    cdef void set_features(self, ExampleC* eg, StateClass stcls) except *: 
+        fill_context(eg.atoms, stcls)
+        eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
+
+
 cdef class Parser:
-    def __init__(self, StringStore strings, transition_system, model):
+    def __init__(self, StringStore strings, transition_system, ParserModel model):
        self.moves = transition_system
        self.model = model

@ -80,54 +86,50 @@ cdef class Parser:
        cfg = Config.read(model_dir, 'config')
        moves = transition_system(strings, cfg.labels)
        templates = get_templates(cfg.features)
-        model = Model(moves.n_moves, templates, model_dir)
+        model = ParserModel(moves.n_moves, templates)
+        if path.exists(path.join(model_dir, 'model')):
+            model.load(path.join(model_dir, 'model'))
        return cls(strings, moves, model)

+    def __reduce__(self):
+        return (Parser, (self.moves.strings, self.moves, self.model), None, None)
+
    def __call__(self, Doc tokens):
        cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
        self.moves.initialize_state(stcls)

-        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
-                                  self.model.n_feats, self.model.n_feats)
-        self.parse(stcls, eg.c)
-        tokens.set_parse(stcls._sent)
-
-    def __reduce__(self):
-        return (Parser, (self.moves.strings, self.moves, self.model), None, None)
-
-    cdef void predict(self, StateClass stcls, ExampleC* eg) nogil:
-        memset(eg.scores, 0, eg.nr_class * sizeof(weight_t))
-        self.moves.set_valid(eg.is_valid, stcls)
-        fill_context(eg.atoms, stcls)
-        self.model.set_scores(eg.scores, eg.atoms)
-        eg.guess = arg_max_if_true(eg.scores, eg.is_valid, self.model.n_classes)
-
-    cdef void parse(self, StateClass stcls, ExampleC eg) nogil:
+        cdef Pool mem = Pool()
+        cdef ExampleC eg = self.model.allocate(mem)
        while not stcls.is_final():
-            self.predict(stcls, &eg)
-            if not eg.is_valid[eg.guess]:
-                break
-            self.moves.c[eg.guess].do(stcls, self.moves.c[eg.guess].label)
-        self.moves.finalize_state(stcls)
+            self.model.set_features(&eg, stcls)
+            self.moves.set_valid(eg.is_valid, stcls)
+            self.model.set_prediction(&eg)

+            assert eg.is_valid[eg.guess]
+            
+            action = self.moves.c[eg.guess]
+            action.do(stcls, action.label)
+        self.moves.finalize_state(stcls)
+        tokens.set_parse(stcls._sent)
+  
    def train(self, Doc tokens, GoldParse gold):
        self.moves.preprocess_gold(gold)
        cdef StateClass stcls = StateClass.init(tokens.c, tokens.length)
        self.moves.initialize_state(stcls)
-        cdef Example eg = Example(self.model.n_classes, CONTEXT_SIZE,
-                                  self.model.n_feats, self.model.n_feats)
+        cdef Pool mem = Pool()
+        cdef ExampleC eg = self.model.allocate(mem)
        cdef weight_t loss = 0
        words = [w.orth_ for w in tokens]
-        cdef Transition G
+        cdef Transition action
        while not stcls.is_final():
-            memset(eg.c.scores, 0, eg.c.nr_class * sizeof(weight_t))
-            self.moves.set_costs(eg.c.is_valid, eg.c.costs, stcls, gold)
-            fill_context(eg.c.atoms, stcls)
-            self.model.train(eg)
-            G = self.moves.c[eg.c.guess]
+            self.model.set_features(&eg, stcls)
+            self.moves.set_costs(eg.is_valid, eg.costs, stcls, gold)
+            self.model.set_prediction(&eg)
+            self.model.update(&eg)

-            self.moves.c[eg.c.guess].do(stcls, self.moves.c[eg.c.guess].label)
-            loss += eg.c.loss
+            action = self.moves.c[eg.guess]
+            action.do(stcls, action.label)
+            loss += eg.costs[eg.guess]
        return loss

    def step_through(self, Doc doc):
@ -176,7 +178,10 @@ cdef class StepwiseState:
                for i in range(self.stcls.length)]

    def predict(self):
-        self.parser.predict(self.stcls, &self.eg.c)
+        self.parser.model.set_features(&self.eg.c, self.stcls)
+        self.parser.moves.set_valid(self.eg.c.is_valid, self.stcls)
+        self.parser.model.set_prediction(&self.eg.c)
+
        action = self.parser.moves.c[self.eg.c.guess]
        return self.parser.moves.move_name(action.move, action.label)

--- a/spacy/tagger.pxd
+++ b/spacy/tagger.pxd
@ -1,9 +1,17 @@
-from ._ml cimport Model
+from thinc.api cimport AveragedPerceptron
+from thinc.api cimport ExampleC
+
 from .structs cimport TokenC
 from .vocab cimport Vocab


+cdef class TaggerModel(AveragedPerceptron):
+    cdef void set_features(self, ExampleC* eg, const TokenC* tokens, int i) except *
+    cdef void set_costs(self, ExampleC* eg, int gold) except *
+    cdef void update(self, ExampleC* eg) except *
+ 
+
 cdef class Tagger:
    cdef readonly Vocab vocab
-    cdef readonly Model model
+    cdef readonly TaggerModel model
    cdef public dict freqs
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -1,10 +1,12 @@
 import json
 from os import path
 from collections import defaultdict
+from libc.string cimport memset

+from cymem.cymem cimport Pool
 from thinc.typedefs cimport atom_t, weight_t
-from thinc.learner cimport arg_max, arg_max_if_true, arg_max_if_zero
-from thinc.api cimport Example
+from thinc.api cimport Example, ExampleC
+from thinc.features cimport ConjunctionExtracter

 from .typedefs cimport attr_t
 from .tokens.doc cimport Doc
@ -64,6 +66,44 @@ cpdef enum:
    N_CONTEXT_FIELDS


+cdef class TaggerModel(AveragedPerceptron):
+    def __init__(self, n_classes, templates):
+        AveragedPerceptron.__init__(self, n_classes,
+            ConjunctionExtracter(N_CONTEXT_FIELDS, templates))
+
+    cdef void set_features(self, ExampleC* eg, const TokenC* tokens, int i) except *:
+        _fill_from_token(&eg.atoms[P2_orth], &tokens[i-2])
+        _fill_from_token(&eg.atoms[P1_orth], &tokens[i-1])
+        _fill_from_token(&eg.atoms[W_orth], &tokens[i])
+        _fill_from_token(&eg.atoms[N1_orth], &tokens[i+1])
+        _fill_from_token(&eg.atoms[N2_orth], &tokens[i+2])
+
+        eg.nr_feat = self.extracter.set_features(eg.features, eg.atoms)
+
+    cdef void update(self, ExampleC* eg) except *:
+        self.updater.update(eg)
+   
+
+cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
+    context[0] = t.lex.lower
+    context[1] = t.lex.cluster
+    context[2] = t.lex.shape
+    context[3] = t.lex.prefix
+    context[4] = t.lex.suffix
+    context[5] = t.tag
+    context[6] = t.lemma
+    if t.lex.flags & (1 << IS_ALPHA):
+        context[7] = 1
+    elif t.lex.flags & (1 << IS_PUNCT):
+        context[7] = 2
+    elif t.lex.flags & (1 << LIKE_URL):
+        context[7] = 3
+    elif t.lex.flags & (1 << LIKE_NUM):
+        context[7] = 4
+    else:
+        context[7] = 0
+
+
 cdef class Tagger:
    """A part-of-speech tagger for English"""
    @classmethod
@ -105,7 +145,7 @@ cdef class Tagger:

    @classmethod
    def blank(cls, vocab, templates):
-        model = Model(vocab.morphology.n_tags, templates, model_loc=None)
+        model = TaggerModel(vocab.morphology.n_tags, templates)
        return cls(vocab, model)

    @classmethod
@ -114,10 +154,12 @@ cdef class Tagger:
            templates = json.loads(open(path.join(data_dir, 'templates.json')))
        else:
            templates = cls.default_templates()
-        model = Model(vocab.morphology.n_tags, templates, data_dir)
+        model = TaggerModel(vocab.morphology.n_tags, templates)
+        if path.exists(path.join(data_dir, 'model')):
+            model.load(path.join(data_dir, 'model'))
        return cls(vocab, model)

-    def __init__(self, Vocab vocab, model):
+    def __init__(self, Vocab vocab, TaggerModel model):
        self.vocab = vocab
        self.model = model
        
@ -131,27 +173,6 @@ cdef class Tagger:
    def tag_names(self):
        return self.vocab.morphology.tag_names

-    def __call__(self, Doc tokens):
-        """Apply the tagger, setting the POS tags onto the Doc object.
-
-        Args:
-            tokens (Doc): The tokens to be tagged.
-        """
-        if tokens.length == 0:
-            return 0
-
-        cdef Example eg = self.model._eg
-        cdef int i
-        for i in range(tokens.length):
-            if tokens.c[i].pos == 0:
-                eg.wipe()
-                fill_atoms(eg.c.atoms, tokens.c, i)
-                self.model(eg)
-                self.vocab.morphology.assign_tag(&tokens.c[i], eg.c.guess)
-
-        tokens.is_tagged = True
-        tokens._py_tokens = [None] * tokens.length
-
    def __reduce__(self):
        return (self.__class__, (self.vocab, self.model), None, None)

@ -162,53 +183,45 @@ cdef class Tagger:
        tokens.is_tagged = True
        tokens._py_tokens = [None] * tokens.length

+    def __call__(self, Doc tokens):
+        """Apply the tagger, setting the POS tags onto the Doc object.
+
+        Args:
+            tokens (Doc): The tokens to be tagged.
+        """
+        if tokens.length == 0:
+            return 0
+
+        cdef Pool mem = Pool()
+        cdef ExampleC eg 
+
+        cdef int i, tag
+        for i in range(tokens.length):
+            if tokens.c[i].pos == 0:
+                eg = self.model.allocate(mem)
+                self.model.set_features(&eg, tokens.c, i)
+                self.model.set_prediction(&eg)
+                self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess)
+        tokens.is_tagged = True
+        tokens._py_tokens = [None] * tokens.length
+    
    def train(self, Doc tokens, object gold_tag_strs):
        assert len(tokens) == len(gold_tag_strs)
-        cdef int i
-        cdef int loss
-        cdef const weight_t* scores
-        try:
-            golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
-        except ValueError:
-            raise ValueError(
-                [g for g in gold_tag_strs if g is not None and g not in self.tag_names])
-        correct = 0
-        cdef Example eg = self.model._eg
+        golds = [self.tag_names.index(g) if g is not None else -1 for g in gold_tag_strs]
+        cdef int correct = 0
+        cdef Pool mem = Pool()
+        cdef ExampleC eg 
        for i in range(tokens.length):
-            eg.wipe()
-            fill_atoms(eg.c.atoms, tokens.c, i)
-            self.train(eg)
+            eg = self.model.allocate(mem)
+            self.model.set_features(&eg, tokens.c, i)
+            self.model.set_costs(&eg, golds[i])
+            self.model.set_prediction(&eg)
+            self.model.update(&eg)

-            self.vocab.morphology.assign_tag(&tokens.c[i], eg.c.guess)
+            self.vocab.morphology.assign_tag(&tokens.c[i], eg.guess)
            
-            correct += eg.c.cost == 0
+            correct += eg.cost == 0
            self.freqs[TAG][tokens.c[i].tag] += 1
+        tokens.is_tagged = True
+        tokens._py_tokens = [None] * tokens.length
        return correct
-
-
-cdef inline void fill_atoms(atom_t* atoms, const TokenC* tokens, int i) nogil:
-    _fill_from_token(&atoms[P2_orth], &tokens[i-2])
-    _fill_from_token(&atoms[P1_orth], &tokens[i-1])
-    _fill_from_token(&atoms[W_orth], &tokens[i])
-    _fill_from_token(&atoms[N1_orth], &tokens[i+1])
-    _fill_from_token(&atoms[N2_orth], &tokens[i+2])
-    
-
-cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
-    context[0] = t.lex.lower
-    context[1] = t.lex.cluster
-    context[2] = t.lex.shape
-    context[3] = t.lex.prefix
-    context[4] = t.lex.suffix
-    context[5] = t.tag
-    context[6] = t.lemma
-    if t.lex.flags & (1 << IS_ALPHA):
-        context[7] = 1
-    elif t.lex.flags & (1 << IS_PUNCT):
-        context[7] = 2
-    elif t.lex.flags & (1 << LIKE_URL):
-        context[7] = 3
-    elif t.lex.flags & (1 << LIKE_NUM):
-        context[7] = 4
-    else:
-        context[7] = 0
--- a/spacy/tests/test_basic_create.py
+++ b/spacy/tests/test_basic_create.py
@ -11,7 +11,6 @@ from spacy.strings import StringStore
 from spacy.vocab import Vocab
 from spacy.tokenizer import Tokenizer
 from spacy.syntax.arc_eager import ArcEager
-from spacy._ml import Model
 from spacy.tagger import Tagger
 from spacy.syntax.parser import Parser
 from spacy.matcher import Matcher
--- a/spacy/tests/test_basic_load.py
+++ b/spacy/tests/test_basic_load.py
@ -12,16 +12,13 @@ from spacy.strings import StringStore
 from spacy.vocab import Vocab
 from spacy.tokenizer import Tokenizer
 from spacy.syntax.arc_eager import ArcEager
-from spacy._ml import Model
 from spacy.tagger import Tagger
-from spacy.syntax.parser import Parser
+from spacy.syntax.parser import Parser, ParserModel
 from spacy.matcher import Matcher
 from spacy.syntax.parser import get_templates

 from spacy.en import English

-from thinc.learner import LinearModel
-

 class TestLoadVocab(unittest.TestCase):
    def test_load(self):
@ -54,7 +51,6 @@ class TestLoadParser(unittest.TestCase):
        if path.exists(path.join(data_dir, 'deps')):
            parser = Parser.from_dir(path.join(data_dir, 'deps'), vocab.strings, ArcEager)

-    def test_load_careful(self):
        config_data = {"labels": {"0": {"": True}, "1": {"": True}, "2": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "dobj": True, "neg": True, "csubjpass": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "relcl": True, "quantmod": True, "acomp": True, "compound": True, "pcomp": True, "intj": True, "poss": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "amod": True, "dative": True, "pobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True, "acl": True}, "3": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "acl": True, "poss": True, "neg": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "amod": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "quantmod": True, "acomp": True, "pcomp": True, "intj": True, "relcl": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "dobj": True, "dative": True, "pobj": True, "iobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True}, "4": {"ROOT": True}}, "seed": 0, "features": "basic", "beam_width": 1}

        data_dir = English.default_data_dir()
@ -63,20 +59,11 @@ class TestLoadParser(unittest.TestCase):
        moves = ArcEager(vocab.strings, config_data['labels'])
        templates = get_templates(config_data['features'])

-        model = Model(moves.n_moves, templates, path.join(data_dir, 'deps'))
+        model = ParserModel(moves.n_moves, templates)
+        model.load(path.join(data_dir, 'deps', 'model'))

        parser = Parser(vocab.strings, moves, model)

-    def test_thinc_load(self):
-        data_dir = English.default_data_dir()
-        model_loc = path.join(data_dir, 'deps', 'model')
-
-        # n classes. moves.n_moves above
-        # n features. len(templates) + 1 above
-        if path.exists(model_loc):
-            model = LinearModel(92, 116)
-            model.load(model_loc)
-

 if __name__ == '__main__':
    unittest.main()