Merge branch 'master' of https://github.com/honnibal/spaCy into attrs

2025-12-22 17:43:13 +03:00 · 2015-10-13 04:52:27 +02:00 · 2015-10-13 04:52:27 +02:00 · b866f1443e
commit b866f1443e
parent 92f750cf8b 6c2da06c18
4 changed files with 167 additions and 2 deletions
--- a/setup.py
+++ b/setup.py
@ -92,6 +92,7 @@ def cython_setup(mod_names, language, includes, compile_args, link_args):
        package_data={"spacy": ["*.pxd"],
                      "spacy.en": ["*.pxd", "data/pos/*",
                                   "data/wordnet/*", "data/tokenizer/*",
                                   "data/vocab/tag_map.json",
                                   "data/vocab/lexemes.bin",
                                   "data/vocab/strings.txt"],
                      "spacy.syntax": ["*.pxd"]},
@ -134,7 +135,7 @@ def run_setup(exts):
    headers_workaround.install_headers('numpy')
-VERSION = '0.94'
+VERSION = '0.95'
 def main(modules, is_pypy):
    language = "cpp"
    includes = ['.', path.join(sys.prefix, 'include')]
--- a/spacy/en/download.py
+++ b/spacy/en/download.py
@ -7,7 +7,7 @@ import wget
 import plac
 # TODO: Read this from the same source as the setup
-VERSION = '0.9.1'
+VERSION = '0.9.5'
 AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com'
--- a/tests/test_basic_create.py
+++ b/tests/test_basic_create.py
@ -0,0 +1,89 @@
 """Some quick tests that don't depend on data files or on pytest, for debugging the
 MS windows build issues."""
 from __future__ import print_function, unicode_literals
 import unittest
 import re
 from spacy.lemmatizer import Lemmatizer
 from spacy.morphology import Morphology
 from spacy.strings import StringStore
 from spacy.vocab import Vocab
 from spacy.tokenizer import Tokenizer
 from spacy.syntax.arc_eager import ArcEager
 from spacy._ml import Model
 from spacy.tagger import Tagger
 from spacy.syntax.parser import Parser
 from spacy.matcher import Matcher
 class TestStringStore(unittest.TestCase):
    def test_encode_decode(self):
        strings = StringStore()
        hello_id = strings[u'Hello']
        world_id = strings[u'World']
        self.assertNotEqual(hello_id, world_id)
        self.assertEqual(strings[hello_id], u'Hello')
        self.assertEqual(strings[world_id], u'World')
        self.assertEqual(strings[u'Hello'], hello_id)
        self.assertEqual(strings[u'World'], world_id)
 class TestMorphology(unittest.TestCase):
    def test_create(self):
        lemmatizer = Lemmatizer({}, {}, {})
        strings = StringStore()
        lemmatizer = Lemmatizer({}, {}, {})
        morphology = Morphology(strings, {}, lemmatizer)
 class TestVocab(unittest.TestCase):
    def test_create(self):
        vocab = Vocab()
    def test_get_lexeme(self):
        vocab = Vocab()
        lexeme = vocab[u'Hello']
        self.assertEqual(lexeme.orth_, u'Hello')
 class TestTokenizer(unittest.TestCase):
    def test_create(self):
        vocab = Vocab()
        dummy_re = re.compile(r'sklfb;s')
        tokenizer = Tokenizer(vocab, {}, dummy_re, dummy_re, dummy_re)
        doc = tokenizer(u'I am a document.')
        self.assertEqual(len(doc), 4)
 class TestTagger(unittest.TestCase):
    def test_create(self):
        vocab = Vocab()
        templates = ((1,),)
        model = Model(vocab.morphology.n_tags, templates, model_loc=None)
        tagger = Tagger(vocab, model)
 class TestParser(unittest.TestCase):
    def test_create(self):
        vocab = Vocab()
        templates = ((1,),)
        labels_by_action = {0: ['One', 'Two'], 1: ['Two', 'Three']}
        transition_system = ArcEager(vocab.strings, labels_by_action)
        model = Model(vocab.morphology.n_tags, templates, model_loc=None)
        parser = Parser(vocab.strings, transition_system, model)
 class TestMatcher(unittest.TestCase):
    def test_create(self):
        vocab = Vocab()
        matcher = Matcher(vocab, {})
 if __name__ == '__main__':
    unittest.main()
--- a/tests/test_basic_load.py
+++ b/tests/test_basic_load.py
@ -0,0 +1,75 @@
 """Some quick tests that don't depend on data files or on pytest, for debugging the
 MS windows build issues."""
 from __future__ import print_function, unicode_literals
 import unittest
 import re
 from os import path
 from spacy.lemmatizer import Lemmatizer
 from spacy.morphology import Morphology
 from spacy.strings import StringStore
 from spacy.vocab import Vocab
 from spacy.tokenizer import Tokenizer
 from spacy.syntax.arc_eager import ArcEager
 from spacy._ml import Model
 from spacy.tagger import Tagger
 from spacy.syntax.parser import Parser
 from spacy.matcher import Matcher
 from spacy.syntax.parser import get_templates
 from spacy.en import English
 from thinc.learner import LinearModel
 class TestLoadVocab(unittest.TestCase):
    def test_load(self):
        vocab = Vocab.from_dir(path.join(English.default_data_dir(), 'vocab'))
 class TestLoadTokenizer(unittest.TestCase):
    def test_load(self):
        data_dir = English.default_data_dir()
        vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
        tokenizer = Tokenizer.from_dir(vocab, path.join(data_dir, 'tokenizer'))
 class TestLoadTagger(unittest.TestCase):
    def test_load(self):
        data_dir = English.default_data_dir()
        vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
        tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab)
 class TestLoadParser(unittest.TestCase):
    def test_load(self):
        data_dir = English.default_data_dir()
        vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
        parser = Parser.from_dir(path.join(data_dir, 'deps'), vocab.strings, ArcEager)
    def test_load_careful(self):
        config_data = {"labels": {"0": {"": True}, "1": {"": True}, "2": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "dobj": True, "neg": True, "csubjpass": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "relcl": True, "quantmod": True, "acomp": True, "compound": True, "pcomp": True, "intj": True, "poss": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "amod": True, "dative": True, "pobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True, "acl": True}, "3": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "acl": True, "poss": True, "neg": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "amod": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "quantmod": True, "acomp": True, "pcomp": True, "intj": True, "relcl": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "dobj": True, "dative": True, "pobj": True, "iobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True}, "4": {"ROOT": True}}, "seed": 0, "features": "basic", "beam_width": 1}
        data_dir = English.default_data_dir()
        vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
        moves = ArcEager(vocab.strings, config_data['labels'])
        templates = get_templates(config_data['features'])
        model = Model(moves.n_moves, templates, path.join(data_dir, 'deps'))
        parser = Parser(vocab.strings, moves, model)
    def test_thinc_load(self):
        data_dir = English.default_data_dir()
        model_loc = path.join(data_dir, 'deps', 'model')
        # n classes. moves.n_moves above
        # n features. len(templates) + 1 above
        model = LinearModel(92, 116)
        model.load(model_loc)
 if __name__ == '__main__':
    unittest.main()