mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
Merge branch 'master' of https://github.com/honnibal/spaCy into attrs
This commit is contained in:
commit
b866f1443e
3
setup.py
3
setup.py
|
@ -92,6 +92,7 @@ def cython_setup(mod_names, language, includes, compile_args, link_args):
|
||||||
package_data={"spacy": ["*.pxd"],
|
package_data={"spacy": ["*.pxd"],
|
||||||
"spacy.en": ["*.pxd", "data/pos/*",
|
"spacy.en": ["*.pxd", "data/pos/*",
|
||||||
"data/wordnet/*", "data/tokenizer/*",
|
"data/wordnet/*", "data/tokenizer/*",
|
||||||
|
"data/vocab/tag_map.json",
|
||||||
"data/vocab/lexemes.bin",
|
"data/vocab/lexemes.bin",
|
||||||
"data/vocab/strings.txt"],
|
"data/vocab/strings.txt"],
|
||||||
"spacy.syntax": ["*.pxd"]},
|
"spacy.syntax": ["*.pxd"]},
|
||||||
|
@ -134,7 +135,7 @@ def run_setup(exts):
|
||||||
headers_workaround.install_headers('numpy')
|
headers_workaround.install_headers('numpy')
|
||||||
|
|
||||||
|
|
||||||
VERSION = '0.94'
|
VERSION = '0.95'
|
||||||
def main(modules, is_pypy):
|
def main(modules, is_pypy):
|
||||||
language = "cpp"
|
language = "cpp"
|
||||||
includes = ['.', path.join(sys.prefix, 'include')]
|
includes = ['.', path.join(sys.prefix, 'include')]
|
||||||
|
|
|
@ -7,7 +7,7 @@ import wget
|
||||||
import plac
|
import plac
|
||||||
|
|
||||||
# TODO: Read this from the same source as the setup
|
# TODO: Read this from the same source as the setup
|
||||||
VERSION = '0.9.1'
|
VERSION = '0.9.5'
|
||||||
|
|
||||||
AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com'
|
AWS_STORE = 'https://s3-us-west-1.amazonaws.com/media.spacynlp.com'
|
||||||
|
|
||||||
|
|
89
tests/test_basic_create.py
Normal file
89
tests/test_basic_create.py
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
"""Some quick tests that don't depend on data files or on pytest, for debugging the
|
||||||
|
MS windows build issues."""
|
||||||
|
from __future__ import print_function, unicode_literals
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import re
|
||||||
|
|
||||||
|
from spacy.lemmatizer import Lemmatizer
|
||||||
|
from spacy.morphology import Morphology
|
||||||
|
from spacy.strings import StringStore
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.tokenizer import Tokenizer
|
||||||
|
from spacy.syntax.arc_eager import ArcEager
|
||||||
|
from spacy._ml import Model
|
||||||
|
from spacy.tagger import Tagger
|
||||||
|
from spacy.syntax.parser import Parser
|
||||||
|
from spacy.matcher import Matcher
|
||||||
|
|
||||||
|
|
||||||
|
class TestStringStore(unittest.TestCase):
|
||||||
|
def test_encode_decode(self):
|
||||||
|
strings = StringStore()
|
||||||
|
hello_id = strings[u'Hello']
|
||||||
|
world_id = strings[u'World']
|
||||||
|
|
||||||
|
self.assertNotEqual(hello_id, world_id)
|
||||||
|
|
||||||
|
self.assertEqual(strings[hello_id], u'Hello')
|
||||||
|
self.assertEqual(strings[world_id], u'World')
|
||||||
|
|
||||||
|
self.assertEqual(strings[u'Hello'], hello_id)
|
||||||
|
self.assertEqual(strings[u'World'], world_id)
|
||||||
|
|
||||||
|
|
||||||
|
class TestMorphology(unittest.TestCase):
|
||||||
|
def test_create(self):
|
||||||
|
lemmatizer = Lemmatizer({}, {}, {})
|
||||||
|
strings = StringStore()
|
||||||
|
lemmatizer = Lemmatizer({}, {}, {})
|
||||||
|
morphology = Morphology(strings, {}, lemmatizer)
|
||||||
|
|
||||||
|
|
||||||
|
class TestVocab(unittest.TestCase):
|
||||||
|
def test_create(self):
|
||||||
|
vocab = Vocab()
|
||||||
|
|
||||||
|
def test_get_lexeme(self):
|
||||||
|
vocab = Vocab()
|
||||||
|
lexeme = vocab[u'Hello']
|
||||||
|
self.assertEqual(lexeme.orth_, u'Hello')
|
||||||
|
|
||||||
|
|
||||||
|
class TestTokenizer(unittest.TestCase):
|
||||||
|
def test_create(self):
|
||||||
|
vocab = Vocab()
|
||||||
|
dummy_re = re.compile(r'sklfb;s')
|
||||||
|
tokenizer = Tokenizer(vocab, {}, dummy_re, dummy_re, dummy_re)
|
||||||
|
doc = tokenizer(u'I am a document.')
|
||||||
|
|
||||||
|
self.assertEqual(len(doc), 4)
|
||||||
|
|
||||||
|
|
||||||
|
class TestTagger(unittest.TestCase):
|
||||||
|
def test_create(self):
|
||||||
|
vocab = Vocab()
|
||||||
|
templates = ((1,),)
|
||||||
|
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
|
||||||
|
tagger = Tagger(vocab, model)
|
||||||
|
|
||||||
|
|
||||||
|
class TestParser(unittest.TestCase):
|
||||||
|
def test_create(self):
|
||||||
|
vocab = Vocab()
|
||||||
|
templates = ((1,),)
|
||||||
|
labels_by_action = {0: ['One', 'Two'], 1: ['Two', 'Three']}
|
||||||
|
transition_system = ArcEager(vocab.strings, labels_by_action)
|
||||||
|
model = Model(vocab.morphology.n_tags, templates, model_loc=None)
|
||||||
|
|
||||||
|
parser = Parser(vocab.strings, transition_system, model)
|
||||||
|
|
||||||
|
|
||||||
|
class TestMatcher(unittest.TestCase):
|
||||||
|
def test_create(self):
|
||||||
|
vocab = Vocab()
|
||||||
|
matcher = Matcher(vocab, {})
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
75
tests/test_basic_load.py
Normal file
75
tests/test_basic_load.py
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
"""Some quick tests that don't depend on data files or on pytest, for debugging the
|
||||||
|
MS windows build issues."""
|
||||||
|
from __future__ import print_function, unicode_literals
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import re
|
||||||
|
from os import path
|
||||||
|
|
||||||
|
from spacy.lemmatizer import Lemmatizer
|
||||||
|
from spacy.morphology import Morphology
|
||||||
|
from spacy.strings import StringStore
|
||||||
|
from spacy.vocab import Vocab
|
||||||
|
from spacy.tokenizer import Tokenizer
|
||||||
|
from spacy.syntax.arc_eager import ArcEager
|
||||||
|
from spacy._ml import Model
|
||||||
|
from spacy.tagger import Tagger
|
||||||
|
from spacy.syntax.parser import Parser
|
||||||
|
from spacy.matcher import Matcher
|
||||||
|
from spacy.syntax.parser import get_templates
|
||||||
|
|
||||||
|
from spacy.en import English
|
||||||
|
|
||||||
|
from thinc.learner import LinearModel
|
||||||
|
|
||||||
|
|
||||||
|
class TestLoadVocab(unittest.TestCase):
|
||||||
|
def test_load(self):
|
||||||
|
vocab = Vocab.from_dir(path.join(English.default_data_dir(), 'vocab'))
|
||||||
|
|
||||||
|
|
||||||
|
class TestLoadTokenizer(unittest.TestCase):
|
||||||
|
def test_load(self):
|
||||||
|
data_dir = English.default_data_dir()
|
||||||
|
vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
|
||||||
|
tokenizer = Tokenizer.from_dir(vocab, path.join(data_dir, 'tokenizer'))
|
||||||
|
|
||||||
|
|
||||||
|
class TestLoadTagger(unittest.TestCase):
|
||||||
|
def test_load(self):
|
||||||
|
data_dir = English.default_data_dir()
|
||||||
|
vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
|
||||||
|
tagger = Tagger.from_dir(path.join(data_dir, 'tagger'), vocab)
|
||||||
|
|
||||||
|
|
||||||
|
class TestLoadParser(unittest.TestCase):
|
||||||
|
def test_load(self):
|
||||||
|
data_dir = English.default_data_dir()
|
||||||
|
vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
|
||||||
|
parser = Parser.from_dir(path.join(data_dir, 'deps'), vocab.strings, ArcEager)
|
||||||
|
|
||||||
|
def test_load_careful(self):
|
||||||
|
config_data = {"labels": {"0": {"": True}, "1": {"": True}, "2": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "dobj": True, "neg": True, "csubjpass": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "relcl": True, "quantmod": True, "acomp": True, "compound": True, "pcomp": True, "intj": True, "poss": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "amod": True, "dative": True, "pobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True, "acl": True}, "3": {"cc": True, "agent": True, "ccomp": True, "prt": True, "meta": True, "nsubjpass": True, "csubj": True, "conj": True, "acl": True, "poss": True, "neg": True, "mark": True, "auxpass": True, "advcl": True, "aux": True, "amod": True, "ROOT": True, "prep": True, "parataxis": True, "xcomp": True, "nsubj": True, "nummod": True, "advmod": True, "punct": True, "quantmod": True, "acomp": True, "pcomp": True, "intj": True, "relcl": True, "npadvmod": True, "case": True, "attr": True, "dep": True, "appos": True, "det": True, "nmod": True, "dobj": True, "dative": True, "pobj": True, "iobj": True, "expl": True, "predet": True, "preconj": True, "oprd": True}, "4": {"ROOT": True}}, "seed": 0, "features": "basic", "beam_width": 1}
|
||||||
|
|
||||||
|
data_dir = English.default_data_dir()
|
||||||
|
vocab = Vocab.from_dir(path.join(data_dir, 'vocab'))
|
||||||
|
|
||||||
|
moves = ArcEager(vocab.strings, config_data['labels'])
|
||||||
|
templates = get_templates(config_data['features'])
|
||||||
|
|
||||||
|
model = Model(moves.n_moves, templates, path.join(data_dir, 'deps'))
|
||||||
|
|
||||||
|
parser = Parser(vocab.strings, moves, model)
|
||||||
|
|
||||||
|
def test_thinc_load(self):
|
||||||
|
data_dir = English.default_data_dir()
|
||||||
|
model_loc = path.join(data_dir, 'deps', 'model')
|
||||||
|
|
||||||
|
# n classes. moves.n_moves above
|
||||||
|
# n features. len(templates) + 1 above
|
||||||
|
model = LinearModel(92, 116)
|
||||||
|
model.load(model_loc)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
Loading…
Reference in New Issue
Block a user