From 9027cef3bc2ee7500fc3d2cf8700017c6b170e95 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Mon, 7 Dec 2015 06:01:28 +0100 Subject: [PATCH 01/13] access model via sputnik --- requirements.txt | 2 +- setup.py | 2 +- spacy/de/__init__.py | 4 +- spacy/en/__init__.py | 6 -- spacy/fi/__init__.py | 4 +- spacy/it/__init__.py | 4 +- spacy/language.py | 74 ++++---------- spacy/lemmatizer.py | 31 +++--- spacy/matcher.pyx | 13 +-- spacy/tagger.pyx | 16 +-- spacy/tests/conftest.py | 7 +- spacy/tests/serialize/test_packer.py | 1 - spacy/tests/spans/conftest.py | 5 +- spacy/tests/tagger/test_lemmatizer.py | 28 ++--- spacy/tests/tokens/test_token_references.py | 7 +- spacy/tests/website/conftest.py | 5 +- spacy/tests/website/test_home.py | 5 +- spacy/tokenizer.pyx | 4 +- spacy/util.py | 107 +++++++++++--------- spacy/vocab.pyx | 35 ++++--- 20 files changed, 161 insertions(+), 199 deletions(-) diff --git a/requirements.txt b/requirements.txt index 344cc7665..7565025f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,4 @@ plac six ujson cloudpickle -sputnik == 0.5.2 +sputnik == 0.6.0 diff --git a/setup.py b/setup.py index 8e3e72087..6a38ef918 100644 --- a/setup.py +++ b/setup.py @@ -179,7 +179,7 @@ def run_setup(exts): license="MIT", install_requires=['numpy', 'murmurhash == 0.24', 'cymem == 1.30', 'preshed == 0.44', 'thinc == 4.0.0', "text_unidecode", 'plac', 'six', - 'ujson', 'cloudpickle', 'sputnik == 0.5.2'], + 'ujson', 'cloudpickle', 'sputnik == 0.6.0'], setup_requires=["headers_workaround"], cmdclass = {'build_ext': build_ext_subclass }, ) diff --git a/spacy/de/__init__.py b/spacy/de/__init__.py index 262fbe289..d7cc3dc65 100644 --- a/spacy/de/__init__.py +++ b/spacy/de/__init__.py @@ -6,6 +6,4 @@ from ..language import Language class German(Language): - @classmethod - def default_data_dir(cls): - return path.join(path.dirname(__file__), 'data') + pass diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 4d057db20..309deae41 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -4,8 +4,6 @@ from os import path from ..language import Language -LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') - # improved list from Stone, Denis, Kwantes (2010) STOPWORDS = """ @@ -35,10 +33,6 @@ your yours yourself yourselves STOPWORDS = set(w for w in STOPWORDS.split() if w) class English(Language): - @classmethod - def default_data_dir(cls): - return LOCAL_DATA_DIR - @staticmethod def is_stop(string): return 1 if string.lower() in STOPWORDS else 0 diff --git a/spacy/fi/__init__.py b/spacy/fi/__init__.py index 8e7173767..adba064a5 100644 --- a/spacy/fi/__init__.py +++ b/spacy/fi/__init__.py @@ -6,6 +6,4 @@ from ..language import Language class Finnish(Language): - @classmethod - def default_data_dir(cls): - return path.join(path.dirname(__file__), 'data') + pass diff --git a/spacy/it/__init__.py b/spacy/it/__init__.py index a494de41f..6a824fe82 100644 --- a/spacy/it/__init__.py +++ b/spacy/it/__init__.py @@ -6,6 +6,4 @@ from ..language import Language class Italian(Language): - @classmethod - def default_data_dir(cls): - return path.join(path.dirname(__file__), 'data') + pass diff --git a/spacy/language.py b/spacy/language.py index f598518e2..2dce0ea0d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -20,6 +20,7 @@ from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD +from .util import default_package class Language(object): @@ -100,7 +101,7 @@ class Language(object): return 0 @classmethod - def default_lex_attrs(cls, data_dir=None): + def default_lex_attrs(cls): return { attrs.LOWER: cls.lower, attrs.NORM: cls.norm, @@ -134,73 +135,42 @@ class Language(object): return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} @classmethod - def default_data_dir(cls): - return path.join(path.dirname(__file__), 'data') - - @classmethod - def default_vocab(cls, data_dir=None, get_lex_attr=None): - if data_dir is None: - data_dir = cls.default_data_dir() + def default_vocab(cls, package=None, get_lex_attr=None): + if package is None: + package = default_package() if get_lex_attr is None: - get_lex_attr = cls.default_lex_attrs(data_dir) - return Vocab.from_dir( - path.join(data_dir, 'vocab'), - get_lex_attr=get_lex_attr) + get_lex_attr = cls.default_lex_attrs() + return Vocab.from_package(package, get_lex_attr=get_lex_attr) @classmethod - def default_tokenizer(cls, vocab, data_dir): - if path.exists(data_dir): - return Tokenizer.from_dir(vocab, data_dir) - else: - return Tokenizer(vocab, {}, None, None, None) + def default_parser(cls, package, vocab): + data_dir = package.dir_path('data', 'deps') + return Parser.from_dir(data_dir, vocab.strings, ArcEager) @classmethod - def default_tagger(cls, vocab, data_dir): - if path.exists(data_dir): - return Tagger.from_dir(data_dir, vocab) - else: - return None + def default_entity(cls, package, vocab): + data_dir = package.dir_path('data', 'ner') + return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) - @classmethod - def default_parser(cls, vocab, data_dir): - if path.exists(data_dir): - return Parser.from_dir(data_dir, vocab.strings, ArcEager) - else: - return None - - @classmethod - def default_entity(cls, vocab, data_dir): - if path.exists(data_dir): - return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) - else: - return None - - @classmethod - def default_matcher(cls, vocab, data_dir): - if path.exists(data_dir): - return Matcher.from_dir(data_dir, vocab) - else: - return None - - def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None, + def __init__(self, package=None, vocab=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None, serializer=None, load_vectors=True): if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) - if data_dir in (None, True): - data_dir = self.default_data_dir() + if package in (None, True): + package = default_package() if vocab in (None, True): - vocab = self.default_vocab(data_dir) + vocab = self.default_vocab(package) if tokenizer in (None, True): - tokenizer = self.default_tokenizer(vocab, data_dir=path.join(data_dir, 'tokenizer')) + tokenizer = Tokenizer.from_package(package, vocab) if tagger in (None, True): - tagger = self.default_tagger(vocab, data_dir=path.join(data_dir, 'pos')) + tagger = Tagger.from_package(package, vocab) if entity in (None, True): - entity = self.default_entity(vocab, data_dir=path.join(data_dir, 'ner')) + entity = self.default_entity(package, vocab) if parser in (None, True): - parser = self.default_parser(vocab, data_dir=path.join(data_dir, 'deps')) + parser = self.default_parser(package, vocab) if matcher in (None, True): - matcher = self.default_matcher(vocab, data_dir=data_dir) + matcher = Matcher.from_package(package, vocab) self.vocab = vocab self.tokenizer = tokenizer self.tagger = tagger diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 08e511f68..d7fdcf76b 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -12,16 +12,21 @@ from .parts_of_speech import NOUN, VERB, ADJ, PUNCT class Lemmatizer(object): @classmethod - def from_dir(cls, data_dir): + def from_package(cls, package): index = {} exc = {} for pos in ['adj', 'noun', 'verb']: - index[pos] = read_index(path.join(data_dir, 'wordnet', 'index.%s' % pos)) - exc[pos] = read_exc(path.join(data_dir, 'wordnet', '%s.exc' % pos)) - if path.exists(path.join(data_dir, 'vocab', 'lemma_rules.json')): - rules = json.load(codecs.open(path.join(data_dir, 'vocab', 'lemma_rules.json'), encoding='utf_8')) - else: - rules = {} + index[pos] = package.load_utf8(read_index, + 'data', 'wordnet', 'index.%s' % pos, + default=set()) # TODO: really optional? + exc[pos] = package.load_utf8(read_exc, + 'data', 'wordnet', '%s.exc' % pos, + default={}) # TODO: really optional? + + rules = package.load_utf8(json.load, + 'data', 'vocab', 'lemma_rules.json', + default={}) # TODO: really optional? + return cls(index, exc, rules) def __init__(self, index, exceptions, rules): @@ -70,11 +75,9 @@ def lemmatize(string, index, exceptions, rules): return set(forms) -def read_index(loc): +def read_index(fileobj): index = set() - if not path.exists(loc): - return index - for line in codecs.open(loc, 'r', 'utf8'): + for line in fileobj: if line.startswith(' '): continue pieces = line.split() @@ -84,11 +87,9 @@ def read_index(loc): return index -def read_exc(loc): +def read_exc(fileobj): exceptions = {} - if not path.exists(loc): - return exceptions - for line in codecs.open(loc, 'r', 'utf8'): + for line in fileobj: if line.startswith(' '): continue pieces = line.split() diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 1fa91fab1..4319d593b 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -169,14 +169,11 @@ cdef class Matcher: cdef object _patterns @classmethod - def from_dir(cls, data_dir, Vocab vocab): - patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json') - if path.exists(patterns_loc): - patterns_data = open(patterns_loc).read() - patterns = json.loads(patterns_data) - return cls(vocab, patterns) - else: - return cls(vocab, {}) + def from_package(cls, package, Vocab vocab): + patterns = package.load_utf8(json.load, + 'data', 'vocab', 'gazetteer.json', + default={}) # TODO: really optional? + return cls(vocab, patterns) def __init__(self, vocab, patterns): self.vocab = vocab diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 421d97357..91f574348 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -146,15 +146,17 @@ cdef class Tagger: return cls(vocab, model) @classmethod - def from_dir(cls, data_dir, vocab): - if path.exists(path.join(data_dir, 'templates.json')): - templates = json.loads(open(path.join(data_dir, 'templates.json'))) - else: - templates = cls.default_templates() + def from_package(cls, package, vocab): + # TODO: templates.json deprecated? not present in latest package + templates = package.load_utf8(json.load, + 'data', 'pos', 'templates.json', + default=cls.default_templates()) + model = TaggerModel(vocab.morphology.n_tags, ConjunctionExtracter(N_CONTEXT_FIELDS, templates)) - if path.exists(path.join(data_dir, 'model')): - model.load(path.join(data_dir, 'model')) + + model.load(package.file_path('data', 'pos', 'model', require=False)) # TODO: really optional? + return cls(vocab, model) def __init__(self, Vocab vocab, TaggerModel model): diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 6aee2c31e..03e728a12 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,12 +1,11 @@ +from spacy.en import English + import pytest -from spacy.en import English, LOCAL_DATA_DIR -import os @pytest.fixture(scope="session") def EN(): - data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) - return English(data_dir=data_dir) + return English() def pytest_addoption(parser): diff --git a/spacy/tests/serialize/test_packer.py b/spacy/tests/serialize/test_packer.py index e0d24208a..0e13b2de5 100644 --- a/spacy/tests/serialize/test_packer.py +++ b/spacy/tests/serialize/test_packer.py @@ -10,7 +10,6 @@ from spacy.en import English from spacy.vocab import Vocab from spacy.tokens.doc import Doc from spacy.tokenizer import Tokenizer -from spacy.en import LOCAL_DATA_DIR from os import path from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD diff --git a/spacy/tests/spans/conftest.py b/spacy/tests/spans/conftest.py index f63816e05..d8a0a4cb9 100644 --- a/spacy/tests/spans/conftest.py +++ b/spacy/tests/spans/conftest.py @@ -1,9 +1,8 @@ import pytest -from spacy.en import English, LOCAL_DATA_DIR +from spacy.en import English import os @pytest.fixture(scope="session") def en_nlp(): - data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) - return English(data_dir=data_dir) + return English() diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 91aa8ee65..708594299 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -4,31 +4,33 @@ import io import pickle from spacy.lemmatizer import Lemmatizer, read_index, read_exc -from spacy.en import LOCAL_DATA_DIR -from os import path +from spacy.util import default_package import pytest -def test_read_index(): - wn = path.join(LOCAL_DATA_DIR, 'wordnet') - index = read_index(path.join(wn, 'index.noun')) +@pytest.fixture +def package(): + return default_package() + + +@pytest.fixture +def lemmatizer(package): + return Lemmatizer.from_package(package) + + +def test_read_index(package): + index = package.load_utf8(read_index, 'data', 'wordnet', 'index.noun') assert 'man' in index assert 'plantes' not in index assert 'plant' in index -def test_read_exc(): - wn = path.join(LOCAL_DATA_DIR, 'wordnet') - exc = read_exc(path.join(wn, 'verb.exc')) +def test_read_exc(package): + exc = package.load_utf8(read_exc, 'data', 'wordnet', 'verb.exc') assert exc['was'] == ('be',) -@pytest.fixture -def lemmatizer(): - return Lemmatizer.from_dir(path.join(LOCAL_DATA_DIR)) - - def test_noun_lemmas(lemmatizer): do = lemmatizer.noun diff --git a/spacy/tests/tokens/test_token_references.py b/spacy/tests/tokens/test_token_references.py index e8dbff6fe..24639f141 100644 --- a/spacy/tests/tokens/test_token_references.py +++ b/spacy/tests/tokens/test_token_references.py @@ -2,16 +2,15 @@ from __future__ import unicode_literals import pytest import gc -from spacy.en import English, LOCAL_DATA_DIR +from spacy.en import English import os -data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) # Let this have its own instances, as we have to be careful about memory here # that's the point, after all @pytest.mark.models def get_orphan_token(text, i): - nlp = English(data_dir=data_dir) + nlp = English() tokens = nlp(text) gc.collect() token = tokens[i] @@ -41,7 +40,7 @@ def _orphan_from_list(toks): @pytest.mark.models def test_list_orphans(): # Test case from NSchrading - nlp = English(data_dir=data_dir) + nlp = English() samples = ["a", "test blah wat okay"] lst = [] for sample in samples: diff --git a/spacy/tests/website/conftest.py b/spacy/tests/website/conftest.py index 35c38d845..b4934d20b 100644 --- a/spacy/tests/website/conftest.py +++ b/spacy/tests/website/conftest.py @@ -5,9 +5,8 @@ import os @pytest.fixture(scope='session') def nlp(): - from spacy.en import English, LOCAL_DATA_DIR - data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) - return English(data_dir=data_dir) + from spacy.en import English + return English() @pytest.fixture() diff --git a/spacy/tests/website/test_home.py b/spacy/tests/website/test_home.py index d03acf855..3d9c8aba9 100644 --- a/spacy/tests/website/test_home.py +++ b/spacy/tests/website/test_home.py @@ -10,9 +10,8 @@ def token(doc): def test_load_resources_and_process_text(): - from spacy.en import English, LOCAL_DATA_DIR - data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) - nlp = English(data_dir=data_dir) + from spacy.en import English + nlp = English() doc = nlp('Hello, world. Here are two sentences.') diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 1cde1e76e..345734682 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -41,8 +41,8 @@ cdef class Tokenizer: return (self.__class__, args, None, None) @classmethod - def from_dir(cls, Vocab vocab, data_dir): - rules, prefix_re, suffix_re, infix_re = read_lang_data(data_dir) + def from_package(cls, package, Vocab vocab): + rules, prefix_re, suffix_re, infix_re = read_lang_data(package) prefix_re = re.compile(prefix_re) suffix_re = re.compile(suffix_re) infix_re = re.compile(infix_re) diff --git a/spacy/util.py b/spacy/util.py index 849a3e219..5592e64eb 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,10 +1,23 @@ -from os import path +import os import io import json import re + +from sputnik import Sputnik + from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -DATA_DIR = path.join(path.dirname(__file__), '..', 'data') + +def default_package(): + if os.environ.get('SPACY_DATA'): + data_path = os.environ.get('SPACY_DATA') + else: + data_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data')) + + sputnik = Sputnik('spacy', '0.99.0') # TODO: retrieve version + pool = sputnik.pool(data_path) + return pool.get('en_default') def normalize_slice(length, start, stop, step=None): @@ -31,67 +44,63 @@ def utf8open(loc, mode='r'): return io.open(loc, mode, encoding='utf8') -def read_lang_data(data_dir): - with open(path.join(data_dir, 'specials.json')) as file_: - tokenization = json.load(file_) - prefix = read_prefix(data_dir) - suffix = read_suffix(data_dir) - infix = read_infix(data_dir) +def read_lang_data(package): + tokenization = package.load_utf8(json.load, 'data', 'tokenizer', 'specials.json') + prefix = package.load_utf8(read_prefix, 'data', 'tokenizer', 'prefix.txt') + suffix = package.load_utf8(read_suffix, 'data', 'tokenizer', 'suffix.txt') + infix = package.load_utf8(read_infix, 'data', 'tokenizer', 'infix.txt') return tokenization, prefix, suffix, infix -def read_prefix(data_dir): - with utf8open(path.join(data_dir, 'prefix.txt')) as file_: - entries = file_.read().split('\n') - expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) +def read_prefix(fileobj): + entries = fileobj.read().split('\n') + expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) return expression -def read_suffix(data_dir): - with utf8open(path.join(data_dir, 'suffix.txt')) as file_: - entries = file_.read().split('\n') - expression = '|'.join([piece + '$' for piece in entries if piece.strip()]) +def read_suffix(fileobj): + entries = fileobj.read().split('\n') + expression = '|'.join([piece + '$' for piece in entries if piece.strip()]) return expression -def read_infix(data_dir): - with utf8open(path.join(data_dir, 'infix.txt')) as file_: - entries = file_.read().split('\n') - expression = '|'.join([piece for piece in entries if piece.strip()]) +def read_infix(fileobj): + entries = fileobj.read().split('\n') + expression = '|'.join([piece for piece in entries if piece.strip()]) return expression -def read_tokenization(lang): - loc = path.join(DATA_DIR, lang, 'tokenization') - entries = [] - seen = set() - with utf8open(loc) as file_: - for line in file_: - line = line.strip() - if line.startswith('#'): - continue - if not line: - continue - pieces = line.split() - chunk = pieces.pop(0) - assert chunk not in seen, chunk - seen.add(chunk) - entries.append((chunk, list(pieces))) - if chunk[0].isalpha() and chunk[0].islower(): - chunk = chunk[0].title() + chunk[1:] - pieces[0] = pieces[0][0].title() + pieces[0][1:] - seen.add(chunk) - entries.append((chunk, pieces)) - return entries +# def read_tokenization(lang): +# loc = path.join(DATA_DIR, lang, 'tokenization') +# entries = [] +# seen = set() +# with utf8open(loc) as file_: +# for line in file_: +# line = line.strip() +# if line.startswith('#'): +# continue +# if not line: +# continue +# pieces = line.split() +# chunk = pieces.pop(0) +# assert chunk not in seen, chunk +# seen.add(chunk) +# entries.append((chunk, list(pieces))) +# if chunk[0].isalpha() and chunk[0].islower(): +# chunk = chunk[0].title() + chunk[1:] +# pieces[0] = pieces[0][0].title() + pieces[0][1:] +# seen.add(chunk) +# entries.append((chunk, pieces)) +# return entries -def read_detoken_rules(lang): # Deprecated? - loc = path.join(DATA_DIR, lang, 'detokenize') - entries = [] - with utf8open(loc) as file_: - for line in file_: - entries.append(line.strip()) - return entries +# def read_detoken_rules(lang): # Deprecated? +# loc = path.join(DATA_DIR, lang, 'detokenize') +# entries = [] +# with utf8open(loc) as file_: +# for line in file_: +# entries.append(line.strip()) +# return entries def align_tokens(ref, indices): # Deprecated, surely? diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 3817e7127..ac083d9bc 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -47,28 +47,27 @@ cdef class Vocab: '''A map container for a language's LexemeC structs. ''' @classmethod - def from_dir(cls, data_dir, get_lex_attr=None): - if not path.exists(data_dir): - raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) - if not path.isdir(data_dir): - raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) + def from_package(cls, package, get_lex_attr=None): + tag_map = package.load_utf8(json.load, + 'data', 'vocab', 'tag_map.json') + + lemmatizer = Lemmatizer.from_package(package) + + serializer_freqs = package.load_utf8(json.load, + 'data', 'vocab', 'serializer.json', + require=False) # TODO: really optional? - tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) - lemmatizer = Lemmatizer.from_dir(path.join(data_dir, '..')) - if path.exists(path.join(data_dir, 'serializer.json')): - serializer_freqs = json.load(open(path.join(data_dir, 'serializer.json'))) - else: - serializer_freqs = None cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map, lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) - if path.exists(path.join(data_dir, 'strings.json')): - with io.open(path.join(data_dir, 'strings.json'), 'r', encoding='utf8') as file_: - self.strings.load(file_) - self.load_lexemes(path.join(data_dir, 'lexemes.bin')) - - if path.exists(path.join(data_dir, 'vec.bin')): - self.vectors_length = self.load_vectors_from_bin_loc(path.join(data_dir, 'vec.bin')) + if package.has_file('data', 'vocab', 'strings.json'): # TODO: really optional? + package.load_utf8(self.strings.load, 'data', 'vocab', 'strings.json') + self.load_lexemes(package.file_path('data', 'vocab', 'lexemes.bin')) + + if package.has_file('data', 'vocab', 'vec.bin'): # TODO: really optional? + self.vectors_length = self.load_vectors_from_bin_loc( + package.file_path('data', 'vocab', 'vec.bin')) + return self def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None): From 345dda6f53709ac15a5cb2f659fce1b5d1c13906 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Mon, 7 Dec 2015 06:50:26 +0100 Subject: [PATCH 02/13] small fixes, add package build step --- .travis.yml | 4 +++- package.json | 10 ++++++++++ spacy/language.py | 10 ++++++---- spacy/tagger.pyx | 3 ++- 4 files changed, 21 insertions(+), 6 deletions(-) create mode 100644 package.json diff --git a/.travis.yml b/.travis.yml index c90da84d2..2a0295156 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,7 +21,9 @@ install: - "mv WordNet-3.0 wordnet" - "cd ../../" - "export PYTHONPATH=`pwd`" - - "python bin/init_model.py en lang_data/ corpora/ spacy/en/data" + - "python bin/init_model.py en lang_data/ corpora/ data" + - "sputnik build ." + - "sputnik install en_default-*.sputnik" # run tests script: diff --git a/package.json b/package.json new file mode 100644 index 000000000..563009699 --- /dev/null +++ b/package.json @@ -0,0 +1,10 @@ +{ + "name": "en_default", + "version": "0.99.0", + "description": "english default model", + "license": "public domain", + "include": ["data/**/*"], + "compatibility": { + "spacy": "==0.99.0" + } +} diff --git a/spacy/language.py b/spacy/language.py index 2dce0ea0d..83b91cdca 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -144,13 +144,15 @@ class Language(object): @classmethod def default_parser(cls, package, vocab): - data_dir = package.dir_path('data', 'deps') - return Parser.from_dir(data_dir, vocab.strings, ArcEager) + data_dir = package.dir_path('data', 'deps', require=False) + if data_dir and path.exists(data_dir): + return Parser.from_dir(data_dir, vocab.strings, ArcEager) @classmethod def default_entity(cls, package, vocab): - data_dir = package.dir_path('data', 'ner') - return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) + data_dir = package.dir_path('data', 'ner', require=False) + if data_dir and path.exists(data_dir): + return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) def __init__(self, package=None, vocab=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None, serializer=None, diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 91f574348..1c345c6e8 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -155,7 +155,8 @@ cdef class Tagger: model = TaggerModel(vocab.morphology.n_tags, ConjunctionExtracter(N_CONTEXT_FIELDS, templates)) - model.load(package.file_path('data', 'pos', 'model', require=False)) # TODO: really optional? + if package.has_file('data', 'pos', 'model'): # TODO: really optional? + model.load(package.file_path('data', 'pos', 'model')) return cls(vocab, model) From 7e0757bc5ff174fb40fc5974d1bda50add7b0c19 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Mon, 7 Dec 2015 07:02:42 +0100 Subject: [PATCH 03/13] require newer sputnik version --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7565025f2..a66baefce 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,4 @@ plac six ujson cloudpickle -sputnik == 0.6.0 +sputnik == 0.6.1 diff --git a/setup.py b/setup.py index 6a38ef918..468d31c66 100644 --- a/setup.py +++ b/setup.py @@ -179,7 +179,7 @@ def run_setup(exts): license="MIT", install_requires=['numpy', 'murmurhash == 0.24', 'cymem == 1.30', 'preshed == 0.44', 'thinc == 4.0.0', "text_unidecode", 'plac', 'six', - 'ujson', 'cloudpickle', 'sputnik == 0.6.0'], + 'ujson', 'cloudpickle', 'sputnik == 0.6.1'], setup_requires=["headers_workaround"], cmdclass = {'build_ext': build_ext_subclass }, ) From 3597804c7a4c29e5f40f118f737b073d8bae408a Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Wed, 9 Dec 2015 14:22:26 +0100 Subject: [PATCH 04/13] Update .appveyor.yml --- .appveyor.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 5415b8f4a..739f89819 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -60,8 +60,9 @@ build_script: - "%CMD_IN_ENV% python setup.py build_ext --inplace" - ps: appveyor\download.ps1 - "tar -xzf corpora/en/wordnet.tar.gz" - - "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ spacy/en/data" - + - "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data" + - "%CMD_IN_ENV% sputnik build ." + - "%CMD_IN_ENV% sputnik install en_default-*.sputnik" test_script: # Run the project tests From 9cde3f37bf2c5371c482cd8b8c91cb1201b65e9d Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Tue, 15 Dec 2015 12:23:32 +0100 Subject: [PATCH 05/13] add newer sputnik version --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index a66baefce..ccf78f379 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,4 @@ plac six ujson cloudpickle -sputnik == 0.6.1 +sputnik == 0.6.2 diff --git a/setup.py b/setup.py index 468d31c66..3941a80ca 100644 --- a/setup.py +++ b/setup.py @@ -179,7 +179,7 @@ def run_setup(exts): license="MIT", install_requires=['numpy', 'murmurhash == 0.24', 'cymem == 1.30', 'preshed == 0.44', 'thinc == 4.0.0', "text_unidecode", 'plac', 'six', - 'ujson', 'cloudpickle', 'sputnik == 0.6.1'], + 'ujson', 'cloudpickle', 'sputnik == 0.6.2'], setup_requires=["headers_workaround"], cmdclass = {'build_ext': build_ext_subclass }, ) From 55bd1469dc75da1ea1ed3e584967780faa7ebf6b Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Tue, 15 Dec 2015 15:34:27 +0100 Subject: [PATCH 06/13] add newer sputnik version --- .appveyor.yml | 4 ++-- .travis.yml | 4 ++-- requirements.txt | 2 +- setup.py | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index 739f89819..09beb72fc 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -61,8 +61,8 @@ build_script: - ps: appveyor\download.ps1 - "tar -xzf corpora/en/wordnet.tar.gz" - "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data" - - "%CMD_IN_ENV% sputnik build ." - - "%CMD_IN_ENV% sputnik install en_default-*.sputnik" + - "%CMD_IN_ENV% sputnik build . en_default.sputnik" + - "%CMD_IN_ENV% sputnik install en_default.sputnik" test_script: # Run the project tests diff --git a/.travis.yml b/.travis.yml index 2a0295156..65171f169 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,8 +22,8 @@ install: - "cd ../../" - "export PYTHONPATH=`pwd`" - "python bin/init_model.py en lang_data/ corpora/ data" - - "sputnik build ." - - "sputnik install en_default-*.sputnik" + - "sputnik build . en_default.sputnik" + - "sputnik install en_default.sputnik" # run tests script: diff --git a/requirements.txt b/requirements.txt index ccf78f379..ef73a19d3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,4 @@ plac six ujson cloudpickle -sputnik == 0.6.2 +sputnik == 0.6.3 diff --git a/setup.py b/setup.py index 3941a80ca..72ee3d28b 100644 --- a/setup.py +++ b/setup.py @@ -179,7 +179,7 @@ def run_setup(exts): license="MIT", install_requires=['numpy', 'murmurhash == 0.24', 'cymem == 1.30', 'preshed == 0.44', 'thinc == 4.0.0', "text_unidecode", 'plac', 'six', - 'ujson', 'cloudpickle', 'sputnik == 0.6.2'], + 'ujson', 'cloudpickle', 'sputnik == 0.6.3'], setup_requires=["headers_workaround"], cmdclass = {'build_ext': build_ext_subclass }, ) From 970278a3d6420eb700b22aac6425688d691d2c8a Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 18 Dec 2015 09:49:45 +0100 Subject: [PATCH 07/13] no need to link data dir anymore --- spacy/en/download.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/spacy/en/download.py b/spacy/en/download.py index 26d2b44be..8d3040311 100644 --- a/spacy/en/download.py +++ b/spacy/en/download.py @@ -8,19 +8,16 @@ from sputnik import Sputnik def migrate(path): data_path = os.path.join(path, 'data') - if os.path.isdir(data_path) and not os.path.islink(data_path): - shutil.rmtree(data_path) + if os.path.isdir(data_path): + if os.path.islink(data_path): + os.unlink(data_path) + else: + shutil.rmtree(data_path) for filename in os.listdir(path): if filename.endswith('.tgz'): os.unlink(os.path.join(path, filename)) -def link(package, path): - if os.path.exists(path): - os.unlink(path) - os.symlink(package.dir_path('data'), path) - - @plac.annotations( force=("Force overwrite", "flag", "f", bool), ) @@ -30,8 +27,12 @@ def main(data_size='all', force=False): path = os.path.dirname(os.path.abspath(__file__)) - command = sputnik.make_command( - data_path=os.path.abspath(os.path.join(path, '..', 'data')), + data_path = os.path.abspath(os.path.join(path, '..', 'data')) + if not os.path.isdir(data_path): + os.mkdir(data_path) + + command = sputnik.command( + data_path=data_path, repository_url='https://index.spacy.io') if force: @@ -42,9 +43,6 @@ def main(data_size='all', force=False): # FIXME clean up old-style packages migrate(path) - # FIXME supply spacy with an old-style data dir - link(package, os.path.join(path, 'data')) - if __name__ == '__main__': plac.call(main) From 8359bd4d93f7b9d1bf5b76adb45aa36bfbfbb499 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 18 Dec 2015 09:52:55 +0100 Subject: [PATCH 08/13] strip data/ from package, friendlier Language invocation, make data_dir backward/forward-compatible --- spacy/en/__init__.py | 4 ++ spacy/language.py | 94 +++++++++++++++++++++------ spacy/lemmatizer.py | 6 +- spacy/matcher.pyx | 2 +- spacy/tagger.pyx | 11 ++-- spacy/tests/tagger/test_lemmatizer.py | 4 +- spacy/util.py | 28 ++++---- spacy/vocab.pyx | 14 ++-- 8 files changed, 112 insertions(+), 51 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 309deae41..17af520df 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -33,6 +33,10 @@ your yours yourself yourselves STOPWORDS = set(w for w in STOPWORDS.split() if w) class English(Language): + def __init__(self, **kwargs): + kwargs['lang'] = 'en' + super(English, self).__init__(**kwargs) + @staticmethod def is_stop(string): return 1 if string.lower() in STOPWORDS else 0 diff --git a/spacy/language.py b/spacy/language.py index 83b91cdca..7a96e12ea 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -20,7 +20,7 @@ from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD -from .util import default_package +from .util import get_package class Language(object): @@ -137,48 +137,100 @@ class Language(object): @classmethod def default_vocab(cls, package=None, get_lex_attr=None): if package is None: - package = default_package() + package = get_package() if get_lex_attr is None: get_lex_attr = cls.default_lex_attrs() return Vocab.from_package(package, get_lex_attr=get_lex_attr) @classmethod def default_parser(cls, package, vocab): - data_dir = package.dir_path('data', 'deps', require=False) + data_dir = package.dir_path('deps', require=False) if data_dir and path.exists(data_dir): return Parser.from_dir(data_dir, vocab.strings, ArcEager) @classmethod def default_entity(cls, package, vocab): - data_dir = package.dir_path('data', 'ner', require=False) + data_dir = package.dir_path('ner', require=False) if data_dir and path.exists(data_dir): return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) - def __init__(self, package=None, vocab=None, tokenizer=None, tagger=None, - parser=None, entity=None, matcher=None, serializer=None, - load_vectors=True): + def __init__(self, **kwargs): + """ + a model can be specified: + + 1) by a path to the model directory (DEPRECATED) + - Language(data_dir='path/to/data') + + 2) by a language identifier (and optionally a package root dir) + - Language(lang='en') + - Language(lang='en', data_dir='spacy/data') + + 3) by a model name/version (and optionally a package root dir) + - Language(model='en_default') + - Language(model='en_default', version='1.0.0') + - Language(model='en_default', version='1.0.0', data_dir='spacy/data') + """ + + data_dir = kwargs.pop('data_dir', None) + + lang = kwargs.pop('lang', None) + model = kwargs.pop('model', None) + version = kwargs.pop('version', None) + + vocab = kwargs.pop('vocab', None) + tokenizer = kwargs.pop('tokenizer', None) + tagger = kwargs.pop('tagger', None) + parser = kwargs.pop('parser', None) + entity = kwargs.pop('entity', None) + matcher = kwargs.pop('matcher', None) + serializer = kwargs.pop('serializer', None) + + load_vectors = kwargs.pop('load_vectors', True) + + # support non-package data dirs + if data_dir and path.exists(path.join(data_dir, 'vocab')): + class Package(object): + def __init__(self, root): + self.root = root + + def has_file(self, *path_parts): + return path.exists(path.join(self.root, *path_parts)) + + def file_path(self, *path_parts, **kwargs): + return path.join(self.root, *path_parts) + + def dir_path(self, *path_parts, **kwargs): + return path.join(self.root, *path_parts) + + def load_utf8(self, func, *path_parts, **kwargs): + with io.open(self.file_path(path.join(*path_parts)), + mode='r', encoding='utf8') as f: + return func(f) + + warn("using non-package data_dir", DeprecationWarning) + package = Package(data_dir) + else: + if model is None: + model = '%s_default' % (lang or 'en') + version = None + print(model, version) + package = get_package(name=model, version=version, + data_path=data_dir) + if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) - if package in (None, True): - package = default_package() if vocab in (None, True): - vocab = self.default_vocab(package) + self.vocab = self.default_vocab(package) if tokenizer in (None, True): - tokenizer = Tokenizer.from_package(package, vocab) + self.tokenizer = Tokenizer.from_package(package, self.vocab) if tagger in (None, True): - tagger = Tagger.from_package(package, vocab) + self.tagger = Tagger.from_package(package, self.vocab) if entity in (None, True): - entity = self.default_entity(package, vocab) + self.entity = self.default_entity(package, self.vocab) if parser in (None, True): - parser = self.default_parser(package, vocab) + self.parser = self.default_parser(package, self.vocab) if matcher in (None, True): - matcher = Matcher.from_package(package, vocab) - self.vocab = vocab - self.tokenizer = tokenizer - self.tagger = tagger - self.parser = parser - self.entity = entity - self.matcher = matcher + self.matcher = Matcher.from_package(package, self.vocab) def __reduce__(self): return (self.__class__, diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index d7fdcf76b..c5b9c1c50 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -17,14 +17,14 @@ class Lemmatizer(object): exc = {} for pos in ['adj', 'noun', 'verb']: index[pos] = package.load_utf8(read_index, - 'data', 'wordnet', 'index.%s' % pos, + 'wordnet', 'index.%s' % pos, default=set()) # TODO: really optional? exc[pos] = package.load_utf8(read_exc, - 'data', 'wordnet', '%s.exc' % pos, + 'wordnet', '%s.exc' % pos, default={}) # TODO: really optional? rules = package.load_utf8(json.load, - 'data', 'vocab', 'lemma_rules.json', + 'vocab', 'lemma_rules.json', default={}) # TODO: really optional? return cls(index, exc, rules) diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 4319d593b..4d36b7742 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -171,7 +171,7 @@ cdef class Matcher: @classmethod def from_package(cls, package, Vocab vocab): patterns = package.load_utf8(json.load, - 'data', 'vocab', 'gazetteer.json', + 'vocab', 'gazetteer.json', default={}) # TODO: really optional? return cls(vocab, patterns) diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 1c345c6e8..2c05b4a84 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -148,15 +148,16 @@ cdef class Tagger: @classmethod def from_package(cls, package, vocab): # TODO: templates.json deprecated? not present in latest package - templates = package.load_utf8(json.load, - 'data', 'pos', 'templates.json', - default=cls.default_templates()) + templates = cls.default_templates() + # templates = package.load_utf8(json.load, + # 'pos', 'templates.json', + # default=cls.default_templates()) model = TaggerModel(vocab.morphology.n_tags, ConjunctionExtracter(N_CONTEXT_FIELDS, templates)) - if package.has_file('data', 'pos', 'model'): # TODO: really optional? - model.load(package.file_path('data', 'pos', 'model')) + if package.has_file('pos', 'model'): # TODO: really optional? + model.load(package.file_path('pos', 'model')) return cls(vocab, model) diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 708594299..6950f010f 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -20,14 +20,14 @@ def lemmatizer(package): def test_read_index(package): - index = package.load_utf8(read_index, 'data', 'wordnet', 'index.noun') + index = package.load_utf8(read_index, 'wordnet', 'index.noun') assert 'man' in index assert 'plantes' not in index assert 'plant' in index def test_read_exc(package): - exc = package.load_utf8(read_exc, 'data', 'wordnet', 'verb.exc') + exc = package.load_utf8(read_exc, 'wordnet', 'verb.exc') assert exc['was'] == ('be',) diff --git a/spacy/util.py b/spacy/util.py index 5592e64eb..69e3ba237 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -8,16 +8,20 @@ from sputnik import Sputnik from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -def default_package(): - if os.environ.get('SPACY_DATA'): - data_path = os.environ.get('SPACY_DATA') - else: - data_path = os.path.abspath( - os.path.join(os.path.dirname(__file__), 'data')) +def get_package(name=None, version=None, data_path=None): + if data_path is None: + if os.environ.get('SPACY_DATA'): + data_path = os.environ.get('SPACY_DATA') + else: + data_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data')) - sputnik = Sputnik('spacy', '0.99.0') # TODO: retrieve version + sputnik = Sputnik('spacy', '0.100.0') # TODO: retrieve version pool = sputnik.pool(data_path) - return pool.get('en_default') + + if version: + name += ' ==%s' % version + return pool.get(name) def normalize_slice(length, start, stop, step=None): @@ -45,10 +49,10 @@ def utf8open(loc, mode='r'): def read_lang_data(package): - tokenization = package.load_utf8(json.load, 'data', 'tokenizer', 'specials.json') - prefix = package.load_utf8(read_prefix, 'data', 'tokenizer', 'prefix.txt') - suffix = package.load_utf8(read_suffix, 'data', 'tokenizer', 'suffix.txt') - infix = package.load_utf8(read_infix, 'data', 'tokenizer', 'infix.txt') + tokenization = package.load_utf8(json.load, 'tokenizer', 'specials.json') + prefix = package.load_utf8(read_prefix, 'tokenizer', 'prefix.txt') + suffix = package.load_utf8(read_suffix, 'tokenizer', 'suffix.txt') + infix = package.load_utf8(read_infix, 'tokenizer', 'infix.txt') return tokenization, prefix, suffix, infix diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index ac083d9bc..bb0ae6173 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -49,24 +49,24 @@ cdef class Vocab: @classmethod def from_package(cls, package, get_lex_attr=None): tag_map = package.load_utf8(json.load, - 'data', 'vocab', 'tag_map.json') + 'vocab', 'tag_map.json') lemmatizer = Lemmatizer.from_package(package) serializer_freqs = package.load_utf8(json.load, - 'data', 'vocab', 'serializer.json', + 'vocab', 'serializer.json', require=False) # TODO: really optional? cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map, lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) - if package.has_file('data', 'vocab', 'strings.json'): # TODO: really optional? - package.load_utf8(self.strings.load, 'data', 'vocab', 'strings.json') - self.load_lexemes(package.file_path('data', 'vocab', 'lexemes.bin')) + if package.has_file('vocab', 'strings.json'): # TODO: really optional? + package.load_utf8(self.strings.load, 'vocab', 'strings.json') + self.load_lexemes(package.file_path('vocab', 'lexemes.bin')) - if package.has_file('data', 'vocab', 'vec.bin'): # TODO: really optional? + if package.has_file('vocab', 'vec.bin'): # TODO: really optional? self.vectors_length = self.load_vectors_from_bin_loc( - package.file_path('data', 'vocab', 'vec.bin')) + package.file_path('vocab', 'vec.bin')) return self From cfa187aaf0a10fe124663b877d3551643631a660 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 18 Dec 2015 10:58:02 +0100 Subject: [PATCH 09/13] fix tests --- spacy/language.py | 1 - spacy/tests/tagger/test_lemmatizer.py | 4 ++-- spacy/util.py | 2 ++ 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index 7a96e12ea..cbbd942b5 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -213,7 +213,6 @@ class Language(object): if model is None: model = '%s_default' % (lang or 'en') version = None - print(model, version) package = get_package(name=model, version=version, data_path=data_dir) diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 6950f010f..e25fbe199 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -4,14 +4,14 @@ import io import pickle from spacy.lemmatizer import Lemmatizer, read_index, read_exc -from spacy.util import default_package +from spacy.util import get_package import pytest @pytest.fixture def package(): - return default_package() + return get_package() @pytest.fixture diff --git a/spacy/util.py b/spacy/util.py index 69e3ba237..be0b2f433 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -19,6 +19,8 @@ def get_package(name=None, version=None, data_path=None): sputnik = Sputnik('spacy', '0.100.0') # TODO: retrieve version pool = sputnik.pool(data_path) + if name is None: + name = 'en_default' if version: name += ' ==%s' % version return pool.get(name) From 0434d9a085cb04b09e136c26d39c098efa84cf59 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 18 Dec 2015 11:25:21 +0100 Subject: [PATCH 10/13] fix tests --- package.json | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/package.json b/package.json index 563009699..0710031ed 100644 --- a/package.json +++ b/package.json @@ -1,10 +1,17 @@ { "name": "en_default", - "version": "0.99.0", + "version": "0.100.0", "description": "english default model", "license": "public domain", - "include": ["data/**/*"], + "include": [ + "deps/*", + "ner/*", + "pos/*", + "tokenizer/*", + "vocab/*", + "wordnet/*" + ], "compatibility": { - "spacy": "==0.99.0" + "spacy": "==0.100.0" } } From 93bb6131fbd35d140063c1f8ec0c8966852146ed Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 18 Dec 2015 11:34:50 +0100 Subject: [PATCH 11/13] fix tests --- .travis.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 65171f169..08ce5763f 100644 --- a/.travis.yml +++ b/.travis.yml @@ -22,7 +22,8 @@ install: - "cd ../../" - "export PYTHONPATH=`pwd`" - "python bin/init_model.py en lang_data/ corpora/ data" - - "sputnik build . en_default.sputnik" + - "cp package.json data" + - "sputnik build data en_default.sputnik" - "sputnik install en_default.sputnik" # run tests From d1f46528ca94b62cbd70c6c19f1b1e2f6738dde0 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 18 Dec 2015 11:42:23 +0100 Subject: [PATCH 12/13] fix appveyor setup --- .appveyor.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index 09beb72fc..397e27b90 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -61,7 +61,8 @@ build_script: - ps: appveyor\download.ps1 - "tar -xzf corpora/en/wordnet.tar.gz" - "%CMD_IN_ENV% python bin/init_model.py en lang_data/ corpora/ data" - - "%CMD_IN_ENV% sputnik build . en_default.sputnik" + - "cp package.json data" + - "%CMD_IN_ENV% sputnik build data en_default.sputnik" - "%CMD_IN_ENV% sputnik install en_default.sputnik" test_script: From d8d348bb5586a2053d6d6ff89801b144a5ca6360 Mon Sep 17 00:00:00 2001 From: Henning Peters Date: Fri, 18 Dec 2015 19:12:08 +0100 Subject: [PATCH 13/13] allow to specify version constraint within model name --- spacy/language.py | 11 +++-------- spacy/util.py | 9 ++------- 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/spacy/language.py b/spacy/language.py index cbbd942b5..2ea5eb162 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -167,15 +167,14 @@ class Language(object): 3) by a model name/version (and optionally a package root dir) - Language(model='en_default') - - Language(model='en_default', version='1.0.0') - - Language(model='en_default', version='1.0.0', data_dir='spacy/data') + - Language(model='en_default ==1.0.0') + - Language(model='en_default <1.1.0, data_dir='spacy/data') """ data_dir = kwargs.pop('data_dir', None) lang = kwargs.pop('lang', None) model = kwargs.pop('model', None) - version = kwargs.pop('version', None) vocab = kwargs.pop('vocab', None) tokenizer = kwargs.pop('tokenizer', None) @@ -210,11 +209,7 @@ class Language(object): warn("using non-package data_dir", DeprecationWarning) package = Package(data_dir) else: - if model is None: - model = '%s_default' % (lang or 'en') - version = None - package = get_package(name=model, version=version, - data_path=data_dir) + package = get_package(name=model, data_path=data_dir) if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) diff --git a/spacy/util.py b/spacy/util.py index be0b2f433..8c9ea319c 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -8,7 +8,7 @@ from sputnik import Sputnik from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -def get_package(name=None, version=None, data_path=None): +def get_package(name=None, data_path=None): if data_path is None: if os.environ.get('SPACY_DATA'): data_path = os.environ.get('SPACY_DATA') @@ -18,12 +18,7 @@ def get_package(name=None, version=None, data_path=None): sputnik = Sputnik('spacy', '0.100.0') # TODO: retrieve version pool = sputnik.pool(data_path) - - if name is None: - name = 'en_default' - if version: - name += ' ==%s' % version - return pool.get(name) + return pool.get(name or 'en_default') def normalize_slice(length, start, stop, step=None):