diff --git a/requirements.txt b/requirements.txt index 344cc7665..7565025f2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,4 @@ plac six ujson cloudpickle -sputnik == 0.5.2 +sputnik == 0.6.0 diff --git a/setup.py b/setup.py index 8e3e72087..6a38ef918 100644 --- a/setup.py +++ b/setup.py @@ -179,7 +179,7 @@ def run_setup(exts): license="MIT", install_requires=['numpy', 'murmurhash == 0.24', 'cymem == 1.30', 'preshed == 0.44', 'thinc == 4.0.0', "text_unidecode", 'plac', 'six', - 'ujson', 'cloudpickle', 'sputnik == 0.5.2'], + 'ujson', 'cloudpickle', 'sputnik == 0.6.0'], setup_requires=["headers_workaround"], cmdclass = {'build_ext': build_ext_subclass }, ) diff --git a/spacy/de/__init__.py b/spacy/de/__init__.py index 262fbe289..d7cc3dc65 100644 --- a/spacy/de/__init__.py +++ b/spacy/de/__init__.py @@ -6,6 +6,4 @@ from ..language import Language class German(Language): - @classmethod - def default_data_dir(cls): - return path.join(path.dirname(__file__), 'data') + pass diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 4d057db20..309deae41 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -4,8 +4,6 @@ from os import path from ..language import Language -LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') - # improved list from Stone, Denis, Kwantes (2010) STOPWORDS = """ @@ -35,10 +33,6 @@ your yours yourself yourselves STOPWORDS = set(w for w in STOPWORDS.split() if w) class English(Language): - @classmethod - def default_data_dir(cls): - return LOCAL_DATA_DIR - @staticmethod def is_stop(string): return 1 if string.lower() in STOPWORDS else 0 diff --git a/spacy/fi/__init__.py b/spacy/fi/__init__.py index 8e7173767..adba064a5 100644 --- a/spacy/fi/__init__.py +++ b/spacy/fi/__init__.py @@ -6,6 +6,4 @@ from ..language import Language class Finnish(Language): - @classmethod - def default_data_dir(cls): - return path.join(path.dirname(__file__), 'data') + pass diff --git a/spacy/it/__init__.py b/spacy/it/__init__.py index a494de41f..6a824fe82 100644 --- a/spacy/it/__init__.py +++ b/spacy/it/__init__.py @@ -6,6 +6,4 @@ from ..language import Language class Italian(Language): - @classmethod - def default_data_dir(cls): - return path.join(path.dirname(__file__), 'data') + pass diff --git a/spacy/language.py b/spacy/language.py index f598518e2..2dce0ea0d 100644 --- a/spacy/language.py +++ b/spacy/language.py @@ -20,6 +20,7 @@ from .syntax.ner import BiluoPushDown from .syntax.arc_eager import ArcEager from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD +from .util import default_package class Language(object): @@ -100,7 +101,7 @@ class Language(object): return 0 @classmethod - def default_lex_attrs(cls, data_dir=None): + def default_lex_attrs(cls): return { attrs.LOWER: cls.lower, attrs.NORM: cls.norm, @@ -134,73 +135,42 @@ class Language(object): return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}} @classmethod - def default_data_dir(cls): - return path.join(path.dirname(__file__), 'data') - - @classmethod - def default_vocab(cls, data_dir=None, get_lex_attr=None): - if data_dir is None: - data_dir = cls.default_data_dir() + def default_vocab(cls, package=None, get_lex_attr=None): + if package is None: + package = default_package() if get_lex_attr is None: - get_lex_attr = cls.default_lex_attrs(data_dir) - return Vocab.from_dir( - path.join(data_dir, 'vocab'), - get_lex_attr=get_lex_attr) + get_lex_attr = cls.default_lex_attrs() + return Vocab.from_package(package, get_lex_attr=get_lex_attr) @classmethod - def default_tokenizer(cls, vocab, data_dir): - if path.exists(data_dir): - return Tokenizer.from_dir(vocab, data_dir) - else: - return Tokenizer(vocab, {}, None, None, None) + def default_parser(cls, package, vocab): + data_dir = package.dir_path('data', 'deps') + return Parser.from_dir(data_dir, vocab.strings, ArcEager) @classmethod - def default_tagger(cls, vocab, data_dir): - if path.exists(data_dir): - return Tagger.from_dir(data_dir, vocab) - else: - return None + def default_entity(cls, package, vocab): + data_dir = package.dir_path('data', 'ner') + return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) - @classmethod - def default_parser(cls, vocab, data_dir): - if path.exists(data_dir): - return Parser.from_dir(data_dir, vocab.strings, ArcEager) - else: - return None - - @classmethod - def default_entity(cls, vocab, data_dir): - if path.exists(data_dir): - return Parser.from_dir(data_dir, vocab.strings, BiluoPushDown) - else: - return None - - @classmethod - def default_matcher(cls, vocab, data_dir): - if path.exists(data_dir): - return Matcher.from_dir(data_dir, vocab) - else: - return None - - def __init__(self, data_dir=None, vocab=None, tokenizer=None, tagger=None, + def __init__(self, package=None, vocab=None, tokenizer=None, tagger=None, parser=None, entity=None, matcher=None, serializer=None, load_vectors=True): if load_vectors is not True: warn("load_vectors is deprecated", DeprecationWarning) - if data_dir in (None, True): - data_dir = self.default_data_dir() + if package in (None, True): + package = default_package() if vocab in (None, True): - vocab = self.default_vocab(data_dir) + vocab = self.default_vocab(package) if tokenizer in (None, True): - tokenizer = self.default_tokenizer(vocab, data_dir=path.join(data_dir, 'tokenizer')) + tokenizer = Tokenizer.from_package(package, vocab) if tagger in (None, True): - tagger = self.default_tagger(vocab, data_dir=path.join(data_dir, 'pos')) + tagger = Tagger.from_package(package, vocab) if entity in (None, True): - entity = self.default_entity(vocab, data_dir=path.join(data_dir, 'ner')) + entity = self.default_entity(package, vocab) if parser in (None, True): - parser = self.default_parser(vocab, data_dir=path.join(data_dir, 'deps')) + parser = self.default_parser(package, vocab) if matcher in (None, True): - matcher = self.default_matcher(vocab, data_dir=data_dir) + matcher = Matcher.from_package(package, vocab) self.vocab = vocab self.tokenizer = tokenizer self.tagger = tagger diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py index 08e511f68..d7fdcf76b 100644 --- a/spacy/lemmatizer.py +++ b/spacy/lemmatizer.py @@ -12,16 +12,21 @@ from .parts_of_speech import NOUN, VERB, ADJ, PUNCT class Lemmatizer(object): @classmethod - def from_dir(cls, data_dir): + def from_package(cls, package): index = {} exc = {} for pos in ['adj', 'noun', 'verb']: - index[pos] = read_index(path.join(data_dir, 'wordnet', 'index.%s' % pos)) - exc[pos] = read_exc(path.join(data_dir, 'wordnet', '%s.exc' % pos)) - if path.exists(path.join(data_dir, 'vocab', 'lemma_rules.json')): - rules = json.load(codecs.open(path.join(data_dir, 'vocab', 'lemma_rules.json'), encoding='utf_8')) - else: - rules = {} + index[pos] = package.load_utf8(read_index, + 'data', 'wordnet', 'index.%s' % pos, + default=set()) # TODO: really optional? + exc[pos] = package.load_utf8(read_exc, + 'data', 'wordnet', '%s.exc' % pos, + default={}) # TODO: really optional? + + rules = package.load_utf8(json.load, + 'data', 'vocab', 'lemma_rules.json', + default={}) # TODO: really optional? + return cls(index, exc, rules) def __init__(self, index, exceptions, rules): @@ -70,11 +75,9 @@ def lemmatize(string, index, exceptions, rules): return set(forms) -def read_index(loc): +def read_index(fileobj): index = set() - if not path.exists(loc): - return index - for line in codecs.open(loc, 'r', 'utf8'): + for line in fileobj: if line.startswith(' '): continue pieces = line.split() @@ -84,11 +87,9 @@ def read_index(loc): return index -def read_exc(loc): +def read_exc(fileobj): exceptions = {} - if not path.exists(loc): - return exceptions - for line in codecs.open(loc, 'r', 'utf8'): + for line in fileobj: if line.startswith(' '): continue pieces = line.split() diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx index 1fa91fab1..4319d593b 100644 --- a/spacy/matcher.pyx +++ b/spacy/matcher.pyx @@ -169,14 +169,11 @@ cdef class Matcher: cdef object _patterns @classmethod - def from_dir(cls, data_dir, Vocab vocab): - patterns_loc = path.join(data_dir, 'vocab', 'gazetteer.json') - if path.exists(patterns_loc): - patterns_data = open(patterns_loc).read() - patterns = json.loads(patterns_data) - return cls(vocab, patterns) - else: - return cls(vocab, {}) + def from_package(cls, package, Vocab vocab): + patterns = package.load_utf8(json.load, + 'data', 'vocab', 'gazetteer.json', + default={}) # TODO: really optional? + return cls(vocab, patterns) def __init__(self, vocab, patterns): self.vocab = vocab diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index 421d97357..91f574348 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -146,15 +146,17 @@ cdef class Tagger: return cls(vocab, model) @classmethod - def from_dir(cls, data_dir, vocab): - if path.exists(path.join(data_dir, 'templates.json')): - templates = json.loads(open(path.join(data_dir, 'templates.json'))) - else: - templates = cls.default_templates() + def from_package(cls, package, vocab): + # TODO: templates.json deprecated? not present in latest package + templates = package.load_utf8(json.load, + 'data', 'pos', 'templates.json', + default=cls.default_templates()) + model = TaggerModel(vocab.morphology.n_tags, ConjunctionExtracter(N_CONTEXT_FIELDS, templates)) - if path.exists(path.join(data_dir, 'model')): - model.load(path.join(data_dir, 'model')) + + model.load(package.file_path('data', 'pos', 'model', require=False)) # TODO: really optional? + return cls(vocab, model) def __init__(self, Vocab vocab, TaggerModel model): diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py index 6aee2c31e..03e728a12 100644 --- a/spacy/tests/conftest.py +++ b/spacy/tests/conftest.py @@ -1,12 +1,11 @@ +from spacy.en import English + import pytest -from spacy.en import English, LOCAL_DATA_DIR -import os @pytest.fixture(scope="session") def EN(): - data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) - return English(data_dir=data_dir) + return English() def pytest_addoption(parser): diff --git a/spacy/tests/serialize/test_packer.py b/spacy/tests/serialize/test_packer.py index e0d24208a..0e13b2de5 100644 --- a/spacy/tests/serialize/test_packer.py +++ b/spacy/tests/serialize/test_packer.py @@ -10,7 +10,6 @@ from spacy.en import English from spacy.vocab import Vocab from spacy.tokens.doc import Doc from spacy.tokenizer import Tokenizer -from spacy.en import LOCAL_DATA_DIR from os import path from spacy.attrs import ORTH, SPACY, TAG, DEP, HEAD diff --git a/spacy/tests/spans/conftest.py b/spacy/tests/spans/conftest.py index f63816e05..d8a0a4cb9 100644 --- a/spacy/tests/spans/conftest.py +++ b/spacy/tests/spans/conftest.py @@ -1,9 +1,8 @@ import pytest -from spacy.en import English, LOCAL_DATA_DIR +from spacy.en import English import os @pytest.fixture(scope="session") def en_nlp(): - data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) - return English(data_dir=data_dir) + return English() diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py index 91aa8ee65..708594299 100644 --- a/spacy/tests/tagger/test_lemmatizer.py +++ b/spacy/tests/tagger/test_lemmatizer.py @@ -4,31 +4,33 @@ import io import pickle from spacy.lemmatizer import Lemmatizer, read_index, read_exc -from spacy.en import LOCAL_DATA_DIR -from os import path +from spacy.util import default_package import pytest -def test_read_index(): - wn = path.join(LOCAL_DATA_DIR, 'wordnet') - index = read_index(path.join(wn, 'index.noun')) +@pytest.fixture +def package(): + return default_package() + + +@pytest.fixture +def lemmatizer(package): + return Lemmatizer.from_package(package) + + +def test_read_index(package): + index = package.load_utf8(read_index, 'data', 'wordnet', 'index.noun') assert 'man' in index assert 'plantes' not in index assert 'plant' in index -def test_read_exc(): - wn = path.join(LOCAL_DATA_DIR, 'wordnet') - exc = read_exc(path.join(wn, 'verb.exc')) +def test_read_exc(package): + exc = package.load_utf8(read_exc, 'data', 'wordnet', 'verb.exc') assert exc['was'] == ('be',) -@pytest.fixture -def lemmatizer(): - return Lemmatizer.from_dir(path.join(LOCAL_DATA_DIR)) - - def test_noun_lemmas(lemmatizer): do = lemmatizer.noun diff --git a/spacy/tests/tokens/test_token_references.py b/spacy/tests/tokens/test_token_references.py index e8dbff6fe..24639f141 100644 --- a/spacy/tests/tokens/test_token_references.py +++ b/spacy/tests/tokens/test_token_references.py @@ -2,16 +2,15 @@ from __future__ import unicode_literals import pytest import gc -from spacy.en import English, LOCAL_DATA_DIR +from spacy.en import English import os -data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) # Let this have its own instances, as we have to be careful about memory here # that's the point, after all @pytest.mark.models def get_orphan_token(text, i): - nlp = English(data_dir=data_dir) + nlp = English() tokens = nlp(text) gc.collect() token = tokens[i] @@ -41,7 +40,7 @@ def _orphan_from_list(toks): @pytest.mark.models def test_list_orphans(): # Test case from NSchrading - nlp = English(data_dir=data_dir) + nlp = English() samples = ["a", "test blah wat okay"] lst = [] for sample in samples: diff --git a/spacy/tests/website/conftest.py b/spacy/tests/website/conftest.py index 35c38d845..b4934d20b 100644 --- a/spacy/tests/website/conftest.py +++ b/spacy/tests/website/conftest.py @@ -5,9 +5,8 @@ import os @pytest.fixture(scope='session') def nlp(): - from spacy.en import English, LOCAL_DATA_DIR - data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) - return English(data_dir=data_dir) + from spacy.en import English + return English() @pytest.fixture() diff --git a/spacy/tests/website/test_home.py b/spacy/tests/website/test_home.py index d03acf855..3d9c8aba9 100644 --- a/spacy/tests/website/test_home.py +++ b/spacy/tests/website/test_home.py @@ -10,9 +10,8 @@ def token(doc): def test_load_resources_and_process_text(): - from spacy.en import English, LOCAL_DATA_DIR - data_dir = os.environ.get('SPACY_DATA', LOCAL_DATA_DIR) - nlp = English(data_dir=data_dir) + from spacy.en import English + nlp = English() doc = nlp('Hello, world. Here are two sentences.') diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx index 1cde1e76e..345734682 100644 --- a/spacy/tokenizer.pyx +++ b/spacy/tokenizer.pyx @@ -41,8 +41,8 @@ cdef class Tokenizer: return (self.__class__, args, None, None) @classmethod - def from_dir(cls, Vocab vocab, data_dir): - rules, prefix_re, suffix_re, infix_re = read_lang_data(data_dir) + def from_package(cls, package, Vocab vocab): + rules, prefix_re, suffix_re, infix_re = read_lang_data(package) prefix_re = re.compile(prefix_re) suffix_re = re.compile(suffix_re) infix_re = re.compile(infix_re) diff --git a/spacy/util.py b/spacy/util.py index 849a3e219..5592e64eb 100644 --- a/spacy/util.py +++ b/spacy/util.py @@ -1,10 +1,23 @@ -from os import path +import os import io import json import re + +from sputnik import Sputnik + from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE -DATA_DIR = path.join(path.dirname(__file__), '..', 'data') + +def default_package(): + if os.environ.get('SPACY_DATA'): + data_path = os.environ.get('SPACY_DATA') + else: + data_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), 'data')) + + sputnik = Sputnik('spacy', '0.99.0') # TODO: retrieve version + pool = sputnik.pool(data_path) + return pool.get('en_default') def normalize_slice(length, start, stop, step=None): @@ -31,67 +44,63 @@ def utf8open(loc, mode='r'): return io.open(loc, mode, encoding='utf8') -def read_lang_data(data_dir): - with open(path.join(data_dir, 'specials.json')) as file_: - tokenization = json.load(file_) - prefix = read_prefix(data_dir) - suffix = read_suffix(data_dir) - infix = read_infix(data_dir) +def read_lang_data(package): + tokenization = package.load_utf8(json.load, 'data', 'tokenizer', 'specials.json') + prefix = package.load_utf8(read_prefix, 'data', 'tokenizer', 'prefix.txt') + suffix = package.load_utf8(read_suffix, 'data', 'tokenizer', 'suffix.txt') + infix = package.load_utf8(read_infix, 'data', 'tokenizer', 'infix.txt') return tokenization, prefix, suffix, infix -def read_prefix(data_dir): - with utf8open(path.join(data_dir, 'prefix.txt')) as file_: - entries = file_.read().split('\n') - expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) +def read_prefix(fileobj): + entries = fileobj.read().split('\n') + expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()]) return expression -def read_suffix(data_dir): - with utf8open(path.join(data_dir, 'suffix.txt')) as file_: - entries = file_.read().split('\n') - expression = '|'.join([piece + '$' for piece in entries if piece.strip()]) +def read_suffix(fileobj): + entries = fileobj.read().split('\n') + expression = '|'.join([piece + '$' for piece in entries if piece.strip()]) return expression -def read_infix(data_dir): - with utf8open(path.join(data_dir, 'infix.txt')) as file_: - entries = file_.read().split('\n') - expression = '|'.join([piece for piece in entries if piece.strip()]) +def read_infix(fileobj): + entries = fileobj.read().split('\n') + expression = '|'.join([piece for piece in entries if piece.strip()]) return expression -def read_tokenization(lang): - loc = path.join(DATA_DIR, lang, 'tokenization') - entries = [] - seen = set() - with utf8open(loc) as file_: - for line in file_: - line = line.strip() - if line.startswith('#'): - continue - if not line: - continue - pieces = line.split() - chunk = pieces.pop(0) - assert chunk not in seen, chunk - seen.add(chunk) - entries.append((chunk, list(pieces))) - if chunk[0].isalpha() and chunk[0].islower(): - chunk = chunk[0].title() + chunk[1:] - pieces[0] = pieces[0][0].title() + pieces[0][1:] - seen.add(chunk) - entries.append((chunk, pieces)) - return entries +# def read_tokenization(lang): +# loc = path.join(DATA_DIR, lang, 'tokenization') +# entries = [] +# seen = set() +# with utf8open(loc) as file_: +# for line in file_: +# line = line.strip() +# if line.startswith('#'): +# continue +# if not line: +# continue +# pieces = line.split() +# chunk = pieces.pop(0) +# assert chunk not in seen, chunk +# seen.add(chunk) +# entries.append((chunk, list(pieces))) +# if chunk[0].isalpha() and chunk[0].islower(): +# chunk = chunk[0].title() + chunk[1:] +# pieces[0] = pieces[0][0].title() + pieces[0][1:] +# seen.add(chunk) +# entries.append((chunk, pieces)) +# return entries -def read_detoken_rules(lang): # Deprecated? - loc = path.join(DATA_DIR, lang, 'detokenize') - entries = [] - with utf8open(loc) as file_: - for line in file_: - entries.append(line.strip()) - return entries +# def read_detoken_rules(lang): # Deprecated? +# loc = path.join(DATA_DIR, lang, 'detokenize') +# entries = [] +# with utf8open(loc) as file_: +# for line in file_: +# entries.append(line.strip()) +# return entries def align_tokens(ref, indices): # Deprecated, surely? diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 3817e7127..ac083d9bc 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -47,28 +47,27 @@ cdef class Vocab: '''A map container for a language's LexemeC structs. ''' @classmethod - def from_dir(cls, data_dir, get_lex_attr=None): - if not path.exists(data_dir): - raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) - if not path.isdir(data_dir): - raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) + def from_package(cls, package, get_lex_attr=None): + tag_map = package.load_utf8(json.load, + 'data', 'vocab', 'tag_map.json') + + lemmatizer = Lemmatizer.from_package(package) + + serializer_freqs = package.load_utf8(json.load, + 'data', 'vocab', 'serializer.json', + require=False) # TODO: really optional? - tag_map = json.load(open(path.join(data_dir, 'tag_map.json'))) - lemmatizer = Lemmatizer.from_dir(path.join(data_dir, '..')) - if path.exists(path.join(data_dir, 'serializer.json')): - serializer_freqs = json.load(open(path.join(data_dir, 'serializer.json'))) - else: - serializer_freqs = None cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map, lemmatizer=lemmatizer, serializer_freqs=serializer_freqs) - if path.exists(path.join(data_dir, 'strings.json')): - with io.open(path.join(data_dir, 'strings.json'), 'r', encoding='utf8') as file_: - self.strings.load(file_) - self.load_lexemes(path.join(data_dir, 'lexemes.bin')) - - if path.exists(path.join(data_dir, 'vec.bin')): - self.vectors_length = self.load_vectors_from_bin_loc(path.join(data_dir, 'vec.bin')) + if package.has_file('data', 'vocab', 'strings.json'): # TODO: really optional? + package.load_utf8(self.strings.load, 'data', 'vocab', 'strings.json') + self.load_lexemes(package.file_path('data', 'vocab', 'lexemes.bin')) + + if package.has_file('data', 'vocab', 'vec.bin'): # TODO: really optional? + self.vectors_length = self.load_vectors_from_bin_loc( + package.file_path('data', 'vocab', 'vec.bin')) + return self def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):