From 83e364188c1bbacb440f398c03e620f94441e402 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 24 Sep 2016 15:42:01 +0200
Subject: [PATCH] Mostly finished loading refactoring. Design is in place, but
 doesn't work yet.

---
 spacy/__init__.py                    |  25 ++++--
 spacy/deprecated.py                  |  99 +++++++++++++++++++++
 spacy/language.py                    | 113 ++++++++----------------
 spacy/lemmatizer.py                  |   1 -
 spacy/matcher.pyx                    |  13 ++-
 spacy/syntax/parser.pxd              |   3 +-
 spacy/syntax/parser.pyx              |  38 +++-----
 spacy/tagger.pyx                     |  21 ++---
 spacy/tests/munge/test_align.py      |   2 +-
 spacy/tests/munge/test_detokenize.py |   2 +-
 spacy/tests/test_matcher.py          |   2 +-
 spacy/tokenizer.pyx                  |  75 +++++++++++-----
 spacy/util.py                        | 127 ---------------------------
 spacy/vocab.pyx                      |  39 ++++----
 14 files changed, 265 insertions(+), 295 deletions(-)
 create mode 100644 spacy/deprecated.py

diff --git a/spacy/__init__.py b/spacy/__init__.py
index 4c8a73797..e8ad5dde3 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -1,23 +1,38 @@
-from .util import set_lang_class, get_lang_class, get_package, get_package_by_name
+import pathlib
+
+from .util import set_lang_class, get_lang_class
 
 from . import en
 from . import de
 from . import zh
 
 
+_data_path = pathlib.Path(__file__).parent / 'data'
+
 set_lang_class(en.English.lang, en.English)
 set_lang_class(de.German.lang, de.German)
 set_lang_class(zh.Chinese.lang, zh.Chinese)
 
 
+def get_data_path():
+    return _data_path
+
+
+def set_data_path(path):
+    global _data_path
+    if isinstance(path, basestring):
+        path = pathlib.Path(path)
+    _data_path = path
+
+
 def load(name, vocab=None, tokenizer=None, parser=None, tagger=None, entity=None,
          matcher=None, serializer=None, vectors=None, via=None):
-    package = get_package_by_name(name, via=via)
-    vectors_package = get_package_by_name(vectors, via=via)
+    if via is None:
+        via = get_data_path()
     cls = get_lang_class(name)
     return cls(
-        package=package,
-        vectors_package=vectors_package,
+        via,
+        vectors=vectors,
         vocab=vocab,
         tokenizer=tokenizer,
         tagger=tagger,
diff --git a/spacy/deprecated.py b/spacy/deprecated.py
new file mode 100644
index 000000000..2f9109772
--- /dev/null
+++ b/spacy/deprecated.py
@@ -0,0 +1,99 @@
+from sputnik.dir_package import DirPackage
+from sputnik.package_list import (PackageNotFoundException,
+                                  CompatiblePackageNotFoundException)
+
+import sputnik
+from . import about
+
+
+def get_package(data_dir):
+    if not isinstance(data_dir, six.string_types):
+        raise RuntimeError('data_dir must be a string')
+    return DirPackage(data_dir)
+
+
+def get_package_by_name(name=None, via=None):
+    if name is None:
+        return
+    lang = get_lang_class(name)
+    try:
+        return sputnik.package(about.__title__, about.__version__,
+            name, data_path=via)
+    except PackageNotFoundException as e:
+        raise RuntimeError("Model '%s' not installed. Please run 'python -m "
+                           "%s.download' to install latest compatible "
+                           "model." % (name, lang.__module__))
+    except CompatiblePackageNotFoundException as e:
+        raise RuntimeError("Installed model is not compatible with spaCy "
+                           "version. Please run 'python -m %s.download "
+                           "--force' to install latest compatible model." %
+                           (lang.__module__))
+
+
+
+
+def read_lang_data(package):
+    tokenization = package.load_json(('tokenizer', 'specials.json'))
+    with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
+        prefix = read_prefix(file_) if file_ is not None else None
+    with package.open(('tokenizer', 'suffix.txt'), default=None) as file_:
+        suffix = read_suffix(file_) if file_ is not None else None
+    with package.open(('tokenizer', 'infix.txt'), default=None) as file_:
+        infix = read_infix(file_) if file_ is not None else None
+    return tokenization, prefix, suffix, infix
+
+
+def read_prefix(fileobj):
+    entries = fileobj.read().split('\n')
+    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
+    return expression
+
+
+def read_suffix(fileobj):
+    entries = fileobj.read().split('\n')
+    expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
+    return expression
+
+
+def read_infix(fileobj):
+    entries = fileobj.read().split('\n')
+    expression = '|'.join([piece for piece in entries if piece.strip()])
+    return expression
+
+
+def align_tokens(ref, indices): # Deprecated, surely?
+    start = 0
+    queue = list(indices)
+    for token in ref:
+        end = start + len(token)
+        emit = []
+        while queue and queue[0][1] <= end:
+            emit.append(queue.pop(0))
+        yield token, emit
+        start = end
+    assert not queue
+
+
+def detokenize(token_rules, words): # Deprecated?
+    """To align with treebanks, return a list of "chunks", where a chunk is a
+    sequence of tokens that are separated by whitespace in actual strings. Each
+    chunk should be a tuple of token indices, e.g.
+
+    >>> detokenize(["ca<SEP>n't", '<SEP>!'], ["I", "ca", "n't", "!"])
+    [(0,), (1, 2, 3)]
+    """
+    string = ' '.join(words)
+    for subtoks in token_rules:
+        # Algorithmically this is dumb, but writing a little list-based match
+        # machine? Ain't nobody got time for that.
+        string = string.replace(subtoks.replace('<SEP>', ' '), subtoks)
+    positions = []
+    i = 0
+    for chunk in string.split():
+        subtoks = chunk.split('<SEP>')
+        positions.append(tuple(range(i, i+len(subtoks))))
+        i += len(subtoks)
+    return positions
+
+
+
diff --git a/spacy/language.py b/spacy/language.py
index 9ba0caf23..2e0a3f022 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -25,13 +25,19 @@ class Defaults(object):
     def __init__(self, lang, path):
         self.lang = lang
         self.path = path
+        self.lex_attr_getters = dict(self.__class__.lex_attr_getters)
+        if (self.path / 'vocab' / 'oov_prob').exists():
+            with (self.path / 'vocab' / 'oov_prob').open() as file_:
+                oov_prob = file_.read().strip()
+            self.lex_attr_getters['PROB'] = lambda string: oov_prob
+        self.lex_attr_getters['LANG'] = lambda string: self.lang,
 
     def Vectors(self):
         pass
     
-    def Vocab(self, vectors=None, get_lex_attr=None):
-        if get_lex_attr is None:
-            get_lex_attr = self.lex_attrs()
+    def Vocab(self, vectors=None, lex_attr_getters=None):
+        if lex_attr_getters is None:
+            lex_attr_getters = dict(self.lex_attr_getters)
         if vectors is None:
             vectors = self.Vectors()
         return Vocab.load(self.path, get_lex_attr=get_lex_attr, vectors=vectors)
@@ -64,84 +70,42 @@ class Defaults(object):
             nlp.parser,
             nlp.entity]
 
-    def dep_labels(self):
-        return {0: {'ROOT': True}}
+    dep_labels = {0: {'ROOT': True}}
 
-    def ner_labels(self):
-        return {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
+    ner_labels = {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
 
-    def lex_attrs(self, *args, **kwargs):
-        if 'oov_prob' in kwargs:
-            oov_prob = kwargs.get('oov_prob', -20)
-        else:
-            with (self.path / 'vocab' / 'oov_prob').open() as file_:
-                oov_prob = file_.read().strip()
-        return {
-            attrs.LOWER: self.lower,
-            attrs.NORM: self.norm,
-            attrs.SHAPE: orth.word_shape,
-            attrs.PREFIX: self.prefix,
-            attrs.SUFFIX: self.suffix,
-            attrs.CLUSTER: self.cluster,
-            attrs.PROB: lambda string: oov_prob,
-            attrs.LANG: lambda string: self.lang,
-            attrs.IS_ALPHA: orth.is_alpha,
-            attrs.IS_ASCII: orth.is_ascii,
-            attrs.IS_DIGIT: self.is_digit,
-            attrs.IS_LOWER: orth.is_lower,
-            attrs.IS_PUNCT: orth.is_punct,
-            attrs.IS_SPACE: self.is_space,
-            attrs.IS_TITLE: orth.is_title,
-            attrs.IS_UPPER: orth.is_upper,
-            attrs.IS_BRACKET: orth.is_bracket,
-            attrs.IS_QUOTE: orth.is_quote,
-            attrs.IS_LEFT_PUNCT: orth.is_left_punct,
-            attrs.IS_RIGHT_PUNCT: orth.is_right_punct,
-            attrs.LIKE_URL: orth.like_url,
-            attrs.LIKE_NUM: orth.like_number,
-            attrs.LIKE_EMAIL: orth.like_email,
-            attrs.IS_STOP: self.is_stop,
-            attrs.IS_OOV: lambda string: True
-        }
-
-    @staticmethod
-    def lower(string):
-        return string.lower()
-
-    @staticmethod
-    def norm(string):
-        return string
-
-    @staticmethod
-    def prefix(string):
-        return string[0]
-
-    @staticmethod
-    def suffix(string):
-        return string[-3:]
-
-    @staticmethod
-    def cluster(string):
-        return 0
-
-    @staticmethod
-    def is_digit(string):
-        return string.isdigit()
-
-    @staticmethod
-    def is_space(string):
-        return string.isspace()
-
-    @staticmethod
-    def is_stop(string):
-        return 0
+    lex_attr_getters = {
+        attrs.LOWER: lambda string: string.lower(),
+        attrs.NORM: lambda string: string,
+        attrs.SHAPE: orth.word_shape,
+        attrs.PREFIX: lambda string: string[0],
+        attrs.SUFFIX: lambda string: string[-3:],
+        attrs.CLUSTER: lambda string: 0,
+        attrs.IS_ALPHA: orth.is_alpha,
+        attrs.IS_ASCII: orth.is_ascii,
+        attrs.IS_DIGIT: lambda string: string.isdigit(),
+        attrs.IS_LOWER: orth.is_lower,
+        attrs.IS_PUNCT: orth.is_punct,
+        attrs.IS_SPACE: lambda string: string.isspace(),
+        attrs.IS_TITLE: orth.is_title,
+        attrs.IS_UPPER: orth.is_upper,
+        attrs.IS_BRACKET: orth.is_bracket,
+        attrs.IS_QUOTE: orth.is_quote,
+        attrs.IS_LEFT_PUNCT: orth.is_left_punct,
+        attrs.IS_RIGHT_PUNCT: orth.is_right_punct,
+        attrs.LIKE_URL: orth.like_url,
+        attrs.LIKE_NUM: orth.like_number,
+        attrs.LIKE_EMAIL: orth.like_email,
+        attrs.IS_STOP: lambda string: False,
+        attrs.IS_OOV: lambda string: True
+    }
 
 
 class Language(object):
     '''A text-processing pipeline. Usually you'll load this once per process, and
     pass the instance around your program.
     '''
-    
+    Defaults = Defaults
     lang = None
 
     def __init__(self,
@@ -180,6 +144,7 @@ class Language(object):
             path = data_dir
         if isinstance(path, basestring):
             path = pathlib.Path(path)
+        self.path = path
         defaults = defaults if defaults is not True else self.get_defaults(self.path)
         
         self.vocab     = vocab if vocab is not True else defaults.Vocab(vectors=vectors)
@@ -291,4 +256,4 @@ class Language(object):
 
         
     def get_defaults(self, path):
-        return Defaults(path)
+        return Defaults(self.lang, path)
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index a05ca49c0..486fa8c7f 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -8,7 +8,6 @@ except ImportError:
     import json
 
 from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
-from .util import get_package
 
 
 class Lemmatizer(object):
diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index 089dddf95..d4d695379 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -23,7 +23,6 @@ from .tokens.doc cimport Doc
 from .vocab cimport Vocab
 
 from .attrs import FLAG61 as U_ENT
-from .util import get_package
 
 from .attrs import FLAG60 as B2_ENT
 from .attrs import FLAG59 as B3_ENT
@@ -195,14 +194,12 @@ cdef class Matcher:
     cdef vector[TokenPatternC*] patterns
     cdef readonly Vocab vocab
     cdef public object _patterns
-
+    
     @classmethod
-    def load(cls, data_dir, Vocab vocab):
-        return cls.from_package(get_package(data_dir), vocab=vocab)
-
-    @classmethod
-    def from_package(cls, package, Vocab vocab):
-        patterns = package.load_json(('vocab', 'gazetteer.json'))
+    def load(cls, path, vocab):
+        if (path / 'patterns.json').exists():
+            with (path / 'patterns.json').open() as file_:
+                patterns = json.load(file_)
         return cls(vocab, patterns)
 
     def __init__(self, vocab, patterns={}):
diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd
index e10049fb6..2856cccc9 100644
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@@ -4,6 +4,7 @@ from thinc.structs cimport ExampleC
 
 from .stateclass cimport StateClass
 from .arc_eager cimport TransitionSystem
+from ..vocab cimport Vocab
 from ..tokens.doc cimport Doc
 from ..structs cimport TokenC
 from ._state cimport StateC
@@ -13,8 +14,8 @@ cdef class ParserModel(AveragedPerceptron):
     cdef void set_featuresC(self, ExampleC* eg, const StateC* state) nogil
 
 cdef class Parser:
+    cdef readonly Vocab vocab
     cdef readonly ParserModel model
     cdef readonly TransitionSystem moves
-    cdef int _projectivize
 
     cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 41baa0505..2eaadbc86 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -78,34 +78,24 @@ cdef class ParserModel(AveragedPerceptron):
 
 
 cdef class Parser:
-    def __init__(self, StringStore strings, transition_system, ParserModel model, int projectivize = 0):
+    @classmethod
+    def load(cls, path, Vocab vocab, moves_class):
+        with (path / 'config.json').open() as file_:
+            cfg = json.loads(file_)
+        moves = moves_class(vocab.strings, cfg['labels'])
+        templates = get_templates(cfg['features'])
+        model = ParserModel(templates)
+        if (path / 'model').exists():
+            model.load(path / 'model')
+        return cls(vocab, moves, model, **cfg)
+
+    def __init__(self, Vocab vocab, transition_system, ParserModel model, **cfg):
         self.moves = transition_system
         self.model = model
-        self._projectivize = projectivize
-
-    @classmethod
-    def from_dir(cls, model_dir, strings, transition_system):
-        if not os.path.exists(model_dir):
-            print >> sys.stderr, "Warning: No model found at", model_dir
-        elif not os.path.isdir(model_dir):
-            print >> sys.stderr, "Warning: model path:", model_dir, "is not a directory"
-        cfg = Config.read(model_dir, 'config')
-        moves = transition_system(strings, cfg.labels)
-        templates = get_templates(cfg.features)
-        model = ParserModel(templates)
-        project = cfg.projectivize if hasattr(cfg,'projectivize') else False
-        if path.exists(path.join(model_dir, 'model')):
-            model.load(path.join(model_dir, 'model'))
-        return cls(strings, moves, model, project)
-
-    @classmethod
-    def load(cls, pkg_or_str_or_file, vocab):
-        # TODO
-        raise NotImplementedError(
-                "This should be here, but isn't yet =/. Use Parser.from_dir")
+        self.cfg = cfg
 
     def __reduce__(self):
-        return (Parser, (self.moves.strings, self.moves, self.model), None, None)
+        return (Parser, (self.vocab, self.moves, self.model), None, None)
 
     def __call__(self, Doc tokens):
         cdef int nr_class = self.moves.n_moves
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index 9e4c8ac43..a8d4cf37a 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -18,8 +18,6 @@ from .parts_of_speech cimport VERB, X, PUNCT, EOL, SPACE
 
 from .attrs cimport *
 
-from .util import get_package
-
  
 cpdef enum:
     P2_orth
@@ -147,24 +145,21 @@ cdef class Tagger:
         return cls(vocab, model)
 
     @classmethod
-    def load(cls, data_dir, vocab):
-        return cls.from_package(get_package(data_dir), vocab=vocab)
-
-    @classmethod
-    def from_package(cls, pkg, vocab):
-        # TODO: templates.json deprecated? not present in latest package
-        # templates = cls.default_templates()
-        templates = pkg.load_json(('pos', 'templates.json'), default=cls.default_templates())
+    def load(cls, path, vocab):
+        if (path / 'pos' / 'templates.json').exists():
+            with (path / 'pos' / 'templates.json').open() as file_:
+                templates = json.load(file_)
+        else:
+            templates = cls.default_templates()
 
         model = TaggerModel(templates)
-        if pkg.has_file('pos', 'model'):
-            model.load(pkg.file_path('pos', 'model'))
+        if (path / 'pos' / 'model').exists():
+            model.load(path / 'pos' / 'model')
         return cls(vocab, model)
 
     def __init__(self, Vocab vocab, TaggerModel model):
         self.vocab = vocab
         self.model = model
-        
         # TODO: Move this to tag map
         self.freqs = {TAG: defaultdict(int)}
         for tag in self.tag_names:
diff --git a/spacy/tests/munge/test_align.py b/spacy/tests/munge/test_align.py
index a603c4a74..e3e0a78dc 100644
--- a/spacy/tests/munge/test_align.py
+++ b/spacy/tests/munge/test_align.py
@@ -1,4 +1,4 @@
-from spacy.util import align_tokens
+from spacy.deprecated import align_tokens
 
 
 def test_perfect_align():
diff --git a/spacy/tests/munge/test_detokenize.py b/spacy/tests/munge/test_detokenize.py
index ffc9e2582..5d8a234c6 100644
--- a/spacy/tests/munge/test_detokenize.py
+++ b/spacy/tests/munge/test_detokenize.py
@@ -1,4 +1,4 @@
-from spacy.util import detokenize
+from spacy.deprecated import detokenize
 
 def test_punct():
     tokens = 'Pierre Vinken , 61 years old .'.split()
diff --git a/spacy/tests/test_matcher.py b/spacy/tests/test_matcher.py
index aade59918..5d3b7b916 100644
--- a/spacy/tests/test_matcher.py
+++ b/spacy/tests/test_matcher.py
@@ -16,7 +16,7 @@ def matcher():
         'GoogleNow':  ['PRODUCT', {}, [[{'ORTH': 'Google'}, {'ORTH': 'Now'}]]],
         'Java':       ['PRODUCT', {}, [[{'LOWER': 'java'}]]],
     }
-    return Matcher(Vocab(get_lex_attr=English.default_lex_attrs()), patterns)
+    return Matcher(Vocab(get_lex_attr=English.Defaults.lex_attr_getters), patterns)
 
 
 def test_compile(matcher):
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 0a2df1bcb..25b592aef 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -1,13 +1,20 @@
 # cython: embedsignature=True
 from __future__ import unicode_literals
 
-from os import path
 import re
+import pathlib
 
 from cython.operator cimport dereference as deref
 from cython.operator cimport preincrement as preinc
 from cpython cimport Py_UNICODE_ISSPACE
 
+
+try:
+    import ujson as json
+except ImportError:
+    import json
+
+
 from cymem.cymem cimport Pool
 from preshed.maps cimport PreshMap
 
@@ -16,17 +23,53 @@ cimport cython
 
 from . import util
 from .tokens.doc cimport Doc
-from .util import read_lang_data, get_package
 
 
 cdef class Tokenizer:
-    def __init__(self, Vocab vocab, rules, prefix_re, suffix_re, infix_re):
+    @classmethod
+    def load(cls, path, Vocab vocab, rules=None, prefix_search=None, suffix_search=None,
+             infix_finditer=None):
+        '''Load a Tokenizer, reading unsupplied components from the path.
+        
+        Arguments:
+            path pathlib.Path (or string, or Path-like)
+            vocab Vocab
+            rules dict
+            prefix_search callable -- Signature of re.compile(string).search
+            suffix_search callable -- Signature of re.compile(string).search
+            infix_finditer callable -- Signature of re.compile(string).finditer
+        '''
+        if isinstance(path, basestring):
+            path = pathlib.Path(path)
+
+        if rules is None:
+            with (path / 'tokenizer' / 'specials.json').open() as file_:
+                rules = json.load(file_)
+        if prefix_search is None:
+            prefix_search = util.read_regex(path / 'tokenizer' / 'prefix.txt').search
+        if suffix_search is None:
+            suffix_search = util.read_regex(path / 'tokenizer' / 'suffix.txt').search
+        if infix_finditer is None:
+            infix_finditer = util.read_regex(path / 'tokenizer' / 'infix.txt').finditer
+        return cls(vocab, rules, prefix_search, suffix_search, infix_finditer)
+
+
+    def __init__(self, Vocab vocab, rules, prefix_search, suffix_search, infix_finditer):
+        '''Create a Tokenizer, to create Doc objects given unicode text.
+        
+        Arguments:
+            vocab Vocab
+            rules dict
+            prefix_search callable -- Signature of re.compile(string).search
+            suffix_search callable -- Signature of re.compile(string).search
+            infix_finditer callable -- Signature of re.compile(string).finditer
+        '''
         self.mem = Pool()
         self._cache = PreshMap()
         self._specials = PreshMap()
-        self._prefix_re = prefix_re
-        self._suffix_re = suffix_re
-        self._infix_re = infix_re
+        self.prefix_search = prefix_search
+        self.suffix_search = suffix_search
+        self.infix_finditer = infix_finditer
         self.vocab = vocab
         self._rules = {}
         for chunk, substrings in sorted(rules.items()):
@@ -40,19 +83,7 @@ cdef class Tokenizer:
                 self._infix_re)
 
         return (self.__class__, args, None, None)
-
-    @classmethod
-    def load(cls, data_dir, Vocab vocab):
-        return cls.from_package(get_package(data_dir), vocab=vocab)
-
-    @classmethod
-    def from_package(cls, package, Vocab vocab):
-        rules, prefix_re, suffix_re, infix_re = read_lang_data(package)
-        prefix_re = re.compile(prefix_re)
-        suffix_re = re.compile(suffix_re)
-        infix_re = re.compile(infix_re)
-        return cls(vocab, rules, prefix_re, suffix_re, infix_re)
-
+    
     cpdef Doc tokens_from_list(self, list strings):
         cdef Doc tokens = Doc(self.vocab)
         if sum([len(s) for s in strings]) == 0:
@@ -258,14 +289,14 @@ cdef class Tokenizer:
         self._cache.set(key, cached)
 
     def find_infix(self, unicode string):
-        return list(self._infix_re.finditer(string))
+        return list(self.infix_finditer(string))
 
     def find_prefix(self, unicode string):
-        match = self._prefix_re.search(string)
+        match = self.prefix_search(string)
         return (match.end() - match.start()) if match is not None else 0
 
     def find_suffix(self, unicode string):
-        match = self._suffix_re.search(string)
+        match = self.suffix_search(string)
         return (match.end() - match.start()) if match is not None else 0
 
     def _load_special_tokenization(self, special_cases):
diff --git a/spacy/util.py b/spacy/util.py
index 53a584715..5c7480326 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -5,12 +5,6 @@ import re
 import os.path
 
 import six
-import sputnik
-from sputnik.dir_package import DirPackage
-from sputnik.package_list import (PackageNotFoundException,
-                                  CompatiblePackageNotFoundException)
-
-from . import about
 from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 
 
@@ -29,30 +23,6 @@ def get_lang_class(name):
     return LANGUAGES[lang]
 
 
-def get_package(data_dir):
-    if not isinstance(data_dir, six.string_types):
-        raise RuntimeError('data_dir must be a string')
-    return DirPackage(data_dir)
-
-
-def get_package_by_name(name=None, via=None):
-    if name is None:
-        return
-    lang = get_lang_class(name)
-    try:
-        return sputnik.package(about.__title__, about.__version__,
-            name, data_path=via)
-    except PackageNotFoundException as e:
-        raise RuntimeError("Model '%s' not installed. Please run 'python -m "
-                           "%s.download' to install latest compatible "
-                           "model." % (name, lang.__module__))
-    except CompatiblePackageNotFoundException as e:
-        raise RuntimeError("Installed model is not compatible with spaCy "
-                           "version. Please run 'python -m %s.download "
-                           "--force' to install latest compatible model." %
-                           (lang.__module__))
-
-
 def normalize_slice(length, start, stop, step=None):
     if not (step is None or step == 1):
         raise ValueError("Stepped slices not supported in Span objects."
@@ -75,100 +45,3 @@ def normalize_slice(length, start, stop, step=None):
 
 def utf8open(loc, mode='r'):
     return io.open(loc, mode, encoding='utf8')
-
-
-def read_lang_data(package):
-    tokenization = package.load_json(('tokenizer', 'specials.json'))
-    with package.open(('tokenizer', 'prefix.txt'), default=None) as file_:
-        prefix = read_prefix(file_) if file_ is not None else None
-    with package.open(('tokenizer', 'suffix.txt'), default=None) as file_:
-        suffix = read_suffix(file_) if file_ is not None else None
-    with package.open(('tokenizer', 'infix.txt'), default=None) as file_:
-        infix = read_infix(file_) if file_ is not None else None
-    return tokenization, prefix, suffix, infix
-
-
-def read_prefix(fileobj):
-    entries = fileobj.read().split('\n')
-    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
-    return expression
-
-
-def read_suffix(fileobj):
-    entries = fileobj.read().split('\n')
-    expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
-    return expression
-
-
-def read_infix(fileobj):
-    entries = fileobj.read().split('\n')
-    expression = '|'.join([piece for piece in entries if piece.strip()])
-    return expression
-
-
-# def read_tokenization(lang):
-#     loc = path.join(DATA_DIR, lang, 'tokenization')
-#     entries = []
-#     seen = set()
-#     with utf8open(loc) as file_:
-#         for line in file_:
-#             line = line.strip()
-#             if line.startswith('#'):
-#                 continue
-#             if not line:
-#                 continue
-#             pieces = line.split()
-#             chunk = pieces.pop(0)
-#             assert chunk not in seen, chunk
-#             seen.add(chunk)
-#             entries.append((chunk, list(pieces)))
-#             if chunk[0].isalpha() and chunk[0].islower():
-#                 chunk = chunk[0].title() + chunk[1:]
-#                 pieces[0] = pieces[0][0].title() + pieces[0][1:]
-#                 seen.add(chunk)
-#                 entries.append((chunk, pieces))
-#     return entries
-
-
-# def read_detoken_rules(lang): # Deprecated?
-#     loc = path.join(DATA_DIR, lang, 'detokenize')
-#     entries = []
-#     with utf8open(loc) as file_:
-#         for line in file_:
-#             entries.append(line.strip())
-#     return entries
-
-
-def align_tokens(ref, indices): # Deprecated, surely?
-    start = 0
-    queue = list(indices)
-    for token in ref:
-        end = start + len(token)
-        emit = []
-        while queue and queue[0][1] <= end:
-            emit.append(queue.pop(0))
-        yield token, emit
-        start = end
-    assert not queue
-
-
-def detokenize(token_rules, words): # Deprecated?
-    """To align with treebanks, return a list of "chunks", where a chunk is a
-    sequence of tokens that are separated by whitespace in actual strings. Each
-    chunk should be a tuple of token indices, e.g.
-
-    >>> detokenize(["ca<SEP>n't", '<SEP>!'], ["I", "ca", "n't", "!"])
-    [(0,), (1, 2, 3)]
-    """
-    string = ' '.join(words)
-    for subtoks in token_rules:
-        # Algorithmically this is dumb, but writing a little list-based match
-        # machine? Ain't nobody got time for that.
-        string = string.replace(subtoks.replace('<SEP>', ' '), subtoks)
-    positions = []
-    i = 0
-    for chunk in string.split():
-        subtoks = chunk.split('<SEP>')
-        positions.append(tuple(range(i, i+len(subtoks))))
-        i += len(subtoks)
-    return positions
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index b78861639..a7a9ffe9d 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -19,7 +19,6 @@ from .orth cimport word_shape
 from .typedefs cimport attr_t
 from .cfile cimport CFile
 from .lemmatizer import Lemmatizer
-from .util import get_package
 
 from . import attrs
 from . import symbols
@@ -28,6 +27,7 @@ from cymem.cymem cimport Address
 from .serialize.packer cimport Packer
 from .attrs cimport PROB, LANG
 
+
 try:
     import copy_reg
 except ImportError:
@@ -47,30 +47,32 @@ cdef class Vocab:
     '''A map container for a language's LexemeC structs.
     '''
     @classmethod
-    def load(cls, data_dir, get_lex_attr=None):
-        return cls.from_package(get_package(data_dir), get_lex_attr=get_lex_attr)
+    def load(cls, path, get_lex_attr=None, vectors=True, lemmatizer=None):
+        if (path / 'vocab' / 'tag_map.json').exists():
+            with (path / 'vocab' / 'tag_map.json').open() as file_:
+                tag_map = json.loads(file_)
+        else:
+            tag_map = {}
 
-    @classmethod
-    def from_package(cls, package, get_lex_attr=None, vectors_package=None):
-        tag_map = package.load_json(('vocab', 'tag_map.json'), default={})
+        if lemmatizer is None:
+            lemmatizer = Lemmatizer.load(path)
 
-        lemmatizer = Lemmatizer.from_package(package)
-
-        serializer_freqs = package.load_json(('vocab', 'serializer.json'), default={})
+        if (path / 'vocab' / 'serializer.json').exists():
+            with (path / 'vocab' / 'serializer.json').open() as file_:
+                serializer_freqs = json.loads(file_)
+        else:
+            serializer_freqs = {}
 
         cdef Vocab self = cls(get_lex_attr=get_lex_attr, tag_map=tag_map,
                               lemmatizer=lemmatizer, serializer_freqs=serializer_freqs)
 
-        with package.open(('vocab', 'strings.json')) as file_:
+        with (path / 'vocab' / 'strings.json').open() as file_:
             self.strings.load(file_)
-        self.load_lexemes(package.file_path('vocab', 'lexemes.bin'))
+        self.load_lexemes(path / 'vocab' / 'lexemes.bin')
 
-        if vectors_package and vectors_package.has_file('vocab', 'vec.bin'):
-            self.vectors_length = self.load_vectors_from_bin_loc(
-                vectors_package.file_path('vocab', 'vec.bin'))
-        elif package.has_file('vocab', 'vec.bin'):
-            self.vectors_length = self.load_vectors_from_bin_loc(
-                package.file_path('vocab', 'vec.bin'))
+        if vectors is True:
+            vectors = lambda self_: self_.load_vectors_from_bin_loc(path / 'vocab' / 'vec.bin')
+        self.vectors_length = vectors(self)
         return self
 
     def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):
@@ -87,6 +89,9 @@ cdef class Vocab:
         # is the frequency rank of the word, plus a certain offset. The structural
         # strings are loaded first, because the vocab is open-class, and these
         # symbols are closed class.
+        # TODO: Actually this has turned out to be a pain in the ass...
+        # It means the data is invalidated when we add a symbol :(
+        # Need to rethink this.
         for name in symbols.NAMES + list(sorted(tag_map.keys())):
             if name:
                 _ = self.strings[name]