From fd65cf6cbb620f2a7c7d6b456e695df9807abaf3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sat, 24 Sep 2016 20:26:17 +0200
Subject: [PATCH] Finish refactoring data loading

---
 spacy/__init__.py                     | 31 +++++------
 spacy/cfile.pyx                       |  8 ++-
 spacy/de/__init__.py                  | 21 +++++---
 spacy/deprecated.py                   | 17 ------
 spacy/en/__init__.py                  | 58 ++++++++++----------
 spacy/language.py                     | 29 ++++++----
 spacy/lemmatizer.py                   | 27 ++++++----
 spacy/matcher.pyx                     |  6 ++-
 spacy/syntax/parser.pxd               |  1 +
 spacy/syntax/parser.pyx               |  4 +-
 spacy/tagger.pyx                      | 15 +++---
 spacy/tests/serialize/test_packer.py  | 12 ++---
 spacy/tests/tagger/test_lemmatizer.py | 21 ++++----
 spacy/tokenizer.pxd                   |  6 +--
 spacy/tokenizer.pyx                   |  7 +--
 spacy/util.py                         | 77 +++++++++++++++++++++++++++
 spacy/vocab.pyx                       | 21 ++++----
 17 files changed, 220 insertions(+), 141 deletions(-)

diff --git a/spacy/__init__.py b/spacy/__init__.py
index e8ad5dde3..da038a9fe 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -7,31 +7,26 @@ from . import de
 from . import zh
 
 
-_data_path = pathlib.Path(__file__).parent / 'data'
-
 set_lang_class(en.English.lang, en.English)
 set_lang_class(de.German.lang, de.German)
 set_lang_class(zh.Chinese.lang, zh.Chinese)
 
 
-def get_data_path():
-    return _data_path
-
-
-def set_data_path(path):
-    global _data_path
-    if isinstance(path, basestring):
-        path = pathlib.Path(path)
-    _data_path = path
-
-
-def load(name, vocab=None, tokenizer=None, parser=None, tagger=None, entity=None,
-         matcher=None, serializer=None, vectors=None, via=None):
+def load(name, vocab=True, tokenizer=True, parser=True, tagger=True, entity=True,
+         matcher=True, serializer=True, vectors=True, via=None):
     if via is None:
-        via = get_data_path()
-    cls = get_lang_class(name)
+        via = util.get_data_path()
+
+    target_name, target_version = util.split_data_name(name)
+    path = util.match_best_version(target_name, target_version, via)
+
+    if isinstance(vectors, basestring):
+        vectors_name, vectors_version = util.split_data_name(vectors)
+        vectors = util.match_best_version(vectors_name, vectors_version, via)
+    
+    cls = get_lang_class(target_name)
     return cls(
-        via,
+        path,
         vectors=vectors,
         vocab=vocab,
         tokenizer=tokenizer,
diff --git a/spacy/cfile.pyx b/spacy/cfile.pyx
index 781759466..5a9c3850e 100644
--- a/spacy/cfile.pyx
+++ b/spacy/cfile.pyx
@@ -2,15 +2,19 @@ from libc.stdio cimport fopen, fclose, fread, fwrite, FILE
 
 
 cdef class CFile:
-    def __init__(self, loc, mode):
+    def __init__(self, loc, mode, on_open_error=None):
         if isinstance(mode, unicode):
             mode_str = mode.encode('ascii')
         else:
             mode_str = mode
+        loc = str(loc)
         cdef bytes bytes_loc = loc.encode('utf8') if type(loc) == unicode else loc
         self.fp = fopen(<char*>bytes_loc, mode_str)
         if self.fp == NULL:
-            raise IOError("Could not open binary file %s" % bytes_loc)
+            if on_open_error is not None:
+                on_open_error()
+            else:
+                raise IOError("Could not open binary file %s" % bytes_loc)
         self.is_open = True
 
     def __dealloc__(self):
diff --git a/spacy/de/__init__.py b/spacy/de/__init__.py
index 8a2f809ff..5a3ad953c 100644
--- a/spacy/de/__init__.py
+++ b/spacy/de/__init__.py
@@ -3,15 +3,22 @@ from __future__ import unicode_literals, print_function
 from os import path
 
 from ..language import Language
+from ..vocab import Vocab
+from ..attrs import LANG
 
 
 class German(Language):
     lang = 'de'
+    
+    class Defaults(Language.Defaults):
+        def Vocab(self, vectors=None, lex_attr_getters=None):
+            if lex_attr_getters is None:
+                lex_attr_getters = dict(self.lex_attr_getters)
+            if vectors is None:
+                vectors = self.Vectors()
+            # set a dummy lemmatizer for now that simply returns the same string
+            # until the morphology is done for German
+            return Vocab.load(self.path, get_lex_attr=lex_attr_getters, vectors=vectors,
+                              lemmatizer=False)
 
-    @classmethod
-    def default_vocab(cls, package, get_lex_attr=None, vectors_package=None):
-        vocab = super(German,cls).default_vocab(package,get_lex_attr,vectors_package)
-        # set a dummy lemmatizer for now that simply returns the same string
-        # until the morphology is done for German
-        vocab.morphology.lemmatizer = lambda string,pos: set([string])
-        return vocab
+        stop_words = set()
diff --git a/spacy/deprecated.py b/spacy/deprecated.py
index 2f9109772..d75354f9c 100644
--- a/spacy/deprecated.py
+++ b/spacy/deprecated.py
@@ -43,23 +43,6 @@ def read_lang_data(package):
     return tokenization, prefix, suffix, infix
 
 
-def read_prefix(fileobj):
-    entries = fileobj.read().split('\n')
-    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
-    return expression
-
-
-def read_suffix(fileobj):
-    entries = fileobj.read().split('\n')
-    expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
-    return expression
-
-
-def read_infix(fileobj):
-    entries = fileobj.read().split('\n')
-    expression = '|'.join([piece for piece in entries if piece.strip()])
-    return expression
-
 
 def align_tokens(ref, indices): # Deprecated, surely?
     start = 0
diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py
index b261b6c20..4ffed67b4 100644
--- a/spacy/en/__init__.py
+++ b/spacy/en/__init__.py
@@ -5,37 +5,35 @@ from os import path
 from ..language import Language
 
 
-# improved list from Stone, Denis, Kwantes (2010)
-STOPWORDS = """
-a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be
-became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can
-cannot cant co computer con could couldnt cry de describe
-detail did didn do does doesn doing don done down due during
-each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen
-fify fill find fire first five for former formerly forty found four from front full further get give go
-had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie
-if in inc indeed interest into is it its itself keep last latter latterly least less ltd
-just
-kg km
-made make many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely
-neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off
-often on once one only onto or other others otherwise our ours ourselves out over own part per
-perhaps please put rather re
-quite
-rather really regarding
-same say see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten
-than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under
-until up unless upon us used using
-various very very via
-was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you
-your yours yourself yourselves
-"""
-STOPWORDS = set(w for w in STOPWORDS.split() if w)
-
 
 class English(Language):
     lang = 'en'
 
-    @staticmethod
-    def is_stop(string):
-        return 1 if string.lower() in STOPWORDS else 0
+    class Defaults(Language.Defaults):
+        lex_attr_getters = dict(Language.Defaults.lex_attr_getters)
+
+        # improved list from Stone, Denis, Kwantes (2010)
+        stop_words = set("""
+        a about above across after afterwards again against all almost alone along already also although always am among amongst amoungst amount an and another any anyhow anyone anything anyway anywhere are around as at back be
+        became because become becomes becoming been before beforehand behind being below beside besides between beyond bill both bottom but by call can
+        cannot cant co computer con could couldnt cry de describe
+        detail did didn do does doesn doing don done down due during
+        each eg eight either eleven else elsewhere empty enough etc even ever every everyone everything everywhere except few fifteen
+        fify fill find fire first five for former formerly forty found four from front full further get give go
+        had has hasnt have he hence her here hereafter hereby herein hereupon hers herself him himself his how however hundred i ie
+        if in inc indeed interest into is it its itself keep last latter latterly least less ltd
+        just
+        kg km
+        made make many may me meanwhile might mill mine more moreover most mostly move much must my myself name namely
+        neither never nevertheless next nine no nobody none noone nor not nothing now nowhere of off
+        often on once one only onto or other others otherwise our ours ourselves out over own part per
+        perhaps please put rather re
+        quite
+        rather really regarding
+        same say see seem seemed seeming seems serious several she should show side since sincere six sixty so some somehow someone something sometime sometimes somewhere still such system take ten
+        than that the their them themselves then thence there thereafter thereby therefore therein thereupon these they thick thin third this those though three through throughout thru thus to together too top toward towards twelve twenty two un under
+        until up unless upon us used using
+        various very very via
+        was we well were what whatever when whence whenever where whereafter whereas whereby wherein whereupon wherever whether which while whither who whoever whole whom whose why will with within without would yet you
+        your yours yourself yourselves
+        """.split())
diff --git a/spacy/language.py b/spacy/language.py
index 2e0a3f022..2c7c496b8 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1,4 +1,5 @@
 from __future__ import absolute_import
+from __future__ import unicode_literals
 from warnings import warn
 import pathlib
 
@@ -17,36 +18,38 @@ from . import attrs
 from . import orth
 from .syntax.ner import BiluoPushDown
 from .syntax.arc_eager import ArcEager
+from . import util
 
-from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD
+from .attrs import TAG, DEP, ENT_IOB, ENT_TYPE, HEAD, PROB, LANG, IS_STOP
 
 
-class Defaults(object):
+class BaseDefaults(object):
     def __init__(self, lang, path):
-        self.lang = lang
         self.path = path
+        self.lang = lang
         self.lex_attr_getters = dict(self.__class__.lex_attr_getters)
         if (self.path / 'vocab' / 'oov_prob').exists():
             with (self.path / 'vocab' / 'oov_prob').open() as file_:
                 oov_prob = file_.read().strip()
-            self.lex_attr_getters['PROB'] = lambda string: oov_prob
-        self.lex_attr_getters['LANG'] = lambda string: self.lang,
+            self.lex_attr_getters[PROB] = lambda string: oov_prob
+        self.lex_attr_getters[LANG] = lambda string: lang
+        self.lex_attr_getters[IS_STOP] = lambda string: string in self.stop_words
 
     def Vectors(self):
-        pass
+        return True
     
     def Vocab(self, vectors=None, lex_attr_getters=None):
         if lex_attr_getters is None:
             lex_attr_getters = dict(self.lex_attr_getters)
         if vectors is None:
             vectors = self.Vectors()
-        return Vocab.load(self.path, get_lex_attr=get_lex_attr, vectors=vectors)
+        return Vocab.load(self.path, get_lex_attr=self.lex_attr_getters, vectors=vectors)
 
     def Tokenizer(self, vocab):
         return Tokenizer.load(self.path, vocab) 
 
     def Tagger(self, vocab):
-        return Tagger.load(self.path, self.vocab)
+        return Tagger.load(self.path / 'pos', vocab)
 
     def Parser(self, vocab):
         if (self.path / 'deps').exists():
@@ -74,6 +77,9 @@ class Defaults(object):
 
     ner_labels = {0: {'PER': True, 'LOC': True, 'ORG': True, 'MISC': True}}
 
+
+    stop_words = set()
+
     lex_attr_getters = {
         attrs.LOWER: lambda string: string.lower(),
         attrs.NORM: lambda string: string,
@@ -101,11 +107,12 @@ class Defaults(object):
     }
 
 
+
 class Language(object):
     '''A text-processing pipeline. Usually you'll load this once per process, and
     pass the instance around your program.
     '''
-    Defaults = Defaults
+    Defaults = BaseDefaults
     lang = None
 
     def __init__(self,
@@ -144,6 +151,8 @@ class Language(object):
             path = data_dir
         if isinstance(path, basestring):
             path = pathlib.Path(path)
+        if path is None:
+            path = util.match_best_version(self.lang, '', util.get_data_path())
         self.path = path
         defaults = defaults if defaults is not True else self.get_defaults(self.path)
         
@@ -256,4 +265,4 @@ class Language(object):
 
         
     def get_defaults(self, path):
-        return Defaults(self.lang, path)
+        return self.Defaults(self.lang, path)
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index 486fa8c7f..210f41c9d 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -1,6 +1,6 @@
 from __future__ import unicode_literals, print_function
-from os import path
 import codecs
+import pathlib
 
 try:
     import ujson as json
@@ -12,19 +12,24 @@ from .parts_of_speech import NOUN, VERB, ADJ, PUNCT
 
 class Lemmatizer(object):
     @classmethod
-    def load(cls, via):
-        return cls.from_package(get_package(via))
-
-    @classmethod
-    def from_package(cls, pkg):
+    def load(cls, path):
         index = {}
         exc = {}
         for pos in ['adj', 'noun', 'verb']:
-            with pkg.open(('wordnet', 'index.%s' % pos), default=None) as file_:
-                index[pos] = read_index(file_) if file_ is not None else set()
-            with pkg.open(('wordnet', '%s.exc' % pos), default=None) as file_:
-                exc[pos] = read_exc(file_) if file_ is not None else {}
-        rules = pkg.load_json(('vocab', 'lemma_rules.json'), default={})
+            pos_index_path = path / 'wordnet' / 'index.{pos}'.format(pos=pos)
+            if pos_index_path.exists():
+                with pos_index_path.open() as file_:
+                    index[pos] = read_index(file_)
+            else:
+                index[pos] = set()
+            pos_exc_path = path / 'wordnet' / '{pos}.exc'.format(pos=pos)
+            if pos_exc_path.exists():
+                with pos_exc_path.open() as file_:
+                    exc[pos] = read_exc(file_)
+            else:
+                exc[pos] = {}
+        with (path / 'vocab' / 'lemma_rules.json').open() as file_:
+            rules = json.load(file_)
         return cls(index, exc, rules)
 
     def __init__(self, index, exceptions, rules):
diff --git a/spacy/matcher.pyx b/spacy/matcher.pyx
index d4d695379..3307eb864 100644
--- a/spacy/matcher.pyx
+++ b/spacy/matcher.pyx
@@ -197,9 +197,11 @@ cdef class Matcher:
     
     @classmethod
     def load(cls, path, vocab):
-        if (path / 'patterns.json').exists():
-            with (path / 'patterns.json').open() as file_:
+        if (path / 'gazetteer.json').exists():
+            with (path / 'gazetteer.json').open() as file_:
                 patterns = json.load(file_)
+        else:
+            patterns = {}
         return cls(vocab, patterns)
 
     def __init__(self, vocab, patterns={}):
diff --git a/spacy/syntax/parser.pxd b/spacy/syntax/parser.pxd
index 2856cccc9..2c3a106b3 100644
--- a/spacy/syntax/parser.pxd
+++ b/spacy/syntax/parser.pxd
@@ -17,5 +17,6 @@ cdef class Parser:
     cdef readonly Vocab vocab
     cdef readonly ParserModel model
     cdef readonly TransitionSystem moves
+    cdef readonly object cfg
 
     cdef int parseC(self, TokenC* tokens, int length, int nr_feat, int nr_class) nogil
diff --git a/spacy/syntax/parser.pyx b/spacy/syntax/parser.pyx
index 2eaadbc86..0419c339d 100644
--- a/spacy/syntax/parser.pyx
+++ b/spacy/syntax/parser.pyx
@@ -81,12 +81,12 @@ cdef class Parser:
     @classmethod
     def load(cls, path, Vocab vocab, moves_class):
         with (path / 'config.json').open() as file_:
-            cfg = json.loads(file_)
+            cfg = json.load(file_)
         moves = moves_class(vocab.strings, cfg['labels'])
         templates = get_templates(cfg['features'])
         model = ParserModel(templates)
         if (path / 'model').exists():
-            model.load(path / 'model')
+            model.load(str(path / 'model'))
         return cls(vocab, moves, model, **cfg)
 
     def __init__(self, Vocab vocab, transition_system, ParserModel model, **cfg):
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index a8d4cf37a..64b3e9cc2 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -1,5 +1,5 @@
 import json
-from os import path
+import pathlib
 from collections import defaultdict
 from libc.string cimport memset
 
@@ -102,10 +102,6 @@ cdef inline void _fill_from_token(atom_t* context, const TokenC* t) nogil:
 
 cdef class Tagger:
     """A part-of-speech tagger for English"""
-    @classmethod
-    def read_config(cls, data_dir):
-        return json.load(open(path.join(data_dir, 'pos', 'config.json')))
-
     @classmethod
     def default_templates(cls):
         return (
@@ -146,15 +142,16 @@ cdef class Tagger:
 
     @classmethod
     def load(cls, path, vocab):
-        if (path / 'pos' / 'templates.json').exists():
-            with (path / 'pos' / 'templates.json').open() as file_:
+        path = path if not isinstance(path, basestring) else pathlib.Path(path)
+        if (path / 'templates.json').exists():
+            with (path / 'templates.json').open() as file_:
                 templates = json.load(file_)
         else:
             templates = cls.default_templates()
 
         model = TaggerModel(templates)
-        if (path / 'pos' / 'model').exists():
-            model.load(path / 'pos' / 'model')
+        if (path / 'model').exists():
+            model.load(str(path / 'model'))
         return cls(vocab, model)
 
     def __init__(self, Vocab vocab, TaggerModel model):
diff --git a/spacy/tests/serialize/test_packer.py b/spacy/tests/serialize/test_packer.py
index 1f78da974..0eb24a08d 100644
--- a/spacy/tests/serialize/test_packer.py
+++ b/spacy/tests/serialize/test_packer.py
@@ -22,13 +22,13 @@ from spacy.serialize.bits import BitArray
 
 @pytest.fixture
 def vocab():
-    data_dir = os.environ.get('SPACY_DATA')
-    if data_dir is None:
-        package = util.get_package_by_name('en')
+    path = os.environ.get('SPACY_DATA')
+    if path is None:
+        path = util.match_best_version('en', None, util.get_data_path())
     else:
-        package = util.get_package(data_dir)
+        path = util.match_best_version('en', None, path)
 
-    vocab = English.default_vocab(package=package)
+    vocab = English.Defaults('en', path).Vocab()
     lex = vocab['dog']
     assert vocab[vocab.strings['dog']].orth_ == 'dog'
     lex  = vocab['the']
@@ -40,7 +40,7 @@ def vocab():
 @pytest.fixture
 def tokenizer(vocab):
     null_re = re.compile(r'!!!!!!!!!')
-    tokenizer = Tokenizer(vocab, {}, null_re, null_re, null_re)
+    tokenizer = Tokenizer(vocab, {}, null_re.search, null_re.search, null_re.finditer)
     return tokenizer
 
 
diff --git a/spacy/tests/tagger/test_lemmatizer.py b/spacy/tests/tagger/test_lemmatizer.py
index aaf698fa3..8cde74052 100644
--- a/spacy/tests/tagger/test_lemmatizer.py
+++ b/spacy/tests/tagger/test_lemmatizer.py
@@ -11,29 +11,26 @@ import pytest
 
 
 @pytest.fixture
-def package():
-    data_dir = os.environ.get('SPACY_DATA')
-    if data_dir is None:
-        return util.get_package_by_name('en')
-    else:
-        return util.get_package(data_dir)
+def path():
+    return util.match_best_version('en', None,
+                os.environ.get('SPACY_DATA', util.get_data_path()))
 
 
 @pytest.fixture
-def lemmatizer(package):
-    return Lemmatizer.from_package(package)
+def lemmatizer(path):
+    return Lemmatizer.load(path)
 
 
-def test_read_index(package):
-    with package.open(('wordnet', 'index.noun')) as file_:
+def test_read_index(path):
+    with (path / 'wordnet' / 'index.noun').open() as file_:
         index = read_index(file_)
     assert 'man' in index
     assert 'plantes' not in index
     assert 'plant' in index
 
 
-def test_read_exc(package):
-    with package.open(('wordnet', 'verb.exc')) as file_:
+def test_read_exc(path):
+    with (path / 'wordnet' / 'verb.exc').open() as file_:
         exc = read_exc(file_)
     assert exc['was'] == ('be',)
 
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index 2fc192d12..e53b7dbd1 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -16,9 +16,9 @@ cdef class Tokenizer:
     cdef PreshMap _specials
     cpdef readonly Vocab vocab
 
-    cdef object _prefix_re
-    cdef object _suffix_re
-    cdef object _infix_re
+    cdef public object prefix_search
+    cdef public object suffix_search
+    cdef public object infix_finditer
     cdef object _rules
 
     cpdef Doc tokens_from_list(self, list strings)
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 25b592aef..fcf1a2cbd 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -46,11 +46,11 @@ cdef class Tokenizer:
             with (path / 'tokenizer' / 'specials.json').open() as file_:
                 rules = json.load(file_)
         if prefix_search is None:
-            prefix_search = util.read_regex(path / 'tokenizer' / 'prefix.txt').search
+            prefix_search = util.read_prefix_regex(path / 'tokenizer' / 'prefix.txt').search
         if suffix_search is None:
-            suffix_search = util.read_regex(path / 'tokenizer' / 'suffix.txt').search
+            suffix_search = util.read_suffix_regex(path / 'tokenizer' / 'suffix.txt').search
         if infix_finditer is None:
-            infix_finditer = util.read_regex(path / 'tokenizer' / 'infix.txt').finditer
+            infix_finditer = util.read_infix_regex(path / 'tokenizer' / 'infix.txt').finditer
         return cls(vocab, rules, prefix_search, suffix_search, infix_finditer)
 
 
@@ -297,6 +297,7 @@ cdef class Tokenizer:
 
     def find_suffix(self, unicode string):
         match = self.suffix_search(string)
+        print("Suffix", match, string)
         return (match.end() - match.start()) if match is not None else 0
 
     def _load_special_tokenization(self, special_cases):
diff --git a/spacy/util.py b/spacy/util.py
index 5c7480326..1e48e5840 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -3,12 +3,14 @@ import io
 import json
 import re
 import os.path
+import pathlib
 
 import six
 from .attrs import TAG, HEAD, DEP, ENT_IOB, ENT_TYPE
 
 
 LANGUAGES = {}
+_data_path = pathlib.Path(__file__).parent / 'data'
 
 
 def set_lang_class(name, cls):
@@ -23,6 +25,81 @@ def get_lang_class(name):
     return LANGUAGES[lang]
 
 
+def get_data_path():
+    return _data_path
+
+
+def set_data_path(path):
+    global _data_path
+    if isinstance(path, basestring):
+        path = pathlib.Path(path)
+    _data_path = path
+
+
+
+def match_best_version(target_name, target_version, path):
+    path = path if not isinstance(path, basestring) else pathlib.Path(path)
+    matches = []
+    for data_name in path.iterdir():
+        name, version = split_data_name(data_name.parts[-1])
+        if name == target_name and constraint_match(target_version, version):
+            matches.append((tuple(float(v) for v in version.split('.')), data_name))
+    if matches:
+        return pathlib.Path(max(matches)[1])
+    else:
+        return None
+
+
+def split_data_name(name):
+    return name.split('-', 1) if '-' in name else (name, '')
+
+
+def constraint_match(constraint_string, version):
+    # From http://github.com/spacy-io/sputnik
+    if not constraint_string:
+        return True
+
+    constraints = [c.strip() for c in constraint_string.split(',') if c.strip()]
+
+    for c in constraints:
+        if not re.match(r'[><=][=]?\d+(\.\d+)*', c):
+            raise ValueError('invalid constraint: %s' % c)
+
+    return all(semver.match(version, c) for c in constraints)
+
+
+def read_regex(path):
+    path = path if not isinstance(path, basestring) else pathlib.Path(path)
+    with path.open() as file_:
+        entries = file_.read().split('\n')
+    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
+    return re.compile(expression)
+
+
+def read_prefix_regex(path):
+    path = path if not isinstance(path, basestring) else pathlib.Path(path)
+    with path.open() as file_:
+        entries = file_.read().split('\n')
+    expression = '|'.join(['^' + re.escape(piece) for piece in entries if piece.strip()])
+    return re.compile(expression)
+
+
+def read_suffix_regex(path):
+    path = path if not isinstance(path, basestring) else pathlib.Path(path)
+    with path.open() as file_:
+        entries = file_.read().split('\n')
+    expression = '|'.join([piece + '$' for piece in entries if piece.strip()])
+    return re.compile(expression)
+
+
+def read_infix_regex(path):
+    path = path if not isinstance(path, basestring) else pathlib.Path(path)
+    with path.open() as file_:
+        entries = file_.read().split('\n')
+    expression = '|'.join([piece for piece in entries if piece.strip()])
+    return re.compile(expression)
+
+
 def normalize_slice(length, start, stop, step=None):
     if not (step is None or step == 1):
         raise ValueError("Stepped slices not supported in Span objects."
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index a7a9ffe9d..f061dd233 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -47,19 +47,21 @@ cdef class Vocab:
     '''A map container for a language's LexemeC structs.
     '''
     @classmethod
-    def load(cls, path, get_lex_attr=None, vectors=True, lemmatizer=None):
+    def load(cls, path, get_lex_attr=None, vectors=True, lemmatizer=True):
         if (path / 'vocab' / 'tag_map.json').exists():
             with (path / 'vocab' / 'tag_map.json').open() as file_:
-                tag_map = json.loads(file_)
+                tag_map = json.load(file_)
         else:
             tag_map = {}
 
-        if lemmatizer is None:
+        if lemmatizer is True:
             lemmatizer = Lemmatizer.load(path)
+        elif not lemmatizer:
+            lemmatizer = lambda string, pos: set((string,))
 
         if (path / 'vocab' / 'serializer.json').exists():
             with (path / 'vocab' / 'serializer.json').open() as file_:
-                serializer_freqs = json.loads(file_)
+                serializer_freqs = json.load(file_)
         else:
             serializer_freqs = {}
 
@@ -72,7 +74,8 @@ cdef class Vocab:
 
         if vectors is True:
             vectors = lambda self_: self_.load_vectors_from_bin_loc(path / 'vocab' / 'vec.bin')
-        self.vectors_length = vectors(self)
+        if vectors:
+            self.vectors_length = vectors(self)
         return self
 
     def __init__(self, get_lex_attr=None, tag_map=None, lemmatizer=None, serializer_freqs=None):
@@ -101,6 +104,7 @@ cdef class Vocab:
         
         self.length = 1
         self._serializer = None
+        print("Vocab lang", self.lang)
     
     property serializer:
         def __get__(self):
@@ -113,7 +117,7 @@ cdef class Vocab:
         def __get__(self):
             langfunc = None
             if self.get_lex_attr:
-                langfunc = self.get_lex_attr.get(LANG,None)
+                langfunc = self.get_lex_attr.get(LANG, None)
             return langfunc('_') if langfunc else ''
 
     def __len__(self):
@@ -261,9 +265,8 @@ cdef class Vocab:
         fp.close()
 
     def load_lexemes(self, loc):
-        if not path.exists(loc):
-            raise IOError('LexemeCs file not found at %s' % loc)
-        fp = CFile(loc, 'rb')
+        fp = CFile(loc, 'rb',
+                on_open_error=lambda: IOError('LexemeCs file not found at %s' % loc))
         cdef LexemeC* lexeme
         cdef hash_t key
         cdef unicode py_str