From 73f200436f577d3c0b4ef73139e48c2b043d9381 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal@gmail.com>
Date: Tue, 23 Dec 2014 11:40:32 +1100
Subject: [PATCH] * Tests passing except for morphology/lemmatization stuff

---
 spacy/en/__init__.py           | 26 +++++-----------
 spacy/en/pos.pxd               |  3 ++
 spacy/en/pos.pyx               | 15 ++++++++--
 spacy/morphology.pyx           | 16 ++++------
 spacy/strings.pxd              |  7 -----
 spacy/strings.pyx              |  6 ++--
 spacy/tagger.pyx               |  3 +-
 spacy/tokens.pxd               |  1 +
 spacy/tokens.pyx               |  5 ++--
 spacy/vocab.pyx                | 20 ++++++-------
 tests/test_add_lemmas.py       | 10 +++++--
 tests/test_contractions.py     |  2 +-
 tests/test_emoticons.py        |  2 +-
 tests/test_infix.py            |  7 +++--
 tests/test_iter_lexicon.py     |  2 +-
 tests/test_lexeme_flags.py     |  4 +--
 tests/test_morph_exceptions.py |  6 ++--
 tests/test_only_punct.py       |  8 +++--
 tests/test_post_punct.py       | 25 +++++++++-------
 tests/test_pre_punct.py        | 23 ++++++++------
 tests/test_special_affix.py    | 55 ++++++++++++++++++----------------
 tests/test_string_loading.py   | 13 +++++---
 tests/test_surround_punct.py   | 15 ++++++----
 tests/test_tokenizer.py        |  2 +-
 tests/test_tokens_from_list.py | 13 ++++++--
 tests/test_vocab.py            | 32 ++++++++++++--------
 tests/test_whitespace.py       | 31 +++++++++++--------
 27 files changed, 197 insertions(+), 155 deletions(-)

diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py
index d17eaf61b..563d0b1b7 100644
--- a/spacy/en/__init__.py
+++ b/spacy/en/__init__.py
@@ -6,8 +6,6 @@ from ..vocab import Vocab
 from ..tokenizer import Tokenizer
 from ..syntax.parser import GreedyParser
 from ..tokens import Tokens
-from ..morphology import Morphologizer
-from .lemmatizer import Lemmatizer
 from .pos import EnPosTagger
 from .pos import POS_TAGS
 from .attrs import get_flags
@@ -18,28 +16,18 @@ def get_lex_props(string):
 
 
 class English(object):
-    def __init__(self, data_dir=None, pos_tag=True, parse=False):
+    def __init__(self, data_dir=None, tag=True, parse=False):
         if data_dir is None:
             data_dir = path.join(path.dirname(__file__), 'data')
-        self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props)
-        for pos_str in POS_TAGS:
-            _ = self.vocab.strings.pos_tags[pos_str]
+        self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
         self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
-        if pos_tag:
-            morph = Morphologizer(self.vocab.strings, POS_TAGS,
-                                  Lemmatizer(path.join(data_dir, 'wordnet')))
-            self.pos_tagger = EnPosTagger(data_dir, morph)
-        else:
-            self.pos_tagger = None
-        if parse:
-            self.parser = GreedyParser(data_dir)
-        else:
-            self.parser = None
+        self.tagger = EnPosTagger(self.vocab.strings, data_dir) if tag else None
+        self.parser = GreedyParser(data_dir) if parse else None
 
-    def __call__(self, text, pos_tag=True, parse=True):
+    def __call__(self, text, tag=True, parse=True):
         tokens = self.tokenizer.tokenize(text)
-        if self.pos_tagger and pos_tag:
-            self.pos_tagger(tokens)
+        if self.tagger and tag:
+            self.tagger(tokens)
         if self.parser and parse:
             self.parser.parse(tokens)
         return tokens
diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd
index 99c83d795..63cd906fc 100644
--- a/spacy/en/pos.pxd
+++ b/spacy/en/pos.pxd
@@ -1,6 +1,9 @@
 from ..tagger cimport Tagger
 from ..morphology cimport Morphologizer
+from ..strings cimport StringStore
 
 
 cdef class EnPosTagger(Tagger):
+    cdef readonly StringStore strings
+    cdef readonly StringStore tags
     cdef readonly Morphologizer morphologizer
diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx
index 6b002f9a8..4484207ae 100644
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@@ -1,10 +1,13 @@
 from os import path
+import json
+
 from thinc.typedefs cimport atom_t
 
 from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
 from ..typedefs cimport X, PUNCT, EOL
 from ..structs cimport TokenC, Morphology
 from ..tokens cimport Tokens
+from .lemmatizer import Lemmatizer
 
 
 cpdef enum en_person_t:
@@ -192,10 +195,18 @@ POS_TEMPLATES = (
 
 
 cdef class EnPosTagger(Tagger):
-    def __init__(self, data_dir, morphologizer):
+    def __init__(self, StringStore strings, data_dir):
         model_dir = path.join(data_dir, 'pos')
         Tagger.__init__(self, path.join(model_dir))
-        self.morphologizer = morphologizer
+        self.strings = strings
+        cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
+        self.tags = StringStore()
+        for tag in sorted(cfg['tag_names']):
+            _ = self.tags[tag]
+        self.morphologizer = Morphologizer(self.strings, cfg['tag_names'],
+                                           cfg['tag_map'],
+                                 Lemmatizer(path.join(data_dir, 'wordnet'),
+                                            NOUN, VERB, ADJ))
 
     def __call__(self, Tokens tokens):
         cdef int i
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index 7f9df80da..66495be04 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -35,15 +35,15 @@ cdef struct _Cached:
 cdef class Morphologizer:
     """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
     """
-    def __init__(self, StringStore strings, object tag_map, object lemmatizer,
-                 irregulars=None):
+    def __init__(self, StringStore strings, object tag_names, object tag_map,
+                 object lemmatizer, irregulars=None):
         self.mem = Pool()
         self.strings = strings
         self.lemmatizer = lemmatizer
-        cdef int n_tags = len(self.strings.pos_tags) + 1
+        cdef int n_tags = len(tag_names) + 1
         self._cache = PreshMapArray(n_tags)
         self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
-        for tag, i in self.strings.pos_tags:
+        for i, tag in enumerate(sorted(tag_names)):
             pos, props = tag_map[tag]
             self.tags[i].id = i
             self.tags[i].pos = pos
@@ -65,13 +65,7 @@ cdef class Morphologizer:
         cdef bytes py_string = self.strings[lex.sic]
         cdef set lemma_strings
         cdef bytes lemma_string
-        if pos == NOUN:
-            lemma_strings = self.lemmatizer.noun(py_string)
-        elif pos == VERB:
-            lemma_strings = self.lemmatizer.verb(py_string)
-        else:
-            assert pos == ADJ
-            lemma_strings = self.lemmatizer.adj(py_string)
+        lemma_strings = self.lemmatizer(py_string, pos)
         lemma_string = sorted(lemma_strings)[0]
         lemma = self.strings.intern(lemma_string, len(lemma_string)).i
         return lemma
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
index 00a5fbf66..9c16cfe1c 100644
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@@ -11,16 +11,9 @@ cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end)
     s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
 
 
-cdef class _SymbolMap:
-    cdef dict _string_to_id
-    cdef list _id_to_string
-
-
 cdef class StringStore:
     cdef Pool mem
     cdef Utf8Str* strings
-    cdef readonly _SymbolMap pos_tags
-    cdef readonly _SymbolMap dep_tags
     cdef size_t size
 
     cdef PreshMap _map
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index 903256874..67d375ed7 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -9,7 +9,8 @@ from .typedefs cimport hash_t
 SEPARATOR = '\n|-SEP-|\n'
 
 
-cdef class _SymbolMap:
+"""
+cdef class SymbolMap:
     def __init__(self):
         self._string_to_id = {'': 0}
         self._id_to_string = ['']
@@ -38,6 +39,7 @@ cdef class _SymbolMap:
                 self._string_to_id[string] = id_
                 self._id_to_string.append(string)
             return id_
+"""
 
 
 cdef class StringStore:
@@ -47,8 +49,6 @@ cdef class StringStore:
         self._resize_at = 10000
         self.size = 1
         self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
-        self.pos_tags = _SymbolMap()
-        self.dep_tags = _SymbolMap()
 
     property size:
         def __get__(self):
diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx
index a4aae827f..a718d04ec 100644
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@@ -12,13 +12,14 @@ import cython
 from thinc.features cimport Feature, count_feats
 
 
-def setup_model_dir(tag_names, templates, model_dir):
+def setup_model_dir(tag_names, tag_map, templates, model_dir):
     if path.exists(model_dir):
         shutil.rmtree(model_dir)
     os.mkdir(model_dir)
     config = {
         'templates': templates,
         'tag_names': tag_names,
+        'tag_map': tag_map
     }
     with open(path.join(model_dir, 'config.json'), 'w') as file_:
         json.dump(config, file_)
diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd
index 5ee5d01f7..35796d8e0 100644
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@@ -43,6 +43,7 @@ cdef class Token:
     cdef readonly int dep_id
     cdef int lemma
     cdef public int head
+    cdef public int dep_tag
 
     cdef public atom_t id
     cdef public atom_t cluster
diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx
index 5e81c4a4e..16f9d3c20 100644
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@@ -158,7 +158,8 @@ cdef class Token:
 
     property dep:
         def __get__(self):
-            return self.string_store.dep_tags[self.dep]
+            return self.string_store.dep_tags[self.dep_id]
+
     property pos:
         def __get__(self):
-            return self.string_store.pos_tags[self.pos]
+            return self.pos_id
diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
index 3ab1005f6..a34d3560f 100644
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@@ -12,23 +12,21 @@ cdef class Vocab:
     
     Also interns UTF-8 strings, and maps them to consecutive integer IDs.
     '''
-    def __init__(self, object get_lex_props):
+    def __init__(self, data_dir=None, get_lex_props=None):
         self.mem = Pool()
         self._map = PreshMap(2 ** 20)
         self.strings = StringStore()
         self.lexemes.push_back(&EMPTY_LEXEME)
         self.get_lex_props = get_lex_props
 
-    @classmethod
-    def from_dir(cls, object data_dir, object get_lex_props=None):
-        if not path.exists(data_dir):
-            raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
-        if not path.isdir(data_dir):
-            raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
-        cdef Vocab self = cls(get_lex_props)
-        self.strings.load(path.join(data_dir, 'strings'))
-        self.load(path.join(data_dir, 'lexemes'))
-        return self
+        if data_dir is not None:
+            if not path.exists(data_dir):
+                raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
+        if data_dir is not None:
+            if not path.isdir(data_dir):
+                raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
+            self.strings.load(path.join(data_dir, 'strings'))
+            self.load(path.join(data_dir, 'lexemes'))
 
     def __len__(self):
         return self.lexemes.size()
diff --git a/tests/test_add_lemmas.py b/tests/test_add_lemmas.py
index cf7cb1c7f..5e6fdca61 100644
--- a/tests/test_add_lemmas.py
+++ b/tests/test_add_lemmas.py
@@ -1,14 +1,20 @@
 from spacy.en import English
 import pytest
 
+
 @pytest.fixture
 def EN():
-    return English(pos_tag=True, parse=False)
+    return English(tag=True, parse=False)
 
 @pytest.fixture
 def tagged(EN):
     string = u'Bananas in pyjamas are geese.'
-    tokens = EN(string, pos_tag=True)
+    tokens = EN(string, tag=True)
+    assert EN.tagger.tags[tokens[0].pos] == 'NNP'
+    assert EN.tagger.tags[tokens[1].pos] == 'IN'
+    assert EN.tagger.tags[tokens[2].pos] == 'NNS'
+    assert EN.tagger.tags[tokens[3].pos] == 'VBP'
+    assert EN.tagger.tags[tokens[3].pos] == 'NNS'
     return tokens
 
 
diff --git a/tests/test_contractions.py b/tests/test_contractions.py
index 78a56f67f..5c5e5c7c7 100644
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@@ -5,7 +5,7 @@ from spacy.en import English
 
 @pytest.fixture
 def EN():
-    return English(pos_tag=False)
+    return English(tag=False)
 
 
 def test_possess(EN):
diff --git a/tests/test_emoticons.py b/tests/test_emoticons.py
index 10346b252..52b51906a 100644
--- a/tests/test_emoticons.py
+++ b/tests/test_emoticons.py
@@ -6,7 +6,7 @@ from spacy.en import English
 
 @pytest.fixture
 def EN():
-    return English(pos_tag=False)
+    return English(tag=False)
 
 def test_tweebo_challenge(EN):
     text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
diff --git a/tests/test_infix.py b/tests/test_infix.py
index a1eeadd65..d52996e33 100644
--- a/tests/test_infix.py
+++ b/tests/test_infix.py
@@ -2,7 +2,7 @@ from __future__ import unicode_literals
 
 import pytest
 
-from spacy.en import EN
+from spacy.en import English
 
 
 #def test_hyphen():
@@ -11,7 +11,8 @@ from spacy.en import EN
 
 
 def test_period():
-    tokens = EN.tokenize('best.Known')
+    EN = English()
+    tokens = EN('best.Known')
     assert len(tokens) == 3
-    tokens = EN.tokenize('zombo.com')
+    tokens = EN('zombo.com')
     assert len(tokens) == 1
diff --git a/tests/test_iter_lexicon.py b/tests/test_iter_lexicon.py
index e36c645b4..c03292187 100644
--- a/tests/test_iter_lexicon.py
+++ b/tests/test_iter_lexicon.py
@@ -4,7 +4,7 @@ from spacy.en import English
 
 @pytest.fixture
 def EN():
-    return English(pos_tag=True)
+    return English(tag=True)
 
 def test_range_iter(EN):
     for i in range(len(EN.vocab)):
diff --git a/tests/test_lexeme_flags.py b/tests/test_lexeme_flags.py
index 3ab123039..b2fdee957 100644
--- a/tests/test_lexeme_flags.py
+++ b/tests/test_lexeme_flags.py
@@ -3,12 +3,12 @@ from __future__ import unicode_literals
 import pytest
 
 from spacy.en import English
-from spacy.en.attrs import IS_ALPHA, IS_DIGIT
+from spacy.en.attrs import *
 
 
 @pytest.fixture
 def EN():
-    return English(pos_tag=False)
+    return English(tag=False)
 
 
 def test_is_alpha(EN):
diff --git a/tests/test_morph_exceptions.py b/tests/test_morph_exceptions.py
index f60cb5683..71c6f59f1 100644
--- a/tests/test_morph_exceptions.py
+++ b/tests/test_morph_exceptions.py
@@ -8,7 +8,7 @@ from spacy.en import English
 
 @pytest.fixture
 def EN():
-    return English(pos_tag=True, parse=False)
+    return English(tag=True, parse=False)
 
 
 @pytest.fixture
@@ -18,8 +18,8 @@ def morph_exc():
            }
 
 def test_load_exc(EN, morph_exc):
-    EN.pos_tagger.morphologizer.load_exceptions(morph_exc)
-    tokens = EN('I like his style.', pos_tag=True)
+    EN.tagger.morphologizer.load_exceptions(morph_exc)
+    tokens = EN('I like his style.', tag=True)
     his = tokens[2]
     assert his.pos == 'PRP$'
     assert his.lemma == '-PRP-'
diff --git a/tests/test_only_punct.py b/tests/test_only_punct.py
index f2c558cc7..384ad6332 100644
--- a/tests/test_only_punct.py
+++ b/tests/test_only_punct.py
@@ -1,11 +1,13 @@
 from __future__ import unicode_literals
 import pytest
 
-from spacy.en import EN
+from spacy.en import English
 
 def test_only_pre1():
-    assert len(EN.tokenize("(")) == 1
+    EN = English()
+    assert len(EN("(")) == 1
 
 
 def test_only_pre2():
-    assert len(EN.tokenize("((")) == 2
+    EN = English()
+    assert len(EN("((")) == 2
diff --git a/tests/test_post_punct.py b/tests/test_post_punct.py
index da80f5636..f61759609 100644
--- a/tests/test_post_punct.py
+++ b/tests/test_post_punct.py
@@ -1,6 +1,6 @@
 from __future__ import unicode_literals
 
-from spacy.en import EN
+from spacy.en import English
 
 import pytest
 
@@ -10,38 +10,43 @@ def close_puncts():
     return [')', ']', '}', '*']
 
 
-def test_close(close_puncts):
+@pytest.fixture
+def EN():
+    return English()
+
+
+def test_close(close_puncts, EN):
     word_str = 'Hello'
     for p in close_puncts:
         string = word_str + p
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
         assert len(tokens) == 2
         assert tokens[1].string == p
         assert tokens[0].string == word_str
 
 
-def test_two_different_close(close_puncts):
+def test_two_different_close(close_puncts, EN):
     word_str = 'Hello'
     for p in close_puncts:
         string = word_str + p + "'"
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
         assert len(tokens) == 3
         assert tokens[0].string == word_str
         assert tokens[1].string == p
         assert tokens[2].string == "'"
 
 
-def test_three_same_close(close_puncts):
+def test_three_same_close(close_puncts, EN):
     word_str = 'Hello'
     for p in close_puncts:
         string = word_str + p + p + p
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
         assert len(tokens) == 4
         assert tokens[0].string == word_str
         assert tokens[1].string == p
 
 
-def test_double_end_quote():
-    assert len(EN.tokenize("Hello''")) == 2
-    assert len(EN.tokenize("''")) == 1
+def test_double_end_quote(EN):
+    assert len(EN("Hello''")) == 2
+    assert len(EN("''")) == 1
 
diff --git a/tests/test_pre_punct.py b/tests/test_pre_punct.py
index 557655330..2eb6fdd3d 100644
--- a/tests/test_pre_punct.py
+++ b/tests/test_pre_punct.py
@@ -1,6 +1,6 @@
 from __future__ import unicode_literals
 
-from spacy.en import EN
+from spacy.en import English
 
 import pytest
 
@@ -10,39 +10,44 @@ def open_puncts():
     return ['(', '[', '{', '*']
 
 
-def test_open(open_puncts):
+@pytest.fixture
+def EN():
+    return English()
+
+
+def test_open(open_puncts, EN):
     word_str = 'Hello'
     for p in open_puncts:
         string = p + word_str
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
         assert len(tokens) == 2
         assert tokens[0].string == p
         assert tokens[1].string == word_str
 
 
-def test_two_different_open(open_puncts):
+def test_two_different_open(open_puncts, EN):
     word_str = 'Hello'
     for p in open_puncts:
         string = p + "`" + word_str
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
         assert len(tokens) == 3
         assert tokens[0].string == p
         assert tokens[1].string == "`"
         assert tokens[2].string == word_str
 
 
-def test_three_same_open(open_puncts):
+def test_three_same_open(open_puncts, EN):
     word_str = 'Hello'
     for p in open_puncts:
         string = p + p + p + word_str
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
         assert len(tokens) == 4
         assert tokens[0].string == p
         assert tokens[3].string == word_str
 
 
-def test_open_appostrophe():
+def test_open_appostrophe(EN):
     string = "'The"
-    tokens = EN.tokenize(string)
+    tokens = EN(string)
     assert len(tokens) == 2
     assert tokens[0].string == "'"
diff --git a/tests/test_special_affix.py b/tests/test_special_affix.py
index ec1765368..63ee3eb01 100644
--- a/tests/test_special_affix.py
+++ b/tests/test_special_affix.py
@@ -3,43 +3,48 @@ and suffix punctuation."""
 from __future__ import unicode_literals
 import pytest
 
-from spacy.en import EN
+from spacy.en import English
 
-def test_no_special():
-    assert len(EN.tokenize("(can)")) == 3
-
-def test_no_punct():
-    assert len(EN.tokenize("can't")) == 2
-
-def test_prefix():
-    assert len(EN.tokenize("(can't")) == 3
+@pytest.fixture
+def EN():
+    return English()
 
 
-def test_suffix():
-    assert len(EN.tokenize("can't)")) == 3
+def test_no_special(EN):
+    assert len(EN("(can)")) == 3
+
+def test_no_punct(EN):
+    assert len(EN("can't")) == 2
+
+def test_prefix(EN):
+    assert len(EN("(can't")) == 3
 
 
-def test_wrap():
-    assert len(EN.tokenize("(can't)")) == 4
+def test_suffix(EN):
+    assert len(EN("can't)")) == 3
 
 
-def test_uneven_wrap():
-    assert len(EN.tokenize("(can't?)")) == 5
+def test_wrap(EN):
+    assert len(EN("(can't)")) == 4
 
 
-def test_prefix_interact():
-    assert len(EN.tokenize("U.S.")) == 1
-    assert len(EN.tokenize("us.")) == 2
-    assert len(EN.tokenize("(U.S.")) == 2
+def test_uneven_wrap(EN):
+    assert len(EN("(can't?)")) == 5
 
 
-def test_suffix_interact():
-    assert len(EN.tokenize("U.S.)")) == 2
+def test_prefix_interact(EN):
+    assert len(EN("U.S.")) == 1
+    assert len(EN("us.")) == 2
+    assert len(EN("(U.S.")) == 2
 
 
-def test_even_wrap_interact():
-    assert len(EN.tokenize("(U.S.)")) == 3
+def test_suffix_interact(EN):
+    assert len(EN("U.S.)")) == 2
 
 
-def test_uneven_wrap_interact():
-    assert len(EN.tokenize("(U.S.?)")) == 4
+def test_even_wrap_interact(EN):
+    assert len(EN("(U.S.)")) == 3
+
+
+def test_uneven_wrap_interact(EN):
+    assert len(EN("(U.S.?)")) == 4
diff --git a/tests/test_string_loading.py b/tests/test_string_loading.py
index e2fa2429a..a5021856f 100644
--- a/tests/test_string_loading.py
+++ b/tests/test_string_loading.py
@@ -3,13 +3,18 @@ from __future__ import unicode_literals
 
 import pytest
 
-from spacy.en import EN
+from spacy.en import English
 
 
-def test_one():
-    tokens = EN.tokenize('Betty Botter bought a pound of butter.')
+@pytest.fixture
+def EN():
+    return English()
+
+
+def test_one(EN):
+    tokens = EN('Betty Botter bought a pound of butter.')
     assert tokens[0].string == 'Betty'
-    tokens2 = EN.tokenize('Betty also bought a pound of butter.')
+    tokens2 = EN('Betty also bought a pound of butter.')
     assert tokens2[0].string == 'Betty'
 
 
diff --git a/tests/test_surround_punct.py b/tests/test_surround_punct.py
index b7be782f2..0c816ad8f 100644
--- a/tests/test_surround_punct.py
+++ b/tests/test_surround_punct.py
@@ -1,6 +1,6 @@
 from __future__ import unicode_literals
 
-from spacy.en import EN
+from spacy.en import English
 
 import pytest
 
@@ -10,22 +10,27 @@ def paired_puncts():
     return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]
 
 
-def test_token(paired_puncts):
+@pytest.fixture
+def EN():
+    return English()
+
+
+def test_token(paired_puncts, EN):
     word_str = 'Hello'
     for open_, close_ in paired_puncts:
         string = open_ + word_str + close_
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
         assert len(tokens) == 3
         assert tokens[0].string == open_
         assert tokens[1].string == word_str
         assert tokens[2].string == close_
 
 
-def test_two_different(paired_puncts):
+def test_two_different(paired_puncts, EN):
     word_str = 'Hello'
     for open_, close_ in paired_puncts:
         string = "`" + open_ + word_str + close_ + "'"
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
         assert len(tokens) == 5
         assert tokens[0].string == "`"
         assert tokens[1].string == open_
diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
index e3f4aff0e..f11d19f8f 100644
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@@ -8,7 +8,7 @@ from spacy.en import English
 
 @pytest.fixture
 def EN():
-    return English(pos_tag=False, parse=False)
+    return English(tag=False, parse=False)
 
 def test_single_word(EN):
     tokens = EN(u'hello')
diff --git a/tests/test_tokens_from_list.py b/tests/test_tokens_from_list.py
index eef00f403..e72c355fa 100644
--- a/tests/test_tokens_from_list.py
+++ b/tests/test_tokens_from_list.py
@@ -1,9 +1,16 @@
 from __future__ import unicode_literals
+import pytest
 
-from spacy.en import EN
+from spacy.en import English
 
-def test1():
+
+@pytest.fixture
+def EN():
+    return English()
+
+
+def test1(EN):
     words = ['JAPAN', 'GET', 'LUCKY']
-    tokens = EN.tokens_from_list(words)
+    tokens = EN.tokenizer.tokens_from_list(words)
     assert len(tokens) == 3
     assert tokens[0].string == 'JAPAN'
diff --git a/tests/test_vocab.py b/tests/test_vocab.py
index daaabd33d..f0640e633 100644
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@@ -1,23 +1,29 @@
 from __future__ import unicode_literals
+import pytest
 
-from spacy.en import EN
+from spacy.en import English
 
 
-def test_neq():
-    addr = EN.lexicon['Hello']
-    assert EN.lexicon['bye']['sic'] != addr['sic']
+@pytest.fixture
+def EN():
+    return English()
 
 
-def test_eq():
-    addr = EN.lexicon['Hello']
-    assert EN.lexicon['Hello']['sic'] == addr['sic']
+def test_neq(EN):
+    addr = EN.vocab['Hello']
+    assert EN.vocab['bye']['sic'] != addr['sic']
 
 
-def test_case_neq():
-    addr = EN.lexicon['Hello']
-    assert EN.lexicon['hello']['sic'] != addr['sic']
+def test_eq(EN):
+    addr = EN.vocab['Hello']
+    assert EN.vocab['Hello']['sic'] == addr['sic']
 
 
-def test_punct_neq():
-    addr = EN.lexicon['Hello']
-    assert EN.lexicon['Hello,']['sic'] != addr['sic']
+def test_case_neq(EN):
+    addr = EN.vocab['Hello']
+    assert EN.vocab['hello']['sic'] != addr['sic']
+
+
+def test_punct_neq(EN):
+    addr = EN.vocab['Hello']
+    assert EN.vocab['Hello,']['sic'] != addr['sic']
diff --git a/tests/test_whitespace.py b/tests/test_whitespace.py
index dc943664e..f4246a53c 100644
--- a/tests/test_whitespace.py
+++ b/tests/test_whitespace.py
@@ -1,38 +1,43 @@
 """Test that tokens are created correctly for whitespace."""
 from __future__ import unicode_literals
 
-from spacy.en import EN
+from spacy.en import English
 import pytest
 
 
-def test_single_space():
-    tokens = EN.tokenize('hello possums')
+@pytest.fixture
+def EN():
+    return English(tag=False)
+
+
+def test_single_space(EN):
+    tokens = EN('hello possums')
     assert len(tokens) == 2
 
 
-def test_double_space():
-    tokens = EN.tokenize('hello  possums')
+def test_double_space(EN):
+    tokens = EN('hello  possums')
     assert len(tokens) == 3
     assert tokens[1].string == ' '
 
 
-def test_newline():
-    tokens = EN.tokenize('hello\npossums')
+def test_newline(EN):
+    tokens = EN('hello\npossums')
     assert len(tokens) == 3
 
 
-def test_newline_space():
-    tokens = EN.tokenize('hello \npossums')
+def test_newline_space(EN):
+    tokens = EN('hello \npossums')
     assert len(tokens) == 3
 
 
-def test_newline_double_space():
-    tokens = EN.tokenize('hello  \npossums')
+def test_newline_double_space(EN):
+    tokens = EN('hello  \npossums')
     assert len(tokens) == 3
 
 
-def test_newline_space_wrap():
-    tokens = EN.tokenize('hello \n possums')
+def test_newline_space_wrap(EN):
+    tokens = EN('hello \n possums')
     assert len(tokens) == 3