* Tests passing except for morphology/lemmatization stuff

2025-12-23 18:13:13 +03:00 · 2014-12-23 11:40:32 +11:00 · 2014-12-23 11:40:32 +11:00 · 73f200436f
commit 73f200436f
parent cf8d26c3d2
27 changed files with 197 additions and 155 deletions
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -6,8 +6,6 @@ from ..vocab import Vocab
 from ..tokenizer import Tokenizer
 from ..syntax.parser import GreedyParser
 from ..tokens import Tokens
 from ..morphology import Morphologizer
 from .lemmatizer import Lemmatizer
 from .pos import EnPosTagger
 from .pos import POS_TAGS
 from .attrs import get_flags
@ -18,28 +16,18 @@ def get_lex_props(string):
 class English(object):
-    def __init__(self, data_dir=None, pos_tag=True, parse=False):
+    def __init__(self, data_dir=None, tag=True, parse=False):
        if data_dir is None:
            data_dir = path.join(path.dirname(__file__), 'data')
-        self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props)
+        self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
        for pos_str in POS_TAGS:
            _ = self.vocab.strings.pos_tags[pos_str]
        self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
-        if pos_tag:
+        self.tagger = EnPosTagger(self.vocab.strings, data_dir) if tag else None
-            morph = Morphologizer(self.vocab.strings, POS_TAGS,
+        self.parser = GreedyParser(data_dir) if parse else None
                                  Lemmatizer(path.join(data_dir, 'wordnet')))
            self.pos_tagger = EnPosTagger(data_dir, morph)
        else:
            self.pos_tagger = None
        if parse:
            self.parser = GreedyParser(data_dir)
        else:
            self.parser = None
-    def __call__(self, text, pos_tag=True, parse=True):
+    def __call__(self, text, tag=True, parse=True):
        tokens = self.tokenizer.tokenize(text)
-        if self.pos_tagger and pos_tag:
+        if self.tagger and tag:
-            self.pos_tagger(tokens)
+            self.tagger(tokens)
        if self.parser and parse:
            self.parser.parse(tokens)
        return tokens
--- a/spacy/en/pos.pxd
+++ b/spacy/en/pos.pxd
@ -1,6 +1,9 @@
 from ..tagger cimport Tagger
 from ..morphology cimport Morphologizer
 from ..strings cimport StringStore
 cdef class EnPosTagger(Tagger):
    cdef readonly StringStore strings
    cdef readonly StringStore tags
    cdef readonly Morphologizer morphologizer
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -1,10 +1,13 @@
 from os import path
 import json
 from thinc.typedefs cimport atom_t
 from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
 from ..typedefs cimport X, PUNCT, EOL
 from ..structs cimport TokenC, Morphology
 from ..tokens cimport Tokens
 from .lemmatizer import Lemmatizer
 cpdef enum en_person_t:
@ -192,10 +195,18 @@ POS_TEMPLATES = (
 cdef class EnPosTagger(Tagger):
-    def __init__(self, data_dir, morphologizer):
+    def __init__(self, StringStore strings, data_dir):
        model_dir = path.join(data_dir, 'pos')
        Tagger.__init__(self, path.join(model_dir))
-        self.morphologizer = morphologizer
+        self.strings = strings
        cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
        self.tags = StringStore()
        for tag in sorted(cfg['tag_names']):
            _ = self.tags[tag]
        self.morphologizer = Morphologizer(self.strings, cfg['tag_names'],
                                           cfg['tag_map'],
                                 Lemmatizer(path.join(data_dir, 'wordnet'),
                                            NOUN, VERB, ADJ))
    def __call__(self, Tokens tokens):
        cdef int i
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -35,15 +35,15 @@ cdef struct _Cached:
 cdef class Morphologizer:
    """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
    """
-    def __init__(self, StringStore strings, object tag_map, object lemmatizer,
+    def __init__(self, StringStore strings, object tag_names, object tag_map,
-                 irregulars=None):
+                 object lemmatizer, irregulars=None):
        self.mem = Pool()
        self.strings = strings
        self.lemmatizer = lemmatizer
-        cdef int n_tags = len(self.strings.pos_tags) + 1
+        cdef int n_tags = len(tag_names) + 1
        self._cache = PreshMapArray(n_tags)
        self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
-        for tag, i in self.strings.pos_tags:
+        for i, tag in enumerate(sorted(tag_names)):
            pos, props = tag_map[tag]
            self.tags[i].id = i
            self.tags[i].pos = pos
@ -65,13 +65,7 @@ cdef class Morphologizer:
        cdef bytes py_string = self.strings[lex.sic]
        cdef set lemma_strings
        cdef bytes lemma_string
-        if pos == NOUN:
+        lemma_strings = self.lemmatizer(py_string, pos)
            lemma_strings = self.lemmatizer.noun(py_string)
        elif pos == VERB:
            lemma_strings = self.lemmatizer.verb(py_string)
        else:
            assert pos == ADJ
            lemma_strings = self.lemmatizer.adj(py_string)
        lemma_string = sorted(lemma_strings)[0]
        lemma = self.strings.intern(lemma_string, len(lemma_string)).i
        return lemma
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -11,16 +11,9 @@ cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end)
    s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
 cdef class _SymbolMap:
    cdef dict _string_to_id
    cdef list _id_to_string
 cdef class StringStore:
    cdef Pool mem
    cdef Utf8Str* strings
    cdef readonly _SymbolMap pos_tags
    cdef readonly _SymbolMap dep_tags
    cdef size_t size
    cdef PreshMap _map
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -9,7 +9,8 @@ from .typedefs cimport hash_t
 SEPARATOR = '\n|-SEP-|\n'
-cdef class _SymbolMap:
+"""
 cdef class SymbolMap:
    def __init__(self):
        self._string_to_id = {'': 0}
        self._id_to_string = ['']
@ -38,6 +39,7 @@ cdef class _SymbolMap:
                self._string_to_id[string] = id_
                self._id_to_string.append(string)
            return id_
 """
 cdef class StringStore:
@ -47,8 +49,6 @@ cdef class StringStore:
        self._resize_at = 10000
        self.size = 1
        self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
        self.pos_tags = _SymbolMap()
        self.dep_tags = _SymbolMap()
    property size:
        def __get__(self):
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -12,13 +12,14 @@ import cython
 from thinc.features cimport Feature, count_feats
-def setup_model_dir(tag_names, templates, model_dir):
+def setup_model_dir(tag_names, tag_map, templates, model_dir):
    if path.exists(model_dir):
        shutil.rmtree(model_dir)
    os.mkdir(model_dir)
    config = {
        'templates': templates,
        'tag_names': tag_names,
        'tag_map': tag_map
    }
    with open(path.join(model_dir, 'config.json'), 'w') as file_:
        json.dump(config, file_)
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -43,6 +43,7 @@ cdef class Token:
    cdef readonly int dep_id
    cdef int lemma
    cdef public int head
    cdef public int dep_tag
    cdef public atom_t id
    cdef public atom_t cluster
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -158,7 +158,8 @@ cdef class Token:
    property dep:
        def __get__(self):
-            return self.string_store.dep_tags[self.dep]
+            return self.string_store.dep_tags[self.dep_id]
    property pos:
        def __get__(self):
-            return self.string_store.pos_tags[self.pos]
+            return self.pos_id
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -12,23 +12,21 @@ cdef class Vocab:
    Also interns UTF-8 strings, and maps them to consecutive integer IDs.
    '''
-    def __init__(self, object get_lex_props):
+    def __init__(self, data_dir=None, get_lex_props=None):
        self.mem = Pool()
        self._map = PreshMap(2 ** 20)
        self.strings = StringStore()
        self.lexemes.push_back(&EMPTY_LEXEME)
        self.get_lex_props = get_lex_props
-    @classmethod
+        if data_dir is not None:
-    def from_dir(cls, object data_dir, object get_lex_props=None):
+            if not path.exists(data_dir):
-        if not path.exists(data_dir):
+                raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
-            raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
+        if data_dir is not None:
-        if not path.isdir(data_dir):
+            if not path.isdir(data_dir):
-            raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
+                raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
-        cdef Vocab self = cls(get_lex_props)
+            self.strings.load(path.join(data_dir, 'strings'))
-        self.strings.load(path.join(data_dir, 'strings'))
+            self.load(path.join(data_dir, 'lexemes'))
        self.load(path.join(data_dir, 'lexemes'))
        return self
    def __len__(self):
        return self.lexemes.size()
--- a/tests/test_add_lemmas.py
+++ b/tests/test_add_lemmas.py
@ -1,14 +1,20 @@
 from spacy.en import English
 import pytest
@pytest.fixture
 def EN():
-    return English(pos_tag=True, parse=False)
+    return English(tag=True, parse=False)
@pytest.fixture
 def tagged(EN):
    string = u'Bananas in pyjamas are geese.'
-    tokens = EN(string, pos_tag=True)
+    tokens = EN(string, tag=True)
    assert EN.tagger.tags[tokens[0].pos] == 'NNP'
    assert EN.tagger.tags[tokens[1].pos] == 'IN'
    assert EN.tagger.tags[tokens[2].pos] == 'NNS'
    assert EN.tagger.tags[tokens[3].pos] == 'VBP'
    assert EN.tagger.tags[tokens[3].pos] == 'NNS'
    return tokens
--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -5,7 +5,7 @@ from spacy.en import English
@pytest.fixture
 def EN():
-    return English(pos_tag=False)
+    return English(tag=False)
 def test_possess(EN):
--- a/tests/test_emoticons.py
+++ b/tests/test_emoticons.py
@ -6,7 +6,7 @@ from spacy.en import English
@pytest.fixture
 def EN():
-    return English(pos_tag=False)
+    return English(tag=False)
 def test_tweebo_challenge(EN):
    text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
--- a/tests/test_infix.py
+++ b/tests/test_infix.py
@ -2,7 +2,7 @@ from __future__ import unicode_literals
 import pytest
-from spacy.en import EN
+from spacy.en import English
 #def test_hyphen():
@ -11,7 +11,8 @@ from spacy.en import EN
 def test_period():
-    tokens = EN.tokenize('best.Known')
+    EN = English()
    tokens = EN('best.Known')
    assert len(tokens) == 3
-    tokens = EN.tokenize('zombo.com')
+    tokens = EN('zombo.com')
    assert len(tokens) == 1
--- a/tests/test_iter_lexicon.py
+++ b/tests/test_iter_lexicon.py
@ -4,7 +4,7 @@ from spacy.en import English
@pytest.fixture
 def EN():
-    return English(pos_tag=True)
+    return English(tag=True)
 def test_range_iter(EN):
    for i in range(len(EN.vocab)):
--- a/tests/test_lexeme_flags.py
+++ b/tests/test_lexeme_flags.py
@ -3,12 +3,12 @@ from __future__ import unicode_literals
 import pytest
 from spacy.en import English
-from spacy.en.attrs import IS_ALPHA, IS_DIGIT
+from spacy.en.attrs import *
@pytest.fixture
 def EN():
-    return English(pos_tag=False)
+    return English(tag=False)
 def test_is_alpha(EN):
--- a/tests/test_morph_exceptions.py
+++ b/tests/test_morph_exceptions.py
@ -8,7 +8,7 @@ from spacy.en import English
@pytest.fixture
 def EN():
-    return English(pos_tag=True, parse=False)
+    return English(tag=True, parse=False)
@pytest.fixture
@ -18,8 +18,8 @@ def morph_exc():
           }
 def test_load_exc(EN, morph_exc):
-    EN.pos_tagger.morphologizer.load_exceptions(morph_exc)
+    EN.tagger.morphologizer.load_exceptions(morph_exc)
-    tokens = EN('I like his style.', pos_tag=True)
+    tokens = EN('I like his style.', tag=True)
    his = tokens[2]
    assert his.pos == 'PRP$'
    assert his.lemma == '-PRP-'
--- a/tests/test_only_punct.py
+++ b/tests/test_only_punct.py
@ -1,11 +1,13 @@
 from __future__ import unicode_literals
 import pytest
-from spacy.en import EN
+from spacy.en import English
 def test_only_pre1():
-    assert len(EN.tokenize("(")) == 1
+    EN = English()
    assert len(EN("(")) == 1
 def test_only_pre2():
-    assert len(EN.tokenize("((")) == 2
+    EN = English()
    assert len(EN("((")) == 2
--- a/tests/test_post_punct.py
+++ b/tests/test_post_punct.py
@ -1,6 +1,6 @@
 from __future__ import unicode_literals
-from spacy.en import EN
+from spacy.en import English
 import pytest
@ -10,38 +10,43 @@ def close_puncts():
    return [')', ']', '}', '*']
-def test_close(close_puncts):
+@pytest.fixture
 def EN():
    return English()
 def test_close(close_puncts, EN):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 2
        assert tokens[1].string == p
        assert tokens[0].string == word_str
-def test_two_different_close(close_puncts):
+def test_two_different_close(close_puncts, EN):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p + "'"
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 3
        assert tokens[0].string == word_str
        assert tokens[1].string == p
        assert tokens[2].string == "'"
-def test_three_same_close(close_puncts):
+def test_three_same_close(close_puncts, EN):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p + p + p
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 4
        assert tokens[0].string == word_str
        assert tokens[1].string == p
-def test_double_end_quote():
+def test_double_end_quote(EN):
-    assert len(EN.tokenize("Hello''")) == 2
+    assert len(EN("Hello''")) == 2
-    assert len(EN.tokenize("''")) == 1
+    assert len(EN("''")) == 1
--- a/tests/test_pre_punct.py
+++ b/tests/test_pre_punct.py
@ -1,6 +1,6 @@
 from __future__ import unicode_literals
-from spacy.en import EN
+from spacy.en import English
 import pytest
@ -10,39 +10,44 @@ def open_puncts():
    return ['(', '[', '{', '*']
-def test_open(open_puncts):
+@pytest.fixture
 def EN():
    return English()
 def test_open(open_puncts, EN):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + word_str
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 2
        assert tokens[0].string == p
        assert tokens[1].string == word_str
-def test_two_different_open(open_puncts):
+def test_two_different_open(open_puncts, EN):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + "`" + word_str
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 3
        assert tokens[0].string == p
        assert tokens[1].string == "`"
        assert tokens[2].string == word_str
-def test_three_same_open(open_puncts):
+def test_three_same_open(open_puncts, EN):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + p + p + word_str
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 4
        assert tokens[0].string == p
        assert tokens[3].string == word_str
-def test_open_appostrophe():
+def test_open_appostrophe(EN):
    string = "'The"
-    tokens = EN.tokenize(string)
+    tokens = EN(string)
    assert len(tokens) == 2
    assert tokens[0].string == "'"
--- a/tests/test_special_affix.py
+++ b/tests/test_special_affix.py
@ -3,43 +3,48 @@ and suffix punctuation."""
 from __future__ import unicode_literals
 import pytest
-from spacy.en import EN
+from spacy.en import English
-def test_no_special():
+@pytest.fixture
-    assert len(EN.tokenize("(can)")) == 3
+def EN():
-
+    return English()
 def test_no_punct():
    assert len(EN.tokenize("can't")) == 2
 def test_prefix():
    assert len(EN.tokenize("(can't")) == 3
-def test_suffix():
+def test_no_special(EN):
-    assert len(EN.tokenize("can't)")) == 3
+    assert len(EN("(can)")) == 3
 def test_no_punct(EN):
    assert len(EN("can't")) == 2
 def test_prefix(EN):
    assert len(EN("(can't")) == 3
-def test_wrap():
+def test_suffix(EN):
-    assert len(EN.tokenize("(can't)")) == 4
+    assert len(EN("can't)")) == 3
-def test_uneven_wrap():
+def test_wrap(EN):
-    assert len(EN.tokenize("(can't?)")) == 5
+    assert len(EN("(can't)")) == 4
-def test_prefix_interact():
+def test_uneven_wrap(EN):
-    assert len(EN.tokenize("U.S.")) == 1
+    assert len(EN("(can't?)")) == 5
    assert len(EN.tokenize("us.")) == 2
    assert len(EN.tokenize("(U.S.")) == 2
-def test_suffix_interact():
+def test_prefix_interact(EN):
-    assert len(EN.tokenize("U.S.)")) == 2
+    assert len(EN("U.S.")) == 1
    assert len(EN("us.")) == 2
    assert len(EN("(U.S.")) == 2
-def test_even_wrap_interact():
+def test_suffix_interact(EN):
-    assert len(EN.tokenize("(U.S.)")) == 3
+    assert len(EN("U.S.)")) == 2
-def test_uneven_wrap_interact():
+def test_even_wrap_interact(EN):
-    assert len(EN.tokenize("(U.S.?)")) == 4
+    assert len(EN("(U.S.)")) == 3
 def test_uneven_wrap_interact(EN):
    assert len(EN("(U.S.?)")) == 4
--- a/tests/test_string_loading.py
+++ b/tests/test_string_loading.py
@ -3,13 +3,18 @@ from __future__ import unicode_literals
 import pytest
-from spacy.en import EN
+from spacy.en import English
-def test_one():
+@pytest.fixture
-    tokens = EN.tokenize('Betty Botter bought a pound of butter.')
+def EN():
    return English()
 def test_one(EN):
    tokens = EN('Betty Botter bought a pound of butter.')
    assert tokens[0].string == 'Betty'
-    tokens2 = EN.tokenize('Betty also bought a pound of butter.')
+    tokens2 = EN('Betty also bought a pound of butter.')
    assert tokens2[0].string == 'Betty'
--- a/tests/test_surround_punct.py
+++ b/tests/test_surround_punct.py
@ -1,6 +1,6 @@
 from __future__ import unicode_literals
-from spacy.en import EN
+from spacy.en import English
 import pytest
@ -10,22 +10,27 @@ def paired_puncts():
    return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]
-def test_token(paired_puncts):
+@pytest.fixture
 def EN():
    return English()
 def test_token(paired_puncts, EN):
    word_str = 'Hello'
    for open_, close_ in paired_puncts:
        string = open_ + word_str + close_
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 3
        assert tokens[0].string == open_
        assert tokens[1].string == word_str
        assert tokens[2].string == close_
-def test_two_different(paired_puncts):
+def test_two_different(paired_puncts, EN):
    word_str = 'Hello'
    for open_, close_ in paired_puncts:
        string = "`" + open_ + word_str + close_ + "'"
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 5
        assert tokens[0].string == "`"
        assert tokens[1].string == open_
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -8,7 +8,7 @@ from spacy.en import English
@pytest.fixture
 def EN():
-    return English(pos_tag=False, parse=False)
+    return English(tag=False, parse=False)
 def test_single_word(EN):
    tokens = EN(u'hello')
--- a/tests/test_tokens_from_list.py
+++ b/tests/test_tokens_from_list.py
@ -1,9 +1,16 @@
 from __future__ import unicode_literals
 import pytest
-from spacy.en import EN
+from spacy.en import English
-def test1():
+
@pytest.fixture
 def EN():
    return English()
 def test1(EN):
    words = ['JAPAN', 'GET', 'LUCKY']
-    tokens = EN.tokens_from_list(words)
+    tokens = EN.tokenizer.tokens_from_list(words)
    assert len(tokens) == 3
    assert tokens[0].string == 'JAPAN'
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@ -1,23 +1,29 @@
 from __future__ import unicode_literals
 import pytest
-from spacy.en import EN
+from spacy.en import English
-def test_neq():
+@pytest.fixture
-    addr = EN.lexicon['Hello']
+def EN():
-    assert EN.lexicon['bye']['sic'] != addr['sic']
+    return English()
-def test_eq():
+def test_neq(EN):
-    addr = EN.lexicon['Hello']
+    addr = EN.vocab['Hello']
-    assert EN.lexicon['Hello']['sic'] == addr['sic']
+    assert EN.vocab['bye']['sic'] != addr['sic']
-def test_case_neq():
+def test_eq(EN):
-    addr = EN.lexicon['Hello']
+    addr = EN.vocab['Hello']
-    assert EN.lexicon['hello']['sic'] != addr['sic']
+    assert EN.vocab['Hello']['sic'] == addr['sic']
-def test_punct_neq():
+def test_case_neq(EN):
-    addr = EN.lexicon['Hello']
+    addr = EN.vocab['Hello']
-    assert EN.lexicon['Hello,']['sic'] != addr['sic']
+    assert EN.vocab['hello']['sic'] != addr['sic']
 def test_punct_neq(EN):
    addr = EN.vocab['Hello']
    assert EN.vocab['Hello,']['sic'] != addr['sic']
--- a/tests/test_whitespace.py
+++ b/tests/test_whitespace.py
@ -1,38 +1,43 @@
 """Test that tokens are created correctly for whitespace."""
 from __future__ import unicode_literals
-from spacy.en import EN
+from spacy.en import English
 import pytest
-def test_single_space():
+@pytest.fixture
-    tokens = EN.tokenize('hello possums')
+def EN():
    return English(tag=False)
 def test_single_space(EN):
    tokens = EN('hello possums')
    assert len(tokens) == 2
-def test_double_space():
+def test_double_space(EN):
-    tokens = EN.tokenize('hello  possums')
+    tokens = EN('hello  possums')
    assert len(tokens) == 3
    assert tokens[1].string == ' '
-def test_newline():
+def test_newline(EN):
-    tokens = EN.tokenize('hello\npossums')
+    tokens = EN('hello\npossums')
    assert len(tokens) == 3
-def test_newline_space():
+def test_newline_space(EN):
-    tokens = EN.tokenize('hello \npossums')
+    tokens = EN('hello \npossums')
    assert len(tokens) == 3
-def test_newline_double_space():
+def test_newline_double_space(EN):
-    tokens = EN.tokenize('hello  \npossums')
+    tokens = EN('hello  \npossums')
    assert len(tokens) == 3
-def test_newline_space_wrap():
+def test_newline_space_wrap(EN):
-    tokens = EN.tokenize('hello \n possums')
+    tokens = EN('hello \n possums')
    assert len(tokens) == 3