* Tests passing except for morphology/lemmatization stuff

2025-11-01 16:37:45 +03:00 · 2014-12-23 11:40:32 +11:00 · 2014-12-23 11:40:32 +11:00 · 73f200436f
commit 73f200436f
parent cf8d26c3d2
27 changed files with 197 additions and 155 deletions
--- a/spacy/en/init.py
+++ b/spacy/en/init.py
@ -6,8 +6,6 @@ from ..vocab import Vocab
 from ..tokenizer import Tokenizer
 from ..syntax.parser import GreedyParser
 from ..tokens import Tokens
-from ..morphology import Morphologizer
-from .lemmatizer import Lemmatizer
 from .pos import EnPosTagger
 from .pos import POS_TAGS
 from .attrs import get_flags
@ -18,28 +16,18 @@ def get_lex_props(string):


 class English(object):
-    def __init__(self, data_dir=None, pos_tag=True, parse=False):
+    def __init__(self, data_dir=None, tag=True, parse=False):
        if data_dir is None:
            data_dir = path.join(path.dirname(__file__), 'data')
-        self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props)
-        for pos_str in POS_TAGS:
-            _ = self.vocab.strings.pos_tags[pos_str]
+        self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
        self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
-        if pos_tag:
-            morph = Morphologizer(self.vocab.strings, POS_TAGS,
-                                  Lemmatizer(path.join(data_dir, 'wordnet')))
-            self.pos_tagger = EnPosTagger(data_dir, morph)
-        else:
-            self.pos_tagger = None
-        if parse:
-            self.parser = GreedyParser(data_dir)
-        else:
-            self.parser = None
+        self.tagger = EnPosTagger(self.vocab.strings, data_dir) if tag else None
+        self.parser = GreedyParser(data_dir) if parse else None

-    def __call__(self, text, pos_tag=True, parse=True):
+    def __call__(self, text, tag=True, parse=True):
        tokens = self.tokenizer.tokenize(text)
-        if self.pos_tagger and pos_tag:
-            self.pos_tagger(tokens)
+        if self.tagger and tag:
+            self.tagger(tokens)
        if self.parser and parse:
            self.parser.parse(tokens)
        return tokens
--- a/spacy/en/pos.pxd
+++ b/spacy/en/pos.pxd
@ -1,6 +1,9 @@
 from ..tagger cimport Tagger
 from ..morphology cimport Morphologizer
+from ..strings cimport StringStore


 cdef class EnPosTagger(Tagger):
+    cdef readonly StringStore strings
+    cdef readonly StringStore tags
    cdef readonly Morphologizer morphologizer
--- a/spacy/en/pos.pyx
+++ b/spacy/en/pos.pyx
@ -1,10 +1,13 @@
 from os import path
+import json
+
 from thinc.typedefs cimport atom_t

 from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
 from ..typedefs cimport X, PUNCT, EOL
 from ..structs cimport TokenC, Morphology
 from ..tokens cimport Tokens
+from .lemmatizer import Lemmatizer


 cpdef enum en_person_t:
@ -192,10 +195,18 @@ POS_TEMPLATES = (


 cdef class EnPosTagger(Tagger):
-    def __init__(self, data_dir, morphologizer):
+    def __init__(self, StringStore strings, data_dir):
        model_dir = path.join(data_dir, 'pos')
        Tagger.__init__(self, path.join(model_dir))
-        self.morphologizer = morphologizer
+        self.strings = strings
+        cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
+        self.tags = StringStore()
+        for tag in sorted(cfg['tag_names']):
+            _ = self.tags[tag]
+        self.morphologizer = Morphologizer(self.strings, cfg['tag_names'],
+                                           cfg['tag_map'],
+                                 Lemmatizer(path.join(data_dir, 'wordnet'),
+                                            NOUN, VERB, ADJ))

    def __call__(self, Tokens tokens):
        cdef int i
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -35,15 +35,15 @@ cdef struct _Cached:
 cdef class Morphologizer:
    """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
    """
-    def __init__(self, StringStore strings, object tag_map, object lemmatizer,
-                 irregulars=None):
+    def __init__(self, StringStore strings, object tag_names, object tag_map,
+                 object lemmatizer, irregulars=None):
        self.mem = Pool()
        self.strings = strings
        self.lemmatizer = lemmatizer
-        cdef int n_tags = len(self.strings.pos_tags) + 1
+        cdef int n_tags = len(tag_names) + 1
        self._cache = PreshMapArray(n_tags)
        self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
-        for tag, i in self.strings.pos_tags:
+        for i, tag in enumerate(sorted(tag_names)):
            pos, props = tag_map[tag]
            self.tags[i].id = i
            self.tags[i].pos = pos
@ -65,13 +65,7 @@ cdef class Morphologizer:
        cdef bytes py_string = self.strings[lex.sic]
        cdef set lemma_strings
        cdef bytes lemma_string
-        if pos == NOUN:
-            lemma_strings = self.lemmatizer.noun(py_string)
-        elif pos == VERB:
-            lemma_strings = self.lemmatizer.verb(py_string)
-        else:
-            assert pos == ADJ
-            lemma_strings = self.lemmatizer.adj(py_string)
+        lemma_strings = self.lemmatizer(py_string, pos)
        lemma_string = sorted(lemma_strings)[0]
        lemma = self.strings.intern(lemma_string, len(lemma_string)).i
        return lemma
--- a/spacy/strings.pxd
+++ b/spacy/strings.pxd
@ -11,16 +11,9 @@ cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end)
    s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)


-cdef class _SymbolMap:
-    cdef dict _string_to_id
-    cdef list _id_to_string
-
-
 cdef class StringStore:
    cdef Pool mem
    cdef Utf8Str* strings
-    cdef readonly _SymbolMap pos_tags
-    cdef readonly _SymbolMap dep_tags
    cdef size_t size

    cdef PreshMap _map
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -9,7 +9,8 @@ from .typedefs cimport hash_t
 SEPARATOR = '\n|-SEP-|\n'


-cdef class _SymbolMap:
+"""
+cdef class SymbolMap:
    def __init__(self):
        self._string_to_id = {'': 0}
        self._id_to_string = ['']
@ -38,6 +39,7 @@ cdef class _SymbolMap:
                self._string_to_id[string] = id_
                self._id_to_string.append(string)
            return id_
+"""


 cdef class StringStore:
@ -47,8 +49,6 @@ cdef class StringStore:
        self._resize_at = 10000
        self.size = 1
        self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
-        self.pos_tags = _SymbolMap()
-        self.dep_tags = _SymbolMap()

    property size:
        def __get__(self):
--- a/spacy/tagger.pyx
+++ b/spacy/tagger.pyx
@ -12,13 +12,14 @@ import cython
 from thinc.features cimport Feature, count_feats


-def setup_model_dir(tag_names, templates, model_dir):
+def setup_model_dir(tag_names, tag_map, templates, model_dir):
    if path.exists(model_dir):
        shutil.rmtree(model_dir)
    os.mkdir(model_dir)
    config = {
        'templates': templates,
        'tag_names': tag_names,
+        'tag_map': tag_map
    }
    with open(path.join(model_dir, 'config.json'), 'w') as file_:
        json.dump(config, file_)
--- a/spacy/tokens.pxd
+++ b/spacy/tokens.pxd
@ -43,6 +43,7 @@ cdef class Token:
    cdef readonly int dep_id
    cdef int lemma
    cdef public int head
+    cdef public int dep_tag

    cdef public atom_t id
    cdef public atom_t cluster
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -158,7 +158,8 @@ cdef class Token:

    property dep:
        def __get__(self):
-            return self.string_store.dep_tags[self.dep]
+            return self.string_store.dep_tags[self.dep_id]
+
    property pos:
        def __get__(self):
-            return self.string_store.pos_tags[self.pos]
+            return self.pos_id
--- a/spacy/vocab.pyx
+++ b/spacy/vocab.pyx
@ -12,23 +12,21 @@ cdef class Vocab:
    
    Also interns UTF-8 strings, and maps them to consecutive integer IDs.
    '''
-    def __init__(self, object get_lex_props):
+    def __init__(self, data_dir=None, get_lex_props=None):
        self.mem = Pool()
        self._map = PreshMap(2 ** 20)
        self.strings = StringStore()
        self.lexemes.push_back(&EMPTY_LEXEME)
        self.get_lex_props = get_lex_props

-    @classmethod
-    def from_dir(cls, object data_dir, object get_lex_props=None):
-        if not path.exists(data_dir):
-            raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
-        if not path.isdir(data_dir):
-            raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
-        cdef Vocab self = cls(get_lex_props)
-        self.strings.load(path.join(data_dir, 'strings'))
-        self.load(path.join(data_dir, 'lexemes'))
-        return self
+        if data_dir is not None:
+            if not path.exists(data_dir):
+                raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
+        if data_dir is not None:
+            if not path.isdir(data_dir):
+                raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
+            self.strings.load(path.join(data_dir, 'strings'))
+            self.load(path.join(data_dir, 'lexemes'))

    def __len__(self):
        return self.lexemes.size()
--- a/tests/test_add_lemmas.py
+++ b/tests/test_add_lemmas.py
@ -1,14 +1,20 @@
 from spacy.en import English
 import pytest

+
@pytest.fixture
 def EN():
-    return English(pos_tag=True, parse=False)
+    return English(tag=True, parse=False)

@pytest.fixture
 def tagged(EN):
    string = u'Bananas in pyjamas are geese.'
-    tokens = EN(string, pos_tag=True)
+    tokens = EN(string, tag=True)
+    assert EN.tagger.tags[tokens[0].pos] == 'NNP'
+    assert EN.tagger.tags[tokens[1].pos] == 'IN'
+    assert EN.tagger.tags[tokens[2].pos] == 'NNS'
+    assert EN.tagger.tags[tokens[3].pos] == 'VBP'
+    assert EN.tagger.tags[tokens[3].pos] == 'NNS'
    return tokens


--- a/tests/test_contractions.py
+++ b/tests/test_contractions.py
@ -5,7 +5,7 @@ from spacy.en import English

@pytest.fixture
 def EN():
-    return English(pos_tag=False)
+    return English(tag=False)


 def test_possess(EN):
--- a/tests/test_emoticons.py
+++ b/tests/test_emoticons.py
@ -6,7 +6,7 @@ from spacy.en import English

@pytest.fixture
 def EN():
-    return English(pos_tag=False)
+    return English(tag=False)

 def test_tweebo_challenge(EN):
    text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
--- a/tests/test_infix.py
+++ b/tests/test_infix.py
@ -2,7 +2,7 @@ from __future__ import unicode_literals

 import pytest

-from spacy.en import EN
+from spacy.en import English


 #def test_hyphen():
@ -11,7 +11,8 @@ from spacy.en import EN


 def test_period():
-    tokens = EN.tokenize('best.Known')
+    EN = English()
+    tokens = EN('best.Known')
    assert len(tokens) == 3
-    tokens = EN.tokenize('zombo.com')
+    tokens = EN('zombo.com')
    assert len(tokens) == 1
--- a/tests/test_iter_lexicon.py
+++ b/tests/test_iter_lexicon.py
@ -4,7 +4,7 @@ from spacy.en import English

@pytest.fixture
 def EN():
-    return English(pos_tag=True)
+    return English(tag=True)

 def test_range_iter(EN):
    for i in range(len(EN.vocab)):
--- a/tests/test_lexeme_flags.py
+++ b/tests/test_lexeme_flags.py
@ -3,12 +3,12 @@ from __future__ import unicode_literals
 import pytest

 from spacy.en import English
-from spacy.en.attrs import IS_ALPHA, IS_DIGIT
+from spacy.en.attrs import *


@pytest.fixture
 def EN():
-    return English(pos_tag=False)
+    return English(tag=False)


 def test_is_alpha(EN):
--- a/tests/test_morph_exceptions.py
+++ b/tests/test_morph_exceptions.py
@ -8,7 +8,7 @@ from spacy.en import English

@pytest.fixture
 def EN():
-    return English(pos_tag=True, parse=False)
+    return English(tag=True, parse=False)


@pytest.fixture
@ -18,8 +18,8 @@ def morph_exc():
           }

 def test_load_exc(EN, morph_exc):
-    EN.pos_tagger.morphologizer.load_exceptions(morph_exc)
-    tokens = EN('I like his style.', pos_tag=True)
+    EN.tagger.morphologizer.load_exceptions(morph_exc)
+    tokens = EN('I like his style.', tag=True)
    his = tokens[2]
    assert his.pos == 'PRP$'
    assert his.lemma == '-PRP-'
--- a/tests/test_only_punct.py
+++ b/tests/test_only_punct.py
@ -1,11 +1,13 @@
 from __future__ import unicode_literals
 import pytest

-from spacy.en import EN
+from spacy.en import English

 def test_only_pre1():
-    assert len(EN.tokenize("(")) == 1
+    EN = English()
+    assert len(EN("(")) == 1


 def test_only_pre2():
-    assert len(EN.tokenize("((")) == 2
+    EN = English()
+    assert len(EN("((")) == 2
--- a/tests/test_post_punct.py
+++ b/tests/test_post_punct.py
@ -1,6 +1,6 @@
 from __future__ import unicode_literals

-from spacy.en import EN
+from spacy.en import English

 import pytest

@ -10,38 +10,43 @@ def close_puncts():
    return [')', ']', '}', '*']


-def test_close(close_puncts):
+@pytest.fixture
+def EN():
+    return English()
+
+
+def test_close(close_puncts, EN):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 2
        assert tokens[1].string == p
        assert tokens[0].string == word_str


-def test_two_different_close(close_puncts):
+def test_two_different_close(close_puncts, EN):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p + "'"
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 3
        assert tokens[0].string == word_str
        assert tokens[1].string == p
        assert tokens[2].string == "'"


-def test_three_same_close(close_puncts):
+def test_three_same_close(close_puncts, EN):
    word_str = 'Hello'
    for p in close_puncts:
        string = word_str + p + p + p
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 4
        assert tokens[0].string == word_str
        assert tokens[1].string == p


-def test_double_end_quote():
-    assert len(EN.tokenize("Hello''")) == 2
-    assert len(EN.tokenize("''")) == 1
+def test_double_end_quote(EN):
+    assert len(EN("Hello''")) == 2
+    assert len(EN("''")) == 1

--- a/tests/test_pre_punct.py
+++ b/tests/test_pre_punct.py
@ -1,6 +1,6 @@
 from __future__ import unicode_literals

-from spacy.en import EN
+from spacy.en import English

 import pytest

@ -10,39 +10,44 @@ def open_puncts():
    return ['(', '[', '{', '*']


-def test_open(open_puncts):
+@pytest.fixture
+def EN():
+    return English()
+
+
+def test_open(open_puncts, EN):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + word_str
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 2
        assert tokens[0].string == p
        assert tokens[1].string == word_str


-def test_two_different_open(open_puncts):
+def test_two_different_open(open_puncts, EN):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + "`" + word_str
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 3
        assert tokens[0].string == p
        assert tokens[1].string == "`"
        assert tokens[2].string == word_str


-def test_three_same_open(open_puncts):
+def test_three_same_open(open_puncts, EN):
    word_str = 'Hello'
    for p in open_puncts:
        string = p + p + p + word_str
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 4
        assert tokens[0].string == p
        assert tokens[3].string == word_str


-def test_open_appostrophe():
+def test_open_appostrophe(EN):
    string = "'The"
-    tokens = EN.tokenize(string)
+    tokens = EN(string)
    assert len(tokens) == 2
    assert tokens[0].string == "'"
--- a/tests/test_special_affix.py
+++ b/tests/test_special_affix.py
@ -3,43 +3,48 @@ and suffix punctuation."""
 from __future__ import unicode_literals
 import pytest

-from spacy.en import EN
+from spacy.en import English

-def test_no_special():
-    assert len(EN.tokenize("(can)")) == 3
-
-def test_no_punct():
-    assert len(EN.tokenize("can't")) == 2
-
-def test_prefix():
-    assert len(EN.tokenize("(can't")) == 3
+@pytest.fixture
+def EN():
+    return English()


-def test_suffix():
-    assert len(EN.tokenize("can't)")) == 3
+def test_no_special(EN):
+    assert len(EN("(can)")) == 3
+
+def test_no_punct(EN):
+    assert len(EN("can't")) == 2
+
+def test_prefix(EN):
+    assert len(EN("(can't")) == 3


-def test_wrap():
-    assert len(EN.tokenize("(can't)")) == 4
+def test_suffix(EN):
+    assert len(EN("can't)")) == 3


-def test_uneven_wrap():
-    assert len(EN.tokenize("(can't?)")) == 5
+def test_wrap(EN):
+    assert len(EN("(can't)")) == 4


-def test_prefix_interact():
-    assert len(EN.tokenize("U.S.")) == 1
-    assert len(EN.tokenize("us.")) == 2
-    assert len(EN.tokenize("(U.S.")) == 2
+def test_uneven_wrap(EN):
+    assert len(EN("(can't?)")) == 5


-def test_suffix_interact():
-    assert len(EN.tokenize("U.S.)")) == 2
+def test_prefix_interact(EN):
+    assert len(EN("U.S.")) == 1
+    assert len(EN("us.")) == 2
+    assert len(EN("(U.S.")) == 2


-def test_even_wrap_interact():
-    assert len(EN.tokenize("(U.S.)")) == 3
+def test_suffix_interact(EN):
+    assert len(EN("U.S.)")) == 2


-def test_uneven_wrap_interact():
-    assert len(EN.tokenize("(U.S.?)")) == 4
+def test_even_wrap_interact(EN):
+    assert len(EN("(U.S.)")) == 3
+
+
+def test_uneven_wrap_interact(EN):
+    assert len(EN("(U.S.?)")) == 4
--- a/tests/test_string_loading.py
+++ b/tests/test_string_loading.py
@ -3,13 +3,18 @@ from __future__ import unicode_literals

 import pytest

-from spacy.en import EN
+from spacy.en import English


-def test_one():
-    tokens = EN.tokenize('Betty Botter bought a pound of butter.')
+@pytest.fixture
+def EN():
+    return English()
+
+
+def test_one(EN):
+    tokens = EN('Betty Botter bought a pound of butter.')
    assert tokens[0].string == 'Betty'
-    tokens2 = EN.tokenize('Betty also bought a pound of butter.')
+    tokens2 = EN('Betty also bought a pound of butter.')
    assert tokens2[0].string == 'Betty'


--- a/tests/test_surround_punct.py
+++ b/tests/test_surround_punct.py
@ -1,6 +1,6 @@
 from __future__ import unicode_literals

-from spacy.en import EN
+from spacy.en import English

 import pytest

@ -10,22 +10,27 @@ def paired_puncts():
    return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]


-def test_token(paired_puncts):
+@pytest.fixture
+def EN():
+    return English()
+
+
+def test_token(paired_puncts, EN):
    word_str = 'Hello'
    for open_, close_ in paired_puncts:
        string = open_ + word_str + close_
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 3
        assert tokens[0].string == open_
        assert tokens[1].string == word_str
        assert tokens[2].string == close_


-def test_two_different(paired_puncts):
+def test_two_different(paired_puncts, EN):
    word_str = 'Hello'
    for open_, close_ in paired_puncts:
        string = "`" + open_ + word_str + close_ + "'"
-        tokens = EN.tokenize(string)
+        tokens = EN(string)
        assert len(tokens) == 5
        assert tokens[0].string == "`"
        assert tokens[1].string == open_
--- a/tests/test_tokenizer.py
+++ b/tests/test_tokenizer.py
@ -8,7 +8,7 @@ from spacy.en import English

@pytest.fixture
 def EN():
-    return English(pos_tag=False, parse=False)
+    return English(tag=False, parse=False)

 def test_single_word(EN):
    tokens = EN(u'hello')
--- a/tests/test_tokens_from_list.py
+++ b/tests/test_tokens_from_list.py
@ -1,9 +1,16 @@
 from __future__ import unicode_literals
+import pytest

-from spacy.en import EN
+from spacy.en import English

-def test1():
+
+@pytest.fixture
+def EN():
+    return English()
+
+
+def test1(EN):
    words = ['JAPAN', 'GET', 'LUCKY']
-    tokens = EN.tokens_from_list(words)
+    tokens = EN.tokenizer.tokens_from_list(words)
    assert len(tokens) == 3
    assert tokens[0].string == 'JAPAN'
--- a/tests/test_vocab.py
+++ b/tests/test_vocab.py
@ -1,23 +1,29 @@
 from __future__ import unicode_literals
+import pytest

-from spacy.en import EN
+from spacy.en import English


-def test_neq():
-    addr = EN.lexicon['Hello']
-    assert EN.lexicon['bye']['sic'] != addr['sic']
+@pytest.fixture
+def EN():
+    return English()


-def test_eq():
-    addr = EN.lexicon['Hello']
-    assert EN.lexicon['Hello']['sic'] == addr['sic']
+def test_neq(EN):
+    addr = EN.vocab['Hello']
+    assert EN.vocab['bye']['sic'] != addr['sic']


-def test_case_neq():
-    addr = EN.lexicon['Hello']
-    assert EN.lexicon['hello']['sic'] != addr['sic']
+def test_eq(EN):
+    addr = EN.vocab['Hello']
+    assert EN.vocab['Hello']['sic'] == addr['sic']


-def test_punct_neq():
-    addr = EN.lexicon['Hello']
-    assert EN.lexicon['Hello,']['sic'] != addr['sic']
+def test_case_neq(EN):
+    addr = EN.vocab['Hello']
+    assert EN.vocab['hello']['sic'] != addr['sic']
+
+
+def test_punct_neq(EN):
+    addr = EN.vocab['Hello']
+    assert EN.vocab['Hello,']['sic'] != addr['sic']
--- a/tests/test_whitespace.py
+++ b/tests/test_whitespace.py
@ -1,38 +1,43 @@
 """Test that tokens are created correctly for whitespace."""
 from __future__ import unicode_literals

-from spacy.en import EN
+from spacy.en import English
 import pytest


-def test_single_space():
-    tokens = EN.tokenize('hello possums')
+@pytest.fixture
+def EN():
+    return English(tag=False)
+
+
+def test_single_space(EN):
+    tokens = EN('hello possums')
    assert len(tokens) == 2


-def test_double_space():
-    tokens = EN.tokenize('hello  possums')
+def test_double_space(EN):
+    tokens = EN('hello  possums')
    assert len(tokens) == 3
    assert tokens[1].string == ' '


-def test_newline():
-    tokens = EN.tokenize('hello\npossums')
+def test_newline(EN):
+    tokens = EN('hello\npossums')
    assert len(tokens) == 3


-def test_newline_space():
-    tokens = EN.tokenize('hello \npossums')
+def test_newline_space(EN):
+    tokens = EN('hello \npossums')
    assert len(tokens) == 3


-def test_newline_double_space():
-    tokens = EN.tokenize('hello  \npossums')
+def test_newline_double_space(EN):
+    tokens = EN('hello  \npossums')
    assert len(tokens) == 3


-def test_newline_space_wrap():
-    tokens = EN.tokenize('hello \n possums')
+def test_newline_space_wrap(EN):
+    tokens = EN('hello \n possums')
    assert len(tokens) == 3