From 73f200436f577d3c0b4ef73139e48c2b043d9381 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Tue, 23 Dec 2014 11:40:32 +1100 Subject: [PATCH] * Tests passing except for morphology/lemmatization stuff --- spacy/en/__init__.py | 26 +++++----------- spacy/en/pos.pxd | 3 ++ spacy/en/pos.pyx | 15 ++++++++-- spacy/morphology.pyx | 16 ++++------ spacy/strings.pxd | 7 ----- spacy/strings.pyx | 6 ++-- spacy/tagger.pyx | 3 +- spacy/tokens.pxd | 1 + spacy/tokens.pyx | 5 ++-- spacy/vocab.pyx | 20 ++++++------- tests/test_add_lemmas.py | 10 +++++-- tests/test_contractions.py | 2 +- tests/test_emoticons.py | 2 +- tests/test_infix.py | 7 +++-- tests/test_iter_lexicon.py | 2 +- tests/test_lexeme_flags.py | 4 +-- tests/test_morph_exceptions.py | 6 ++-- tests/test_only_punct.py | 8 +++-- tests/test_post_punct.py | 25 +++++++++------- tests/test_pre_punct.py | 23 ++++++++------ tests/test_special_affix.py | 55 ++++++++++++++++++---------------- tests/test_string_loading.py | 13 +++++--- tests/test_surround_punct.py | 15 ++++++---- tests/test_tokenizer.py | 2 +- tests/test_tokens_from_list.py | 13 ++++++-- tests/test_vocab.py | 32 ++++++++++++-------- tests/test_whitespace.py | 31 +++++++++++-------- 27 files changed, 197 insertions(+), 155 deletions(-) diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index d17eaf61b..563d0b1b7 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -6,8 +6,6 @@ from ..vocab import Vocab from ..tokenizer import Tokenizer from ..syntax.parser import GreedyParser from ..tokens import Tokens -from ..morphology import Morphologizer -from .lemmatizer import Lemmatizer from .pos import EnPosTagger from .pos import POS_TAGS from .attrs import get_flags @@ -18,28 +16,18 @@ def get_lex_props(string): class English(object): - def __init__(self, data_dir=None, pos_tag=True, parse=False): + def __init__(self, data_dir=None, tag=True, parse=False): if data_dir is None: data_dir = path.join(path.dirname(__file__), 'data') - self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props) - for pos_str in POS_TAGS: - _ = self.vocab.strings.pos_tags[pos_str] + self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props) self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir) - if pos_tag: - morph = Morphologizer(self.vocab.strings, POS_TAGS, - Lemmatizer(path.join(data_dir, 'wordnet'))) - self.pos_tagger = EnPosTagger(data_dir, morph) - else: - self.pos_tagger = None - if parse: - self.parser = GreedyParser(data_dir) - else: - self.parser = None + self.tagger = EnPosTagger(self.vocab.strings, data_dir) if tag else None + self.parser = GreedyParser(data_dir) if parse else None - def __call__(self, text, pos_tag=True, parse=True): + def __call__(self, text, tag=True, parse=True): tokens = self.tokenizer.tokenize(text) - if self.pos_tagger and pos_tag: - self.pos_tagger(tokens) + if self.tagger and tag: + self.tagger(tokens) if self.parser and parse: self.parser.parse(tokens) return tokens diff --git a/spacy/en/pos.pxd b/spacy/en/pos.pxd index 99c83d795..63cd906fc 100644 --- a/spacy/en/pos.pxd +++ b/spacy/en/pos.pxd @@ -1,6 +1,9 @@ from ..tagger cimport Tagger from ..morphology cimport Morphologizer +from ..strings cimport StringStore cdef class EnPosTagger(Tagger): + cdef readonly StringStore strings + cdef readonly StringStore tags cdef readonly Morphologizer morphologizer diff --git a/spacy/en/pos.pyx b/spacy/en/pos.pyx index 6b002f9a8..4484207ae 100644 --- a/spacy/en/pos.pyx +++ b/spacy/en/pos.pyx @@ -1,10 +1,13 @@ from os import path +import json + from thinc.typedefs cimport atom_t from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB from ..typedefs cimport X, PUNCT, EOL from ..structs cimport TokenC, Morphology from ..tokens cimport Tokens +from .lemmatizer import Lemmatizer cpdef enum en_person_t: @@ -192,10 +195,18 @@ POS_TEMPLATES = ( cdef class EnPosTagger(Tagger): - def __init__(self, data_dir, morphologizer): + def __init__(self, StringStore strings, data_dir): model_dir = path.join(data_dir, 'pos') Tagger.__init__(self, path.join(model_dir)) - self.morphologizer = morphologizer + self.strings = strings + cfg = json.load(open(path.join(data_dir, 'pos', 'config.json'))) + self.tags = StringStore() + for tag in sorted(cfg['tag_names']): + _ = self.tags[tag] + self.morphologizer = Morphologizer(self.strings, cfg['tag_names'], + cfg['tag_map'], + Lemmatizer(path.join(data_dir, 'wordnet'), + NOUN, VERB, ADJ)) def __call__(self, Tokens tokens): cdef int i diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx index 7f9df80da..66495be04 100644 --- a/spacy/morphology.pyx +++ b/spacy/morphology.pyx @@ -35,15 +35,15 @@ cdef struct _Cached: cdef class Morphologizer: """Given a POS tag and a Lexeme, find its lemma and morphological analysis. """ - def __init__(self, StringStore strings, object tag_map, object lemmatizer, - irregulars=None): + def __init__(self, StringStore strings, object tag_names, object tag_map, + object lemmatizer, irregulars=None): self.mem = Pool() self.strings = strings self.lemmatizer = lemmatizer - cdef int n_tags = len(self.strings.pos_tags) + 1 + cdef int n_tags = len(tag_names) + 1 self._cache = PreshMapArray(n_tags) self.tags = self.mem.alloc(n_tags, sizeof(PosTag)) - for tag, i in self.strings.pos_tags: + for i, tag in enumerate(sorted(tag_names)): pos, props = tag_map[tag] self.tags[i].id = i self.tags[i].pos = pos @@ -65,13 +65,7 @@ cdef class Morphologizer: cdef bytes py_string = self.strings[lex.sic] cdef set lemma_strings cdef bytes lemma_string - if pos == NOUN: - lemma_strings = self.lemmatizer.noun(py_string) - elif pos == VERB: - lemma_strings = self.lemmatizer.verb(py_string) - else: - assert pos == ADJ - lemma_strings = self.lemmatizer.adj(py_string) + lemma_strings = self.lemmatizer(py_string, pos) lemma_string = sorted(lemma_strings)[0] lemma = self.strings.intern(lemma_string, len(lemma_string)).i return lemma diff --git a/spacy/strings.pxd b/spacy/strings.pxd index 00a5fbf66..9c16cfe1c 100644 --- a/spacy/strings.pxd +++ b/spacy/strings.pxd @@ -11,16 +11,9 @@ cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end) s.key = hash64(s.chars, (s.n * sizeof(Py_UNICODE)), 0) -cdef class _SymbolMap: - cdef dict _string_to_id - cdef list _id_to_string - - cdef class StringStore: cdef Pool mem cdef Utf8Str* strings - cdef readonly _SymbolMap pos_tags - cdef readonly _SymbolMap dep_tags cdef size_t size cdef PreshMap _map diff --git a/spacy/strings.pyx b/spacy/strings.pyx index 903256874..67d375ed7 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -9,7 +9,8 @@ from .typedefs cimport hash_t SEPARATOR = '\n|-SEP-|\n' -cdef class _SymbolMap: +""" +cdef class SymbolMap: def __init__(self): self._string_to_id = {'': 0} self._id_to_string = [''] @@ -38,6 +39,7 @@ cdef class _SymbolMap: self._string_to_id[string] = id_ self._id_to_string.append(string) return id_ +""" cdef class StringStore: @@ -47,8 +49,6 @@ cdef class StringStore: self._resize_at = 10000 self.size = 1 self.strings = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) - self.pos_tags = _SymbolMap() - self.dep_tags = _SymbolMap() property size: def __get__(self): diff --git a/spacy/tagger.pyx b/spacy/tagger.pyx index a4aae827f..a718d04ec 100644 --- a/spacy/tagger.pyx +++ b/spacy/tagger.pyx @@ -12,13 +12,14 @@ import cython from thinc.features cimport Feature, count_feats -def setup_model_dir(tag_names, templates, model_dir): +def setup_model_dir(tag_names, tag_map, templates, model_dir): if path.exists(model_dir): shutil.rmtree(model_dir) os.mkdir(model_dir) config = { 'templates': templates, 'tag_names': tag_names, + 'tag_map': tag_map } with open(path.join(model_dir, 'config.json'), 'w') as file_: json.dump(config, file_) diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 5ee5d01f7..35796d8e0 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -43,6 +43,7 @@ cdef class Token: cdef readonly int dep_id cdef int lemma cdef public int head + cdef public int dep_tag cdef public atom_t id cdef public atom_t cluster diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 5e81c4a4e..16f9d3c20 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -158,7 +158,8 @@ cdef class Token: property dep: def __get__(self): - return self.string_store.dep_tags[self.dep] + return self.string_store.dep_tags[self.dep_id] + property pos: def __get__(self): - return self.string_store.pos_tags[self.pos] + return self.pos_id diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 3ab1005f6..a34d3560f 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -12,23 +12,21 @@ cdef class Vocab: Also interns UTF-8 strings, and maps them to consecutive integer IDs. ''' - def __init__(self, object get_lex_props): + def __init__(self, data_dir=None, get_lex_props=None): self.mem = Pool() self._map = PreshMap(2 ** 20) self.strings = StringStore() self.lexemes.push_back(&EMPTY_LEXEME) self.get_lex_props = get_lex_props - @classmethod - def from_dir(cls, object data_dir, object get_lex_props=None): - if not path.exists(data_dir): - raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) - if not path.isdir(data_dir): - raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) - cdef Vocab self = cls(get_lex_props) - self.strings.load(path.join(data_dir, 'strings')) - self.load(path.join(data_dir, 'lexemes')) - return self + if data_dir is not None: + if not path.exists(data_dir): + raise IOError("Directory %s not found -- cannot load Vocab." % data_dir) + if data_dir is not None: + if not path.isdir(data_dir): + raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir) + self.strings.load(path.join(data_dir, 'strings')) + self.load(path.join(data_dir, 'lexemes')) def __len__(self): return self.lexemes.size() diff --git a/tests/test_add_lemmas.py b/tests/test_add_lemmas.py index cf7cb1c7f..5e6fdca61 100644 --- a/tests/test_add_lemmas.py +++ b/tests/test_add_lemmas.py @@ -1,14 +1,20 @@ from spacy.en import English import pytest + @pytest.fixture def EN(): - return English(pos_tag=True, parse=False) + return English(tag=True, parse=False) @pytest.fixture def tagged(EN): string = u'Bananas in pyjamas are geese.' - tokens = EN(string, pos_tag=True) + tokens = EN(string, tag=True) + assert EN.tagger.tags[tokens[0].pos] == 'NNP' + assert EN.tagger.tags[tokens[1].pos] == 'IN' + assert EN.tagger.tags[tokens[2].pos] == 'NNS' + assert EN.tagger.tags[tokens[3].pos] == 'VBP' + assert EN.tagger.tags[tokens[3].pos] == 'NNS' return tokens diff --git a/tests/test_contractions.py b/tests/test_contractions.py index 78a56f67f..5c5e5c7c7 100644 --- a/tests/test_contractions.py +++ b/tests/test_contractions.py @@ -5,7 +5,7 @@ from spacy.en import English @pytest.fixture def EN(): - return English(pos_tag=False) + return English(tag=False) def test_possess(EN): diff --git a/tests/test_emoticons.py b/tests/test_emoticons.py index 10346b252..52b51906a 100644 --- a/tests/test_emoticons.py +++ b/tests/test_emoticons.py @@ -6,7 +6,7 @@ from spacy.en import English @pytest.fixture def EN(): - return English(pos_tag=False) + return English(tag=False) def test_tweebo_challenge(EN): text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ....""" diff --git a/tests/test_infix.py b/tests/test_infix.py index a1eeadd65..d52996e33 100644 --- a/tests/test_infix.py +++ b/tests/test_infix.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals import pytest -from spacy.en import EN +from spacy.en import English #def test_hyphen(): @@ -11,7 +11,8 @@ from spacy.en import EN def test_period(): - tokens = EN.tokenize('best.Known') + EN = English() + tokens = EN('best.Known') assert len(tokens) == 3 - tokens = EN.tokenize('zombo.com') + tokens = EN('zombo.com') assert len(tokens) == 1 diff --git a/tests/test_iter_lexicon.py b/tests/test_iter_lexicon.py index e36c645b4..c03292187 100644 --- a/tests/test_iter_lexicon.py +++ b/tests/test_iter_lexicon.py @@ -4,7 +4,7 @@ from spacy.en import English @pytest.fixture def EN(): - return English(pos_tag=True) + return English(tag=True) def test_range_iter(EN): for i in range(len(EN.vocab)): diff --git a/tests/test_lexeme_flags.py b/tests/test_lexeme_flags.py index 3ab123039..b2fdee957 100644 --- a/tests/test_lexeme_flags.py +++ b/tests/test_lexeme_flags.py @@ -3,12 +3,12 @@ from __future__ import unicode_literals import pytest from spacy.en import English -from spacy.en.attrs import IS_ALPHA, IS_DIGIT +from spacy.en.attrs import * @pytest.fixture def EN(): - return English(pos_tag=False) + return English(tag=False) def test_is_alpha(EN): diff --git a/tests/test_morph_exceptions.py b/tests/test_morph_exceptions.py index f60cb5683..71c6f59f1 100644 --- a/tests/test_morph_exceptions.py +++ b/tests/test_morph_exceptions.py @@ -8,7 +8,7 @@ from spacy.en import English @pytest.fixture def EN(): - return English(pos_tag=True, parse=False) + return English(tag=True, parse=False) @pytest.fixture @@ -18,8 +18,8 @@ def morph_exc(): } def test_load_exc(EN, morph_exc): - EN.pos_tagger.morphologizer.load_exceptions(morph_exc) - tokens = EN('I like his style.', pos_tag=True) + EN.tagger.morphologizer.load_exceptions(morph_exc) + tokens = EN('I like his style.', tag=True) his = tokens[2] assert his.pos == 'PRP$' assert his.lemma == '-PRP-' diff --git a/tests/test_only_punct.py b/tests/test_only_punct.py index f2c558cc7..384ad6332 100644 --- a/tests/test_only_punct.py +++ b/tests/test_only_punct.py @@ -1,11 +1,13 @@ from __future__ import unicode_literals import pytest -from spacy.en import EN +from spacy.en import English def test_only_pre1(): - assert len(EN.tokenize("(")) == 1 + EN = English() + assert len(EN("(")) == 1 def test_only_pre2(): - assert len(EN.tokenize("((")) == 2 + EN = English() + assert len(EN("((")) == 2 diff --git a/tests/test_post_punct.py b/tests/test_post_punct.py index da80f5636..f61759609 100644 --- a/tests/test_post_punct.py +++ b/tests/test_post_punct.py @@ -1,6 +1,6 @@ from __future__ import unicode_literals -from spacy.en import EN +from spacy.en import English import pytest @@ -10,38 +10,43 @@ def close_puncts(): return [')', ']', '}', '*'] -def test_close(close_puncts): +@pytest.fixture +def EN(): + return English() + + +def test_close(close_puncts, EN): word_str = 'Hello' for p in close_puncts: string = word_str + p - tokens = EN.tokenize(string) + tokens = EN(string) assert len(tokens) == 2 assert tokens[1].string == p assert tokens[0].string == word_str -def test_two_different_close(close_puncts): +def test_two_different_close(close_puncts, EN): word_str = 'Hello' for p in close_puncts: string = word_str + p + "'" - tokens = EN.tokenize(string) + tokens = EN(string) assert len(tokens) == 3 assert tokens[0].string == word_str assert tokens[1].string == p assert tokens[2].string == "'" -def test_three_same_close(close_puncts): +def test_three_same_close(close_puncts, EN): word_str = 'Hello' for p in close_puncts: string = word_str + p + p + p - tokens = EN.tokenize(string) + tokens = EN(string) assert len(tokens) == 4 assert tokens[0].string == word_str assert tokens[1].string == p -def test_double_end_quote(): - assert len(EN.tokenize("Hello''")) == 2 - assert len(EN.tokenize("''")) == 1 +def test_double_end_quote(EN): + assert len(EN("Hello''")) == 2 + assert len(EN("''")) == 1 diff --git a/tests/test_pre_punct.py b/tests/test_pre_punct.py index 557655330..2eb6fdd3d 100644 --- a/tests/test_pre_punct.py +++ b/tests/test_pre_punct.py @@ -1,6 +1,6 @@ from __future__ import unicode_literals -from spacy.en import EN +from spacy.en import English import pytest @@ -10,39 +10,44 @@ def open_puncts(): return ['(', '[', '{', '*'] -def test_open(open_puncts): +@pytest.fixture +def EN(): + return English() + + +def test_open(open_puncts, EN): word_str = 'Hello' for p in open_puncts: string = p + word_str - tokens = EN.tokenize(string) + tokens = EN(string) assert len(tokens) == 2 assert tokens[0].string == p assert tokens[1].string == word_str -def test_two_different_open(open_puncts): +def test_two_different_open(open_puncts, EN): word_str = 'Hello' for p in open_puncts: string = p + "`" + word_str - tokens = EN.tokenize(string) + tokens = EN(string) assert len(tokens) == 3 assert tokens[0].string == p assert tokens[1].string == "`" assert tokens[2].string == word_str -def test_three_same_open(open_puncts): +def test_three_same_open(open_puncts, EN): word_str = 'Hello' for p in open_puncts: string = p + p + p + word_str - tokens = EN.tokenize(string) + tokens = EN(string) assert len(tokens) == 4 assert tokens[0].string == p assert tokens[3].string == word_str -def test_open_appostrophe(): +def test_open_appostrophe(EN): string = "'The" - tokens = EN.tokenize(string) + tokens = EN(string) assert len(tokens) == 2 assert tokens[0].string == "'" diff --git a/tests/test_special_affix.py b/tests/test_special_affix.py index ec1765368..63ee3eb01 100644 --- a/tests/test_special_affix.py +++ b/tests/test_special_affix.py @@ -3,43 +3,48 @@ and suffix punctuation.""" from __future__ import unicode_literals import pytest -from spacy.en import EN +from spacy.en import English -def test_no_special(): - assert len(EN.tokenize("(can)")) == 3 - -def test_no_punct(): - assert len(EN.tokenize("can't")) == 2 - -def test_prefix(): - assert len(EN.tokenize("(can't")) == 3 +@pytest.fixture +def EN(): + return English() -def test_suffix(): - assert len(EN.tokenize("can't)")) == 3 +def test_no_special(EN): + assert len(EN("(can)")) == 3 + +def test_no_punct(EN): + assert len(EN("can't")) == 2 + +def test_prefix(EN): + assert len(EN("(can't")) == 3 -def test_wrap(): - assert len(EN.tokenize("(can't)")) == 4 +def test_suffix(EN): + assert len(EN("can't)")) == 3 -def test_uneven_wrap(): - assert len(EN.tokenize("(can't?)")) == 5 +def test_wrap(EN): + assert len(EN("(can't)")) == 4 -def test_prefix_interact(): - assert len(EN.tokenize("U.S.")) == 1 - assert len(EN.tokenize("us.")) == 2 - assert len(EN.tokenize("(U.S.")) == 2 +def test_uneven_wrap(EN): + assert len(EN("(can't?)")) == 5 -def test_suffix_interact(): - assert len(EN.tokenize("U.S.)")) == 2 +def test_prefix_interact(EN): + assert len(EN("U.S.")) == 1 + assert len(EN("us.")) == 2 + assert len(EN("(U.S.")) == 2 -def test_even_wrap_interact(): - assert len(EN.tokenize("(U.S.)")) == 3 +def test_suffix_interact(EN): + assert len(EN("U.S.)")) == 2 -def test_uneven_wrap_interact(): - assert len(EN.tokenize("(U.S.?)")) == 4 +def test_even_wrap_interact(EN): + assert len(EN("(U.S.)")) == 3 + + +def test_uneven_wrap_interact(EN): + assert len(EN("(U.S.?)")) == 4 diff --git a/tests/test_string_loading.py b/tests/test_string_loading.py index e2fa2429a..a5021856f 100644 --- a/tests/test_string_loading.py +++ b/tests/test_string_loading.py @@ -3,13 +3,18 @@ from __future__ import unicode_literals import pytest -from spacy.en import EN +from spacy.en import English -def test_one(): - tokens = EN.tokenize('Betty Botter bought a pound of butter.') +@pytest.fixture +def EN(): + return English() + + +def test_one(EN): + tokens = EN('Betty Botter bought a pound of butter.') assert tokens[0].string == 'Betty' - tokens2 = EN.tokenize('Betty also bought a pound of butter.') + tokens2 = EN('Betty also bought a pound of butter.') assert tokens2[0].string == 'Betty' diff --git a/tests/test_surround_punct.py b/tests/test_surround_punct.py index b7be782f2..0c816ad8f 100644 --- a/tests/test_surround_punct.py +++ b/tests/test_surround_punct.py @@ -1,6 +1,6 @@ from __future__ import unicode_literals -from spacy.en import EN +from spacy.en import English import pytest @@ -10,22 +10,27 @@ def paired_puncts(): return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')] -def test_token(paired_puncts): +@pytest.fixture +def EN(): + return English() + + +def test_token(paired_puncts, EN): word_str = 'Hello' for open_, close_ in paired_puncts: string = open_ + word_str + close_ - tokens = EN.tokenize(string) + tokens = EN(string) assert len(tokens) == 3 assert tokens[0].string == open_ assert tokens[1].string == word_str assert tokens[2].string == close_ -def test_two_different(paired_puncts): +def test_two_different(paired_puncts, EN): word_str = 'Hello' for open_, close_ in paired_puncts: string = "`" + open_ + word_str + close_ + "'" - tokens = EN.tokenize(string) + tokens = EN(string) assert len(tokens) == 5 assert tokens[0].string == "`" assert tokens[1].string == open_ diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index e3f4aff0e..f11d19f8f 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -8,7 +8,7 @@ from spacy.en import English @pytest.fixture def EN(): - return English(pos_tag=False, parse=False) + return English(tag=False, parse=False) def test_single_word(EN): tokens = EN(u'hello') diff --git a/tests/test_tokens_from_list.py b/tests/test_tokens_from_list.py index eef00f403..e72c355fa 100644 --- a/tests/test_tokens_from_list.py +++ b/tests/test_tokens_from_list.py @@ -1,9 +1,16 @@ from __future__ import unicode_literals +import pytest -from spacy.en import EN +from spacy.en import English -def test1(): + +@pytest.fixture +def EN(): + return English() + + +def test1(EN): words = ['JAPAN', 'GET', 'LUCKY'] - tokens = EN.tokens_from_list(words) + tokens = EN.tokenizer.tokens_from_list(words) assert len(tokens) == 3 assert tokens[0].string == 'JAPAN' diff --git a/tests/test_vocab.py b/tests/test_vocab.py index daaabd33d..f0640e633 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -1,23 +1,29 @@ from __future__ import unicode_literals +import pytest -from spacy.en import EN +from spacy.en import English -def test_neq(): - addr = EN.lexicon['Hello'] - assert EN.lexicon['bye']['sic'] != addr['sic'] +@pytest.fixture +def EN(): + return English() -def test_eq(): - addr = EN.lexicon['Hello'] - assert EN.lexicon['Hello']['sic'] == addr['sic'] +def test_neq(EN): + addr = EN.vocab['Hello'] + assert EN.vocab['bye']['sic'] != addr['sic'] -def test_case_neq(): - addr = EN.lexicon['Hello'] - assert EN.lexicon['hello']['sic'] != addr['sic'] +def test_eq(EN): + addr = EN.vocab['Hello'] + assert EN.vocab['Hello']['sic'] == addr['sic'] -def test_punct_neq(): - addr = EN.lexicon['Hello'] - assert EN.lexicon['Hello,']['sic'] != addr['sic'] +def test_case_neq(EN): + addr = EN.vocab['Hello'] + assert EN.vocab['hello']['sic'] != addr['sic'] + + +def test_punct_neq(EN): + addr = EN.vocab['Hello'] + assert EN.vocab['Hello,']['sic'] != addr['sic'] diff --git a/tests/test_whitespace.py b/tests/test_whitespace.py index dc943664e..f4246a53c 100644 --- a/tests/test_whitespace.py +++ b/tests/test_whitespace.py @@ -1,38 +1,43 @@ """Test that tokens are created correctly for whitespace.""" from __future__ import unicode_literals -from spacy.en import EN +from spacy.en import English import pytest -def test_single_space(): - tokens = EN.tokenize('hello possums') +@pytest.fixture +def EN(): + return English(tag=False) + + +def test_single_space(EN): + tokens = EN('hello possums') assert len(tokens) == 2 -def test_double_space(): - tokens = EN.tokenize('hello possums') +def test_double_space(EN): + tokens = EN('hello possums') assert len(tokens) == 3 assert tokens[1].string == ' ' -def test_newline(): - tokens = EN.tokenize('hello\npossums') +def test_newline(EN): + tokens = EN('hello\npossums') assert len(tokens) == 3 -def test_newline_space(): - tokens = EN.tokenize('hello \npossums') +def test_newline_space(EN): + tokens = EN('hello \npossums') assert len(tokens) == 3 -def test_newline_double_space(): - tokens = EN.tokenize('hello \npossums') +def test_newline_double_space(EN): + tokens = EN('hello \npossums') assert len(tokens) == 3 -def test_newline_space_wrap(): - tokens = EN.tokenize('hello \n possums') +def test_newline_space_wrap(EN): + tokens = EN('hello \n possums') assert len(tokens) == 3