mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-25 01:16:28 +03:00
* Tests passing except for morphology/lemmatization stuff
This commit is contained in:
parent
cf8d26c3d2
commit
73f200436f
|
@ -6,8 +6,6 @@ from ..vocab import Vocab
|
|||
from ..tokenizer import Tokenizer
|
||||
from ..syntax.parser import GreedyParser
|
||||
from ..tokens import Tokens
|
||||
from ..morphology import Morphologizer
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .pos import EnPosTagger
|
||||
from .pos import POS_TAGS
|
||||
from .attrs import get_flags
|
||||
|
@ -18,28 +16,18 @@ def get_lex_props(string):
|
|||
|
||||
|
||||
class English(object):
|
||||
def __init__(self, data_dir=None, pos_tag=True, parse=False):
|
||||
def __init__(self, data_dir=None, tag=True, parse=False):
|
||||
if data_dir is None:
|
||||
data_dir = path.join(path.dirname(__file__), 'data')
|
||||
self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props)
|
||||
for pos_str in POS_TAGS:
|
||||
_ = self.vocab.strings.pos_tags[pos_str]
|
||||
self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
|
||||
self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
|
||||
if pos_tag:
|
||||
morph = Morphologizer(self.vocab.strings, POS_TAGS,
|
||||
Lemmatizer(path.join(data_dir, 'wordnet')))
|
||||
self.pos_tagger = EnPosTagger(data_dir, morph)
|
||||
else:
|
||||
self.pos_tagger = None
|
||||
if parse:
|
||||
self.parser = GreedyParser(data_dir)
|
||||
else:
|
||||
self.parser = None
|
||||
self.tagger = EnPosTagger(self.vocab.strings, data_dir) if tag else None
|
||||
self.parser = GreedyParser(data_dir) if parse else None
|
||||
|
||||
def __call__(self, text, pos_tag=True, parse=True):
|
||||
def __call__(self, text, tag=True, parse=True):
|
||||
tokens = self.tokenizer.tokenize(text)
|
||||
if self.pos_tagger and pos_tag:
|
||||
self.pos_tagger(tokens)
|
||||
if self.tagger and tag:
|
||||
self.tagger(tokens)
|
||||
if self.parser and parse:
|
||||
self.parser.parse(tokens)
|
||||
return tokens
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
from ..tagger cimport Tagger
|
||||
from ..morphology cimport Morphologizer
|
||||
from ..strings cimport StringStore
|
||||
|
||||
|
||||
cdef class EnPosTagger(Tagger):
|
||||
cdef readonly StringStore strings
|
||||
cdef readonly StringStore tags
|
||||
cdef readonly Morphologizer morphologizer
|
||||
|
|
|
@ -1,10 +1,13 @@
|
|||
from os import path
|
||||
import json
|
||||
|
||||
from thinc.typedefs cimport atom_t
|
||||
|
||||
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
|
||||
from ..typedefs cimport X, PUNCT, EOL
|
||||
from ..structs cimport TokenC, Morphology
|
||||
from ..tokens cimport Tokens
|
||||
from .lemmatizer import Lemmatizer
|
||||
|
||||
|
||||
cpdef enum en_person_t:
|
||||
|
@ -192,10 +195,18 @@ POS_TEMPLATES = (
|
|||
|
||||
|
||||
cdef class EnPosTagger(Tagger):
|
||||
def __init__(self, data_dir, morphologizer):
|
||||
def __init__(self, StringStore strings, data_dir):
|
||||
model_dir = path.join(data_dir, 'pos')
|
||||
Tagger.__init__(self, path.join(model_dir))
|
||||
self.morphologizer = morphologizer
|
||||
self.strings = strings
|
||||
cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
|
||||
self.tags = StringStore()
|
||||
for tag in sorted(cfg['tag_names']):
|
||||
_ = self.tags[tag]
|
||||
self.morphologizer = Morphologizer(self.strings, cfg['tag_names'],
|
||||
cfg['tag_map'],
|
||||
Lemmatizer(path.join(data_dir, 'wordnet'),
|
||||
NOUN, VERB, ADJ))
|
||||
|
||||
def __call__(self, Tokens tokens):
|
||||
cdef int i
|
||||
|
|
|
@ -35,15 +35,15 @@ cdef struct _Cached:
|
|||
cdef class Morphologizer:
|
||||
"""Given a POS tag and a Lexeme, find its lemma and morphological analysis.
|
||||
"""
|
||||
def __init__(self, StringStore strings, object tag_map, object lemmatizer,
|
||||
irregulars=None):
|
||||
def __init__(self, StringStore strings, object tag_names, object tag_map,
|
||||
object lemmatizer, irregulars=None):
|
||||
self.mem = Pool()
|
||||
self.strings = strings
|
||||
self.lemmatizer = lemmatizer
|
||||
cdef int n_tags = len(self.strings.pos_tags) + 1
|
||||
cdef int n_tags = len(tag_names) + 1
|
||||
self._cache = PreshMapArray(n_tags)
|
||||
self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
|
||||
for tag, i in self.strings.pos_tags:
|
||||
for i, tag in enumerate(sorted(tag_names)):
|
||||
pos, props = tag_map[tag]
|
||||
self.tags[i].id = i
|
||||
self.tags[i].pos = pos
|
||||
|
@ -65,13 +65,7 @@ cdef class Morphologizer:
|
|||
cdef bytes py_string = self.strings[lex.sic]
|
||||
cdef set lemma_strings
|
||||
cdef bytes lemma_string
|
||||
if pos == NOUN:
|
||||
lemma_strings = self.lemmatizer.noun(py_string)
|
||||
elif pos == VERB:
|
||||
lemma_strings = self.lemmatizer.verb(py_string)
|
||||
else:
|
||||
assert pos == ADJ
|
||||
lemma_strings = self.lemmatizer.adj(py_string)
|
||||
lemma_strings = self.lemmatizer(py_string, pos)
|
||||
lemma_string = sorted(lemma_strings)[0]
|
||||
lemma = self.strings.intern(lemma_string, len(lemma_string)).i
|
||||
return lemma
|
||||
|
|
|
@ -11,16 +11,9 @@ cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end)
|
|||
s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
|
||||
|
||||
|
||||
cdef class _SymbolMap:
|
||||
cdef dict _string_to_id
|
||||
cdef list _id_to_string
|
||||
|
||||
|
||||
cdef class StringStore:
|
||||
cdef Pool mem
|
||||
cdef Utf8Str* strings
|
||||
cdef readonly _SymbolMap pos_tags
|
||||
cdef readonly _SymbolMap dep_tags
|
||||
cdef size_t size
|
||||
|
||||
cdef PreshMap _map
|
||||
|
|
|
@ -9,7 +9,8 @@ from .typedefs cimport hash_t
|
|||
SEPARATOR = '\n|-SEP-|\n'
|
||||
|
||||
|
||||
cdef class _SymbolMap:
|
||||
"""
|
||||
cdef class SymbolMap:
|
||||
def __init__(self):
|
||||
self._string_to_id = {'': 0}
|
||||
self._id_to_string = ['']
|
||||
|
@ -38,6 +39,7 @@ cdef class _SymbolMap:
|
|||
self._string_to_id[string] = id_
|
||||
self._id_to_string.append(string)
|
||||
return id_
|
||||
"""
|
||||
|
||||
|
||||
cdef class StringStore:
|
||||
|
@ -47,8 +49,6 @@ cdef class StringStore:
|
|||
self._resize_at = 10000
|
||||
self.size = 1
|
||||
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
||||
self.pos_tags = _SymbolMap()
|
||||
self.dep_tags = _SymbolMap()
|
||||
|
||||
property size:
|
||||
def __get__(self):
|
||||
|
|
|
@ -12,13 +12,14 @@ import cython
|
|||
from thinc.features cimport Feature, count_feats
|
||||
|
||||
|
||||
def setup_model_dir(tag_names, templates, model_dir):
|
||||
def setup_model_dir(tag_names, tag_map, templates, model_dir):
|
||||
if path.exists(model_dir):
|
||||
shutil.rmtree(model_dir)
|
||||
os.mkdir(model_dir)
|
||||
config = {
|
||||
'templates': templates,
|
||||
'tag_names': tag_names,
|
||||
'tag_map': tag_map
|
||||
}
|
||||
with open(path.join(model_dir, 'config.json'), 'w') as file_:
|
||||
json.dump(config, file_)
|
||||
|
|
|
@ -43,6 +43,7 @@ cdef class Token:
|
|||
cdef readonly int dep_id
|
||||
cdef int lemma
|
||||
cdef public int head
|
||||
cdef public int dep_tag
|
||||
|
||||
cdef public atom_t id
|
||||
cdef public atom_t cluster
|
||||
|
|
|
@ -158,7 +158,8 @@ cdef class Token:
|
|||
|
||||
property dep:
|
||||
def __get__(self):
|
||||
return self.string_store.dep_tags[self.dep]
|
||||
return self.string_store.dep_tags[self.dep_id]
|
||||
|
||||
property pos:
|
||||
def __get__(self):
|
||||
return self.string_store.pos_tags[self.pos]
|
||||
return self.pos_id
|
||||
|
|
|
@ -12,23 +12,21 @@ cdef class Vocab:
|
|||
|
||||
Also interns UTF-8 strings, and maps them to consecutive integer IDs.
|
||||
'''
|
||||
def __init__(self, object get_lex_props):
|
||||
def __init__(self, data_dir=None, get_lex_props=None):
|
||||
self.mem = Pool()
|
||||
self._map = PreshMap(2 ** 20)
|
||||
self.strings = StringStore()
|
||||
self.lexemes.push_back(&EMPTY_LEXEME)
|
||||
self.get_lex_props = get_lex_props
|
||||
|
||||
@classmethod
|
||||
def from_dir(cls, object data_dir, object get_lex_props=None):
|
||||
if not path.exists(data_dir):
|
||||
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||
if not path.isdir(data_dir):
|
||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||
cdef Vocab self = cls(get_lex_props)
|
||||
self.strings.load(path.join(data_dir, 'strings'))
|
||||
self.load(path.join(data_dir, 'lexemes'))
|
||||
return self
|
||||
if data_dir is not None:
|
||||
if not path.exists(data_dir):
|
||||
raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
|
||||
if data_dir is not None:
|
||||
if not path.isdir(data_dir):
|
||||
raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
|
||||
self.strings.load(path.join(data_dir, 'strings'))
|
||||
self.load(path.join(data_dir, 'lexemes'))
|
||||
|
||||
def __len__(self):
|
||||
return self.lexemes.size()
|
||||
|
|
|
@ -1,14 +1,20 @@
|
|||
from spacy.en import English
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English(pos_tag=True, parse=False)
|
||||
return English(tag=True, parse=False)
|
||||
|
||||
@pytest.fixture
|
||||
def tagged(EN):
|
||||
string = u'Bananas in pyjamas are geese.'
|
||||
tokens = EN(string, pos_tag=True)
|
||||
tokens = EN(string, tag=True)
|
||||
assert EN.tagger.tags[tokens[0].pos] == 'NNP'
|
||||
assert EN.tagger.tags[tokens[1].pos] == 'IN'
|
||||
assert EN.tagger.tags[tokens[2].pos] == 'NNS'
|
||||
assert EN.tagger.tags[tokens[3].pos] == 'VBP'
|
||||
assert EN.tagger.tags[tokens[3].pos] == 'NNS'
|
||||
return tokens
|
||||
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ from spacy.en import English
|
|||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English(pos_tag=False)
|
||||
return English(tag=False)
|
||||
|
||||
|
||||
def test_possess(EN):
|
||||
|
|
|
@ -6,7 +6,7 @@ from spacy.en import English
|
|||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English(pos_tag=False)
|
||||
return English(tag=False)
|
||||
|
||||
def test_tweebo_challenge(EN):
|
||||
text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
|
||||
|
|
|
@ -2,7 +2,7 @@ from __future__ import unicode_literals
|
|||
|
||||
import pytest
|
||||
|
||||
from spacy.en import EN
|
||||
from spacy.en import English
|
||||
|
||||
|
||||
#def test_hyphen():
|
||||
|
@ -11,7 +11,8 @@ from spacy.en import EN
|
|||
|
||||
|
||||
def test_period():
|
||||
tokens = EN.tokenize('best.Known')
|
||||
EN = English()
|
||||
tokens = EN('best.Known')
|
||||
assert len(tokens) == 3
|
||||
tokens = EN.tokenize('zombo.com')
|
||||
tokens = EN('zombo.com')
|
||||
assert len(tokens) == 1
|
||||
|
|
|
@ -4,7 +4,7 @@ from spacy.en import English
|
|||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English(pos_tag=True)
|
||||
return English(tag=True)
|
||||
|
||||
def test_range_iter(EN):
|
||||
for i in range(len(EN.vocab)):
|
||||
|
|
|
@ -3,12 +3,12 @@ from __future__ import unicode_literals
|
|||
import pytest
|
||||
|
||||
from spacy.en import English
|
||||
from spacy.en.attrs import IS_ALPHA, IS_DIGIT
|
||||
from spacy.en.attrs import *
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English(pos_tag=False)
|
||||
return English(tag=False)
|
||||
|
||||
|
||||
def test_is_alpha(EN):
|
||||
|
|
|
@ -8,7 +8,7 @@ from spacy.en import English
|
|||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English(pos_tag=True, parse=False)
|
||||
return English(tag=True, parse=False)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -18,8 +18,8 @@ def morph_exc():
|
|||
}
|
||||
|
||||
def test_load_exc(EN, morph_exc):
|
||||
EN.pos_tagger.morphologizer.load_exceptions(morph_exc)
|
||||
tokens = EN('I like his style.', pos_tag=True)
|
||||
EN.tagger.morphologizer.load_exceptions(morph_exc)
|
||||
tokens = EN('I like his style.', tag=True)
|
||||
his = tokens[2]
|
||||
assert his.pos == 'PRP$'
|
||||
assert his.lemma == '-PRP-'
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from spacy.en import EN
|
||||
from spacy.en import English
|
||||
|
||||
def test_only_pre1():
|
||||
assert len(EN.tokenize("(")) == 1
|
||||
EN = English()
|
||||
assert len(EN("(")) == 1
|
||||
|
||||
|
||||
def test_only_pre2():
|
||||
assert len(EN.tokenize("((")) == 2
|
||||
EN = English()
|
||||
assert len(EN("((")) == 2
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.en import EN
|
||||
from spacy.en import English
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -10,38 +10,43 @@ def close_puncts():
|
|||
return [')', ']', '}', '*']
|
||||
|
||||
|
||||
def test_close(close_puncts):
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
def test_close(close_puncts, EN):
|
||||
word_str = 'Hello'
|
||||
for p in close_puncts:
|
||||
string = word_str + p
|
||||
tokens = EN.tokenize(string)
|
||||
tokens = EN(string)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[1].string == p
|
||||
assert tokens[0].string == word_str
|
||||
|
||||
|
||||
def test_two_different_close(close_puncts):
|
||||
def test_two_different_close(close_puncts, EN):
|
||||
word_str = 'Hello'
|
||||
for p in close_puncts:
|
||||
string = word_str + p + "'"
|
||||
tokens = EN.tokenize(string)
|
||||
tokens = EN(string)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].string == word_str
|
||||
assert tokens[1].string == p
|
||||
assert tokens[2].string == "'"
|
||||
|
||||
|
||||
def test_three_same_close(close_puncts):
|
||||
def test_three_same_close(close_puncts, EN):
|
||||
word_str = 'Hello'
|
||||
for p in close_puncts:
|
||||
string = word_str + p + p + p
|
||||
tokens = EN.tokenize(string)
|
||||
tokens = EN(string)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].string == word_str
|
||||
assert tokens[1].string == p
|
||||
|
||||
|
||||
def test_double_end_quote():
|
||||
assert len(EN.tokenize("Hello''")) == 2
|
||||
assert len(EN.tokenize("''")) == 1
|
||||
def test_double_end_quote(EN):
|
||||
assert len(EN("Hello''")) == 2
|
||||
assert len(EN("''")) == 1
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.en import EN
|
||||
from spacy.en import English
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -10,39 +10,44 @@ def open_puncts():
|
|||
return ['(', '[', '{', '*']
|
||||
|
||||
|
||||
def test_open(open_puncts):
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
def test_open(open_puncts, EN):
|
||||
word_str = 'Hello'
|
||||
for p in open_puncts:
|
||||
string = p + word_str
|
||||
tokens = EN.tokenize(string)
|
||||
tokens = EN(string)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].string == p
|
||||
assert tokens[1].string == word_str
|
||||
|
||||
|
||||
def test_two_different_open(open_puncts):
|
||||
def test_two_different_open(open_puncts, EN):
|
||||
word_str = 'Hello'
|
||||
for p in open_puncts:
|
||||
string = p + "`" + word_str
|
||||
tokens = EN.tokenize(string)
|
||||
tokens = EN(string)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].string == p
|
||||
assert tokens[1].string == "`"
|
||||
assert tokens[2].string == word_str
|
||||
|
||||
|
||||
def test_three_same_open(open_puncts):
|
||||
def test_three_same_open(open_puncts, EN):
|
||||
word_str = 'Hello'
|
||||
for p in open_puncts:
|
||||
string = p + p + p + word_str
|
||||
tokens = EN.tokenize(string)
|
||||
tokens = EN(string)
|
||||
assert len(tokens) == 4
|
||||
assert tokens[0].string == p
|
||||
assert tokens[3].string == word_str
|
||||
|
||||
|
||||
def test_open_appostrophe():
|
||||
def test_open_appostrophe(EN):
|
||||
string = "'The"
|
||||
tokens = EN.tokenize(string)
|
||||
tokens = EN(string)
|
||||
assert len(tokens) == 2
|
||||
assert tokens[0].string == "'"
|
||||
|
|
|
@ -3,43 +3,48 @@ and suffix punctuation."""
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from spacy.en import EN
|
||||
from spacy.en import English
|
||||
|
||||
def test_no_special():
|
||||
assert len(EN.tokenize("(can)")) == 3
|
||||
|
||||
def test_no_punct():
|
||||
assert len(EN.tokenize("can't")) == 2
|
||||
|
||||
def test_prefix():
|
||||
assert len(EN.tokenize("(can't")) == 3
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
def test_suffix():
|
||||
assert len(EN.tokenize("can't)")) == 3
|
||||
def test_no_special(EN):
|
||||
assert len(EN("(can)")) == 3
|
||||
|
||||
def test_no_punct(EN):
|
||||
assert len(EN("can't")) == 2
|
||||
|
||||
def test_prefix(EN):
|
||||
assert len(EN("(can't")) == 3
|
||||
|
||||
|
||||
def test_wrap():
|
||||
assert len(EN.tokenize("(can't)")) == 4
|
||||
def test_suffix(EN):
|
||||
assert len(EN("can't)")) == 3
|
||||
|
||||
|
||||
def test_uneven_wrap():
|
||||
assert len(EN.tokenize("(can't?)")) == 5
|
||||
def test_wrap(EN):
|
||||
assert len(EN("(can't)")) == 4
|
||||
|
||||
|
||||
def test_prefix_interact():
|
||||
assert len(EN.tokenize("U.S.")) == 1
|
||||
assert len(EN.tokenize("us.")) == 2
|
||||
assert len(EN.tokenize("(U.S.")) == 2
|
||||
def test_uneven_wrap(EN):
|
||||
assert len(EN("(can't?)")) == 5
|
||||
|
||||
|
||||
def test_suffix_interact():
|
||||
assert len(EN.tokenize("U.S.)")) == 2
|
||||
def test_prefix_interact(EN):
|
||||
assert len(EN("U.S.")) == 1
|
||||
assert len(EN("us.")) == 2
|
||||
assert len(EN("(U.S.")) == 2
|
||||
|
||||
|
||||
def test_even_wrap_interact():
|
||||
assert len(EN.tokenize("(U.S.)")) == 3
|
||||
def test_suffix_interact(EN):
|
||||
assert len(EN("U.S.)")) == 2
|
||||
|
||||
|
||||
def test_uneven_wrap_interact():
|
||||
assert len(EN.tokenize("(U.S.?)")) == 4
|
||||
def test_even_wrap_interact(EN):
|
||||
assert len(EN("(U.S.)")) == 3
|
||||
|
||||
|
||||
def test_uneven_wrap_interact(EN):
|
||||
assert len(EN("(U.S.?)")) == 4
|
||||
|
|
|
@ -3,13 +3,18 @@ from __future__ import unicode_literals
|
|||
|
||||
import pytest
|
||||
|
||||
from spacy.en import EN
|
||||
from spacy.en import English
|
||||
|
||||
|
||||
def test_one():
|
||||
tokens = EN.tokenize('Betty Botter bought a pound of butter.')
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
def test_one(EN):
|
||||
tokens = EN('Betty Botter bought a pound of butter.')
|
||||
assert tokens[0].string == 'Betty'
|
||||
tokens2 = EN.tokenize('Betty also bought a pound of butter.')
|
||||
tokens2 = EN('Betty also bought a pound of butter.')
|
||||
assert tokens2[0].string == 'Betty'
|
||||
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.en import EN
|
||||
from spacy.en import English
|
||||
|
||||
import pytest
|
||||
|
||||
|
@ -10,22 +10,27 @@ def paired_puncts():
|
|||
return [('(', ')'), ('[', ']'), ('{', '}'), ('*', '*')]
|
||||
|
||||
|
||||
def test_token(paired_puncts):
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
def test_token(paired_puncts, EN):
|
||||
word_str = 'Hello'
|
||||
for open_, close_ in paired_puncts:
|
||||
string = open_ + word_str + close_
|
||||
tokens = EN.tokenize(string)
|
||||
tokens = EN(string)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].string == open_
|
||||
assert tokens[1].string == word_str
|
||||
assert tokens[2].string == close_
|
||||
|
||||
|
||||
def test_two_different(paired_puncts):
|
||||
def test_two_different(paired_puncts, EN):
|
||||
word_str = 'Hello'
|
||||
for open_, close_ in paired_puncts:
|
||||
string = "`" + open_ + word_str + close_ + "'"
|
||||
tokens = EN.tokenize(string)
|
||||
tokens = EN(string)
|
||||
assert len(tokens) == 5
|
||||
assert tokens[0].string == "`"
|
||||
assert tokens[1].string == open_
|
||||
|
|
|
@ -8,7 +8,7 @@ from spacy.en import English
|
|||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English(pos_tag=False, parse=False)
|
||||
return English(tag=False, parse=False)
|
||||
|
||||
def test_single_word(EN):
|
||||
tokens = EN(u'hello')
|
||||
|
|
|
@ -1,9 +1,16 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from spacy.en import EN
|
||||
from spacy.en import English
|
||||
|
||||
def test1():
|
||||
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
def test1(EN):
|
||||
words = ['JAPAN', 'GET', 'LUCKY']
|
||||
tokens = EN.tokens_from_list(words)
|
||||
tokens = EN.tokenizer.tokens_from_list(words)
|
||||
assert len(tokens) == 3
|
||||
assert tokens[0].string == 'JAPAN'
|
||||
|
|
|
@ -1,23 +1,29 @@
|
|||
from __future__ import unicode_literals
|
||||
import pytest
|
||||
|
||||
from spacy.en import EN
|
||||
from spacy.en import English
|
||||
|
||||
|
||||
def test_neq():
|
||||
addr = EN.lexicon['Hello']
|
||||
assert EN.lexicon['bye']['sic'] != addr['sic']
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English()
|
||||
|
||||
|
||||
def test_eq():
|
||||
addr = EN.lexicon['Hello']
|
||||
assert EN.lexicon['Hello']['sic'] == addr['sic']
|
||||
def test_neq(EN):
|
||||
addr = EN.vocab['Hello']
|
||||
assert EN.vocab['bye']['sic'] != addr['sic']
|
||||
|
||||
|
||||
def test_case_neq():
|
||||
addr = EN.lexicon['Hello']
|
||||
assert EN.lexicon['hello']['sic'] != addr['sic']
|
||||
def test_eq(EN):
|
||||
addr = EN.vocab['Hello']
|
||||
assert EN.vocab['Hello']['sic'] == addr['sic']
|
||||
|
||||
|
||||
def test_punct_neq():
|
||||
addr = EN.lexicon['Hello']
|
||||
assert EN.lexicon['Hello,']['sic'] != addr['sic']
|
||||
def test_case_neq(EN):
|
||||
addr = EN.vocab['Hello']
|
||||
assert EN.vocab['hello']['sic'] != addr['sic']
|
||||
|
||||
|
||||
def test_punct_neq(EN):
|
||||
addr = EN.vocab['Hello']
|
||||
assert EN.vocab['Hello,']['sic'] != addr['sic']
|
||||
|
|
|
@ -1,38 +1,43 @@
|
|||
"""Test that tokens are created correctly for whitespace."""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from spacy.en import EN
|
||||
from spacy.en import English
|
||||
import pytest
|
||||
|
||||
|
||||
def test_single_space():
|
||||
tokens = EN.tokenize('hello possums')
|
||||
@pytest.fixture
|
||||
def EN():
|
||||
return English(tag=False)
|
||||
|
||||
|
||||
def test_single_space(EN):
|
||||
tokens = EN('hello possums')
|
||||
assert len(tokens) == 2
|
||||
|
||||
|
||||
def test_double_space():
|
||||
tokens = EN.tokenize('hello possums')
|
||||
def test_double_space(EN):
|
||||
tokens = EN('hello possums')
|
||||
assert len(tokens) == 3
|
||||
assert tokens[1].string == ' '
|
||||
|
||||
|
||||
def test_newline():
|
||||
tokens = EN.tokenize('hello\npossums')
|
||||
def test_newline(EN):
|
||||
tokens = EN('hello\npossums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_newline_space():
|
||||
tokens = EN.tokenize('hello \npossums')
|
||||
def test_newline_space(EN):
|
||||
tokens = EN('hello \npossums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_newline_double_space():
|
||||
tokens = EN.tokenize('hello \npossums')
|
||||
def test_newline_double_space(EN):
|
||||
tokens = EN('hello \npossums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
def test_newline_space_wrap():
|
||||
tokens = EN.tokenize('hello \n possums')
|
||||
def test_newline_space_wrap(EN):
|
||||
tokens = EN('hello \n possums')
|
||||
assert len(tokens) == 3
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user