mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 09:57:26 +03:00 
			
		
		
		
	* Tests passing except for morphology/lemmatization stuff
This commit is contained in:
		
							parent
							
								
									cf8d26c3d2
								
							
						
					
					
						commit
						73f200436f
					
				| 
						 | 
					@ -6,8 +6,6 @@ from ..vocab import Vocab
 | 
				
			||||||
from ..tokenizer import Tokenizer
 | 
					from ..tokenizer import Tokenizer
 | 
				
			||||||
from ..syntax.parser import GreedyParser
 | 
					from ..syntax.parser import GreedyParser
 | 
				
			||||||
from ..tokens import Tokens
 | 
					from ..tokens import Tokens
 | 
				
			||||||
from ..morphology import Morphologizer
 | 
					 | 
				
			||||||
from .lemmatizer import Lemmatizer
 | 
					 | 
				
			||||||
from .pos import EnPosTagger
 | 
					from .pos import EnPosTagger
 | 
				
			||||||
from .pos import POS_TAGS
 | 
					from .pos import POS_TAGS
 | 
				
			||||||
from .attrs import get_flags
 | 
					from .attrs import get_flags
 | 
				
			||||||
| 
						 | 
					@ -18,28 +16,18 @@ def get_lex_props(string):
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class English(object):
 | 
					class English(object):
 | 
				
			||||||
    def __init__(self, data_dir=None, pos_tag=True, parse=False):
 | 
					    def __init__(self, data_dir=None, tag=True, parse=False):
 | 
				
			||||||
        if data_dir is None:
 | 
					        if data_dir is None:
 | 
				
			||||||
            data_dir = path.join(path.dirname(__file__), 'data')
 | 
					            data_dir = path.join(path.dirname(__file__), 'data')
 | 
				
			||||||
        self.vocab = Vocab.from_dir(data_dir, get_lex_props=get_lex_props)
 | 
					        self.vocab = Vocab(data_dir=data_dir, get_lex_props=get_lex_props)
 | 
				
			||||||
        for pos_str in POS_TAGS:
 | 
					 | 
				
			||||||
            _ = self.vocab.strings.pos_tags[pos_str]
 | 
					 | 
				
			||||||
        self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
 | 
					        self.tokenizer = Tokenizer.from_dir(self.vocab, data_dir)
 | 
				
			||||||
        if pos_tag:
 | 
					        self.tagger = EnPosTagger(self.vocab.strings, data_dir) if tag else None
 | 
				
			||||||
            morph = Morphologizer(self.vocab.strings, POS_TAGS,
 | 
					        self.parser = GreedyParser(data_dir) if parse else None
 | 
				
			||||||
                                  Lemmatizer(path.join(data_dir, 'wordnet')))
 | 
					 | 
				
			||||||
            self.pos_tagger = EnPosTagger(data_dir, morph)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            self.pos_tagger = None
 | 
					 | 
				
			||||||
        if parse:
 | 
					 | 
				
			||||||
            self.parser = GreedyParser(data_dir)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            self.parser = None
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, text, pos_tag=True, parse=True):
 | 
					    def __call__(self, text, tag=True, parse=True):
 | 
				
			||||||
        tokens = self.tokenizer.tokenize(text)
 | 
					        tokens = self.tokenizer.tokenize(text)
 | 
				
			||||||
        if self.pos_tagger and pos_tag:
 | 
					        if self.tagger and tag:
 | 
				
			||||||
            self.pos_tagger(tokens)
 | 
					            self.tagger(tokens)
 | 
				
			||||||
        if self.parser and parse:
 | 
					        if self.parser and parse:
 | 
				
			||||||
            self.parser.parse(tokens)
 | 
					            self.parser.parse(tokens)
 | 
				
			||||||
        return tokens
 | 
					        return tokens
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,9 @@
 | 
				
			||||||
from ..tagger cimport Tagger
 | 
					from ..tagger cimport Tagger
 | 
				
			||||||
from ..morphology cimport Morphologizer
 | 
					from ..morphology cimport Morphologizer
 | 
				
			||||||
 | 
					from ..strings cimport StringStore
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class EnPosTagger(Tagger):
 | 
					cdef class EnPosTagger(Tagger):
 | 
				
			||||||
 | 
					    cdef readonly StringStore strings
 | 
				
			||||||
 | 
					    cdef readonly StringStore tags
 | 
				
			||||||
    cdef readonly Morphologizer morphologizer
 | 
					    cdef readonly Morphologizer morphologizer
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,10 +1,13 @@
 | 
				
			||||||
from os import path
 | 
					from os import path
 | 
				
			||||||
 | 
					import json
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from thinc.typedefs cimport atom_t
 | 
					from thinc.typedefs cimport atom_t
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
 | 
					from ..typedefs cimport NO_TAG, ADJ, ADV, ADP, CONJ, DET, NOUN, NUM, PRON, PRT, VERB
 | 
				
			||||||
from ..typedefs cimport X, PUNCT, EOL
 | 
					from ..typedefs cimport X, PUNCT, EOL
 | 
				
			||||||
from ..structs cimport TokenC, Morphology
 | 
					from ..structs cimport TokenC, Morphology
 | 
				
			||||||
from ..tokens cimport Tokens
 | 
					from ..tokens cimport Tokens
 | 
				
			||||||
 | 
					from .lemmatizer import Lemmatizer
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cpdef enum en_person_t:
 | 
					cpdef enum en_person_t:
 | 
				
			||||||
| 
						 | 
					@ -192,10 +195,18 @@ POS_TEMPLATES = (
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class EnPosTagger(Tagger):
 | 
					cdef class EnPosTagger(Tagger):
 | 
				
			||||||
    def __init__(self, data_dir, morphologizer):
 | 
					    def __init__(self, StringStore strings, data_dir):
 | 
				
			||||||
        model_dir = path.join(data_dir, 'pos')
 | 
					        model_dir = path.join(data_dir, 'pos')
 | 
				
			||||||
        Tagger.__init__(self, path.join(model_dir))
 | 
					        Tagger.__init__(self, path.join(model_dir))
 | 
				
			||||||
        self.morphologizer = morphologizer
 | 
					        self.strings = strings
 | 
				
			||||||
 | 
					        cfg = json.load(open(path.join(data_dir, 'pos', 'config.json')))
 | 
				
			||||||
 | 
					        self.tags = StringStore()
 | 
				
			||||||
 | 
					        for tag in sorted(cfg['tag_names']):
 | 
				
			||||||
 | 
					            _ = self.tags[tag]
 | 
				
			||||||
 | 
					        self.morphologizer = Morphologizer(self.strings, cfg['tag_names'],
 | 
				
			||||||
 | 
					                                           cfg['tag_map'],
 | 
				
			||||||
 | 
					                                 Lemmatizer(path.join(data_dir, 'wordnet'),
 | 
				
			||||||
 | 
					                                            NOUN, VERB, ADJ))
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __call__(self, Tokens tokens):
 | 
					    def __call__(self, Tokens tokens):
 | 
				
			||||||
        cdef int i
 | 
					        cdef int i
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -35,15 +35,15 @@ cdef struct _Cached:
 | 
				
			||||||
cdef class Morphologizer:
 | 
					cdef class Morphologizer:
 | 
				
			||||||
    """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
 | 
					    """Given a POS tag and a Lexeme, find its lemma and morphological analysis.
 | 
				
			||||||
    """
 | 
					    """
 | 
				
			||||||
    def __init__(self, StringStore strings, object tag_map, object lemmatizer,
 | 
					    def __init__(self, StringStore strings, object tag_names, object tag_map,
 | 
				
			||||||
                 irregulars=None):
 | 
					                 object lemmatizer, irregulars=None):
 | 
				
			||||||
        self.mem = Pool()
 | 
					        self.mem = Pool()
 | 
				
			||||||
        self.strings = strings
 | 
					        self.strings = strings
 | 
				
			||||||
        self.lemmatizer = lemmatizer
 | 
					        self.lemmatizer = lemmatizer
 | 
				
			||||||
        cdef int n_tags = len(self.strings.pos_tags) + 1
 | 
					        cdef int n_tags = len(tag_names) + 1
 | 
				
			||||||
        self._cache = PreshMapArray(n_tags)
 | 
					        self._cache = PreshMapArray(n_tags)
 | 
				
			||||||
        self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
 | 
					        self.tags = <PosTag*>self.mem.alloc(n_tags, sizeof(PosTag))
 | 
				
			||||||
        for tag, i in self.strings.pos_tags:
 | 
					        for i, tag in enumerate(sorted(tag_names)):
 | 
				
			||||||
            pos, props = tag_map[tag]
 | 
					            pos, props = tag_map[tag]
 | 
				
			||||||
            self.tags[i].id = i
 | 
					            self.tags[i].id = i
 | 
				
			||||||
            self.tags[i].pos = pos
 | 
					            self.tags[i].pos = pos
 | 
				
			||||||
| 
						 | 
					@ -65,13 +65,7 @@ cdef class Morphologizer:
 | 
				
			||||||
        cdef bytes py_string = self.strings[lex.sic]
 | 
					        cdef bytes py_string = self.strings[lex.sic]
 | 
				
			||||||
        cdef set lemma_strings
 | 
					        cdef set lemma_strings
 | 
				
			||||||
        cdef bytes lemma_string
 | 
					        cdef bytes lemma_string
 | 
				
			||||||
        if pos == NOUN:
 | 
					        lemma_strings = self.lemmatizer(py_string, pos)
 | 
				
			||||||
            lemma_strings = self.lemmatizer.noun(py_string)
 | 
					 | 
				
			||||||
        elif pos == VERB:
 | 
					 | 
				
			||||||
            lemma_strings = self.lemmatizer.verb(py_string)
 | 
					 | 
				
			||||||
        else:
 | 
					 | 
				
			||||||
            assert pos == ADJ
 | 
					 | 
				
			||||||
            lemma_strings = self.lemmatizer.adj(py_string)
 | 
					 | 
				
			||||||
        lemma_string = sorted(lemma_strings)[0]
 | 
					        lemma_string = sorted(lemma_strings)[0]
 | 
				
			||||||
        lemma = self.strings.intern(lemma_string, len(lemma_string)).i
 | 
					        lemma = self.strings.intern(lemma_string, len(lemma_string)).i
 | 
				
			||||||
        return lemma
 | 
					        return lemma
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -11,16 +11,9 @@ cdef inline void slice_unicode(UniStr* s, Py_UNICODE* chars, int start, int end)
 | 
				
			||||||
    s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
 | 
					    s.key = hash64(s.chars, <int>(s.n * sizeof(Py_UNICODE)), 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class _SymbolMap:
 | 
					 | 
				
			||||||
    cdef dict _string_to_id
 | 
					 | 
				
			||||||
    cdef list _id_to_string
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
cdef class StringStore:
 | 
					cdef class StringStore:
 | 
				
			||||||
    cdef Pool mem
 | 
					    cdef Pool mem
 | 
				
			||||||
    cdef Utf8Str* strings
 | 
					    cdef Utf8Str* strings
 | 
				
			||||||
    cdef readonly _SymbolMap pos_tags
 | 
					 | 
				
			||||||
    cdef readonly _SymbolMap dep_tags
 | 
					 | 
				
			||||||
    cdef size_t size
 | 
					    cdef size_t size
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef PreshMap _map
 | 
					    cdef PreshMap _map
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -9,7 +9,8 @@ from .typedefs cimport hash_t
 | 
				
			||||||
SEPARATOR = '\n|-SEP-|\n'
 | 
					SEPARATOR = '\n|-SEP-|\n'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class _SymbolMap:
 | 
					"""
 | 
				
			||||||
 | 
					cdef class SymbolMap:
 | 
				
			||||||
    def __init__(self):
 | 
					    def __init__(self):
 | 
				
			||||||
        self._string_to_id = {'': 0}
 | 
					        self._string_to_id = {'': 0}
 | 
				
			||||||
        self._id_to_string = ['']
 | 
					        self._id_to_string = ['']
 | 
				
			||||||
| 
						 | 
					@ -38,6 +39,7 @@ cdef class _SymbolMap:
 | 
				
			||||||
                self._string_to_id[string] = id_
 | 
					                self._string_to_id[string] = id_
 | 
				
			||||||
                self._id_to_string.append(string)
 | 
					                self._id_to_string.append(string)
 | 
				
			||||||
            return id_
 | 
					            return id_
 | 
				
			||||||
 | 
					"""
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class StringStore:
 | 
					cdef class StringStore:
 | 
				
			||||||
| 
						 | 
					@ -47,8 +49,6 @@ cdef class StringStore:
 | 
				
			||||||
        self._resize_at = 10000
 | 
					        self._resize_at = 10000
 | 
				
			||||||
        self.size = 1
 | 
					        self.size = 1
 | 
				
			||||||
        self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
 | 
					        self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
 | 
				
			||||||
        self.pos_tags = _SymbolMap()
 | 
					 | 
				
			||||||
        self.dep_tags = _SymbolMap()
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property size:
 | 
					    property size:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,13 +12,14 @@ import cython
 | 
				
			||||||
from thinc.features cimport Feature, count_feats
 | 
					from thinc.features cimport Feature, count_feats
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def setup_model_dir(tag_names, templates, model_dir):
 | 
					def setup_model_dir(tag_names, tag_map, templates, model_dir):
 | 
				
			||||||
    if path.exists(model_dir):
 | 
					    if path.exists(model_dir):
 | 
				
			||||||
        shutil.rmtree(model_dir)
 | 
					        shutil.rmtree(model_dir)
 | 
				
			||||||
    os.mkdir(model_dir)
 | 
					    os.mkdir(model_dir)
 | 
				
			||||||
    config = {
 | 
					    config = {
 | 
				
			||||||
        'templates': templates,
 | 
					        'templates': templates,
 | 
				
			||||||
        'tag_names': tag_names,
 | 
					        'tag_names': tag_names,
 | 
				
			||||||
 | 
					        'tag_map': tag_map
 | 
				
			||||||
    }
 | 
					    }
 | 
				
			||||||
    with open(path.join(model_dir, 'config.json'), 'w') as file_:
 | 
					    with open(path.join(model_dir, 'config.json'), 'w') as file_:
 | 
				
			||||||
        json.dump(config, file_)
 | 
					        json.dump(config, file_)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -43,6 +43,7 @@ cdef class Token:
 | 
				
			||||||
    cdef readonly int dep_id
 | 
					    cdef readonly int dep_id
 | 
				
			||||||
    cdef int lemma
 | 
					    cdef int lemma
 | 
				
			||||||
    cdef public int head
 | 
					    cdef public int head
 | 
				
			||||||
 | 
					    cdef public int dep_tag
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef public atom_t id
 | 
					    cdef public atom_t id
 | 
				
			||||||
    cdef public atom_t cluster
 | 
					    cdef public atom_t cluster
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -158,7 +158,8 @@ cdef class Token:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property dep:
 | 
					    property dep:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self.string_store.dep_tags[self.dep]
 | 
					            return self.string_store.dep_tags[self.dep_id]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property pos:
 | 
					    property pos:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self.string_store.pos_tags[self.pos]
 | 
					            return self.pos_id
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -12,23 +12,21 @@ cdef class Vocab:
 | 
				
			||||||
    
 | 
					    
 | 
				
			||||||
    Also interns UTF-8 strings, and maps them to consecutive integer IDs.
 | 
					    Also interns UTF-8 strings, and maps them to consecutive integer IDs.
 | 
				
			||||||
    '''
 | 
					    '''
 | 
				
			||||||
    def __init__(self, object get_lex_props):
 | 
					    def __init__(self, data_dir=None, get_lex_props=None):
 | 
				
			||||||
        self.mem = Pool()
 | 
					        self.mem = Pool()
 | 
				
			||||||
        self._map = PreshMap(2 ** 20)
 | 
					        self._map = PreshMap(2 ** 20)
 | 
				
			||||||
        self.strings = StringStore()
 | 
					        self.strings = StringStore()
 | 
				
			||||||
        self.lexemes.push_back(&EMPTY_LEXEME)
 | 
					        self.lexemes.push_back(&EMPTY_LEXEME)
 | 
				
			||||||
        self.get_lex_props = get_lex_props
 | 
					        self.get_lex_props = get_lex_props
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    @classmethod
 | 
					        if data_dir is not None:
 | 
				
			||||||
    def from_dir(cls, object data_dir, object get_lex_props=None):
 | 
					            if not path.exists(data_dir):
 | 
				
			||||||
        if not path.exists(data_dir):
 | 
					                raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
 | 
				
			||||||
            raise IOError("Directory %s not found -- cannot load Vocab." % data_dir)
 | 
					        if data_dir is not None:
 | 
				
			||||||
        if not path.isdir(data_dir):
 | 
					            if not path.isdir(data_dir):
 | 
				
			||||||
            raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
 | 
					                raise IOError("Path %s is a file, not a dir -- cannot load Vocab." % data_dir)
 | 
				
			||||||
        cdef Vocab self = cls(get_lex_props)
 | 
					            self.strings.load(path.join(data_dir, 'strings'))
 | 
				
			||||||
        self.strings.load(path.join(data_dir, 'strings'))
 | 
					            self.load(path.join(data_dir, 'lexemes'))
 | 
				
			||||||
        self.load(path.join(data_dir, 'lexemes'))
 | 
					 | 
				
			||||||
        return self
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __len__(self):
 | 
					    def __len__(self):
 | 
				
			||||||
        return self.lexemes.size()
 | 
					        return self.lexemes.size()
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,14 +1,20 @@
 | 
				
			||||||
from spacy.en import English
 | 
					from spacy.en import English
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def EN():
 | 
					def EN():
 | 
				
			||||||
    return English(pos_tag=True, parse=False)
 | 
					    return English(tag=True, parse=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def tagged(EN):
 | 
					def tagged(EN):
 | 
				
			||||||
    string = u'Bananas in pyjamas are geese.'
 | 
					    string = u'Bananas in pyjamas are geese.'
 | 
				
			||||||
    tokens = EN(string, pos_tag=True)
 | 
					    tokens = EN(string, tag=True)
 | 
				
			||||||
 | 
					    assert EN.tagger.tags[tokens[0].pos] == 'NNP'
 | 
				
			||||||
 | 
					    assert EN.tagger.tags[tokens[1].pos] == 'IN'
 | 
				
			||||||
 | 
					    assert EN.tagger.tags[tokens[2].pos] == 'NNS'
 | 
				
			||||||
 | 
					    assert EN.tagger.tags[tokens[3].pos] == 'VBP'
 | 
				
			||||||
 | 
					    assert EN.tagger.tags[tokens[3].pos] == 'NNS'
 | 
				
			||||||
    return tokens
 | 
					    return tokens
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -5,7 +5,7 @@ from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def EN():
 | 
					def EN():
 | 
				
			||||||
    return English(pos_tag=False)
 | 
					    return English(tag=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_possess(EN):
 | 
					def test_possess(EN):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -6,7 +6,7 @@ from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def EN():
 | 
					def EN():
 | 
				
			||||||
    return English(pos_tag=False)
 | 
					    return English(tag=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_tweebo_challenge(EN):
 | 
					def test_tweebo_challenge(EN):
 | 
				
			||||||
    text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
 | 
					    text = u""":o :/ :'( >:o (: :) >.< XD -__- o.O ;D :-) @_@ :P 8D :1 >:( :D =| ") :> ...."""
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2,7 +2,7 @@ from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en import EN
 | 
					from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#def test_hyphen():
 | 
					#def test_hyphen():
 | 
				
			||||||
| 
						 | 
					@ -11,7 +11,8 @@ from spacy.en import EN
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_period():
 | 
					def test_period():
 | 
				
			||||||
    tokens = EN.tokenize('best.Known')
 | 
					    EN = English()
 | 
				
			||||||
 | 
					    tokens = EN('best.Known')
 | 
				
			||||||
    assert len(tokens) == 3
 | 
					    assert len(tokens) == 3
 | 
				
			||||||
    tokens = EN.tokenize('zombo.com')
 | 
					    tokens = EN('zombo.com')
 | 
				
			||||||
    assert len(tokens) == 1
 | 
					    assert len(tokens) == 1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -4,7 +4,7 @@ from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def EN():
 | 
					def EN():
 | 
				
			||||||
    return English(pos_tag=True)
 | 
					    return English(tag=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_range_iter(EN):
 | 
					def test_range_iter(EN):
 | 
				
			||||||
    for i in range(len(EN.vocab)):
 | 
					    for i in range(len(EN.vocab)):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,12 +3,12 @@ from __future__ import unicode_literals
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en import English
 | 
					from spacy.en import English
 | 
				
			||||||
from spacy.en.attrs import IS_ALPHA, IS_DIGIT
 | 
					from spacy.en.attrs import *
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def EN():
 | 
					def EN():
 | 
				
			||||||
    return English(pos_tag=False)
 | 
					    return English(tag=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_is_alpha(EN):
 | 
					def test_is_alpha(EN):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,7 +8,7 @@ from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def EN():
 | 
					def EN():
 | 
				
			||||||
    return English(pos_tag=True, parse=False)
 | 
					    return English(tag=True, parse=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
| 
						 | 
					@ -18,8 +18,8 @@ def morph_exc():
 | 
				
			||||||
           }
 | 
					           }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_load_exc(EN, morph_exc):
 | 
					def test_load_exc(EN, morph_exc):
 | 
				
			||||||
    EN.pos_tagger.morphologizer.load_exceptions(morph_exc)
 | 
					    EN.tagger.morphologizer.load_exceptions(morph_exc)
 | 
				
			||||||
    tokens = EN('I like his style.', pos_tag=True)
 | 
					    tokens = EN('I like his style.', tag=True)
 | 
				
			||||||
    his = tokens[2]
 | 
					    his = tokens[2]
 | 
				
			||||||
    assert his.pos == 'PRP$'
 | 
					    assert his.pos == 'PRP$'
 | 
				
			||||||
    assert his.lemma == '-PRP-'
 | 
					    assert his.lemma == '-PRP-'
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,11 +1,13 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en import EN
 | 
					from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_only_pre1():
 | 
					def test_only_pre1():
 | 
				
			||||||
    assert len(EN.tokenize("(")) == 1
 | 
					    EN = English()
 | 
				
			||||||
 | 
					    assert len(EN("(")) == 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_only_pre2():
 | 
					def test_only_pre2():
 | 
				
			||||||
    assert len(EN.tokenize("((")) == 2
 | 
					    EN = English()
 | 
				
			||||||
 | 
					    assert len(EN("((")) == 2
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en import EN
 | 
					from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,38 +10,43 @@ def close_puncts():
 | 
				
			||||||
    return [')', ']', '}', '*']
 | 
					    return [')', ']', '}', '*']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_close(close_puncts):
 | 
					@pytest.fixture
 | 
				
			||||||
 | 
					def EN():
 | 
				
			||||||
 | 
					    return English()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_close(close_puncts, EN):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for p in close_puncts:
 | 
					    for p in close_puncts:
 | 
				
			||||||
        string = word_str + p
 | 
					        string = word_str + p
 | 
				
			||||||
        tokens = EN.tokenize(string)
 | 
					        tokens = EN(string)
 | 
				
			||||||
        assert len(tokens) == 2
 | 
					        assert len(tokens) == 2
 | 
				
			||||||
        assert tokens[1].string == p
 | 
					        assert tokens[1].string == p
 | 
				
			||||||
        assert tokens[0].string == word_str
 | 
					        assert tokens[0].string == word_str
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_two_different_close(close_puncts):
 | 
					def test_two_different_close(close_puncts, EN):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for p in close_puncts:
 | 
					    for p in close_puncts:
 | 
				
			||||||
        string = word_str + p + "'"
 | 
					        string = word_str + p + "'"
 | 
				
			||||||
        tokens = EN.tokenize(string)
 | 
					        tokens = EN(string)
 | 
				
			||||||
        assert len(tokens) == 3
 | 
					        assert len(tokens) == 3
 | 
				
			||||||
        assert tokens[0].string == word_str
 | 
					        assert tokens[0].string == word_str
 | 
				
			||||||
        assert tokens[1].string == p
 | 
					        assert tokens[1].string == p
 | 
				
			||||||
        assert tokens[2].string == "'"
 | 
					        assert tokens[2].string == "'"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_three_same_close(close_puncts):
 | 
					def test_three_same_close(close_puncts, EN):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for p in close_puncts:
 | 
					    for p in close_puncts:
 | 
				
			||||||
        string = word_str + p + p + p
 | 
					        string = word_str + p + p + p
 | 
				
			||||||
        tokens = EN.tokenize(string)
 | 
					        tokens = EN(string)
 | 
				
			||||||
        assert len(tokens) == 4
 | 
					        assert len(tokens) == 4
 | 
				
			||||||
        assert tokens[0].string == word_str
 | 
					        assert tokens[0].string == word_str
 | 
				
			||||||
        assert tokens[1].string == p
 | 
					        assert tokens[1].string == p
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_double_end_quote():
 | 
					def test_double_end_quote(EN):
 | 
				
			||||||
    assert len(EN.tokenize("Hello''")) == 2
 | 
					    assert len(EN("Hello''")) == 2
 | 
				
			||||||
    assert len(EN.tokenize("''")) == 1
 | 
					    assert len(EN("''")) == 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en import EN
 | 
					from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,39 +10,44 @@ def open_puncts():
 | 
				
			||||||
    return ['(', '[', '{', '*']
 | 
					    return ['(', '[', '{', '*']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_open(open_puncts):
 | 
					@pytest.fixture
 | 
				
			||||||
 | 
					def EN():
 | 
				
			||||||
 | 
					    return English()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_open(open_puncts, EN):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for p in open_puncts:
 | 
					    for p in open_puncts:
 | 
				
			||||||
        string = p + word_str
 | 
					        string = p + word_str
 | 
				
			||||||
        tokens = EN.tokenize(string)
 | 
					        tokens = EN(string)
 | 
				
			||||||
        assert len(tokens) == 2
 | 
					        assert len(tokens) == 2
 | 
				
			||||||
        assert tokens[0].string == p
 | 
					        assert tokens[0].string == p
 | 
				
			||||||
        assert tokens[1].string == word_str
 | 
					        assert tokens[1].string == word_str
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_two_different_open(open_puncts):
 | 
					def test_two_different_open(open_puncts, EN):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for p in open_puncts:
 | 
					    for p in open_puncts:
 | 
				
			||||||
        string = p + "`" + word_str
 | 
					        string = p + "`" + word_str
 | 
				
			||||||
        tokens = EN.tokenize(string)
 | 
					        tokens = EN(string)
 | 
				
			||||||
        assert len(tokens) == 3
 | 
					        assert len(tokens) == 3
 | 
				
			||||||
        assert tokens[0].string == p
 | 
					        assert tokens[0].string == p
 | 
				
			||||||
        assert tokens[1].string == "`"
 | 
					        assert tokens[1].string == "`"
 | 
				
			||||||
        assert tokens[2].string == word_str
 | 
					        assert tokens[2].string == word_str
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_three_same_open(open_puncts):
 | 
					def test_three_same_open(open_puncts, EN):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for p in open_puncts:
 | 
					    for p in open_puncts:
 | 
				
			||||||
        string = p + p + p + word_str
 | 
					        string = p + p + p + word_str
 | 
				
			||||||
        tokens = EN.tokenize(string)
 | 
					        tokens = EN(string)
 | 
				
			||||||
        assert len(tokens) == 4
 | 
					        assert len(tokens) == 4
 | 
				
			||||||
        assert tokens[0].string == p
 | 
					        assert tokens[0].string == p
 | 
				
			||||||
        assert tokens[3].string == word_str
 | 
					        assert tokens[3].string == word_str
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_open_appostrophe():
 | 
					def test_open_appostrophe(EN):
 | 
				
			||||||
    string = "'The"
 | 
					    string = "'The"
 | 
				
			||||||
    tokens = EN.tokenize(string)
 | 
					    tokens = EN(string)
 | 
				
			||||||
    assert len(tokens) == 2
 | 
					    assert len(tokens) == 2
 | 
				
			||||||
    assert tokens[0].string == "'"
 | 
					    assert tokens[0].string == "'"
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,43 +3,48 @@ and suffix punctuation."""
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en import EN
 | 
					from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_no_special():
 | 
					@pytest.fixture
 | 
				
			||||||
    assert len(EN.tokenize("(can)")) == 3
 | 
					def EN():
 | 
				
			||||||
 | 
					    return English()
 | 
				
			||||||
def test_no_punct():
 | 
					 | 
				
			||||||
    assert len(EN.tokenize("can't")) == 2
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_prefix():
 | 
					 | 
				
			||||||
    assert len(EN.tokenize("(can't")) == 3
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_suffix():
 | 
					def test_no_special(EN):
 | 
				
			||||||
    assert len(EN.tokenize("can't)")) == 3
 | 
					    assert len(EN("(can)")) == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_no_punct(EN):
 | 
				
			||||||
 | 
					    assert len(EN("can't")) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_prefix(EN):
 | 
				
			||||||
 | 
					    assert len(EN("(can't")) == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_wrap():
 | 
					def test_suffix(EN):
 | 
				
			||||||
    assert len(EN.tokenize("(can't)")) == 4
 | 
					    assert len(EN("can't)")) == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_uneven_wrap():
 | 
					def test_wrap(EN):
 | 
				
			||||||
    assert len(EN.tokenize("(can't?)")) == 5
 | 
					    assert len(EN("(can't)")) == 4
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_prefix_interact():
 | 
					def test_uneven_wrap(EN):
 | 
				
			||||||
    assert len(EN.tokenize("U.S.")) == 1
 | 
					    assert len(EN("(can't?)")) == 5
 | 
				
			||||||
    assert len(EN.tokenize("us.")) == 2
 | 
					 | 
				
			||||||
    assert len(EN.tokenize("(U.S.")) == 2
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_suffix_interact():
 | 
					def test_prefix_interact(EN):
 | 
				
			||||||
    assert len(EN.tokenize("U.S.)")) == 2
 | 
					    assert len(EN("U.S.")) == 1
 | 
				
			||||||
 | 
					    assert len(EN("us.")) == 2
 | 
				
			||||||
 | 
					    assert len(EN("(U.S.")) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_even_wrap_interact():
 | 
					def test_suffix_interact(EN):
 | 
				
			||||||
    assert len(EN.tokenize("(U.S.)")) == 3
 | 
					    assert len(EN("U.S.)")) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_uneven_wrap_interact():
 | 
					def test_even_wrap_interact(EN):
 | 
				
			||||||
    assert len(EN.tokenize("(U.S.?)")) == 4
 | 
					    assert len(EN("(U.S.)")) == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_uneven_wrap_interact(EN):
 | 
				
			||||||
 | 
					    assert len(EN("(U.S.?)")) == 4
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -3,13 +3,18 @@ from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en import EN
 | 
					from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_one():
 | 
					@pytest.fixture
 | 
				
			||||||
    tokens = EN.tokenize('Betty Botter bought a pound of butter.')
 | 
					def EN():
 | 
				
			||||||
 | 
					    return English()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_one(EN):
 | 
				
			||||||
 | 
					    tokens = EN('Betty Botter bought a pound of butter.')
 | 
				
			||||||
    assert tokens[0].string == 'Betty'
 | 
					    assert tokens[0].string == 'Betty'
 | 
				
			||||||
    tokens2 = EN.tokenize('Betty also bought a pound of butter.')
 | 
					    tokens2 = EN('Betty also bought a pound of butter.')
 | 
				
			||||||
    assert tokens2[0].string == 'Betty'
 | 
					    assert tokens2[0].string == 'Betty'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,6 +1,6 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en import EN
 | 
					from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -10,22 +10,27 @@ def paired_puncts():
 | 
				
			||||||
    return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]
 | 
					    return [('(', ')'),  ('[', ']'), ('{', '}'), ('*', '*')]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_token(paired_puncts):
 | 
					@pytest.fixture
 | 
				
			||||||
 | 
					def EN():
 | 
				
			||||||
 | 
					    return English()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_token(paired_puncts, EN):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for open_, close_ in paired_puncts:
 | 
					    for open_, close_ in paired_puncts:
 | 
				
			||||||
        string = open_ + word_str + close_
 | 
					        string = open_ + word_str + close_
 | 
				
			||||||
        tokens = EN.tokenize(string)
 | 
					        tokens = EN(string)
 | 
				
			||||||
        assert len(tokens) == 3
 | 
					        assert len(tokens) == 3
 | 
				
			||||||
        assert tokens[0].string == open_
 | 
					        assert tokens[0].string == open_
 | 
				
			||||||
        assert tokens[1].string == word_str
 | 
					        assert tokens[1].string == word_str
 | 
				
			||||||
        assert tokens[2].string == close_
 | 
					        assert tokens[2].string == close_
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_two_different(paired_puncts):
 | 
					def test_two_different(paired_puncts, EN):
 | 
				
			||||||
    word_str = 'Hello'
 | 
					    word_str = 'Hello'
 | 
				
			||||||
    for open_, close_ in paired_puncts:
 | 
					    for open_, close_ in paired_puncts:
 | 
				
			||||||
        string = "`" + open_ + word_str + close_ + "'"
 | 
					        string = "`" + open_ + word_str + close_ + "'"
 | 
				
			||||||
        tokens = EN.tokenize(string)
 | 
					        tokens = EN(string)
 | 
				
			||||||
        assert len(tokens) == 5
 | 
					        assert len(tokens) == 5
 | 
				
			||||||
        assert tokens[0].string == "`"
 | 
					        assert tokens[0].string == "`"
 | 
				
			||||||
        assert tokens[1].string == open_
 | 
					        assert tokens[1].string == open_
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -8,7 +8,7 @@ from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def EN():
 | 
					def EN():
 | 
				
			||||||
    return English(pos_tag=False, parse=False)
 | 
					    return English(tag=False, parse=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_single_word(EN):
 | 
					def test_single_word(EN):
 | 
				
			||||||
    tokens = EN(u'hello')
 | 
					    tokens = EN(u'hello')
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,9 +1,16 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en import EN
 | 
					from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test1():
 | 
					
 | 
				
			||||||
 | 
					@pytest.fixture
 | 
				
			||||||
 | 
					def EN():
 | 
				
			||||||
 | 
					    return English()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test1(EN):
 | 
				
			||||||
    words = ['JAPAN', 'GET', 'LUCKY']
 | 
					    words = ['JAPAN', 'GET', 'LUCKY']
 | 
				
			||||||
    tokens = EN.tokens_from_list(words)
 | 
					    tokens = EN.tokenizer.tokens_from_list(words)
 | 
				
			||||||
    assert len(tokens) == 3
 | 
					    assert len(tokens) == 3
 | 
				
			||||||
    assert tokens[0].string == 'JAPAN'
 | 
					    assert tokens[0].string == 'JAPAN'
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,23 +1,29 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en import EN
 | 
					from spacy.en import English
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_neq():
 | 
					@pytest.fixture
 | 
				
			||||||
    addr = EN.lexicon['Hello']
 | 
					def EN():
 | 
				
			||||||
    assert EN.lexicon['bye']['sic'] != addr['sic']
 | 
					    return English()
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_eq():
 | 
					def test_neq(EN):
 | 
				
			||||||
    addr = EN.lexicon['Hello']
 | 
					    addr = EN.vocab['Hello']
 | 
				
			||||||
    assert EN.lexicon['Hello']['sic'] == addr['sic']
 | 
					    assert EN.vocab['bye']['sic'] != addr['sic']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_case_neq():
 | 
					def test_eq(EN):
 | 
				
			||||||
    addr = EN.lexicon['Hello']
 | 
					    addr = EN.vocab['Hello']
 | 
				
			||||||
    assert EN.lexicon['hello']['sic'] != addr['sic']
 | 
					    assert EN.vocab['Hello']['sic'] == addr['sic']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_punct_neq():
 | 
					def test_case_neq(EN):
 | 
				
			||||||
    addr = EN.lexicon['Hello']
 | 
					    addr = EN.vocab['Hello']
 | 
				
			||||||
    assert EN.lexicon['Hello,']['sic'] != addr['sic']
 | 
					    assert EN.vocab['hello']['sic'] != addr['sic']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_punct_neq(EN):
 | 
				
			||||||
 | 
					    addr = EN.vocab['Hello']
 | 
				
			||||||
 | 
					    assert EN.vocab['Hello,']['sic'] != addr['sic']
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,38 +1,43 @@
 | 
				
			||||||
"""Test that tokens are created correctly for whitespace."""
 | 
					"""Test that tokens are created correctly for whitespace."""
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en import EN
 | 
					from spacy.en import English
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_single_space():
 | 
					@pytest.fixture
 | 
				
			||||||
    tokens = EN.tokenize('hello possums')
 | 
					def EN():
 | 
				
			||||||
 | 
					    return English(tag=False)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_single_space(EN):
 | 
				
			||||||
 | 
					    tokens = EN('hello possums')
 | 
				
			||||||
    assert len(tokens) == 2
 | 
					    assert len(tokens) == 2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_double_space():
 | 
					def test_double_space(EN):
 | 
				
			||||||
    tokens = EN.tokenize('hello  possums')
 | 
					    tokens = EN('hello  possums')
 | 
				
			||||||
    assert len(tokens) == 3
 | 
					    assert len(tokens) == 3
 | 
				
			||||||
    assert tokens[1].string == ' '
 | 
					    assert tokens[1].string == ' '
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_newline():
 | 
					def test_newline(EN):
 | 
				
			||||||
    tokens = EN.tokenize('hello\npossums')
 | 
					    tokens = EN('hello\npossums')
 | 
				
			||||||
    assert len(tokens) == 3
 | 
					    assert len(tokens) == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_newline_space():
 | 
					def test_newline_space(EN):
 | 
				
			||||||
    tokens = EN.tokenize('hello \npossums')
 | 
					    tokens = EN('hello \npossums')
 | 
				
			||||||
    assert len(tokens) == 3
 | 
					    assert len(tokens) == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_newline_double_space():
 | 
					def test_newline_double_space(EN):
 | 
				
			||||||
    tokens = EN.tokenize('hello  \npossums')
 | 
					    tokens = EN('hello  \npossums')
 | 
				
			||||||
    assert len(tokens) == 3
 | 
					    assert len(tokens) == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_newline_space_wrap():
 | 
					def test_newline_space_wrap(EN):
 | 
				
			||||||
    tokens = EN.tokenize('hello \n possums')
 | 
					    tokens = EN('hello \n possums')
 | 
				
			||||||
    assert len(tokens) == 3
 | 
					    assert len(tokens) == 3
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user