mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme
This commit is contained in:
		
							parent
							
								
									0930892fc1
								
							
						
					
					
						commit
						7d3c40de7d
					
				| 
						 | 
					@ -12,13 +12,24 @@ from .attrs import get_flags
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def get_lex_props(string):
 | 
					def get_lex_props(string):
 | 
				
			||||||
    return {'flags': get_flags(string), 'length': len(string),
 | 
					    return {
 | 
				
			||||||
            'sic': string, 'norm1': string, 'norm2': string, 'shape': string,
 | 
					        'flags': get_flags(string),
 | 
				
			||||||
            'prefix': string[0], 'suffix': string[-3:], 'cluster': 0, 'prob': 0,
 | 
					        'length': len(string),
 | 
				
			||||||
            'sentiment': 0}
 | 
					        'sic': string,
 | 
				
			||||||
 | 
					        'norm1': string,
 | 
				
			||||||
 | 
					        'norm2': string,
 | 
				
			||||||
 | 
					        'shape': orth.word_shape(string),
 | 
				
			||||||
 | 
					        'prefix': string[0],
 | 
				
			||||||
 | 
					        'suffix': string[-3:],
 | 
				
			||||||
 | 
					        'cluster': 0,
 | 
				
			||||||
 | 
					        'prob': 0,
 | 
				
			||||||
 | 
					        'sentiment': 0
 | 
				
			||||||
 | 
					    }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
 | 
					LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
class English(object):
 | 
					class English(object):
 | 
				
			||||||
    """The English NLP pipeline.
 | 
					    """The English NLP pipeline.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -16,7 +16,7 @@ cdef class Lexeme:
 | 
				
			||||||
    cdef readonly attr_t id
 | 
					    cdef readonly attr_t id
 | 
				
			||||||
    cdef readonly attr_t length
 | 
					    cdef readonly attr_t length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    cdef readonly unicode sic
 | 
					    cdef readonly attr_t sic
 | 
				
			||||||
    cdef readonly unicode norm1
 | 
					    cdef readonly unicode norm1
 | 
				
			||||||
    cdef readonly unicode norm2
 | 
					    cdef readonly unicode norm2
 | 
				
			||||||
    cdef readonly unicode shape
 | 
					    cdef readonly unicode shape
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,3 +1,4 @@
 | 
				
			||||||
 | 
					# cython: embedsignature=True
 | 
				
			||||||
from cpython.ref cimport Py_INCREF
 | 
					from cpython.ref cimport Py_INCREF
 | 
				
			||||||
from cymem.cymem cimport Pool
 | 
					from cymem.cymem cimport Pool
 | 
				
			||||||
from murmurhash.mrmr cimport hash64
 | 
					from murmurhash.mrmr cimport hash64
 | 
				
			||||||
| 
						 | 
					@ -29,6 +30,7 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
cdef class Lexeme:
 | 
					cdef class Lexeme:
 | 
				
			||||||
 | 
					    """A dummy docstring"""
 | 
				
			||||||
    def __init__(self):
 | 
					    def __init__(self):
 | 
				
			||||||
        pass
 | 
					        pass
 | 
				
			||||||
        
 | 
					        
 | 
				
			||||||
| 
						 | 
					@ -42,7 +44,7 @@ cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings):
 | 
				
			||||||
    py.id = c.id
 | 
					    py.id = c.id
 | 
				
			||||||
    py.length = c.length
 | 
					    py.length = c.length
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    py.sic = strings[c.sic]
 | 
					    py.sic = c.sic
 | 
				
			||||||
    py.norm1 = strings[c.norm1]
 | 
					    py.norm1 = strings[c.norm1]
 | 
				
			||||||
    py.norm2 = strings[c.norm2]
 | 
					    py.norm2 = strings[c.norm2]
 | 
				
			||||||
    py.shape = strings[c.shape]
 | 
					    py.shape = strings[c.shape]
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -53,8 +53,8 @@ cdef class StringStore:
 | 
				
			||||||
        self.mem = Pool()
 | 
					        self.mem = Pool()
 | 
				
			||||||
        self._map = PreshMap()
 | 
					        self._map = PreshMap()
 | 
				
			||||||
        self._resize_at = 10000
 | 
					        self._resize_at = 10000
 | 
				
			||||||
        self.size = 1
 | 
					 | 
				
			||||||
        self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
 | 
					        self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
 | 
				
			||||||
 | 
					        self.size = 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property size:
 | 
					    property size:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
| 
						 | 
					@ -64,7 +64,9 @@ cdef class StringStore:
 | 
				
			||||||
        cdef bytes byte_string
 | 
					        cdef bytes byte_string
 | 
				
			||||||
        cdef const Utf8Str* utf8str
 | 
					        cdef const Utf8Str* utf8str
 | 
				
			||||||
        if isinstance(string_or_id, int) or isinstance(string_or_id, long):
 | 
					        if isinstance(string_or_id, int) or isinstance(string_or_id, long):
 | 
				
			||||||
            if string_or_id < 1 or string_or_id >= self.size:
 | 
					            if string_or_id == 0:
 | 
				
			||||||
 | 
					                return u''
 | 
				
			||||||
 | 
					            elif string_or_id < 1 or string_or_id >= self.size:
 | 
				
			||||||
                raise IndexError(string_or_id)
 | 
					                raise IndexError(string_or_id)
 | 
				
			||||||
            utf8str = &self.strings[<int>string_or_id]
 | 
					            utf8str = &self.strings[<int>string_or_id]
 | 
				
			||||||
            return utf8str.chars[:utf8str.length].decode('utf8')
 | 
					            return utf8str.chars[:utf8str.length].decode('utf8')
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -120,9 +120,9 @@ cdef class Tokens:
 | 
				
			||||||
            attr_ids (list[int]): A list of attribute ID ints.
 | 
					            attr_ids (list[int]): A list of attribute ID ints.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Returns:
 | 
					        Returns:
 | 
				
			||||||
            feat_array (numpy.ndarray[long, ndim=2]): A feature matrix, with one
 | 
					            feat_array (numpy.ndarray[long, ndim=2]):
 | 
				
			||||||
                row per word, and one column per attribute indicated in the input
 | 
					              A feature matrix, with one row per word, and one column per attribute
 | 
				
			||||||
                attr_ids.
 | 
					              indicated in the input attr_ids.
 | 
				
			||||||
        """
 | 
					        """
 | 
				
			||||||
        cdef int i, j
 | 
					        cdef int i, j
 | 
				
			||||||
        cdef attr_id_t feature
 | 
					        cdef attr_id_t feature
 | 
				
			||||||
| 
						 | 
					@ -278,7 +278,7 @@ cdef class Token:
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property sic:
 | 
					    property sic:
 | 
				
			||||||
        def __get__(self):
 | 
					        def __get__(self):
 | 
				
			||||||
            return self._seq.vocab.strings[self._seq.data[self.i].lex.sic]
 | 
					            return self._seq.data[self.i].lex.sic
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    property head:
 | 
					    property head:
 | 
				
			||||||
        """The token predicted by the parser to be the head of the current token."""
 | 
					        """The token predicted by the parser to be the head of the current token."""
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -77,14 +77,15 @@ cdef class Vocab:
 | 
				
			||||||
        unseen unicode string is given, a new lexeme is created and stored.
 | 
					        unseen unicode string is given, a new lexeme is created and stored.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Args:
 | 
					        Args:
 | 
				
			||||||
            id_or_string (int or unicode): The integer ID of a word, or its unicode
 | 
					            id_or_string (int or unicode):
 | 
				
			||||||
                string.  If an int >= Lexicon.size, IndexError is raised.
 | 
					              The integer ID of a word, or its unicode string.  If an int >= Lexicon.size,
 | 
				
			||||||
                If id_or_string is neither an int nor a unicode string, ValueError
 | 
					              IndexError is raised. If id_or_string is neither an int nor a unicode string,
 | 
				
			||||||
                is raised.
 | 
					              ValueError is raised.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        Returns:
 | 
					        Returns:
 | 
				
			||||||
            lexeme (Lexeme): An instance of the Lexeme Python class, with data
 | 
					            lexeme (Lexeme):
 | 
				
			||||||
                copied on instantiation.
 | 
					              An instance of the Lexeme Python class, with data copied on
 | 
				
			||||||
 | 
					              instantiation.
 | 
				
			||||||
        '''
 | 
					        '''
 | 
				
			||||||
        cdef UniStr c_str
 | 
					        cdef UniStr c_str
 | 
				
			||||||
        cdef const LexemeC* lexeme
 | 
					        cdef const LexemeC* lexeme
 | 
				
			||||||
| 
						 | 
					@ -92,9 +93,11 @@ cdef class Vocab:
 | 
				
			||||||
            if id_or_string >= self.lexemes.size():
 | 
					            if id_or_string >= self.lexemes.size():
 | 
				
			||||||
                raise IndexError
 | 
					                raise IndexError
 | 
				
			||||||
            lexeme = self.lexemes.at(id_or_string)
 | 
					            lexeme = self.lexemes.at(id_or_string)
 | 
				
			||||||
        else:
 | 
					        elif type(id_or_string) == unicode:
 | 
				
			||||||
            slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
 | 
					            slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
 | 
				
			||||||
            lexeme = self.get(self.mem, &c_str)
 | 
					            lexeme = self.get(self.mem, &c_str)
 | 
				
			||||||
 | 
					        else:
 | 
				
			||||||
 | 
					            raise ValueError("Vocab unable to map type: %s. Maps unicode --> int or int --> unicode" % str(type(id_or_string)))
 | 
				
			||||||
        return Lexeme_cinit(lexeme, self.strings)
 | 
					        return Lexeme_cinit(lexeme, self.strings)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __setitem__(self, unicode py_str, dict props):
 | 
					    def __setitem__(self, unicode py_str, dict props):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -27,10 +27,6 @@ def test_save_unicode(sstore):
 | 
				
			||||||
    assert Hello_i == 1
 | 
					    assert Hello_i == 1
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_zero_id(sstore):
 | 
					 | 
				
			||||||
    with pytest.raises(IndexError):
 | 
					 | 
				
			||||||
        sstore[0]
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
def test_retrieve_id(sstore):
 | 
					def test_retrieve_id(sstore):
 | 
				
			||||||
    A_i = sstore[b'A']
 | 
					    A_i = sstore[b'A']
 | 
				
			||||||
    assert sstore.size == 1
 | 
					    assert sstore.size == 1
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,14 +1,14 @@
 | 
				
			||||||
from __future__ import unicode_literals
 | 
					from __future__ import unicode_literals
 | 
				
			||||||
 | 
					
 | 
				
			||||||
from spacy.en.lemmatizer import Lemmatizer, read_index, read_exc
 | 
					from spacy.en.lemmatizer import Lemmatizer, read_index, read_exc
 | 
				
			||||||
from spacy.en import DATA_DIR
 | 
					from spacy.en import LOCAL_DATA_DIR
 | 
				
			||||||
from os import path
 | 
					from os import path
 | 
				
			||||||
 | 
					
 | 
				
			||||||
import pytest
 | 
					import pytest
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_read_index():
 | 
					def test_read_index():
 | 
				
			||||||
    wn = path.join(DATA_DIR, 'wordnet')
 | 
					    wn = path.join(LOCAL_DATA_DIR, 'wordnet')
 | 
				
			||||||
    index = read_index(path.join(wn, 'index.noun'))
 | 
					    index = read_index(path.join(wn, 'index.noun'))
 | 
				
			||||||
    assert 'man' in index
 | 
					    assert 'man' in index
 | 
				
			||||||
    assert 'plantes' not in index
 | 
					    assert 'plantes' not in index
 | 
				
			||||||
| 
						 | 
					@ -16,14 +16,14 @@ def test_read_index():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_read_exc():
 | 
					def test_read_exc():
 | 
				
			||||||
    wn = path.join(DATA_DIR, 'wordnet')
 | 
					    wn = path.join(LOCAL_DATA_DIR, 'wordnet')
 | 
				
			||||||
    exc = read_exc(path.join(wn, 'verb.exc'))
 | 
					    exc = read_exc(path.join(wn, 'verb.exc'))
 | 
				
			||||||
    assert exc['was'] == ('be',)
 | 
					    assert exc['was'] == ('be',)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@pytest.fixture
 | 
					@pytest.fixture
 | 
				
			||||||
def lemmatizer():
 | 
					def lemmatizer():
 | 
				
			||||||
    return Lemmatizer(path.join(DATA_DIR, 'wordnet'), 0, 0, 0)
 | 
					    return Lemmatizer(path.join(LOCAL_DATA_DIR, 'wordnet'), 0, 0, 0)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_noun_lemmas(lemmatizer):
 | 
					def test_noun_lemmas(lemmatizer):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -13,17 +13,17 @@ def EN():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_is_alpha(EN):
 | 
					def test_is_alpha(EN):
 | 
				
			||||||
    the = EN.vocab['the']
 | 
					    the = EN.vocab['the']
 | 
				
			||||||
    assert the['flags'] & (1 << IS_ALPHA)
 | 
					    assert the.flags & (1 << IS_ALPHA)
 | 
				
			||||||
    year = EN.vocab['1999']
 | 
					    year = EN.vocab['1999']
 | 
				
			||||||
    assert not year['flags'] & (1 << IS_ALPHA)
 | 
					    assert not year.flags & (1 << IS_ALPHA)
 | 
				
			||||||
    mixed = EN.vocab['hello1']
 | 
					    mixed = EN.vocab['hello1']
 | 
				
			||||||
    assert not mixed['flags'] & (1 << IS_ALPHA)
 | 
					    assert not mixed.flags & (1 << IS_ALPHA)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_is_digit(EN):
 | 
					def test_is_digit(EN):
 | 
				
			||||||
    the = EN.vocab['the']
 | 
					    the = EN.vocab['the']
 | 
				
			||||||
    assert not the['flags'] & (1 << IS_DIGIT)
 | 
					    assert not the.flags & (1 << IS_DIGIT)
 | 
				
			||||||
    year = EN.vocab['1999']
 | 
					    year = EN.vocab['1999']
 | 
				
			||||||
    assert year['flags'] & (1 << IS_DIGIT)
 | 
					    assert year.flags & (1 << IS_DIGIT)
 | 
				
			||||||
    mixed = EN.vocab['hello1']
 | 
					    mixed = EN.vocab['hello1']
 | 
				
			||||||
    assert not mixed['flags'] & (1 << IS_DIGIT)
 | 
					    assert not mixed.flags & (1 << IS_DIGIT)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -33,17 +33,17 @@ def test_punct(EN):
 | 
				
			||||||
def test_digits(EN):
 | 
					def test_digits(EN):
 | 
				
			||||||
    tokens = EN('The year: 1984.')
 | 
					    tokens = EN('The year: 1984.')
 | 
				
			||||||
    assert len(tokens) == 5
 | 
					    assert len(tokens) == 5
 | 
				
			||||||
    assert tokens[0].sic == EN.vocab['The']['sic']
 | 
					    assert tokens[0].sic == EN.vocab['The'].sic
 | 
				
			||||||
    assert tokens[3].sic == EN.vocab['1984']['sic']
 | 
					    assert tokens[3].sic == EN.vocab['1984'].sic
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_contraction(EN):
 | 
					def test_contraction(EN):
 | 
				
			||||||
    tokens = EN("don't giggle")
 | 
					    tokens = EN("don't giggle")
 | 
				
			||||||
    assert len(tokens) == 3
 | 
					    assert len(tokens) == 3
 | 
				
			||||||
    assert tokens[1].sic == EN.vocab["n't"]['sic']
 | 
					    assert tokens[1].sic == EN.vocab["n't"].sic
 | 
				
			||||||
    tokens = EN("i said don't!")
 | 
					    tokens = EN("i said don't!")
 | 
				
			||||||
    assert len(tokens) == 5
 | 
					    assert len(tokens) == 5
 | 
				
			||||||
    assert tokens[4].sic == EN.vocab['!']['sic']
 | 
					    assert tokens[4].sic == EN.vocab['!'].sic
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_contraction_punct(EN):
 | 
					def test_contraction_punct(EN):
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -11,24 +11,24 @@ def EN():
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_neq(EN):
 | 
					def test_neq(EN):
 | 
				
			||||||
    addr = EN.vocab['Hello']
 | 
					    addr = EN.vocab['Hello']
 | 
				
			||||||
    assert EN.vocab['bye']['sic'] != addr['sic']
 | 
					    assert EN.vocab['bye'].sic != addr.sic
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_eq(EN):
 | 
					def test_eq(EN):
 | 
				
			||||||
    addr = EN.vocab['Hello']
 | 
					    addr = EN.vocab['Hello']
 | 
				
			||||||
    assert EN.vocab['Hello']['sic'] == addr['sic']
 | 
					    assert EN.vocab['Hello'].sic == addr.sic
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_case_neq(EN):
 | 
					def test_case_neq(EN):
 | 
				
			||||||
    addr = EN.vocab['Hello']
 | 
					    addr = EN.vocab['Hello']
 | 
				
			||||||
    assert EN.vocab['hello']['sic'] != addr['sic']
 | 
					    assert EN.vocab['hello'].sic != addr.sic
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_punct_neq(EN):
 | 
					def test_punct_neq(EN):
 | 
				
			||||||
    addr = EN.vocab['Hello']
 | 
					    addr = EN.vocab['Hello']
 | 
				
			||||||
    assert EN.vocab['Hello,']['sic'] != addr['sic']
 | 
					    assert EN.vocab['Hello,'].sic != addr.sic
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_shape_attr(EN):
 | 
					def test_shape_attr(EN):
 | 
				
			||||||
    example = EN.vocab['example']
 | 
					    example = EN.vocab['example']
 | 
				
			||||||
    assert example['sic'] != example['shape']
 | 
					    assert example.sic != example.shape
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in New Issue
	
	Block a user