* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme

This commit is contained in:
Matthew Honnibal 2015-01-15 00:33:16 +11:00
parent 0930892fc1
commit 7d3c40de7d
11 changed files with 56 additions and 42 deletions

View File

@ -12,13 +12,24 @@ from .attrs import get_flags
def get_lex_props(string): def get_lex_props(string):
return {'flags': get_flags(string), 'length': len(string), return {
'sic': string, 'norm1': string, 'norm2': string, 'shape': string, 'flags': get_flags(string),
'prefix': string[0], 'suffix': string[-3:], 'cluster': 0, 'prob': 0, 'length': len(string),
'sentiment': 0} 'sic': string,
'norm1': string,
'norm2': string,
'shape': orth.word_shape(string),
'prefix': string[0],
'suffix': string[-3:],
'cluster': 0,
'prob': 0,
'sentiment': 0
}
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
class English(object): class English(object):
"""The English NLP pipeline. """The English NLP pipeline.

View File

@ -16,7 +16,7 @@ cdef class Lexeme:
cdef readonly attr_t id cdef readonly attr_t id
cdef readonly attr_t length cdef readonly attr_t length
cdef readonly unicode sic cdef readonly attr_t sic
cdef readonly unicode norm1 cdef readonly unicode norm1
cdef readonly unicode norm2 cdef readonly unicode norm2
cdef readonly unicode shape cdef readonly unicode shape

View File

@ -1,3 +1,4 @@
# cython: embedsignature=True
from cpython.ref cimport Py_INCREF from cpython.ref cimport Py_INCREF
from cymem.cymem cimport Pool from cymem.cymem cimport Pool
from murmurhash.mrmr cimport hash64 from murmurhash.mrmr cimport hash64
@ -29,6 +30,7 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
cdef class Lexeme: cdef class Lexeme:
"""A dummy docstring"""
def __init__(self): def __init__(self):
pass pass
@ -42,7 +44,7 @@ cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings):
py.id = c.id py.id = c.id
py.length = c.length py.length = c.length
py.sic = strings[c.sic] py.sic = c.sic
py.norm1 = strings[c.norm1] py.norm1 = strings[c.norm1]
py.norm2 = strings[c.norm2] py.norm2 = strings[c.norm2]
py.shape = strings[c.shape] py.shape = strings[c.shape]

View File

@ -53,8 +53,8 @@ cdef class StringStore:
self.mem = Pool() self.mem = Pool()
self._map = PreshMap() self._map = PreshMap()
self._resize_at = 10000 self._resize_at = 10000
self.size = 1
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str)) self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
self.size = 1
property size: property size:
def __get__(self): def __get__(self):
@ -64,7 +64,9 @@ cdef class StringStore:
cdef bytes byte_string cdef bytes byte_string
cdef const Utf8Str* utf8str cdef const Utf8Str* utf8str
if isinstance(string_or_id, int) or isinstance(string_or_id, long): if isinstance(string_or_id, int) or isinstance(string_or_id, long):
if string_or_id < 1 or string_or_id >= self.size: if string_or_id == 0:
return u''
elif string_or_id < 1 or string_or_id >= self.size:
raise IndexError(string_or_id) raise IndexError(string_or_id)
utf8str = &self.strings[<int>string_or_id] utf8str = &self.strings[<int>string_or_id]
return utf8str.chars[:utf8str.length].decode('utf8') return utf8str.chars[:utf8str.length].decode('utf8')

View File

@ -120,9 +120,9 @@ cdef class Tokens:
attr_ids (list[int]): A list of attribute ID ints. attr_ids (list[int]): A list of attribute ID ints.
Returns: Returns:
feat_array (numpy.ndarray[long, ndim=2]): A feature matrix, with one feat_array (numpy.ndarray[long, ndim=2]):
row per word, and one column per attribute indicated in the input A feature matrix, with one row per word, and one column per attribute
attr_ids. indicated in the input attr_ids.
""" """
cdef int i, j cdef int i, j
cdef attr_id_t feature cdef attr_id_t feature
@ -278,7 +278,7 @@ cdef class Token:
property sic: property sic:
def __get__(self): def __get__(self):
return self._seq.vocab.strings[self._seq.data[self.i].lex.sic] return self._seq.data[self.i].lex.sic
property head: property head:
"""The token predicted by the parser to be the head of the current token.""" """The token predicted by the parser to be the head of the current token."""

View File

@ -77,14 +77,15 @@ cdef class Vocab:
unseen unicode string is given, a new lexeme is created and stored. unseen unicode string is given, a new lexeme is created and stored.
Args: Args:
id_or_string (int or unicode): The integer ID of a word, or its unicode id_or_string (int or unicode):
string. If an int >= Lexicon.size, IndexError is raised. The integer ID of a word, or its unicode string. If an int >= Lexicon.size,
If id_or_string is neither an int nor a unicode string, ValueError IndexError is raised. If id_or_string is neither an int nor a unicode string,
is raised. ValueError is raised.
Returns: Returns:
lexeme (Lexeme): An instance of the Lexeme Python class, with data lexeme (Lexeme):
copied on instantiation. An instance of the Lexeme Python class, with data copied on
instantiation.
''' '''
cdef UniStr c_str cdef UniStr c_str
cdef const LexemeC* lexeme cdef const LexemeC* lexeme
@ -92,9 +93,11 @@ cdef class Vocab:
if id_or_string >= self.lexemes.size(): if id_or_string >= self.lexemes.size():
raise IndexError raise IndexError
lexeme = self.lexemes.at(id_or_string) lexeme = self.lexemes.at(id_or_string)
else: elif type(id_or_string) == unicode:
slice_unicode(&c_str, id_or_string, 0, len(id_or_string)) slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
lexeme = self.get(self.mem, &c_str) lexeme = self.get(self.mem, &c_str)
else:
raise ValueError("Vocab unable to map type: %s. Maps unicode --> int or int --> unicode" % str(type(id_or_string)))
return Lexeme_cinit(lexeme, self.strings) return Lexeme_cinit(lexeme, self.strings)
def __setitem__(self, unicode py_str, dict props): def __setitem__(self, unicode py_str, dict props):

View File

@ -27,10 +27,6 @@ def test_save_unicode(sstore):
assert Hello_i == 1 assert Hello_i == 1
def test_zero_id(sstore):
with pytest.raises(IndexError):
sstore[0]
def test_retrieve_id(sstore): def test_retrieve_id(sstore):
A_i = sstore[b'A'] A_i = sstore[b'A']
assert sstore.size == 1 assert sstore.size == 1

View File

@ -1,14 +1,14 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from spacy.en.lemmatizer import Lemmatizer, read_index, read_exc from spacy.en.lemmatizer import Lemmatizer, read_index, read_exc
from spacy.en import DATA_DIR from spacy.en import LOCAL_DATA_DIR
from os import path from os import path
import pytest import pytest
def test_read_index(): def test_read_index():
wn = path.join(DATA_DIR, 'wordnet') wn = path.join(LOCAL_DATA_DIR, 'wordnet')
index = read_index(path.join(wn, 'index.noun')) index = read_index(path.join(wn, 'index.noun'))
assert 'man' in index assert 'man' in index
assert 'plantes' not in index assert 'plantes' not in index
@ -16,14 +16,14 @@ def test_read_index():
def test_read_exc(): def test_read_exc():
wn = path.join(DATA_DIR, 'wordnet') wn = path.join(LOCAL_DATA_DIR, 'wordnet')
exc = read_exc(path.join(wn, 'verb.exc')) exc = read_exc(path.join(wn, 'verb.exc'))
assert exc['was'] == ('be',) assert exc['was'] == ('be',)
@pytest.fixture @pytest.fixture
def lemmatizer(): def lemmatizer():
return Lemmatizer(path.join(DATA_DIR, 'wordnet'), 0, 0, 0) return Lemmatizer(path.join(LOCAL_DATA_DIR, 'wordnet'), 0, 0, 0)
def test_noun_lemmas(lemmatizer): def test_noun_lemmas(lemmatizer):

View File

@ -13,17 +13,17 @@ def EN():
def test_is_alpha(EN): def test_is_alpha(EN):
the = EN.vocab['the'] the = EN.vocab['the']
assert the['flags'] & (1 << IS_ALPHA) assert the.flags & (1 << IS_ALPHA)
year = EN.vocab['1999'] year = EN.vocab['1999']
assert not year['flags'] & (1 << IS_ALPHA) assert not year.flags & (1 << IS_ALPHA)
mixed = EN.vocab['hello1'] mixed = EN.vocab['hello1']
assert not mixed['flags'] & (1 << IS_ALPHA) assert not mixed.flags & (1 << IS_ALPHA)
def test_is_digit(EN): def test_is_digit(EN):
the = EN.vocab['the'] the = EN.vocab['the']
assert not the['flags'] & (1 << IS_DIGIT) assert not the.flags & (1 << IS_DIGIT)
year = EN.vocab['1999'] year = EN.vocab['1999']
assert year['flags'] & (1 << IS_DIGIT) assert year.flags & (1 << IS_DIGIT)
mixed = EN.vocab['hello1'] mixed = EN.vocab['hello1']
assert not mixed['flags'] & (1 << IS_DIGIT) assert not mixed.flags & (1 << IS_DIGIT)

View File

@ -33,17 +33,17 @@ def test_punct(EN):
def test_digits(EN): def test_digits(EN):
tokens = EN('The year: 1984.') tokens = EN('The year: 1984.')
assert len(tokens) == 5 assert len(tokens) == 5
assert tokens[0].sic == EN.vocab['The']['sic'] assert tokens[0].sic == EN.vocab['The'].sic
assert tokens[3].sic == EN.vocab['1984']['sic'] assert tokens[3].sic == EN.vocab['1984'].sic
def test_contraction(EN): def test_contraction(EN):
tokens = EN("don't giggle") tokens = EN("don't giggle")
assert len(tokens) == 3 assert len(tokens) == 3
assert tokens[1].sic == EN.vocab["n't"]['sic'] assert tokens[1].sic == EN.vocab["n't"].sic
tokens = EN("i said don't!") tokens = EN("i said don't!")
assert len(tokens) == 5 assert len(tokens) == 5
assert tokens[4].sic == EN.vocab['!']['sic'] assert tokens[4].sic == EN.vocab['!'].sic
def test_contraction_punct(EN): def test_contraction_punct(EN):

View File

@ -11,24 +11,24 @@ def EN():
def test_neq(EN): def test_neq(EN):
addr = EN.vocab['Hello'] addr = EN.vocab['Hello']
assert EN.vocab['bye']['sic'] != addr['sic'] assert EN.vocab['bye'].sic != addr.sic
def test_eq(EN): def test_eq(EN):
addr = EN.vocab['Hello'] addr = EN.vocab['Hello']
assert EN.vocab['Hello']['sic'] == addr['sic'] assert EN.vocab['Hello'].sic == addr.sic
def test_case_neq(EN): def test_case_neq(EN):
addr = EN.vocab['Hello'] addr = EN.vocab['Hello']
assert EN.vocab['hello']['sic'] != addr['sic'] assert EN.vocab['hello'].sic != addr.sic
def test_punct_neq(EN): def test_punct_neq(EN):
addr = EN.vocab['Hello'] addr = EN.vocab['Hello']
assert EN.vocab['Hello,']['sic'] != addr['sic'] assert EN.vocab['Hello,'].sic != addr.sic
def test_shape_attr(EN): def test_shape_attr(EN):
example = EN.vocab['example'] example = EN.vocab['example']
assert example['sic'] != example['shape'] assert example.sic != example.shape