diff --git a/spacy/en/__init__.py b/spacy/en/__init__.py index 933de124e..4ea837ed0 100644 --- a/spacy/en/__init__.py +++ b/spacy/en/__init__.py @@ -12,13 +12,24 @@ from .attrs import get_flags def get_lex_props(string): - return {'flags': get_flags(string), 'length': len(string), - 'sic': string, 'norm1': string, 'norm2': string, 'shape': string, - 'prefix': string[0], 'suffix': string[-3:], 'cluster': 0, 'prob': 0, - 'sentiment': 0} + return { + 'flags': get_flags(string), + 'length': len(string), + 'sic': string, + 'norm1': string, + 'norm2': string, + 'shape': orth.word_shape(string), + 'prefix': string[0], + 'suffix': string[-3:], + 'cluster': 0, + 'prob': 0, + 'sentiment': 0 + } + LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data') + class English(object): """The English NLP pipeline. diff --git a/spacy/lexeme.pxd b/spacy/lexeme.pxd index 8686f8e6a..32626f122 100644 --- a/spacy/lexeme.pxd +++ b/spacy/lexeme.pxd @@ -16,7 +16,7 @@ cdef class Lexeme: cdef readonly attr_t id cdef readonly attr_t length - cdef readonly unicode sic + cdef readonly attr_t sic cdef readonly unicode norm1 cdef readonly unicode norm2 cdef readonly unicode shape diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index dfc82d46e..fbdcd31da 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -1,3 +1,4 @@ +# cython: embedsignature=True from cpython.ref cimport Py_INCREF from cymem.cymem cimport Pool from murmurhash.mrmr cimport hash64 @@ -29,6 +30,7 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store cdef class Lexeme: + """A dummy docstring""" def __init__(self): pass @@ -42,7 +44,7 @@ cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings): py.id = c.id py.length = c.length - py.sic = strings[c.sic] + py.sic = c.sic py.norm1 = strings[c.norm1] py.norm2 = strings[c.norm2] py.shape = strings[c.shape] diff --git a/spacy/strings.pyx b/spacy/strings.pyx index df9b89dc3..e5792aa9a 100644 --- a/spacy/strings.pyx +++ b/spacy/strings.pyx @@ -53,8 +53,8 @@ cdef class StringStore: self.mem = Pool() self._map = PreshMap() self._resize_at = 10000 - self.size = 1 self.strings = self.mem.alloc(self._resize_at, sizeof(Utf8Str)) + self.size = 1 property size: def __get__(self): @@ -64,7 +64,9 @@ cdef class StringStore: cdef bytes byte_string cdef const Utf8Str* utf8str if isinstance(string_or_id, int) or isinstance(string_or_id, long): - if string_or_id < 1 or string_or_id >= self.size: + if string_or_id == 0: + return u'' + elif string_or_id < 1 or string_or_id >= self.size: raise IndexError(string_or_id) utf8str = &self.strings[string_or_id] return utf8str.chars[:utf8str.length].decode('utf8') diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 61aab89b1..4c0156df3 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -120,9 +120,9 @@ cdef class Tokens: attr_ids (list[int]): A list of attribute ID ints. Returns: - feat_array (numpy.ndarray[long, ndim=2]): A feature matrix, with one - row per word, and one column per attribute indicated in the input - attr_ids. + feat_array (numpy.ndarray[long, ndim=2]): + A feature matrix, with one row per word, and one column per attribute + indicated in the input attr_ids. """ cdef int i, j cdef attr_id_t feature @@ -278,7 +278,7 @@ cdef class Token: property sic: def __get__(self): - return self._seq.vocab.strings[self._seq.data[self.i].lex.sic] + return self._seq.data[self.i].lex.sic property head: """The token predicted by the parser to be the head of the current token.""" diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx index 4043b14e0..800947964 100644 --- a/spacy/vocab.pyx +++ b/spacy/vocab.pyx @@ -77,14 +77,15 @@ cdef class Vocab: unseen unicode string is given, a new lexeme is created and stored. Args: - id_or_string (int or unicode): The integer ID of a word, or its unicode - string. If an int >= Lexicon.size, IndexError is raised. - If id_or_string is neither an int nor a unicode string, ValueError - is raised. + id_or_string (int or unicode): + The integer ID of a word, or its unicode string. If an int >= Lexicon.size, + IndexError is raised. If id_or_string is neither an int nor a unicode string, + ValueError is raised. Returns: - lexeme (Lexeme): An instance of the Lexeme Python class, with data - copied on instantiation. + lexeme (Lexeme): + An instance of the Lexeme Python class, with data copied on + instantiation. ''' cdef UniStr c_str cdef const LexemeC* lexeme @@ -92,9 +93,11 @@ cdef class Vocab: if id_or_string >= self.lexemes.size(): raise IndexError lexeme = self.lexemes.at(id_or_string) - else: + elif type(id_or_string) == unicode: slice_unicode(&c_str, id_or_string, 0, len(id_or_string)) lexeme = self.get(self.mem, &c_str) + else: + raise ValueError("Vocab unable to map type: %s. Maps unicode --> int or int --> unicode" % str(type(id_or_string))) return Lexeme_cinit(lexeme, self.strings) def __setitem__(self, unicode py_str, dict props): diff --git a/tests/test_intern.py b/tests/test_intern.py index 5375ebb2a..74f0d6bcf 100644 --- a/tests/test_intern.py +++ b/tests/test_intern.py @@ -27,10 +27,6 @@ def test_save_unicode(sstore): assert Hello_i == 1 -def test_zero_id(sstore): - with pytest.raises(IndexError): - sstore[0] - def test_retrieve_id(sstore): A_i = sstore[b'A'] assert sstore.size == 1 diff --git a/tests/test_lemmatizer.py b/tests/test_lemmatizer.py index bc9231259..5a6a8fc62 100644 --- a/tests/test_lemmatizer.py +++ b/tests/test_lemmatizer.py @@ -1,14 +1,14 @@ from __future__ import unicode_literals from spacy.en.lemmatizer import Lemmatizer, read_index, read_exc -from spacy.en import DATA_DIR +from spacy.en import LOCAL_DATA_DIR from os import path import pytest def test_read_index(): - wn = path.join(DATA_DIR, 'wordnet') + wn = path.join(LOCAL_DATA_DIR, 'wordnet') index = read_index(path.join(wn, 'index.noun')) assert 'man' in index assert 'plantes' not in index @@ -16,14 +16,14 @@ def test_read_index(): def test_read_exc(): - wn = path.join(DATA_DIR, 'wordnet') + wn = path.join(LOCAL_DATA_DIR, 'wordnet') exc = read_exc(path.join(wn, 'verb.exc')) assert exc['was'] == ('be',) @pytest.fixture def lemmatizer(): - return Lemmatizer(path.join(DATA_DIR, 'wordnet'), 0, 0, 0) + return Lemmatizer(path.join(LOCAL_DATA_DIR, 'wordnet'), 0, 0, 0) def test_noun_lemmas(lemmatizer): diff --git a/tests/test_lexeme_flags.py b/tests/test_lexeme_flags.py index 3a4776b80..e04360d98 100644 --- a/tests/test_lexeme_flags.py +++ b/tests/test_lexeme_flags.py @@ -13,17 +13,17 @@ def EN(): def test_is_alpha(EN): the = EN.vocab['the'] - assert the['flags'] & (1 << IS_ALPHA) + assert the.flags & (1 << IS_ALPHA) year = EN.vocab['1999'] - assert not year['flags'] & (1 << IS_ALPHA) + assert not year.flags & (1 << IS_ALPHA) mixed = EN.vocab['hello1'] - assert not mixed['flags'] & (1 << IS_ALPHA) + assert not mixed.flags & (1 << IS_ALPHA) def test_is_digit(EN): the = EN.vocab['the'] - assert not the['flags'] & (1 << IS_DIGIT) + assert not the.flags & (1 << IS_DIGIT) year = EN.vocab['1999'] - assert year['flags'] & (1 << IS_DIGIT) + assert year.flags & (1 << IS_DIGIT) mixed = EN.vocab['hello1'] - assert not mixed['flags'] & (1 << IS_DIGIT) + assert not mixed.flags & (1 << IS_DIGIT) diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index cb02bbcff..c5b50041f 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -33,17 +33,17 @@ def test_punct(EN): def test_digits(EN): tokens = EN('The year: 1984.') assert len(tokens) == 5 - assert tokens[0].sic == EN.vocab['The']['sic'] - assert tokens[3].sic == EN.vocab['1984']['sic'] + assert tokens[0].sic == EN.vocab['The'].sic + assert tokens[3].sic == EN.vocab['1984'].sic def test_contraction(EN): tokens = EN("don't giggle") assert len(tokens) == 3 - assert tokens[1].sic == EN.vocab["n't"]['sic'] + assert tokens[1].sic == EN.vocab["n't"].sic tokens = EN("i said don't!") assert len(tokens) == 5 - assert tokens[4].sic == EN.vocab['!']['sic'] + assert tokens[4].sic == EN.vocab['!'].sic def test_contraction_punct(EN): diff --git a/tests/test_vocab.py b/tests/test_vocab.py index 0a739ad0e..a83fa82d3 100644 --- a/tests/test_vocab.py +++ b/tests/test_vocab.py @@ -11,24 +11,24 @@ def EN(): def test_neq(EN): addr = EN.vocab['Hello'] - assert EN.vocab['bye']['sic'] != addr['sic'] + assert EN.vocab['bye'].sic != addr.sic def test_eq(EN): addr = EN.vocab['Hello'] - assert EN.vocab['Hello']['sic'] == addr['sic'] + assert EN.vocab['Hello'].sic == addr.sic def test_case_neq(EN): addr = EN.vocab['Hello'] - assert EN.vocab['hello']['sic'] != addr['sic'] + assert EN.vocab['hello'].sic != addr.sic def test_punct_neq(EN): addr = EN.vocab['Hello'] - assert EN.vocab['Hello,']['sic'] != addr['sic'] + assert EN.vocab['Hello,'].sic != addr.sic def test_shape_attr(EN): example = EN.vocab['example'] - assert example['sic'] != example['shape'] + assert example.sic != example.shape