mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 18:26:30 +03:00
* Tests passing after refactor. API has obvious warts, particularly in Token and Lexeme
This commit is contained in:
parent
0930892fc1
commit
7d3c40de7d
|
@ -12,13 +12,24 @@ from .attrs import get_flags
|
||||||
|
|
||||||
|
|
||||||
def get_lex_props(string):
|
def get_lex_props(string):
|
||||||
return {'flags': get_flags(string), 'length': len(string),
|
return {
|
||||||
'sic': string, 'norm1': string, 'norm2': string, 'shape': string,
|
'flags': get_flags(string),
|
||||||
'prefix': string[0], 'suffix': string[-3:], 'cluster': 0, 'prob': 0,
|
'length': len(string),
|
||||||
'sentiment': 0}
|
'sic': string,
|
||||||
|
'norm1': string,
|
||||||
|
'norm2': string,
|
||||||
|
'shape': orth.word_shape(string),
|
||||||
|
'prefix': string[0],
|
||||||
|
'suffix': string[-3:],
|
||||||
|
'cluster': 0,
|
||||||
|
'prob': 0,
|
||||||
|
'sentiment': 0
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
|
LOCAL_DATA_DIR = path.join(path.dirname(__file__), 'data')
|
||||||
|
|
||||||
|
|
||||||
class English(object):
|
class English(object):
|
||||||
"""The English NLP pipeline.
|
"""The English NLP pipeline.
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@ cdef class Lexeme:
|
||||||
cdef readonly attr_t id
|
cdef readonly attr_t id
|
||||||
cdef readonly attr_t length
|
cdef readonly attr_t length
|
||||||
|
|
||||||
cdef readonly unicode sic
|
cdef readonly attr_t sic
|
||||||
cdef readonly unicode norm1
|
cdef readonly unicode norm1
|
||||||
cdef readonly unicode norm2
|
cdef readonly unicode norm2
|
||||||
cdef readonly unicode shape
|
cdef readonly unicode shape
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
# cython: embedsignature=True
|
||||||
from cpython.ref cimport Py_INCREF
|
from cpython.ref cimport Py_INCREF
|
||||||
from cymem.cymem cimport Pool
|
from cymem.cymem cimport Pool
|
||||||
from murmurhash.mrmr cimport hash64
|
from murmurhash.mrmr cimport hash64
|
||||||
|
@ -29,6 +30,7 @@ cdef int set_lex_struct_props(LexemeC* lex, dict props, StringStore string_store
|
||||||
|
|
||||||
|
|
||||||
cdef class Lexeme:
|
cdef class Lexeme:
|
||||||
|
"""A dummy docstring"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -42,7 +44,7 @@ cdef Lexeme Lexeme_cinit(const LexemeC* c, StringStore strings):
|
||||||
py.id = c.id
|
py.id = c.id
|
||||||
py.length = c.length
|
py.length = c.length
|
||||||
|
|
||||||
py.sic = strings[c.sic]
|
py.sic = c.sic
|
||||||
py.norm1 = strings[c.norm1]
|
py.norm1 = strings[c.norm1]
|
||||||
py.norm2 = strings[c.norm2]
|
py.norm2 = strings[c.norm2]
|
||||||
py.shape = strings[c.shape]
|
py.shape = strings[c.shape]
|
||||||
|
|
|
@ -53,8 +53,8 @@ cdef class StringStore:
|
||||||
self.mem = Pool()
|
self.mem = Pool()
|
||||||
self._map = PreshMap()
|
self._map = PreshMap()
|
||||||
self._resize_at = 10000
|
self._resize_at = 10000
|
||||||
self.size = 1
|
|
||||||
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
self.strings = <Utf8Str*>self.mem.alloc(self._resize_at, sizeof(Utf8Str))
|
||||||
|
self.size = 1
|
||||||
|
|
||||||
property size:
|
property size:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
|
@ -64,7 +64,9 @@ cdef class StringStore:
|
||||||
cdef bytes byte_string
|
cdef bytes byte_string
|
||||||
cdef const Utf8Str* utf8str
|
cdef const Utf8Str* utf8str
|
||||||
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
|
if isinstance(string_or_id, int) or isinstance(string_or_id, long):
|
||||||
if string_or_id < 1 or string_or_id >= self.size:
|
if string_or_id == 0:
|
||||||
|
return u''
|
||||||
|
elif string_or_id < 1 or string_or_id >= self.size:
|
||||||
raise IndexError(string_or_id)
|
raise IndexError(string_or_id)
|
||||||
utf8str = &self.strings[<int>string_or_id]
|
utf8str = &self.strings[<int>string_or_id]
|
||||||
return utf8str.chars[:utf8str.length].decode('utf8')
|
return utf8str.chars[:utf8str.length].decode('utf8')
|
||||||
|
|
|
@ -120,9 +120,9 @@ cdef class Tokens:
|
||||||
attr_ids (list[int]): A list of attribute ID ints.
|
attr_ids (list[int]): A list of attribute ID ints.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
feat_array (numpy.ndarray[long, ndim=2]): A feature matrix, with one
|
feat_array (numpy.ndarray[long, ndim=2]):
|
||||||
row per word, and one column per attribute indicated in the input
|
A feature matrix, with one row per word, and one column per attribute
|
||||||
attr_ids.
|
indicated in the input attr_ids.
|
||||||
"""
|
"""
|
||||||
cdef int i, j
|
cdef int i, j
|
||||||
cdef attr_id_t feature
|
cdef attr_id_t feature
|
||||||
|
@ -278,7 +278,7 @@ cdef class Token:
|
||||||
|
|
||||||
property sic:
|
property sic:
|
||||||
def __get__(self):
|
def __get__(self):
|
||||||
return self._seq.vocab.strings[self._seq.data[self.i].lex.sic]
|
return self._seq.data[self.i].lex.sic
|
||||||
|
|
||||||
property head:
|
property head:
|
||||||
"""The token predicted by the parser to be the head of the current token."""
|
"""The token predicted by the parser to be the head of the current token."""
|
||||||
|
|
|
@ -77,14 +77,15 @@ cdef class Vocab:
|
||||||
unseen unicode string is given, a new lexeme is created and stored.
|
unseen unicode string is given, a new lexeme is created and stored.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
id_or_string (int or unicode): The integer ID of a word, or its unicode
|
id_or_string (int or unicode):
|
||||||
string. If an int >= Lexicon.size, IndexError is raised.
|
The integer ID of a word, or its unicode string. If an int >= Lexicon.size,
|
||||||
If id_or_string is neither an int nor a unicode string, ValueError
|
IndexError is raised. If id_or_string is neither an int nor a unicode string,
|
||||||
is raised.
|
ValueError is raised.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
lexeme (Lexeme): An instance of the Lexeme Python class, with data
|
lexeme (Lexeme):
|
||||||
copied on instantiation.
|
An instance of the Lexeme Python class, with data copied on
|
||||||
|
instantiation.
|
||||||
'''
|
'''
|
||||||
cdef UniStr c_str
|
cdef UniStr c_str
|
||||||
cdef const LexemeC* lexeme
|
cdef const LexemeC* lexeme
|
||||||
|
@ -92,9 +93,11 @@ cdef class Vocab:
|
||||||
if id_or_string >= self.lexemes.size():
|
if id_or_string >= self.lexemes.size():
|
||||||
raise IndexError
|
raise IndexError
|
||||||
lexeme = self.lexemes.at(id_or_string)
|
lexeme = self.lexemes.at(id_or_string)
|
||||||
else:
|
elif type(id_or_string) == unicode:
|
||||||
slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
|
slice_unicode(&c_str, id_or_string, 0, len(id_or_string))
|
||||||
lexeme = self.get(self.mem, &c_str)
|
lexeme = self.get(self.mem, &c_str)
|
||||||
|
else:
|
||||||
|
raise ValueError("Vocab unable to map type: %s. Maps unicode --> int or int --> unicode" % str(type(id_or_string)))
|
||||||
return Lexeme_cinit(lexeme, self.strings)
|
return Lexeme_cinit(lexeme, self.strings)
|
||||||
|
|
||||||
def __setitem__(self, unicode py_str, dict props):
|
def __setitem__(self, unicode py_str, dict props):
|
||||||
|
|
|
@ -27,10 +27,6 @@ def test_save_unicode(sstore):
|
||||||
assert Hello_i == 1
|
assert Hello_i == 1
|
||||||
|
|
||||||
|
|
||||||
def test_zero_id(sstore):
|
|
||||||
with pytest.raises(IndexError):
|
|
||||||
sstore[0]
|
|
||||||
|
|
||||||
def test_retrieve_id(sstore):
|
def test_retrieve_id(sstore):
|
||||||
A_i = sstore[b'A']
|
A_i = sstore[b'A']
|
||||||
assert sstore.size == 1
|
assert sstore.size == 1
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from spacy.en.lemmatizer import Lemmatizer, read_index, read_exc
|
from spacy.en.lemmatizer import Lemmatizer, read_index, read_exc
|
||||||
from spacy.en import DATA_DIR
|
from spacy.en import LOCAL_DATA_DIR
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
def test_read_index():
|
def test_read_index():
|
||||||
wn = path.join(DATA_DIR, 'wordnet')
|
wn = path.join(LOCAL_DATA_DIR, 'wordnet')
|
||||||
index = read_index(path.join(wn, 'index.noun'))
|
index = read_index(path.join(wn, 'index.noun'))
|
||||||
assert 'man' in index
|
assert 'man' in index
|
||||||
assert 'plantes' not in index
|
assert 'plantes' not in index
|
||||||
|
@ -16,14 +16,14 @@ def test_read_index():
|
||||||
|
|
||||||
|
|
||||||
def test_read_exc():
|
def test_read_exc():
|
||||||
wn = path.join(DATA_DIR, 'wordnet')
|
wn = path.join(LOCAL_DATA_DIR, 'wordnet')
|
||||||
exc = read_exc(path.join(wn, 'verb.exc'))
|
exc = read_exc(path.join(wn, 'verb.exc'))
|
||||||
assert exc['was'] == ('be',)
|
assert exc['was'] == ('be',)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def lemmatizer():
|
def lemmatizer():
|
||||||
return Lemmatizer(path.join(DATA_DIR, 'wordnet'), 0, 0, 0)
|
return Lemmatizer(path.join(LOCAL_DATA_DIR, 'wordnet'), 0, 0, 0)
|
||||||
|
|
||||||
|
|
||||||
def test_noun_lemmas(lemmatizer):
|
def test_noun_lemmas(lemmatizer):
|
||||||
|
|
|
@ -13,17 +13,17 @@ def EN():
|
||||||
|
|
||||||
def test_is_alpha(EN):
|
def test_is_alpha(EN):
|
||||||
the = EN.vocab['the']
|
the = EN.vocab['the']
|
||||||
assert the['flags'] & (1 << IS_ALPHA)
|
assert the.flags & (1 << IS_ALPHA)
|
||||||
year = EN.vocab['1999']
|
year = EN.vocab['1999']
|
||||||
assert not year['flags'] & (1 << IS_ALPHA)
|
assert not year.flags & (1 << IS_ALPHA)
|
||||||
mixed = EN.vocab['hello1']
|
mixed = EN.vocab['hello1']
|
||||||
assert not mixed['flags'] & (1 << IS_ALPHA)
|
assert not mixed.flags & (1 << IS_ALPHA)
|
||||||
|
|
||||||
|
|
||||||
def test_is_digit(EN):
|
def test_is_digit(EN):
|
||||||
the = EN.vocab['the']
|
the = EN.vocab['the']
|
||||||
assert not the['flags'] & (1 << IS_DIGIT)
|
assert not the.flags & (1 << IS_DIGIT)
|
||||||
year = EN.vocab['1999']
|
year = EN.vocab['1999']
|
||||||
assert year['flags'] & (1 << IS_DIGIT)
|
assert year.flags & (1 << IS_DIGIT)
|
||||||
mixed = EN.vocab['hello1']
|
mixed = EN.vocab['hello1']
|
||||||
assert not mixed['flags'] & (1 << IS_DIGIT)
|
assert not mixed.flags & (1 << IS_DIGIT)
|
||||||
|
|
|
@ -33,17 +33,17 @@ def test_punct(EN):
|
||||||
def test_digits(EN):
|
def test_digits(EN):
|
||||||
tokens = EN('The year: 1984.')
|
tokens = EN('The year: 1984.')
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert tokens[0].sic == EN.vocab['The']['sic']
|
assert tokens[0].sic == EN.vocab['The'].sic
|
||||||
assert tokens[3].sic == EN.vocab['1984']['sic']
|
assert tokens[3].sic == EN.vocab['1984'].sic
|
||||||
|
|
||||||
|
|
||||||
def test_contraction(EN):
|
def test_contraction(EN):
|
||||||
tokens = EN("don't giggle")
|
tokens = EN("don't giggle")
|
||||||
assert len(tokens) == 3
|
assert len(tokens) == 3
|
||||||
assert tokens[1].sic == EN.vocab["n't"]['sic']
|
assert tokens[1].sic == EN.vocab["n't"].sic
|
||||||
tokens = EN("i said don't!")
|
tokens = EN("i said don't!")
|
||||||
assert len(tokens) == 5
|
assert len(tokens) == 5
|
||||||
assert tokens[4].sic == EN.vocab['!']['sic']
|
assert tokens[4].sic == EN.vocab['!'].sic
|
||||||
|
|
||||||
|
|
||||||
def test_contraction_punct(EN):
|
def test_contraction_punct(EN):
|
||||||
|
|
|
@ -11,24 +11,24 @@ def EN():
|
||||||
|
|
||||||
def test_neq(EN):
|
def test_neq(EN):
|
||||||
addr = EN.vocab['Hello']
|
addr = EN.vocab['Hello']
|
||||||
assert EN.vocab['bye']['sic'] != addr['sic']
|
assert EN.vocab['bye'].sic != addr.sic
|
||||||
|
|
||||||
|
|
||||||
def test_eq(EN):
|
def test_eq(EN):
|
||||||
addr = EN.vocab['Hello']
|
addr = EN.vocab['Hello']
|
||||||
assert EN.vocab['Hello']['sic'] == addr['sic']
|
assert EN.vocab['Hello'].sic == addr.sic
|
||||||
|
|
||||||
|
|
||||||
def test_case_neq(EN):
|
def test_case_neq(EN):
|
||||||
addr = EN.vocab['Hello']
|
addr = EN.vocab['Hello']
|
||||||
assert EN.vocab['hello']['sic'] != addr['sic']
|
assert EN.vocab['hello'].sic != addr.sic
|
||||||
|
|
||||||
|
|
||||||
def test_punct_neq(EN):
|
def test_punct_neq(EN):
|
||||||
addr = EN.vocab['Hello']
|
addr = EN.vocab['Hello']
|
||||||
assert EN.vocab['Hello,']['sic'] != addr['sic']
|
assert EN.vocab['Hello,'].sic != addr.sic
|
||||||
|
|
||||||
|
|
||||||
def test_shape_attr(EN):
|
def test_shape_attr(EN):
|
||||||
example = EN.vocab['example']
|
example = EN.vocab['example']
|
||||||
assert example['sic'] != example['shape']
|
assert example.sic != example.shape
|
||||||
|
|
Loading…
Reference in New Issue
Block a user