* Fix issue #37: missing check_flag attribute from Token class

This commit is contained in:
Matthew Honnibal 2015-03-26 15:06:26 +01:00
parent 5032f2a5c7
commit 0962ffc095
3 changed files with 38 additions and 0 deletions

View File

@ -89,3 +89,5 @@ cdef class Token:
return self return self
cdef int take_ownership_of_c_data(self) except -1 cdef int take_ownership_of_c_data(self) except -1
cpdef bint check_flag(self, attr_id_t flag_id) except -1

View File

@ -9,6 +9,7 @@ from .typedefs cimport LEMMA
from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from .typedefs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
from .typedefs cimport POS, LEMMA from .typedefs cimport POS, LEMMA
from .parts_of_speech import UNIV_POS_NAMES from .parts_of_speech import UNIV_POS_NAMES
from .lexeme cimport check_flag
from unidecode import unidecode from unidecode import unidecode
@ -252,6 +253,10 @@ cdef class Token:
def __unicode__(self): def __unicode__(self):
return self.string return self.string
cpdef bint check_flag(self, attr_id_t flag_id) except -1:
return check_flag(self.c.lex, flag_id)
cdef int take_ownership_of_c_data(self) except -1: cdef int take_ownership_of_c_data(self) except -1:
owned_data = <TokenC*>PyMem_Malloc(sizeof(TokenC) * self.array_len) owned_data = <TokenC*>PyMem_Malloc(sizeof(TokenC) * self.array_len)
memcpy(owned_data, self.c, sizeof(TokenC) * self.array_len) memcpy(owned_data, self.c, sizeof(TokenC) * self.array_len)

31
tests/test_token_api.py Normal file
View File

@ -0,0 +1,31 @@
from __future__ import unicode_literals
from spacy.en import English
from spacy.en.attrs import IS_ALPHA, IS_ASCII, IS_DIGIT, IS_LOWER, IS_PUNCT
from spacy.en.attrs import IS_SPACE, IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM
from spacy.en.attrs import IS_STOP
import pytest
@pytest.fixture
def token():
nlp = English()
tokens = nlp(u'Give it back! He pleaded.')
return tokens[0]
def test_strings(token):
assert token.orth_ == 'Give'
assert token.lower_ == 'give'
assert token.shape_ == 'Xxxx'
assert token.prefix_ == 'G'
assert token.suffix_ == 'ive'
assert token.lemma_ == 'give'
assert token.pos_ == 'VERB'
assert token.tag_ == 'VB'
assert token.dep_ == 'ROOT'
def test_flags(token):
assert token.check_flag(IS_ALPHA)
assert not token.check_flag(IS_DIGIT)
# TODO: Test more of these, esp. if a bug is found