* Fiddle with token features

This commit is contained in:
Matthew Honnibal 2014-09-12 15:49:36 +02:00
parent 1533041885
commit 5aa591106b
4 changed files with 23 additions and 3 deletions

View File

@ -4,10 +4,14 @@ from spacy.tokens cimport Tokens
cdef class EnglishTokens(Tokens): cdef class EnglishTokens(Tokens):
cpdef size_t canon(self, size_t i)
cpdef size_t shape(self, size_t i)
cpdef size_t non_sparse(self, size_t i)
cpdef size_t asciied(self, size_t i)
cpdef unicode canon_string(self, size_t i) cpdef unicode canon_string(self, size_t i)
cpdef unicode shape_string(self, size_t i) cpdef unicode shape_string(self, size_t i)
cpdef unicode non_sparse_string(self, size_t i) cpdef unicode non_sparse_string(self, size_t i)
cpdef unicode asciied(self, size_t i) cpdef unicode asciied_string(self, size_t i)
cpdef bint is_alpha(self, size_t i) cpdef bint is_alpha(self, size_t i)
cpdef bint is_ascii(self, size_t i) cpdef bint is_ascii(self, size_t i)
cpdef bint is_digit(self, size_t i) cpdef bint is_digit(self, size_t i)

View File

@ -137,8 +137,20 @@ cdef class EnglishTokens(Tokens):
cpdef unicode non_sparse_string(self, size_t i): cpdef unicode non_sparse_string(self, size_t i):
return lexeme_string_view(self.lexemes[i], View_NonSparse) return lexeme_string_view(self.lexemes[i], View_NonSparse)
cpdef unicode asciied(self, size_t i): cpdef unicode asciied_string(self, size_t i):
return lexeme_check_flag(self.lexemes[i], View_Asciied) return lexeme_string_view(self.lexemes[i], View_Asciied)
cpdef size_t canon(self, size_t i):
return id(self.lexemes[i].views[<size_t>View_CanonForm])
cpdef size_t shape(self, size_t i):
return id(self.lexemes[i].views[<size_t>View_WordShape])
cpdef size_t non_sparse(self, size_t i):
return id(self.lexemes[i].views[<size_t>View_NonSparse])
cpdef size_t asciied(self, size_t i):
return id(self.lexemes[i].views[<size_t>View_Asciied])
cpdef bint is_alpha(self, size_t i): cpdef bint is_alpha(self, size_t i):
return lexeme_check_flag(self.lexemes[i], Flag_IsAlpha) return lexeme_check_flag(self.lexemes[i], Flag_IsAlpha)

View File

@ -7,6 +7,7 @@ cdef class Tokens:
cdef LexemeC** lexemes cdef LexemeC** lexemes
cdef int push_back(self, LexemeC* lexeme) except -1 cdef int push_back(self, LexemeC* lexeme) except -1
cpdef size_t id(self, size_t i)
cpdef unicode string(self, size_t i) cpdef unicode string(self, size_t i)
cpdef double prob(self, size_t i) cpdef double prob(self, size_t i)
cpdef size_t cluster(self, size_t i) cpdef size_t cluster(self, size_t i)

View File

@ -55,6 +55,9 @@ cdef class Tokens:
cdef bytes byte_string = self.lexemes[i].string cdef bytes byte_string = self.lexemes[i].string
return byte_string.decode('utf8') return byte_string.decode('utf8')
cpdef size_t id(self, size_t i):
return id(self.lexemes[i].string)
cpdef double prob(self, size_t i): cpdef double prob(self, size_t i):
return self.lexemes[i].prob return self.lexemes[i].prob