From 1a3222af4b2166375538f78bed01bc836bb851d1 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 11 Sep 2014 16:57:08 +0200 Subject: [PATCH] * Moving tokens to use an array internally, instead of a list of Lexeme objects. --- spacy/en.pyx | 58 +++++++++++++++++++++++++----------------------- spacy/lang.pyx | 5 ++++- spacy/tokens.pxd | 9 ++++++-- spacy/tokens.pyx | 32 ++++++++++++++++++++------ 4 files changed, 66 insertions(+), 38 deletions(-) diff --git a/spacy/en.pyx b/spacy/en.pyx index 62e195ca8..785d21e24 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -41,6 +41,8 @@ from libc.stdlib cimport malloc, calloc, free from libc.stdint cimport uint64_t cimport lang +from spacy.lexeme cimport lexeme_check_flag +from spacy.lexeme cimport lexeme_string_view from spacy import util @@ -127,88 +129,88 @@ cdef class EnglishTokens(Tokens): # Without these, clients have to use the underlying string_view and check_flag # methods, which requires them to know the IDs. cpdef unicode canon_string(self, size_t i): - return self.lexemes[i].string_view(View_CanonForm) + return lexeme_string_view(self.lexemes[i], View_CanonForm) cpdef unicode shape_string(self, size_t i): - return self.lexemes[i].string_view(View_WordShape) + return lexeme_string_view(self.lexemes[i], View_WordShape) cpdef unicode non_sparse_string(self, size_t i): - return self.lexemes[i].string_view(View_NonSparse) + return lexeme_string_view(self.lexemes[i], View_NonSparse) cpdef unicode asciied(self, size_t i): - return self.lexemes[i].string_views(View_Asciied) + return lexeme_check_flag(self.lexemes[i], View_Asciied) cpdef bint is_alpha(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_IsAlpha) + return lexeme_check_flag(self.lexemes[i], Flag_IsAlpha) cpdef bint is_ascii(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_IsAscii) + return lexeme_check_flag(self.lexemes[i], Flag_IsAscii) cpdef bint is_digit(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_IsDigit) + return lexeme_check_flag(self.lexemes[i], Flag_IsDigit) cpdef bint is_lower(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_IsLower) + return lexeme_check_flag(self.lexemes[i], Flag_IsLower) cpdef bint is_punct(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_IsPunct) + return lexeme_check_flag(self.lexemes[i], Flag_IsPunct) cpdef bint is_space(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_IsSpace) + return lexeme_check_flag(self.lexemes[i], Flag_IsSpace) cpdef bint is_title(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_IsTitle) + return lexeme_check_flag(self.lexemes[i], Flag_IsTitle) cpdef bint is_upper(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_IsUpper) + return lexeme_check_flag(self.lexemes[i], Flag_IsUpper) cpdef bint can_adj(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_CanAdj) + return lexeme_check_flag(self.lexemes[i], Flag_CanAdj) cpdef bint can_adp(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_CanAdp) + return lexeme_check_flag(self.lexemes[i], Flag_CanAdp) cpdef bint can_adv(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_CanAdv) + return lexeme_check_flag(self.lexemes[i], Flag_CanAdv) cpdef bint can_conj(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_CanConj) + return lexeme_check_flag(self.lexemes[i], Flag_CanConj) cpdef bint can_det(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_CanDet) + return lexeme_check_flag(self.lexemes[i], Flag_CanDet) cpdef bint can_noun(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_CanNoun) + return lexeme_check_flag(self.lexemes[i], Flag_CanNoun) cpdef bint can_num(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_CanNum) + return lexeme_check_flag(self.lexemes[i], Flag_CanNum) cpdef bint can_pdt(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_CanPdt) + return lexeme_check_flag(self.lexemes[i], Flag_CanPdt) cpdef bint can_pos(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_CanPos) + return lexeme_check_flag(self.lexemes[i], Flag_CanPos) cpdef bint can_pron(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_CanPron) + return lexeme_check_flag(self.lexemes[i], Flag_CanPron) cpdef bint can_prt(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_CanPrt) + return lexeme_check_flag(self.lexemes[i], Flag_CanPrt) cpdef bint can_punct(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_CanPunct) + return lexeme_check_flag(self.lexemes[i], Flag_CanPunct) cpdef bint can_verb(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_CanVerb) + return lexeme_check_flag(self.lexemes[i], Flag_CanVerb) cpdef bint oft_lower(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_OftLower) + return lexeme_check_flag(self.lexemes[i], Flag_OftLower) cpdef bint oft_title(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_OftTitle) + return lexeme_check_flag(self.lexemes[i], Flag_OftTitle) cpdef bint oft_upper(self, size_t i): - return self.lexemes[i].check_flag(i, Flag_OftUpper) + return lexeme_check_flag(self.lexemes[i], Flag_OftUpper) cdef class English(Language): diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 2470deb7f..96ec6797c 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -93,7 +93,10 @@ cdef class Language: if start < i: self._tokenize(tokens, string[start:i]) assert tokens - return tokens.lexemes + output = [] + for i in range(tokens.length): + output.append(Lexeme(tokens.lexemes[i])) + return output cdef _tokenize(self, Tokens tokens, unicode string): cdef list lexemes diff --git a/spacy/tokens.pxd b/spacy/tokens.pxd index 006880794..ccfe45d24 100644 --- a/spacy/tokens.pxd +++ b/spacy/tokens.pxd @@ -1,6 +1,11 @@ +from spacy.lexeme cimport LexemeC + cdef class Tokens: - cdef list lexemes - cpdef append(self, object lexeme) + cdef size_t length + cdef size_t size + + cdef LexemeC** lexemes + cdef push_back(self, LexemeC* lexeme) cpdef unicode string(self, size_t i) cpdef double prob(self, size_t i) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 25d8e1939..8495dbae0 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -1,3 +1,10 @@ +from libc.stdlib cimport calloc, free, realloc + +from spacy.word cimport Lexeme +from spacy.lexeme cimport lexeme_check_flag +from spacy.lexeme cimport lexeme_string_view + + cdef class Tokens: """A sequence of references to Lexeme objects. @@ -17,14 +24,25 @@ cdef class Tokens: >>> tokens.can_noun(1) True """ - def __cinit__(self): - self.lexemes = [] + def __cinit__(self, size=100): + assert size >= 1 + self.lexemes = calloc(size, sizeof(LexemeC*)) + self.size = size + self.length = 0 - cpdef append(self, object lexeme): - self.lexemes.append(lexeme) + def append(self, Lexeme lexeme): + self.push_back(lexeme._c) + + cdef push_back(self, LexemeC* lexeme): + if (self.size + 1) == self.length: + self.size *= 2 + self.lexemes = realloc(self.lexemes, self.size * sizeof(LexemeC*)) + self.lexemes[self.length] = lexeme + self.length += 1 cpdef unicode string(self, size_t i): - return self.lexemes[i].string + cdef bytes byte_string = self.lexemes[i].string + return byte_string.decode('utf8') cpdef double prob(self, size_t i): return self.lexemes[i].prob @@ -33,7 +51,7 @@ cdef class Tokens: return self.lexemes[i].cluster cpdef bint check_flag(self, size_t i, size_t flag_id): - return self.lexemes[i].check_flag(flag_id) + return lexeme_check_flag(self.lexemes[i], flag_id) cpdef unicode string_view(self, size_t i, size_t view_id): - return self.lexemes[i].string_view(view_id) + return lexeme_string_view(self.lexemes[i], view_id)