From b66112cb90d7223a7de7c2f61c49c6d71a071334 Mon Sep 17 00:00:00 2001 From: richardpaulhudson Date: Fri, 9 Sep 2022 21:15:38 +0200 Subject: [PATCH] Intermediate state --- spacy/tokens/doc.pxd | 2 ++ spacy/tokens/doc.pyx | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/spacy/tokens/doc.pxd b/spacy/tokens/doc.pxd index 57d087958..ae5e945f5 100644 --- a/spacy/tokens/doc.pxd +++ b/spacy/tokens/doc.pxd @@ -9,6 +9,7 @@ from ..attrs cimport attr_id_t cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) nogil +cdef np.ndarray init_array(int num_tokens, int length) ctypedef const LexemeC* const_Lexeme_ptr @@ -69,3 +70,4 @@ cdef class Doc: cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1 cpdef np.ndarray to_array(self, object features) + diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index 7ba9a3341..987690dc3 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -94,6 +94,12 @@ cdef attr_t get_token_attr_for_matcher(const TokenC* token, attr_id_t feat_name) return get_token_attr(token, feat_name) +cdef np.ndarray init_array(int num_tokens, int length): + cdef np.ndarray output = numpy.zeros((num_tokens, length), dtype='uint8') + output.fill(255) + return output + + class SetEntsDefault(str, Enum): blocked = "blocked" missing = "missing" @@ -1734,6 +1740,42 @@ cdef class Doc: j += 1 return output + def get_suffixes(self, int min_length, int max_length, special_chars:str, int sc_min_length, int sc_max_length): + """ + TODO + """ + byte_strings = [token.orth_.encode('utf8') for token in self] + special_chars_enc = special_chars.encode('utf8') + cdef num_tokens = len(byte_strings) + outputs = [] + for length in range(min_length, max_length+1): + outputs.append(init_array(num_tokens, length)) + for length in range(sc_min_length, sc_max_length+1): + outputs.append(init_array(num_tokens, length)) + + cdef int token_i, sc_char_i, idx + cdef bytes byte_string + cdef unsigned char utf8_char + cdef num_normal_arr = 1 + max_length - min_length + cdef num_sc_arr = 1 + sc_max_length - sc_min_length + for token_i, byte_string in enumerate(byte_strings): + sc_char_i = 0 + idx = 0 + while (idx < max_length or sc_char_i < sc_max_length) and idx < len(byte_string): + this_char = byte_string[len(byte_string) - (1 + idx)] + for normal_arr_i in range(num_normal_arr - 1, -1, -1): + if idx >= normal_arr_i + min_length: + break + outputs[normal_arr_i][token_i, idx] = this_char + if this_char in special_chars_enc: + for sc_arr_i in range(num_sc_arr - 1, -1, -1): + if sc_char_i >= sc_arr_i + sc_min_length: + break + outputs[sc_arr_i + num_normal_arr][token_i, sc_char_i] = this_char + sc_char_i += 1 + idx += 1 + return outputs + @staticmethod def _get_array_attrs(): attrs = [LENGTH, SPACY]