diff --git a/spacy/en.pxd b/spacy/en.pxd index 2ca081e47..66ccd64ce 100644 --- a/spacy/en.pxd +++ b/spacy/en.pxd @@ -131,4 +131,5 @@ cpdef enum: cdef class English(Language): - pass + cdef int is_base_np_end(self, const TokenC* token) except -1 + cdef int is_outside_base_np(self, const TokenC* token) except -1 diff --git a/spacy/en.pyx b/spacy/en.pyx index 80bea551f..ba4f747dd 100644 --- a/spacy/en.pyx +++ b/spacy/en.pyx @@ -205,6 +205,13 @@ cdef class English(Language): c += t[i].pos == golds[i] return c + cdef int is_base_np_end(self, const TokenC* token) except -1: + pass + + cdef int is_outside_base_np(self, const TokenC* token) except -1: + pass + + cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1: _fill_from_token(&context[P2_sic], &tokens[i-2]) diff --git a/spacy/lang.pxd b/spacy/lang.pxd index 12714465c..de7ecd54a 100644 --- a/spacy/lang.pxd +++ b/spacy/lang.pxd @@ -54,6 +54,7 @@ cdef class Language: cpdef Tokens tokens_from_list(self, list strings) cpdef Tokens tokenize(self, unicode text) + cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1 cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1 cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes, @@ -64,4 +65,7 @@ cdef class Language: cdef int _find_suffix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1 cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1 + + cdef int is_base_np_end(self, const TokenC* token) except -1 + cdef int is_outside_base_np(self, const TokenC* token) except -1 diff --git a/spacy/lang.pyx b/spacy/lang.pyx index 4fc15e89b..f8fe7a0a3 100644 --- a/spacy/lang.pyx +++ b/spacy/lang.pyx @@ -81,6 +81,11 @@ cdef class Language: * The data//prefix file, used to build a regex to split off prefixes; * The data//suffix file, used to build a regex to split off suffixes. + The string is first split on whitespace. To tokenize a whitespace-delimited + chunk, we first try to look it up in the special-cases. If it's not found, + we split off a prefix, and then try again. If it's still not found, we + split off a suffix, and repeat. + Args: string (unicode): The string to be tokenized. @@ -274,6 +279,14 @@ cdef class Language: self._specials.set(string.key, cached) self._cache.set(string.key, cached) + cdef int is_base_np_end(self, const TokenC* token) except -1: + raise NotImplementedError + + cdef int is_outside_base_np(self, const TokenC* token) except -1: + raise NotImplementedError + + + cdef int set_morph_from_dict(Morphology* morph, dict props) except -1: morph.number = props.get('number', 0)