* Start laying out parse tree iteration methods

This commit is contained in:
Matthew Honnibal 2014-12-20 01:42:09 +11:00
parent 53b8bc1f3c
commit 8fd9762d91
4 changed files with 26 additions and 1 deletions

View File

@ -131,4 +131,5 @@ cpdef enum:
cdef class English(Language):
pass
cdef int is_base_np_end(self, const TokenC* token) except -1
cdef int is_outside_base_np(self, const TokenC* token) except -1

View File

@ -205,6 +205,13 @@ cdef class English(Language):
c += t[i].pos == golds[i]
return c
cdef int is_base_np_end(self, const TokenC* token) except -1:
pass
cdef int is_outside_base_np(self, const TokenC* token) except -1:
pass
cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
_fill_from_token(&context[P2_sic], &tokens[i-2])

View File

@ -54,6 +54,7 @@ cdef class Language:
cpdef Tokens tokens_from_list(self, list strings)
cpdef Tokens tokenize(self, unicode text)
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
@ -65,3 +66,6 @@ cdef class Language:
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1
cdef int is_base_np_end(self, const TokenC* token) except -1
cdef int is_outside_base_np(self, const TokenC* token) except -1

View File

@ -81,6 +81,11 @@ cdef class Language:
* The data/<lang>/prefix file, used to build a regex to split off prefixes;
* The data/<lang>/suffix file, used to build a regex to split off suffixes.
The string is first split on whitespace. To tokenize a whitespace-delimited
chunk, we first try to look it up in the special-cases. If it's not found,
we split off a prefix, and then try again. If it's still not found, we
split off a suffix, and repeat.
Args:
string (unicode): The string to be tokenized.
@ -274,6 +279,14 @@ cdef class Language:
self._specials.set(string.key, cached)
self._cache.set(string.key, cached)
cdef int is_base_np_end(self, const TokenC* token) except -1:
raise NotImplementedError
cdef int is_outside_base_np(self, const TokenC* token) except -1:
raise NotImplementedError
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
morph.number = props.get('number', 0)