mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
* Start laying out parse tree iteration methods
This commit is contained in:
parent
53b8bc1f3c
commit
8fd9762d91
|
@ -131,4 +131,5 @@ cpdef enum:
|
|||
|
||||
|
||||
cdef class English(Language):
|
||||
pass
|
||||
cdef int is_base_np_end(self, const TokenC* token) except -1
|
||||
cdef int is_outside_base_np(self, const TokenC* token) except -1
|
||||
|
|
|
@ -205,6 +205,13 @@ cdef class English(Language):
|
|||
c += t[i].pos == golds[i]
|
||||
return c
|
||||
|
||||
cdef int is_base_np_end(self, const TokenC* token) except -1:
|
||||
pass
|
||||
|
||||
cdef int is_outside_base_np(self, const TokenC* token) except -1:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
|
||||
_fill_from_token(&context[P2_sic], &tokens[i-2])
|
||||
|
|
|
@ -54,6 +54,7 @@ cdef class Language:
|
|||
cpdef Tokens tokens_from_list(self, list strings)
|
||||
cpdef Tokens tokenize(self, unicode text)
|
||||
|
||||
|
||||
cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
|
||||
cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
|
||||
cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
|
||||
|
@ -65,3 +66,6 @@ cdef class Language:
|
|||
cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
|
||||
cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1
|
||||
|
||||
cdef int is_base_np_end(self, const TokenC* token) except -1
|
||||
cdef int is_outside_base_np(self, const TokenC* token) except -1
|
||||
|
||||
|
|
|
@ -81,6 +81,11 @@ cdef class Language:
|
|||
* The data/<lang>/prefix file, used to build a regex to split off prefixes;
|
||||
* The data/<lang>/suffix file, used to build a regex to split off suffixes.
|
||||
|
||||
The string is first split on whitespace. To tokenize a whitespace-delimited
|
||||
chunk, we first try to look it up in the special-cases. If it's not found,
|
||||
we split off a prefix, and then try again. If it's still not found, we
|
||||
split off a suffix, and repeat.
|
||||
|
||||
Args:
|
||||
string (unicode): The string to be tokenized.
|
||||
|
||||
|
@ -274,6 +279,14 @@ cdef class Language:
|
|||
self._specials.set(string.key, cached)
|
||||
self._cache.set(string.key, cached)
|
||||
|
||||
cdef int is_base_np_end(self, const TokenC* token) except -1:
|
||||
raise NotImplementedError
|
||||
|
||||
cdef int is_outside_base_np(self, const TokenC* token) except -1:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
|
||||
|
||||
cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
|
||||
morph.number = props.get('number', 0)
|
||||
|
|
Loading…
Reference in New Issue
Block a user