* Start laying out parse tree iteration methods

2025-07-15 18:52:29 +03:00 · 2014-12-20 01:42:09 +11:00 · 2014-12-20 01:42:09 +11:00 · 8fd9762d91
commit 8fd9762d91
parent 53b8bc1f3c
4 changed files with 26 additions and 1 deletions
--- a/spacy/en.pxd
+++ b/spacy/en.pxd
@ -131,4 +131,5 @@ cpdef enum:


 cdef class English(Language):
-    pass
+    cdef int is_base_np_end(self, const TokenC* token) except -1
+    cdef int is_outside_base_np(self, const TokenC* token) except -1
--- a/spacy/en.pyx
+++ b/spacy/en.pyx
@ -205,6 +205,13 @@ cdef class English(Language):
            c += t[i].pos == golds[i]
        return c

+    cdef int is_base_np_end(self, const TokenC* token) except -1:
+        pass
+
+    cdef int is_outside_base_np(self, const TokenC* token) except -1:
+        pass
+
+

 cdef int fill_pos_context(atom_t* context, const int i, const TokenC* tokens) except -1:
    _fill_from_token(&context[P2_sic], &tokens[i-2])
--- a/spacy/lang.pxd
+++ b/spacy/lang.pxd
@ -54,6 +54,7 @@ cdef class Language:
    cpdef Tokens tokens_from_list(self, list strings)
    cpdef Tokens tokenize(self, unicode text)

+
    cdef int _try_cache(self, int idx, hash_t key, Tokens tokens) except -1
    cdef int _tokenize(self, Tokens tokens, UniStr* span, int start, int end) except -1
    cdef UniStr* _split_affixes(self, UniStr* string, vector[Lexeme*] *prefixes,
@ -65,3 +66,6 @@ cdef class Language:
    cdef int _find_infix(self, Py_UNICODE* characters, size_t length) except -1
    cdef int _save_cached(self, const TokenC* tokens, hash_t key, int n) except -1

+    cdef int is_base_np_end(self, const TokenC* token) except -1
+    cdef int is_outside_base_np(self, const TokenC* token) except -1
+ 
--- a/spacy/lang.pyx
+++ b/spacy/lang.pyx
@ -81,6 +81,11 @@ cdef class Language:
        * The data/<lang>/prefix file, used to build a regex to split off prefixes;
        * The data/<lang>/suffix file, used to build a regex to split off suffixes.

+        The string is first split on whitespace.  To tokenize a whitespace-delimited
+        chunk, we first try to look it up in the special-cases. If it's not found,
+        we split off a prefix, and then try again. If it's still not found, we
+        split off a suffix, and repeat.
+
        Args:
            string (unicode): The string to be tokenized. 

@ -274,6 +279,14 @@ cdef class Language:
            self._specials.set(string.key, cached)
            self._cache.set(string.key, cached)

+    cdef int is_base_np_end(self, const TokenC* token) except -1:
+        raise NotImplementedError
+
+    cdef int is_outside_base_np(self, const TokenC* token) except -1:
+        raise NotImplementedError
+        
+ 
+

 cdef int set_morph_from_dict(Morphology* morph, dict props) except -1:
    morph.number = props.get('number', 0)