From 9d3ca1390900917a04490af63b32260a739ad0da Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sat, 20 Dec 2014 03:48:10 +1100 Subject: [PATCH] * Start work on parse-tree iteration classes --- spacy/tokens.pyx | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/spacy/tokens.pyx b/spacy/tokens.pyx index 29ee28c32..e0d320b30 100644 --- a/spacy/tokens.pyx +++ b/spacy/tokens.pyx @@ -97,6 +97,20 @@ cdef class Tokens: counts.inc(attr, 1) return dict(counts) + def base_nps(self): + # Iterate backwards, looking for nouns, and if we're collecting, for an + # outside-NP word. We want greedy matching, so it's easier to find the noun. + cdef TokenC* token + cdef int end = -1 + for i in range(self.length-1, -1, -1): + token = &self.data[i] + if end == -1: + if self.lang.is_base_np_end(token): + end = i + elif self.lang.is_outside_base_np(token): + yield i-1, end + end = -1 + def _realloc(self, new_size): self.max_length = new_size n = new_size + (PADDING * 2)