From 74d8cb39804ae7b74f09fc8e40316c3e5c806038 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Thu, 30 Jul 2015 02:29:49 +0200 Subject: [PATCH] * Add noun_chunks iterator, and fix left/right child setting in Doc.merge --- spacy/tokens/doc.pyx | 89 ++++++++++++++++++++++++-------------------- 1 file changed, 49 insertions(+), 40 deletions(-) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index feb11bd87..ef901291d 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -11,10 +11,10 @@ from ..attrs cimport attr_id_t from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE from ..parts_of_speech import UNIV_POS_NAMES -from ..parts_of_speech cimport CONJ, PUNCT +from ..parts_of_speech cimport CONJ, PUNCT, NOUN from ..lexeme cimport check_flag from ..lexeme cimport get_attr as get_lex_attr -from .spans import Span +from .spans cimport Span from .token cimport Token from ..serialize.bits cimport BitArray @@ -154,6 +154,18 @@ cdef class Doc: if start != -1: yield Span(self, start, self.length, label=label) + @property + def noun_chunks(self): + """Yield spans for base noun phrases.""" + cdef const TokenC* word + labels = ['nsubj', 'nsubjpass', 'pcomp', 'pobj', 'conj'] + np_deps = [self.vocab.strings[label] for label in labels] + np_label = self.vocab.strings['NP'] + for i in range(self.length): + word = &self.data[i] + if word.pos == NOUN and word.dep in np_deps: + yield Span(self, word.l_edge, i+1, label=np_label) + @property def sents(self): """ @@ -297,20 +309,7 @@ cdef class Doc: elif attr_id == ENT_TYPE: for i in range(length): tokens[i].ent_type = values[i] - cdef TokenC* head - cdef TokenC* child - # Set left edges - for i in range(length): - child = &tokens[i] - head = &tokens[i + child.head] - if child < head and child.l_edge < head.l_edge: - head.l_edge = child.l_edge - # Set right edges --- same as above, but iterate in reverse - for i in range(length-1, -1, -1): - child = &tokens[i] - head = &tokens[i + child.head] - if child > head and child.r_edge > head.r_edge: - head.r_edge = child.r_edge + set_children_from_heads(self.data, self.length) return self def to_bytes(self): @@ -354,9 +353,12 @@ cdef class Doc: break else: return None - cdef unicode string = self.string + + cdef Span span = self[start:end] # Get LexemeC for newly merged token - new_orth = string[start_idx:end_idx] + new_orth = ''.join([t.string for t in span]) + if span[-1].whitespace_: + new_orth = new_orth[:-1] cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth) # House the new merged token where it starts cdef TokenC* token = &self.data[start] @@ -372,30 +374,16 @@ cdef class Doc: else: token.ent_iob = 3 token.ent_type = self.vocab.strings[ent_type] - # Fix dependencies # Begin by setting all the head indices to absolute token positions # This is easier to work with for now than the offsets + # Before thinking of something simpler, beware the case where a dependency + # bridges over the entity. Here the alignment of the tokens changes. + span_root = span.root.i for i in range(self.length): self.data[i].head += i - # Find the head of the merged token, and its dep relation - outer_heads = {} - for i in range(start, end): - head_idx = self.data[i].head - if head_idx == i or head_idx < start or head_idx >= end: - # Don't consider "heads" which are actually dominated by a word - # in the region we're merging - gp = head_idx - while self.data[gp].head != gp: - if start <= gp < end: - break - gp = self.data[gp].head - else: - # If we have multiple words attaching to the same head, - # but with different dep labels, we're preferring the last - # occurring dep label. Shrug. What else could we do, I guess? - outer_heads[head_idx] = self.data[i].dep - - token.head, token.dep = max(outer_heads.items()) + # Set the head of the merged token, and its dep relation, from the Span + token.head = self.data[span_root].head + token.dep = span.root.dep # Adjust deps before shrinking tokens # Tokens which point into the merged token should now point to it # Subtract the offset from all tokens which point to >= end @@ -406,7 +394,6 @@ cdef class Doc: self.data[i].head = start elif head_idx >= end: self.data[i].head -= offset - # TODO: Fix left and right deps # Now compress the token array for i in range(end, self.length): self.data[i - offset] = self.data[i] @@ -417,6 +404,28 @@ cdef class Doc: for i in range(self.length): # ...And, set heads back to a relative position self.data[i].head -= i - + # Set the left/right children, left/right edges + set_children_from_heads(self.data, self.length) + # Clear the cached Python objects + self._py_tokens = [None] * self.length # Return the merged Python object return self[start] + + +cdef int set_children_from_heads(TokenC* tokens, int length) except -1: + cdef TokenC* head + cdef TokenC* child + cdef int i + # Set left edges + for i in range(length): + child = &tokens[i] + head = &tokens[i + child.head] + if child < head and child.l_edge < head.l_edge: + head.l_edge = child.l_edge + # Set right edges --- same as above, but iterate in reverse + for i in range(length-1, -1, -1): + child = &tokens[i] + head = &tokens[i + child.head] + if child > head and child.r_edge > head.r_edge: + head.r_edge = child.r_edge +