* Add noun_chunks iterator, and fix left/right child setting in Doc.merge

2025-08-09 06:34:54 +03:00 · 2015-07-30 02:29:49 +02:00 · 2015-07-30 02:29:49 +02:00 · 74d8cb3980
commit 74d8cb3980
parent d153f18969
1 changed files with 49 additions and 40 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -11,10 +11,10 @@ from ..attrs cimport attr_id_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
 from ..parts_of_speech import UNIV_POS_NAMES
-from ..parts_of_speech cimport CONJ, PUNCT
+from ..parts_of_speech cimport CONJ, PUNCT, NOUN
 from ..lexeme cimport check_flag
 from ..lexeme cimport get_attr as get_lex_attr
-from .spans import Span
+from .spans cimport Span
 from .token cimport Token
 from ..serialize.bits cimport BitArray

@ -154,6 +154,18 @@ cdef class Doc:
        if start != -1:
            yield Span(self, start, self.length, label=label)

+    @property
+    def noun_chunks(self):
+        """Yield spans for base noun phrases."""
+        cdef const TokenC* word
+        labels = ['nsubj', 'nsubjpass', 'pcomp', 'pobj', 'conj']
+        np_deps = [self.vocab.strings[label] for label in labels]
+        np_label = self.vocab.strings['NP']
+        for i in range(self.length):
+            word = &self.data[i]
+            if word.pos == NOUN and word.dep in np_deps:
+                yield Span(self, word.l_edge, i+1, label=np_label)
+
    @property
    def sents(self):
        """
@ -297,20 +309,7 @@ cdef class Doc:
            elif attr_id == ENT_TYPE:
                for i in range(length):
                    tokens[i].ent_type = values[i]
-        cdef TokenC* head
-        cdef TokenC* child
-        # Set left edges
-        for i in range(length):
-            child = &tokens[i]
-            head = &tokens[i + child.head]
-            if child < head and child.l_edge < head.l_edge:
-                head.l_edge = child.l_edge
-        # Set right edges --- same as above, but iterate in reverse
-        for i in range(length-1, -1, -1):
-            child = &tokens[i]
-            head = &tokens[i + child.head]
-            if child > head and child.r_edge > head.r_edge:
-                head.r_edge = child.r_edge
+        set_children_from_heads(self.data, self.length)
        return self

    def to_bytes(self):
@ -354,9 +353,12 @@ cdef class Doc:
                break
        else:
            return None
-        cdef unicode string = self.string
+
+        cdef Span span = self[start:end]
        # Get LexemeC for newly merged token
-        new_orth = string[start_idx:end_idx]
+        new_orth = ''.join([t.string for t in span])
+        if span[-1].whitespace_:
+            new_orth = new_orth[:-1]
        cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
        # House the new merged token where it starts
        cdef TokenC* token = &self.data[start]
@ -372,30 +374,16 @@ cdef class Doc:
        else:
            token.ent_iob = 3
            token.ent_type = self.vocab.strings[ent_type]
-        # Fix dependencies
        # Begin by setting all the head indices to absolute token positions
        # This is easier to work with for now than the offsets
+        # Before thinking of something simpler, beware the case where a dependency
+        # bridges over the entity. Here the alignment of the tokens changes.
+        span_root = span.root.i
        for i in range(self.length):
            self.data[i].head += i
-        # Find the head of the merged token, and its dep relation
-        outer_heads = {}
-        for i in range(start, end):
-            head_idx = self.data[i].head
-            if head_idx == i or head_idx < start or head_idx >= end:
-                # Don't consider "heads" which are actually dominated by a word
-                # in the region we're merging
-                gp = head_idx
-                while self.data[gp].head != gp:
-                    if start <= gp < end:
-                        break
-                    gp = self.data[gp].head
-                else:
-                    # If we have multiple words attaching to the same head,
-                    # but with different dep labels, we're preferring the last
-                    # occurring dep label. Shrug. What else could we do, I guess?
-                    outer_heads[head_idx] = self.data[i].dep
-
-        token.head, token.dep = max(outer_heads.items())
+        # Set the head of the merged token, and its dep relation, from the Span
+        token.head = self.data[span_root].head
+        token.dep = span.root.dep
        # Adjust deps before shrinking tokens
        # Tokens which point into the merged token should now point to it
        # Subtract the offset from all tokens which point to >= end
@ -406,7 +394,6 @@ cdef class Doc:
                self.data[i].head = start
            elif head_idx >= end:
                self.data[i].head -= offset
-        # TODO: Fix left and right deps
        # Now compress the token array
        for i in range(end, self.length):
            self.data[i - offset] = self.data[i]
@ -417,6 +404,28 @@ cdef class Doc:
        for i in range(self.length):
            # ...And, set heads back to a relative position
            self.data[i].head -= i
-
+        # Set the left/right children, left/right edges
+        set_children_from_heads(self.data, self.length)
+        # Clear the cached Python objects
+        self._py_tokens = [None] * self.length
        # Return the merged Python object
        return self[start]
+
+
+cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
+    cdef TokenC* head
+    cdef TokenC* child
+    cdef int i
+    # Set left edges
+    for i in range(length):
+        child = &tokens[i]
+        head = &tokens[i + child.head]
+        if child < head and child.l_edge < head.l_edge:
+            head.l_edge = child.l_edge
+    # Set right edges --- same as above, but iterate in reverse
+    for i in range(length-1, -1, -1):
+        child = &tokens[i]
+        head = &tokens[i + child.head]
+        if child > head and child.r_edge > head.r_edge:
+            head.r_edge = child.r_edge
+