* Add noun_chunks iterator, and fix left/right child setting in Doc.merge

2025-07-11 08:42:28 +03:00 · 2015-07-30 02:29:49 +02:00 · 2015-07-30 02:29:49 +02:00 · 74d8cb3980
commit 74d8cb3980
parent d153f18969
1 changed files with 49 additions and 40 deletions
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -11,10 +11,10 @@ from ..attrs cimport attr_id_t
 from ..attrs cimport ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX, LENGTH, CLUSTER
 from ..attrs cimport POS, LEMMA, TAG, DEP, HEAD, SPACY, ENT_IOB, ENT_TYPE
 from ..parts_of_speech import UNIV_POS_NAMES
-from ..parts_of_speech cimport CONJ, PUNCT
+from ..parts_of_speech cimport CONJ, PUNCT, NOUN
 from ..lexeme cimport check_flag
 from ..lexeme cimport get_attr as get_lex_attr
-from .spans import Span
+from .spans cimport Span
 from .token cimport Token
 from ..serialize.bits cimport BitArray
@ -154,6 +154,18 @@ cdef class Doc:
        if start != -1:
            yield Span(self, start, self.length, label=label)
    @property
    def noun_chunks(self):
        """Yield spans for base noun phrases."""
        cdef const TokenC* word
        labels = ['nsubj', 'nsubjpass', 'pcomp', 'pobj', 'conj']
        np_deps = [self.vocab.strings[label] for label in labels]
        np_label = self.vocab.strings['NP']
        for i in range(self.length):
            word = &self.data[i]
            if word.pos == NOUN and word.dep in np_deps:
                yield Span(self, word.l_edge, i+1, label=np_label)
    @property
    def sents(self):
        """
@ -297,20 +309,7 @@ cdef class Doc:
            elif attr_id == ENT_TYPE:
                for i in range(length):
                    tokens[i].ent_type = values[i]
-        cdef TokenC* head
+        set_children_from_heads(self.data, self.length)
        cdef TokenC* child
        # Set left edges
        for i in range(length):
            child = &tokens[i]
            head = &tokens[i + child.head]
            if child < head and child.l_edge < head.l_edge:
                head.l_edge = child.l_edge
        # Set right edges --- same as above, but iterate in reverse
        for i in range(length-1, -1, -1):
            child = &tokens[i]
            head = &tokens[i + child.head]
            if child > head and child.r_edge > head.r_edge:
                head.r_edge = child.r_edge
        return self
    def to_bytes(self):
@ -354,9 +353,12 @@ cdef class Doc:
                break
        else:
            return None
-        cdef unicode string = self.string
+
        cdef Span span = self[start:end]
        # Get LexemeC for newly merged token
-        new_orth = string[start_idx:end_idx]
+        new_orth = ''.join([t.string for t in span])
        if span[-1].whitespace_:
            new_orth = new_orth[:-1]
        cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
        # House the new merged token where it starts
        cdef TokenC* token = &self.data[start]
@ -372,30 +374,16 @@ cdef class Doc:
        else:
            token.ent_iob = 3
            token.ent_type = self.vocab.strings[ent_type]
        # Fix dependencies
        # Begin by setting all the head indices to absolute token positions
        # This is easier to work with for now than the offsets
        # Before thinking of something simpler, beware the case where a dependency
        # bridges over the entity. Here the alignment of the tokens changes.
        span_root = span.root.i
        for i in range(self.length):
            self.data[i].head += i
-        # Find the head of the merged token, and its dep relation
+        # Set the head of the merged token, and its dep relation, from the Span
-        outer_heads = {}
+        token.head = self.data[span_root].head
-        for i in range(start, end):
+        token.dep = span.root.dep
            head_idx = self.data[i].head
            if head_idx == i or head_idx < start or head_idx >= end:
                # Don't consider "heads" which are actually dominated by a word
                # in the region we're merging
                gp = head_idx
                while self.data[gp].head != gp:
                    if start <= gp < end:
                        break
                    gp = self.data[gp].head
                else:
                    # If we have multiple words attaching to the same head,
                    # but with different dep labels, we're preferring the last
                    # occurring dep label. Shrug. What else could we do, I guess?
                    outer_heads[head_idx] = self.data[i].dep
        token.head, token.dep = max(outer_heads.items())
        # Adjust deps before shrinking tokens
        # Tokens which point into the merged token should now point to it
        # Subtract the offset from all tokens which point to >= end
@ -406,7 +394,6 @@ cdef class Doc:
                self.data[i].head = start
            elif head_idx >= end:
                self.data[i].head -= offset
        # TODO: Fix left and right deps
        # Now compress the token array
        for i in range(end, self.length):
            self.data[i - offset] = self.data[i]
@ -417,6 +404,28 @@ cdef class Doc:
        for i in range(self.length):
            # ...And, set heads back to a relative position
            self.data[i].head -= i
-
+        # Set the left/right children, left/right edges
        set_children_from_heads(self.data, self.length)
        # Clear the cached Python objects
        self._py_tokens = [None] * self.length
        # Return the merged Python object
        return self[start]
 cdef int set_children_from_heads(TokenC* tokens, int length) except -1:
    cdef TokenC* head
    cdef TokenC* child
    cdef int i
    # Set left edges
    for i in range(length):
        child = &tokens[i]
        head = &tokens[i + child.head]
        if child < head and child.l_edge < head.l_edge:
            head.l_edge = child.l_edge
    # Set right edges --- same as above, but iterate in reverse
    for i in range(length-1, -1, -1):
        child = &tokens[i]
        head = &tokens[i + child.head]
        if child > head and child.r_edge > head.r_edge:
            head.r_edge = child.r_edge