* Add merge() method to Tokens, with fairly brittle/hacky implementation, but quite easy to test. Passing minimal tests. Still need to fix left/right deps in C data

2025-11-01 08:27:44 +03:00 · 2015-03-30 01:37:41 +02:00 · 2015-03-30 01:37:41 +02:00 · e70b87efeb
commit e70b87efeb
parent 557856e84c
1 changed files with 85 additions and 0 deletions
--- a/spacy/tokens.pyx
+++ b/spacy/tokens.pyx
@ -1,8 +1,10 @@
 # cython: embedsignature=True
+from libc.string cimport memset

 from preshed.maps cimport PreshMap
 from preshed.counter cimport PreshCounter

+from .strings cimport slice_unicode
 from .vocab cimport EMPTY_LEXEME
 from .typedefs cimport attr_id_t, attr_t
 from .typedefs cimport LEMMA
@ -11,6 +13,7 @@ from .typedefs cimport POS, LEMMA
 from .parts_of_speech import UNIV_POS_NAMES
 from .lexeme cimport check_flag
 from .spans import Span
+from .structs cimport UniStr

 from unidecode import unidecode

@ -253,6 +256,88 @@ cdef class Tokens:
        for i in range(self.length):
            self.data[i] = parsed[i]

+    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
+              unicode ent_type):
+        cdef int i
+        cdef int start = -1
+        cdef int end = -1
+        for i in range(self.length):
+            if self.data[i].idx == start_idx:
+                start = i
+            if (self.data[i].idx + self.data[i].lex.length) == end_idx:
+                end = i + 1
+                break
+        else:
+            return None
+        # Get LexemeC for newly merged token
+        cdef UniStr new_orth_c
+        slice_unicode(&new_orth_c, self._string, start_idx, end_idx)
+        cdef const LexemeC* lex = self.vocab.get(self.mem, &new_orth_c)
+        # House the new merged token where it starts
+        cdef TokenC* token = &self.data[start]
+        # Update fields
+        token.lex = lex
+        # What to do about morphology??
+        # TODO: token.morph = ???
+        token.tag = self.vocab.strings[tag]
+        token.lemma = self.vocab.strings[lemma] 
+        if ent_type == 'O':
+            token.ent_iob = 2
+            token.ent_type = 0
+        else:
+            token.ent_iob = 3
+            token.ent_type = self.vocab.strings[ent_type]
+        # Fix dependencies
+        # Begin by setting all the head indices to absolute token positions
+        # This is easier to work with for now than the offsets
+        for i in range(self.length):
+            self.data[i].head += i
+        # Find the head of the merged token, and its dep relation
+        outer_heads = {}
+        for i in range(start, end):
+            head_idx = self.data[i].head
+            if head_idx == i or head_idx < start or head_idx >= end:
+                # Don't consider "heads" which are actually dominated by a word
+                # in the region we're merging
+                gp = head_idx
+                while self.data[gp].head != gp:
+                    if start <= gp < end:
+                        break
+                    gp = self.data[gp].head
+                else:
+                    # If we have multiple words attaching to the same head,
+                    # but with different dep labels, we're preferring the last
+                    # occurring dep label. Shrug. What else could we do, I guess?
+                    outer_heads[head_idx] = self.data[i].dep
+
+        token.head, token.dep = max(outer_heads.items())
+        # Adjust deps before shrinking tokens
+        # Tokens which point into the merged token should now point to it
+        # Subtract the offset from all tokens which point to >= end
+        offset = (end - start) - 1
+        for i in range(self.length):
+            head_idx = self.data[i].head
+            if start <= head_idx < end:
+                self.data[i].head = start
+            elif head_idx >= end:
+                self.data[i].head -= offset
+        # TODO: Fix left and right deps
+        # Now compress the token array
+        for i in range(end, self.length):
+            self.data[i - offset] = self.data[i]
+        for i in range(self.length - offset, self.length):
+            memset(&self.data[i], 0, sizeof(TokenC))
+            self.data[i].lex = &EMPTY_LEXEME
+        self.length -= offset
+        for i in range(self.length):
+            # ...And, set heads back to a relative position
+            self.data[i].head -= i
+
+        # Clear cached Python objects
+        self._py_tokens = [None] * self.length
+        # Return the merged Python object
+        return self[start]
+ 

 cdef class Token:
    """An individual token --- i.e. a word, a punctuation symbol, etc.  Created