From 562db6d2d082671a53493f2d21774b292a1463c3 Mon Sep 17 00:00:00 2001
From: Andreas Grivas <agrv@iit.demokritos.gr>
Date: Thu, 5 Nov 2015 17:28:08 +0200
Subject: [PATCH] * merge add lex last - add index finder funcs

---
 spacy/tokens/doc.pyx | 44 ++++++++++++++++++++++++++++++++++----------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index b0a193173..40e0e9d7a 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -438,11 +438,25 @@ cdef class Doc:
                 keep_reading = False
             yield n_bytes_str + data
 
-    # This function is terrible --- need to fix this.
-    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
-              unicode ent_type):
-        """Merge a multi-word expression into a single token.  Currently
-        experimental; API is likely to change."""
+    def token_index_start(self, int start_idx):
+        """ Get index of token in doc that has character index start_idx """
+        cdef int i
+        for i in range(self.length):
+            if self.c[i].idx == start_idx:
+                return i
+        return None
+
+    def token_index_end(self, int end_idx):
+        """ Get index+1 of token in doc ending with character index end_idx """
+        cdef int i
+        for i in range(self.length):
+            if (self.c[i].idx + self.c[i].lex.length) == end_idx:
+                return i + 1
+        return None
+
+    def range_from_indices(self, int start_idx, int end_idx):
+        """ Get tuple - span of token indices which correspond to
+            character indices (start_idx, end_idx) if such a span exists"""
         cdef int i
         cdef int start = -1
         cdef int end = -1
@@ -453,10 +467,18 @@ cdef class Doc:
                 if start == -1:
                     return None
                 end = i + 1
-                break
-        else:
-            return None
+                return (start, end)
+        return None
 
+    # This function is terrible --- need to fix this.
+    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
+              unicode ent_type):
+        """Merge a multi-word expression into a single token.  Currently
+        experimental; API is likely to change."""
+        start_end = self.range_from_indices(start_idx, end_idx)
+        if start_end is None:
+            return None
+        start, end = start_end
         cdef Span span = self[start:end]
         # Get LexemeC for newly merged token
         new_orth = ''.join([t.text_with_ws for t in span])
@@ -465,8 +487,6 @@ cdef class Doc:
         cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
         # House the new merged token where it starts
         cdef TokenC* token = &self.c[start]
-        # Update fields
-        token.lex = lex
         token.spacy = self.c[end-1].spacy
         if tag in self.vocab.morphology.tag_map:
             self.vocab.morphology.assign_tag(token, tag)
@@ -485,6 +505,10 @@ cdef class Doc:
         # bridges over the entity. Here the alignment of the tokens changes.
         span_root = span.root.i
         token.dep = span.root.dep
+        # We update token.lex after keeping span root and dep, since
+        # setting token.lex will change span.start and span.end properties
+        # as it modifies the character offsets in the doc
+        token.lex = lex
         for i in range(self.length):
             self.c[i].head += i
         # Set the head of the merged token, and its dep relation, from the Span