From 8edd58492e43088212dc9c32c7ed9883f0abad18 Mon Sep 17 00:00:00 2001
From: Andreas Grivas <agrv@iit.demokritos.gr>
Date: Mon, 2 Nov 2015 13:41:39 +0200
Subject: [PATCH 1/8] fixed unicode error in printing - added tests

---
 spacy/tests/print/test_print.py | 98 +++++++++++++++++++++++++++++++++
 spacy/tokens/doc.pyx            |  4 +-
 spacy/tokens/spans.pyx          |  2 +-
 spacy/tokens/token.pyx          |  4 +-
 4 files changed, 103 insertions(+), 5 deletions(-)
 create mode 100644 spacy/tests/print/test_print.py

diff --git a/spacy/tests/print/test_print.py b/spacy/tests/print/test_print.py
new file mode 100644
index 000000000..744a813d6
--- /dev/null
+++ b/spacy/tests/print/test_print.py
@@ -0,0 +1,98 @@
+# -*- coding: utf-8 -*-
+import pytest
+
+
+def test_print_doc(EN):
+    try:
+        doc = EN(u'I sat down for coffee at the coffee store')
+        print(doc)
+    except Exception:
+        pytest.fail("Printing failed")
+
+
+def test_repr_doc(EN):
+    try:
+        doc = EN(u'I sat down for coffee at the coffee store')
+        print(repr(doc))
+    except Exception:
+        pytest.fail("Printing failed")
+
+
+def test_print_doc_unicode(EN):
+    try:
+        doc = EN(u'I sat down for coffee at the café')
+        print(doc)
+    except Exception:
+        pytest.fail("Printing failed")
+
+
+def test_repr_doc_unicode(EN):
+    try:
+        doc = EN(u'I sat down for coffee at the café')
+        print(repr(doc))
+    except Exception:
+        pytest.fail("Printing failed")
+
+
+def test_print_span(EN):
+    try:
+        doc = EN(u'I sat down for coffee at the coffee store')[-3:]
+        print(doc)
+    except Exception:
+        pytest.fail("Printing failed")
+
+
+def test_repr_span(EN):
+    try:
+        doc = EN(u'I sat down for coffee at the coffee store')[-3:]
+        print(repr(doc))
+    except Exception:
+        pytest.fail("Printing failed")
+
+
+def test_print_span_unicode(EN):
+    try:
+        doc = EN(u'I sat down for coffee at the café')[-3:]
+        print(doc)
+    except Exception:
+        pytest.fail("Printing failed")
+
+
+def test_repr_span_unicode(EN):
+    try:
+        doc = EN(u'I sat down for coffee at the café')[-3:]
+        print(repr(doc))
+    except Exception:
+        pytest.fail("Printing failed")
+
+
+def test_print_token(EN):
+    try:
+        doc = EN(u'I sat down for coffee at the coffee store')[-1]
+        print(doc)
+    except Exception:
+        pytest.fail("Printing failed")
+
+
+def test_repr_token(EN):
+    try:
+        doc = EN(u'I sat down for coffee at the coffee store')[-1]
+        print(repr(doc))
+    except Exception:
+        pytest.fail("Printing failed")
+
+
+def test_print_token_unicode(EN):
+    try:
+        doc = EN(u'I sat down for coffee at the café')[-1]
+        print(doc)
+    except Exception:
+        pytest.fail("Printing failed")
+
+
+def test_repr_token_unicode(EN):
+    try:
+        doc = EN(u'I sat down for coffee at the café')[-1]
+        print(repr(doc))
+    except Exception:
+        pytest.fail("Printing failed")
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 1626ebfc6..957bc59e6 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -118,10 +118,10 @@ cdef class Doc:
         return u''.join([t.string for t in self])
 
     def __str__(self):
-        return u''.join([t.string for t in self])
+        return u''.join([t.string for t in self]).encode('utf-8')
 
     def __repr__(self):
-        return u''.join([t.string for t in self])
+        return u''.join([t.string for t in self]).encode('utf-8')
 
     def similarity(self, other):
         if self.vector_norm == 0 or other.vector_norm == 0:
diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx
index e1b881f79..1f6b07636 100644
--- a/spacy/tokens/spans.pyx
+++ b/spacy/tokens/spans.pyx
@@ -50,7 +50,7 @@ cdef class Span:
         text = self.text_with_ws
         if self[-1].whitespace_:
             text = text[:-1]
-        return text
+        return text.encode('utf-8')
 
     def __getitem__(self, object i):
         if isinstance(i, slice):
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index cce8eeeb4..02ef52d0c 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -41,10 +41,10 @@ cdef class Token:
         return self.string
 
     def __str__(self):
-        return self.string
+        return self.string.encode('utf-8')
 
     def __repr__(self):
-        return self.string
+        return self.string.encode('utf-8')
 
     cpdef bint check_flag(self, attr_id_t flag_id) except -1:
         return Lexeme.c_check_flag(self.c.lex, flag_id)

From b5ce7a6e96f46ab9fad485bd780ec93c00bd116c Mon Sep 17 00:00:00 2001
From: Andreas Grivas <agrv@iit.demokritos.gr>
Date: Mon, 2 Nov 2015 19:40:37 +0200
Subject: [PATCH 2/8] fix py3 incompatibility

---
 spacy/tokens/doc.pyx   | 10 ++++++++--
 spacy/tokens/spans.pyx |  8 ++++----
 spacy/tokens/token.pyx | 10 ++++++++--
 3 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 957bc59e6..01ccb4fd9 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -7,6 +7,7 @@ import numpy.linalg
 import struct
 cimport numpy as np
 import math
+import six
 
 from ..lexeme cimport Lexeme
 from ..lexeme cimport EMPTY_LEXEME
@@ -117,11 +118,16 @@ cdef class Doc:
     def __unicode__(self):
         return u''.join([t.string for t in self])
 
-    def __str__(self):
+    def __bytes__(self):
         return u''.join([t.string for t in self]).encode('utf-8')
 
+    def __str__(self):
+        if six.PY3:
+            return self.__unicode__()
+        return self.__bytes__()
+
     def __repr__(self):
-        return u''.join([t.string for t in self]).encode('utf-8')
+        return self.__str__()
 
     def similarity(self, other):
         if self.vector_norm == 0 or other.vector_norm == 0:
diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx
index 1f6b07636..95b8e0de1 100644
--- a/spacy/tokens/spans.pyx
+++ b/spacy/tokens/spans.pyx
@@ -4,6 +4,7 @@ import numpy
 import numpy.linalg
 cimport numpy as np
 import math
+import six
 
 from ..structs cimport TokenC, LexemeC
 from ..typedefs cimport flags_t, attr_t
@@ -47,10 +48,9 @@ cdef class Span:
         return self.end - self.start
 
     def __repr__(self):
-        text = self.text_with_ws
-        if self[-1].whitespace_:
-            text = text[:-1]
-        return text.encode('utf-8')
+        if six.PY3:
+            return self.text
+        return self.text.encode('utf-8')
 
     def __getitem__(self, object i):
         if isinstance(i, slice):
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 02ef52d0c..81b850285 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -6,6 +6,7 @@ cimport numpy as np
 np.import_array()
 
 import numpy
+import six
 
 
 from ..lexeme cimport Lexeme
@@ -40,11 +41,16 @@ cdef class Token:
     def __unicode__(self):
         return self.string
 
-    def __str__(self):
+    def __bytes__(self):
         return self.string.encode('utf-8')
 
+    def __str__(self):
+        if six.PY3:
+            return self.__unicode__()
+        return self.__bytes__()
+
     def __repr__(self):
-        return self.string.encode('utf-8')
+        return self.__str__()
 
     cpdef bint check_flag(self, attr_id_t flag_id) except -1:
         return Lexeme.c_check_flag(self.c.lex, flag_id)

From 52ede05f9de21147dd3e0525b2d2615e250e2411 Mon Sep 17 00:00:00 2001
From: Andreas Grivas <agrv@iit.demokritos.gr>
Date: Mon, 2 Nov 2015 19:41:30 +0200
Subject: [PATCH 3/8] fix naming

---
 spacy/tests/print/test_print.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/spacy/tests/print/test_print.py b/spacy/tests/print/test_print.py
index 744a813d6..4740f44e6 100644
--- a/spacy/tests/print/test_print.py
+++ b/spacy/tests/print/test_print.py
@@ -36,63 +36,63 @@ def test_repr_doc_unicode(EN):
 
 def test_print_span(EN):
     try:
-        doc = EN(u'I sat down for coffee at the coffee store')[-3:]
-        print(doc)
+        span = EN(u'I sat down for coffee at the coffee store')[-3:]
+        print(span)
     except Exception:
         pytest.fail("Printing failed")
 
 
 def test_repr_span(EN):
     try:
-        doc = EN(u'I sat down for coffee at the coffee store')[-3:]
-        print(repr(doc))
+        span = EN(u'I sat down for coffee at the coffee store')[-3:]
+        print(repr(span))
     except Exception:
         pytest.fail("Printing failed")
 
 
 def test_print_span_unicode(EN):
     try:
-        doc = EN(u'I sat down for coffee at the café')[-3:]
-        print(doc)
+        span = EN(u'I sat down for coffee at the café')[-3:]
+        print(span)
     except Exception:
         pytest.fail("Printing failed")
 
 
 def test_repr_span_unicode(EN):
     try:
-        doc = EN(u'I sat down for coffee at the café')[-3:]
-        print(repr(doc))
+        span = EN(u'I sat down for coffee at the café')[-3:]
+        print(repr(span))
     except Exception:
         pytest.fail("Printing failed")
 
 
 def test_print_token(EN):
     try:
-        doc = EN(u'I sat down for coffee at the coffee store')[-1]
-        print(doc)
+        token = EN(u'I sat down for coffee at the coffee store')[-1]
+        print(token)
     except Exception:
         pytest.fail("Printing failed")
 
 
 def test_repr_token(EN):
     try:
-        doc = EN(u'I sat down for coffee at the coffee store')[-1]
-        print(repr(doc))
+        token = EN(u'I sat down for coffee at the coffee store')[-1]
+        print(repr(token))
     except Exception:
         pytest.fail("Printing failed")
 
 
 def test_print_token_unicode(EN):
     try:
-        doc = EN(u'I sat down for coffee at the café')[-1]
-        print(doc)
+        token = EN(u'I sat down for coffee at the café')[-1]
+        print(token)
     except Exception:
         pytest.fail("Printing failed")
 
 
 def test_repr_token_unicode(EN):
     try:
-        doc = EN(u'I sat down for coffee at the café')[-1]
-        print(repr(doc))
+        token = EN(u'I sat down for coffee at the café')[-1]
+        print(repr(token))
     except Exception:
         pytest.fail("Printing failed")

From c780bbda3e6db605aaa465400eda1de85d6ae0ea Mon Sep 17 00:00:00 2001
From: Andreas Grivas <agrv@iit.demokritos.gr>
Date: Tue, 3 Nov 2015 17:13:42 +0200
Subject: [PATCH 4/8] changed start end to properties - allow merge

---
 setup.py               | 230 -----------------------------------------
 spacy/tokens/doc.pyx   |  36 +++++--
 spacy/tokens/spans.pxd |   6 +-
 spacy/tokens/spans.pyx |  31 +++++-
 4 files changed, 61 insertions(+), 242 deletions(-)
 delete mode 100644 setup.py

diff --git a/setup.py b/setup.py
deleted file mode 100644
index 1d719c626..000000000
--- a/setup.py
+++ /dev/null
@@ -1,230 +0,0 @@
-#!/usr/bin/env python
-from setuptools import setup
-import shutil
-
-import sys
-import os
-from os import path
-
-from setuptools import Extension
-from distutils import sysconfig
-from distutils.core import setup, Extension
-from distutils.command.build_ext import build_ext
-
-import platform
-
-# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
-# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
-compile_options =  {'msvc'  : ['/Ox', '/EHsc']  ,
-                    'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']       }
-link_options    =  {'msvc'  : [] ,
-                    'other' : [] }
-class build_ext_options:
-    def build_options(self):
-        c_type = None
-        if self.compiler.compiler_type in compile_options:
-            c_type = self.compiler.compiler_type
-        elif 'other' in compile_options:
-            c_type = 'other'
-        if c_type is not None:
-           for e in self.extensions:
-               e.extra_compile_args = compile_options[c_type]
-
-        l_type = None 
-        if self.compiler.compiler_type in link_options:
-            l_type = self.compiler.compiler_type
-        elif 'other' in link_options:
-            l_type = 'other'
-        if l_type is not None:
-           for e in self.extensions:
-               e.extra_link_args = link_options[l_type]
-
-class build_ext_subclass( build_ext, build_ext_options ):
-    def build_extensions(self):
-        build_ext_options.build_options(self)
-        build_ext.build_extensions(self)
-        
-    
-
-# PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But,
-# this is necessary to get it compile.
-# We have to resort to monkey-patching to set the compiler, because pypy broke
-# all the everything.
-
-pre_patch_customize_compiler = sysconfig.customize_compiler
-def my_customize_compiler(compiler):
-    pre_patch_customize_compiler(compiler)
-    compiler.compiler_cxx = ['c++']
-
-
-if platform.python_implementation() == 'PyPy':
-    sysconfig.customize_compiler = my_customize_compiler
-
-#def install_headers():
-#    dest_dir = path.join(sys.prefix, 'include', 'murmurhash')
-#    if not path.exists(dest_dir):
-#        shutil.copytree('murmurhash/headers/murmurhash', dest_dir)
-#
-#    dest_dir = path.join(sys.prefix, 'include', 'numpy')
-
-
-includes = ['.', path.join(sys.prefix, 'include')]
-
-
-try:
-    import numpy
-    numpy_headers = path.join(numpy.get_include(), 'numpy')
-    shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy'))
-except ImportError:
-    pass
-except OSError:
-    pass
-
-
-def clean(mod_names):
-    for name in mod_names:
-        name = name.replace('.', '/')
-        so = name + '.so'
-        html = name + '.html'
-        cpp = name + '.cpp'
-        c = name + '.c'
-        for file_path in [so, html, cpp, c]:
-            if os.path.exists(file_path):
-                os.unlink(file_path)
-
-
-def name_to_path(mod_name, ext):
-    return '%s.%s' % (mod_name.replace('.', '/'), ext)
-
-
-def c_ext(mod_name, language, includes):
-    mod_path = name_to_path(mod_name, language)
-    return Extension(mod_name, [mod_path], include_dirs=includes)
-
-
-def cython_setup(mod_names, language, includes):
-    import Cython.Distutils
-    import Cython.Build
-    import distutils.core
-
-    class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ):
-        def build_extensions(self):
-            build_ext_options.build_options(self)
-            Cython.Distutils.build_ext.build_extensions(self)
-
-    if language == 'cpp':
-        language = 'c++'
-    exts = []
-    for mod_name in mod_names:
-        mod_path = mod_name.replace('.', '/') + '.pyx'
-        e = Extension(mod_name, [mod_path], language=language, include_dirs=includes)
-        exts.append(e)
-    distutils.core.setup(
-        name='spacy',
-        packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
-                  'spacy.syntax', 'spacy.munge'],
-        description="Industrial-strength NLP",
-        author='Matthew Honnibal',
-        author_email='honnibal@gmail.com',
-        version=VERSION,
-        url="http://honnibal.github.io/spaCy/",
-        package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"],
-                      "spacy.tokens": ["*.pxd"],
-                      "spacy.serialize": ["*.pxd"],
-                      "spacy.en": ["*.pxd", "data/pos/*",
-                                   "data/wordnet/*", "data/tokenizer/*",
-                                   "data/vocab/tag_map.json",
-                                   "data/vocab/lexemes.bin",
-                                   "data/vocab/strings.json"],
-                      "spacy.syntax": ["*.pxd"]},
-        ext_modules=exts,
-        cmdclass={'build_ext': build_ext_cython_subclass},
-        license="MIT",
-    )
-
-
-def run_setup(exts):
-    setup(
-        name='spacy',
-        packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
-                  'spacy.syntax', 'spacy.munge',
-                  'spacy.tests',
-                  'spacy.tests.matcher',
-                  'spacy.tests.morphology',
-                  'spacy.tests.munge',
-                  'spacy.tests.parser',
-                  'spacy.tests.serialize',
-                  'spacy.tests.spans',
-                  'spacy.tests.tagger',
-                  'spacy.tests.tokenizer',
-                  'spacy.tests.tokens',
-                  'spacy.tests.vectors',
-                  'spacy.tests.vocab'],
-        description="Industrial-strength NLP",
-        author='Matthew Honnibal',
-        author_email='honnibal@gmail.com',
-        version=VERSION,
-        url="http://honnibal.github.io/spaCy/",
-        package_data={"spacy": ["*.pxd"],
-                      "spacy.en": ["*.pxd", "data/pos/*",
-                                   "data/wordnet/*", "data/tokenizer/*",
-                                   "data/vocab/lexemes.bin",
-                                   "data/vocab/serializer.json",
-                                   "data/vocab/oov_prob",
-                                   "data/vocab/strings.txt"],
-                      "spacy.syntax": ["*.pxd"]},
-        ext_modules=exts,
-        license="MIT",
-        install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43',
-                          'thinc >= 3.4.1', "text_unidecode", 'plac', 'six',
-                          'ujson', 'cloudpickle'],
-        setup_requires=["headers_workaround"],
-        cmdclass = {'build_ext': build_ext_subclass },
-    )
-
-    import headers_workaround
-
-    headers_workaround.fix_venv_pypy_include()
-    headers_workaround.install_headers('murmurhash')
-    headers_workaround.install_headers('numpy')
-
-
-VERSION = '0.97'
-def main(modules, is_pypy):
-    language = "cpp"
-    includes = ['.', path.join(sys.prefix, 'include')]
-    if sys.platform.startswith('darwin'):
-        compile_options['other'].append('-mmacosx-version-min=10.8')
-        compile_options['other'].append('-stdlib=libc++')
-        link_options['other'].append('-lc++')
-    if use_cython:
-        cython_setup(modules, language, includes)
-    else:
-        exts = [c_ext(mn, language, includes)
-                      for mn in modules]
-        run_setup(exts)
-
-MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
-             'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
-             'spacy.morphology', 'spacy.tagger',
-             'spacy.syntax.stateclass', 
-             'spacy._ml', 'spacy._theano',
-             'spacy.tokenizer',
-             'spacy.syntax.parser', 
-             'spacy.syntax.transition_system',
-             'spacy.syntax.arc_eager',
-             'spacy.syntax._parse_features',
-             'spacy.gold', 'spacy.orth',
-             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
-             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
-             'spacy.cfile', 'spacy.matcher',
-             'spacy.syntax.ner',
-             'spacy.symbols']
-
-
-if __name__ == '__main__':
-    if sys.argv[1] == 'clean':
-        clean(MOD_NAMES)
-    else:
-        use_cython = sys.argv[1] == 'build_ext'
-        main(MOD_NAMES, use_cython)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 01ccb4fd9..f1c8d2c71 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -439,11 +439,23 @@ cdef class Doc:
                 keep_reading = False
             yield n_bytes_str + data
 
-    # This function is terrible --- need to fix this.
-    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
-              unicode ent_type):
-        """Merge a multi-word expression into a single token.  Currently
-        experimental; API is likely to change."""
+
+    def token_index_start(self, int start_idx):
+        cdef int i
+        for i in range(self.length):
+            if self.data[i].idx == start_idx:
+                return i
+        return None
+
+    def token_index_end(self, int end_idx):
+        cdef int i
+        for i in range(self.length):
+            if (self.data[i].idx + self.data[i].lex.length) == end_idx:
+                return i + 1
+        return None
+
+    def range_from_indices(self, int start_idx, int end_idx):
+        assert start_idx < end_idx
         cdef int i
         cdef int start = -1
         cdef int end = -1
@@ -454,10 +466,18 @@ cdef class Doc:
                 if start == -1:
                     return None
                 end = i + 1
-                break
-        else:
-            return None
+                return (start, end)
+        return None
 
+    # This function is terrible --- need to fix this.
+    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
+              unicode ent_type):
+        """Merge a multi-word expression into a single token.  Currently
+        experimental; API is likely to change."""
+        start_end = self.range_from_indices(start_idx, end_idx)
+        if start_end is None:
+            return None
+        start, end = start_end
         cdef Span span = self[start:end]
         # Get LexemeC for newly merged token
         new_orth = ''.join([t.text_with_ws for t in span])
diff --git a/spacy/tokens/spans.pxd b/spacy/tokens/spans.pxd
index 54c0a3afb..bae9e4691 100644
--- a/spacy/tokens/spans.pxd
+++ b/spacy/tokens/spans.pxd
@@ -4,8 +4,10 @@ from .doc cimport Doc
 cdef class Span:
     cdef readonly Doc doc
     cdef public int i
-    cdef public int start
-    cdef public int end
+    cdef public int start_token
+    cdef public int end_token
+    cdef public int start_idx
+    cdef public int end_idx
     cdef readonly int label
 
     cdef public _vector
diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx
index 95b8e0de1..f4dcb15f0 100644
--- a/spacy/tokens/spans.pyx
+++ b/spacy/tokens/spans.pyx
@@ -21,8 +21,11 @@ cdef class Span:
             raise IndexError
 
         self.doc = tokens
-        self.start = start
-        self.end = end
+        # keep char offsets - as these don't change when merging spans
+        self.start_token = start
+        self.start_idx = self.doc[start].idx
+        self.end_token = end
+        self.end_idx = self.doc[end - 1].idx + len(self.doc[end - 1])
         self.label = label
         self._vector = vector
         self._vector_norm = vector_norm
@@ -76,6 +79,30 @@ cdef class Span:
             return 0.0
         return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
 
+    property start:
+        def __get__(self):
+            # if we haven't merged anything below check is false - so we get start token
+            if self.start_token >= len(self.doc) or self.doc[self.start_token].idx != self.start_idx:
+                new_start = self.doc.token_index_start(self.start_idx)
+                if new_start is not None:
+                    self.start_token = new_start
+                else:
+                    raise IndexError('Something went terribly wrong during a merge.'
+                                     'No token found with idx %s' % self.start_idx)
+            return self.start_token
+
+    property end:
+        def __get__(self):
+            # if we haven't merged anything we have fast access
+            if self.end_token >= len(self.doc) or self.doc[self.end_token - 1].idx != self.end_idx:
+                new_end = self.doc.token_index_end(self.end_idx)
+                if new_end is not None:
+                    self.end_token = new_end
+                else:
+                    raise IndexError('Something went terribly wrong during a merge.'
+                                     'No token found with idx %s' % self.end_idx)
+            return self.end_token
+
     property vector:
         def __get__(self):
             if self._vector is None:

From 015a84a5ecf72ae6703046e2fc85f3b7bc58ad2a Mon Sep 17 00:00:00 2001
From: Andreas Grivas <agrv@iit.demokritos.gr>
Date: Wed, 4 Nov 2015 12:56:07 +0200
Subject: [PATCH 5/8] added comments

---
 spacy/tokens/doc.pyx   |  4 ++++
 spacy/tokens/spans.pyx | 19 +++++++++++++++----
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index f1c8d2c71..d11054e35 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -441,6 +441,7 @@ cdef class Doc:
 
 
     def token_index_start(self, int start_idx):
+        """ Get index of token in doc that has character index start_idx """
         cdef int i
         for i in range(self.length):
             if self.data[i].idx == start_idx:
@@ -448,6 +449,7 @@ cdef class Doc:
         return None
 
     def token_index_end(self, int end_idx):
+        """ Get index+1 of token in doc ending with character index end_idx """
         cdef int i
         for i in range(self.length):
             if (self.data[i].idx + self.data[i].lex.length) == end_idx:
@@ -455,6 +457,8 @@ cdef class Doc:
         return None
 
     def range_from_indices(self, int start_idx, int end_idx):
+        """ Get tuple - span of token indices which correspond to
+            character indices (start_idx, end_idx) if such a span exists"""
         assert start_idx < end_idx
         cdef int i
         cdef int start = -1
diff --git a/spacy/tokens/spans.pyx b/spacy/tokens/spans.pyx
index f4dcb15f0..afd809ecf 100644
--- a/spacy/tokens/spans.pyx
+++ b/spacy/tokens/spans.pyx
@@ -14,14 +14,15 @@ from ..util import normalize_slice
 
 
 cdef class Span:
-    """A slice from a Doc object."""
+    """A slice from a Doc object. Internally keeps character offsets in order
+       to keep track of changes (merges) in the original Doc. Updates are
+       made in start and end property."""
     def __cinit__(self, Doc tokens, int start, int end, int label=0, vector=None,
                   vector_norm=None):
         if not (0 <= start <= end <= len(tokens)):
             raise IndexError
 
         self.doc = tokens
-        # keep char offsets - as these don't change when merging spans
         self.start_token = start
         self.start_idx = self.doc[start].idx
         self.end_token = end
@@ -80,9 +81,14 @@ cdef class Span:
         return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
 
     property start:
+        """ Get start token index of this span from the Doc."""
         def __get__(self):
-            # if we haven't merged anything below check is false - so we get start token
+            # if we have merged spans in Doc start might have changed.
+            # check if token start index is in doc index range and the token
+            # index is start_idx (it hasn't changed).
+            # Potential IndexError if only second condition was used
             if self.start_token >= len(self.doc) or self.doc[self.start_token].idx != self.start_idx:
+                # go through tokens in Doc - find index of token equal to start_idx
                 new_start = self.doc.token_index_start(self.start_idx)
                 if new_start is not None:
                     self.start_token = new_start
@@ -92,9 +98,14 @@ cdef class Span:
             return self.start_token
 
     property end:
+        """ Get end token index of this span from the Doc."""
         def __get__(self):
-            # if we haven't merged anything we have fast access
+            # if we have merged spans in Doc end will have changed.
+            # check if token end index is in doc index range and the token
+            # index is end_idx (it hasn't changed).
+            # Potential IndexError if only second condition was used
             if self.end_token >= len(self.doc) or self.doc[self.end_token - 1].idx != self.end_idx:
+                # go through tokens in Doc - find index of token equal to end_idx
                 new_end = self.doc.token_index_end(self.end_idx)
                 if new_end is not None:
                     self.end_token = new_end

From 93918b5c234f2111fae0466c4e1498f373c87ab7 Mon Sep 17 00:00:00 2001
From: Andreas Grivas <agrv@iit.demokritos.gr>
Date: Wed, 4 Nov 2015 19:49:22 +0200
Subject: [PATCH 6/8] assign lex after spans, add tests

---
 setup.py                        | 230 ++++++++++++++++++++++++++++++++
 spacy/tests/spans/test_merge.py |  27 +++-
 spacy/tokens/doc.pyx            |   7 +-
 3 files changed, 258 insertions(+), 6 deletions(-)
 create mode 100644 setup.py

diff --git a/setup.py b/setup.py
new file mode 100644
index 000000000..1d719c626
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,230 @@
+#!/usr/bin/env python
+from setuptools import setup
+import shutil
+
+import sys
+import os
+from os import path
+
+from setuptools import Extension
+from distutils import sysconfig
+from distutils.core import setup, Extension
+from distutils.command.build_ext import build_ext
+
+import platform
+
+# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
+# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
+compile_options =  {'msvc'  : ['/Ox', '/EHsc']  ,
+                    'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']       }
+link_options    =  {'msvc'  : [] ,
+                    'other' : [] }
+class build_ext_options:
+    def build_options(self):
+        c_type = None
+        if self.compiler.compiler_type in compile_options:
+            c_type = self.compiler.compiler_type
+        elif 'other' in compile_options:
+            c_type = 'other'
+        if c_type is not None:
+           for e in self.extensions:
+               e.extra_compile_args = compile_options[c_type]
+
+        l_type = None 
+        if self.compiler.compiler_type in link_options:
+            l_type = self.compiler.compiler_type
+        elif 'other' in link_options:
+            l_type = 'other'
+        if l_type is not None:
+           for e in self.extensions:
+               e.extra_link_args = link_options[l_type]
+
+class build_ext_subclass( build_ext, build_ext_options ):
+    def build_extensions(self):
+        build_ext_options.build_options(self)
+        build_ext.build_extensions(self)
+        
+    
+
+# PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But,
+# this is necessary to get it compile.
+# We have to resort to monkey-patching to set the compiler, because pypy broke
+# all the everything.
+
+pre_patch_customize_compiler = sysconfig.customize_compiler
+def my_customize_compiler(compiler):
+    pre_patch_customize_compiler(compiler)
+    compiler.compiler_cxx = ['c++']
+
+
+if platform.python_implementation() == 'PyPy':
+    sysconfig.customize_compiler = my_customize_compiler
+
+#def install_headers():
+#    dest_dir = path.join(sys.prefix, 'include', 'murmurhash')
+#    if not path.exists(dest_dir):
+#        shutil.copytree('murmurhash/headers/murmurhash', dest_dir)
+#
+#    dest_dir = path.join(sys.prefix, 'include', 'numpy')
+
+
+includes = ['.', path.join(sys.prefix, 'include')]
+
+
+try:
+    import numpy
+    numpy_headers = path.join(numpy.get_include(), 'numpy')
+    shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy'))
+except ImportError:
+    pass
+except OSError:
+    pass
+
+
+def clean(mod_names):
+    for name in mod_names:
+        name = name.replace('.', '/')
+        so = name + '.so'
+        html = name + '.html'
+        cpp = name + '.cpp'
+        c = name + '.c'
+        for file_path in [so, html, cpp, c]:
+            if os.path.exists(file_path):
+                os.unlink(file_path)
+
+
+def name_to_path(mod_name, ext):
+    return '%s.%s' % (mod_name.replace('.', '/'), ext)
+
+
+def c_ext(mod_name, language, includes):
+    mod_path = name_to_path(mod_name, language)
+    return Extension(mod_name, [mod_path], include_dirs=includes)
+
+
+def cython_setup(mod_names, language, includes):
+    import Cython.Distutils
+    import Cython.Build
+    import distutils.core
+
+    class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ):
+        def build_extensions(self):
+            build_ext_options.build_options(self)
+            Cython.Distutils.build_ext.build_extensions(self)
+
+    if language == 'cpp':
+        language = 'c++'
+    exts = []
+    for mod_name in mod_names:
+        mod_path = mod_name.replace('.', '/') + '.pyx'
+        e = Extension(mod_name, [mod_path], language=language, include_dirs=includes)
+        exts.append(e)
+    distutils.core.setup(
+        name='spacy',
+        packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
+                  'spacy.syntax', 'spacy.munge'],
+        description="Industrial-strength NLP",
+        author='Matthew Honnibal',
+        author_email='honnibal@gmail.com',
+        version=VERSION,
+        url="http://honnibal.github.io/spaCy/",
+        package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"],
+                      "spacy.tokens": ["*.pxd"],
+                      "spacy.serialize": ["*.pxd"],
+                      "spacy.en": ["*.pxd", "data/pos/*",
+                                   "data/wordnet/*", "data/tokenizer/*",
+                                   "data/vocab/tag_map.json",
+                                   "data/vocab/lexemes.bin",
+                                   "data/vocab/strings.json"],
+                      "spacy.syntax": ["*.pxd"]},
+        ext_modules=exts,
+        cmdclass={'build_ext': build_ext_cython_subclass},
+        license="MIT",
+    )
+
+
+def run_setup(exts):
+    setup(
+        name='spacy',
+        packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
+                  'spacy.syntax', 'spacy.munge',
+                  'spacy.tests',
+                  'spacy.tests.matcher',
+                  'spacy.tests.morphology',
+                  'spacy.tests.munge',
+                  'spacy.tests.parser',
+                  'spacy.tests.serialize',
+                  'spacy.tests.spans',
+                  'spacy.tests.tagger',
+                  'spacy.tests.tokenizer',
+                  'spacy.tests.tokens',
+                  'spacy.tests.vectors',
+                  'spacy.tests.vocab'],
+        description="Industrial-strength NLP",
+        author='Matthew Honnibal',
+        author_email='honnibal@gmail.com',
+        version=VERSION,
+        url="http://honnibal.github.io/spaCy/",
+        package_data={"spacy": ["*.pxd"],
+                      "spacy.en": ["*.pxd", "data/pos/*",
+                                   "data/wordnet/*", "data/tokenizer/*",
+                                   "data/vocab/lexemes.bin",
+                                   "data/vocab/serializer.json",
+                                   "data/vocab/oov_prob",
+                                   "data/vocab/strings.txt"],
+                      "spacy.syntax": ["*.pxd"]},
+        ext_modules=exts,
+        license="MIT",
+        install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43',
+                          'thinc >= 3.4.1', "text_unidecode", 'plac', 'six',
+                          'ujson', 'cloudpickle'],
+        setup_requires=["headers_workaround"],
+        cmdclass = {'build_ext': build_ext_subclass },
+    )
+
+    import headers_workaround
+
+    headers_workaround.fix_venv_pypy_include()
+    headers_workaround.install_headers('murmurhash')
+    headers_workaround.install_headers('numpy')
+
+
+VERSION = '0.97'
+def main(modules, is_pypy):
+    language = "cpp"
+    includes = ['.', path.join(sys.prefix, 'include')]
+    if sys.platform.startswith('darwin'):
+        compile_options['other'].append('-mmacosx-version-min=10.8')
+        compile_options['other'].append('-stdlib=libc++')
+        link_options['other'].append('-lc++')
+    if use_cython:
+        cython_setup(modules, language, includes)
+    else:
+        exts = [c_ext(mn, language, includes)
+                      for mn in modules]
+        run_setup(exts)
+
+MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
+             'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
+             'spacy.morphology', 'spacy.tagger',
+             'spacy.syntax.stateclass', 
+             'spacy._ml', 'spacy._theano',
+             'spacy.tokenizer',
+             'spacy.syntax.parser', 
+             'spacy.syntax.transition_system',
+             'spacy.syntax.arc_eager',
+             'spacy.syntax._parse_features',
+             'spacy.gold', 'spacy.orth',
+             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
+             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
+             'spacy.cfile', 'spacy.matcher',
+             'spacy.syntax.ner',
+             'spacy.symbols']
+
+
+if __name__ == '__main__':
+    if sys.argv[1] == 'clean':
+        clean(MOD_NAMES)
+    else:
+        use_cython = sys.argv[1] == 'build_ext'
+        main(MOD_NAMES, use_cython)
diff --git a/spacy/tests/spans/test_merge.py b/spacy/tests/spans/test_merge.py
index 2360a0839..315757a0b 100644
--- a/spacy/tests/spans/test_merge.py
+++ b/spacy/tests/spans/test_merge.py
@@ -1,7 +1,6 @@
 from __future__ import unicode_literals
 import pytest
 
-@pytest.mark.models
 def test_merge_tokens(EN):
     tokens = EN(u'Los Angeles start.')
     assert len(tokens) == 4
@@ -13,7 +12,6 @@ def test_merge_tokens(EN):
     assert tokens[0].head.orth_ == 'start'
 
 
-@pytest.mark.models
 def test_merge_heads(EN):
     tokens = EN(u'I found a pilates class near work.')
     assert len(tokens) == 8
@@ -32,7 +30,6 @@ def test_issue_54(EN):
     text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
     tokens = EN(text)
 
-@pytest.mark.models
 def test_np_merges(EN):
     text = u'displaCy is a parse tool built with Javascript'
     tokens = EN(text)
@@ -47,3 +44,27 @@ def test_np_merges(EN):
         merged = tokens.merge(start, end, label, lemma, label)
         assert merged != None, (start, end, label, lemma) 
 
+def test_entity_merge(EN):
+    tokens = EN(u'Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale')
+    assert(len(tokens) == 15)
+    for ent in tokens.ents:
+        label, lemma, type_ = (ent.root.tag_, ent.root.lemma_, max(w.ent_type_ for w in ent))
+        ent.merge(label, lemma, type_)
+    # check looping is ok
+    assert(len(tokens) == 13)
+
+def test_sentence_update_after_merge(EN):
+    tokens = EN(u'Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale')
+    sent1, sent2 = list(tokens.sents)
+    init_len = len(sent1)
+    merge_me = tokens[0:2]
+    merge_me.merge(u'none', u'none', u'none')
+    assert(len(sent1) == init_len - 1)
+
+def test_subtree_size_check(EN):
+    tokens = EN(u'Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale')
+    sent1 = list(tokens.sents)[0]
+    init_len = len(list(sent1.root.subtree))
+    merge_me = tokens[0:2]
+    merge_me.merge(u'none', u'none', u'none')
+    assert(len(list(sent1.root.subtree)) == init_len - 1)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index d11054e35..555528a33 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -459,7 +459,6 @@ cdef class Doc:
     def range_from_indices(self, int start_idx, int end_idx):
         """ Get tuple - span of token indices which correspond to
             character indices (start_idx, end_idx) if such a span exists"""
-        assert start_idx < end_idx
         cdef int i
         cdef int start = -1
         cdef int end = -1
@@ -490,8 +489,6 @@ cdef class Doc:
         cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
         # House the new merged token where it starts
         cdef TokenC* token = &self.data[start]
-        # Update fields
-        token.lex = lex
         token.spacy = self.data[end-1].spacy
         # What to do about morphology??
         # TODO: token.morph = ???
@@ -509,6 +506,10 @@ cdef class Doc:
         # bridges over the entity. Here the alignment of the tokens changes.
         span_root = span.root.i
         token.dep = span.root.dep
+        # We update token.lex after keeping span root and dep, since
+        # setting token.lex will change span.start and span.end properties
+        # as it modifies the character offsets in the doc
+        token.lex = lex
         for i in range(self.length):
             self.data[i].head += i
         # Set the head of the merged token, and its dep relation, from the Span

From 770e3637ff5308fef403279d06959cf30b7fb435 Mon Sep 17 00:00:00 2001
From: Andreas Grivas <agrv@iit.demokritos.gr>
Date: Wed, 4 Nov 2015 20:20:42 +0200
Subject: [PATCH 7/8] update data -> c :)

---
 spacy/tokens/doc.pyx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index c7974bf31..5549d78d4 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -443,7 +443,7 @@ cdef class Doc:
         """ Get index of token in doc that has character index start_idx """
         cdef int i
         for i in range(self.length):
-            if self.data[i].idx == start_idx:
+            if self.c[i].idx == start_idx:
                 return i
         return None
 
@@ -451,7 +451,7 @@ cdef class Doc:
         """ Get index+1 of token in doc ending with character index end_idx """
         cdef int i
         for i in range(self.length):
-            if (self.data[i].idx + self.data[i].lex.length) == end_idx:
+            if (self.c[i].idx + self.c[i].lex.length) == end_idx:
                 return i + 1
         return None
 

From 9fa35951ab7b4e6bbc8f1be40ae30517d4c1bc64 Mon Sep 17 00:00:00 2001
From: Andreas Grivas <agrv@iit.demokritos.gr>
Date: Wed, 4 Nov 2015 20:21:25 +0200
Subject: [PATCH 8/8] install error test/span -> test/spans

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index d153ae3da..e8715eaef 100644
--- a/setup.py
+++ b/setup.py
@@ -163,7 +163,7 @@ def run_setup(exts):
                   'spacy.tests.munge',
                   'spacy.tests.parser',
                   'spacy.tests.serialize',
-                  'spacy.tests.span',
+                  'spacy.tests.spans',
                   'spacy.tests.tagger',
                   'spacy.tests.tokenizer',
                   'spacy.tests.tokens',