changed start end to properties - allow merge

2025-08-07 05:40:20 +03:00 · 2015-11-03 17:13:42 +02:00 · 2015-11-03 17:13:42 +02:00 · c780bbda3e
commit c780bbda3e
parent 7691737fad
4 changed files with 61 additions and 242 deletions
--- a/setup.py
+++ b/setup.py
@ -1,230 +0,0 @@
-#!/usr/bin/env python
-from setuptools import setup
-import shutil
-
-import sys
-import os
-from os import path
-
-from setuptools import Extension
-from distutils import sysconfig
-from distutils.core import setup, Extension
-from distutils.command.build_ext import build_ext
-
-import platform
-
-# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
-# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
-compile_options =  {'msvc'  : ['/Ox', '/EHsc']  ,
-                    'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']       }
-link_options    =  {'msvc'  : [] ,
-                    'other' : [] }
-class build_ext_options:
-    def build_options(self):
-        c_type = None
-        if self.compiler.compiler_type in compile_options:
-            c_type = self.compiler.compiler_type
-        elif 'other' in compile_options:
-            c_type = 'other'
-        if c_type is not None:
-           for e in self.extensions:
-               e.extra_compile_args = compile_options[c_type]
-
-        l_type = None 
-        if self.compiler.compiler_type in link_options:
-            l_type = self.compiler.compiler_type
-        elif 'other' in link_options:
-            l_type = 'other'
-        if l_type is not None:
-           for e in self.extensions:
-               e.extra_link_args = link_options[l_type]
-
-class build_ext_subclass( build_ext, build_ext_options ):
-    def build_extensions(self):
-        build_ext_options.build_options(self)
-        build_ext.build_extensions(self)
-        
-    
-
-# PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But,
-# this is necessary to get it compile.
-# We have to resort to monkey-patching to set the compiler, because pypy broke
-# all the everything.
-
-pre_patch_customize_compiler = sysconfig.customize_compiler
-def my_customize_compiler(compiler):
-    pre_patch_customize_compiler(compiler)
-    compiler.compiler_cxx = ['c++']
-
-
-if platform.python_implementation() == 'PyPy':
-    sysconfig.customize_compiler = my_customize_compiler
-
-#def install_headers():
-#    dest_dir = path.join(sys.prefix, 'include', 'murmurhash')
-#    if not path.exists(dest_dir):
-#        shutil.copytree('murmurhash/headers/murmurhash', dest_dir)
-#
-#    dest_dir = path.join(sys.prefix, 'include', 'numpy')
-
-
-includes = ['.', path.join(sys.prefix, 'include')]
-
-
-try:
-    import numpy
-    numpy_headers = path.join(numpy.get_include(), 'numpy')
-    shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy'))
-except ImportError:
-    pass
-except OSError:
-    pass
-
-
-def clean(mod_names):
-    for name in mod_names:
-        name = name.replace('.', '/')
-        so = name + '.so'
-        html = name + '.html'
-        cpp = name + '.cpp'
-        c = name + '.c'
-        for file_path in [so, html, cpp, c]:
-            if os.path.exists(file_path):
-                os.unlink(file_path)
-
-
-def name_to_path(mod_name, ext):
-    return '%s.%s' % (mod_name.replace('.', '/'), ext)
-
-
-def c_ext(mod_name, language, includes):
-    mod_path = name_to_path(mod_name, language)
-    return Extension(mod_name, [mod_path], include_dirs=includes)
-
-
-def cython_setup(mod_names, language, includes):
-    import Cython.Distutils
-    import Cython.Build
-    import distutils.core
-
-    class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ):
-        def build_extensions(self):
-            build_ext_options.build_options(self)
-            Cython.Distutils.build_ext.build_extensions(self)
-
-    if language == 'cpp':
-        language = 'c++'
-    exts = []
-    for mod_name in mod_names:
-        mod_path = mod_name.replace('.', '/') + '.pyx'
-        e = Extension(mod_name, [mod_path], language=language, include_dirs=includes)
-        exts.append(e)
-    distutils.core.setup(
-        name='spacy',
-        packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
-                  'spacy.syntax', 'spacy.munge'],
-        description="Industrial-strength NLP",
-        author='Matthew Honnibal',
-        author_email='honnibal@gmail.com',
-        version=VERSION,
-        url="http://honnibal.github.io/spaCy/",
-        package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"],
-                      "spacy.tokens": ["*.pxd"],
-                      "spacy.serialize": ["*.pxd"],
-                      "spacy.en": ["*.pxd", "data/pos/*",
-                                   "data/wordnet/*", "data/tokenizer/*",
-                                   "data/vocab/tag_map.json",
-                                   "data/vocab/lexemes.bin",
-                                   "data/vocab/strings.json"],
-                      "spacy.syntax": ["*.pxd"]},
-        ext_modules=exts,
-        cmdclass={'build_ext': build_ext_cython_subclass},
-        license="MIT",
-    )
-
-
-def run_setup(exts):
-    setup(
-        name='spacy',
-        packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
-                  'spacy.syntax', 'spacy.munge',
-                  'spacy.tests',
-                  'spacy.tests.matcher',
-                  'spacy.tests.morphology',
-                  'spacy.tests.munge',
-                  'spacy.tests.parser',
-                  'spacy.tests.serialize',
-                  'spacy.tests.spans',
-                  'spacy.tests.tagger',
-                  'spacy.tests.tokenizer',
-                  'spacy.tests.tokens',
-                  'spacy.tests.vectors',
-                  'spacy.tests.vocab'],
-        description="Industrial-strength NLP",
-        author='Matthew Honnibal',
-        author_email='honnibal@gmail.com',
-        version=VERSION,
-        url="http://honnibal.github.io/spaCy/",
-        package_data={"spacy": ["*.pxd"],
-                      "spacy.en": ["*.pxd", "data/pos/*",
-                                   "data/wordnet/*", "data/tokenizer/*",
-                                   "data/vocab/lexemes.bin",
-                                   "data/vocab/serializer.json",
-                                   "data/vocab/oov_prob",
-                                   "data/vocab/strings.txt"],
-                      "spacy.syntax": ["*.pxd"]},
-        ext_modules=exts,
-        license="MIT",
-        install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43',
-                          'thinc >= 3.4.1', "text_unidecode", 'plac', 'six',
-                          'ujson', 'cloudpickle'],
-        setup_requires=["headers_workaround"],
-        cmdclass = {'build_ext': build_ext_subclass },
-    )
-
-    import headers_workaround
-
-    headers_workaround.fix_venv_pypy_include()
-    headers_workaround.install_headers('murmurhash')
-    headers_workaround.install_headers('numpy')
-
-
-VERSION = '0.97'
-def main(modules, is_pypy):
-    language = "cpp"
-    includes = ['.', path.join(sys.prefix, 'include')]
-    if sys.platform.startswith('darwin'):
-        compile_options['other'].append('-mmacosx-version-min=10.8')
-        compile_options['other'].append('-stdlib=libc++')
-        link_options['other'].append('-lc++')
-    if use_cython:
-        cython_setup(modules, language, includes)
-    else:
-        exts = [c_ext(mn, language, includes)
-                      for mn in modules]
-        run_setup(exts)
-
-MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
-             'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
-             'spacy.morphology', 'spacy.tagger',
-             'spacy.syntax.stateclass', 
-             'spacy._ml', 'spacy._theano',
-             'spacy.tokenizer',
-             'spacy.syntax.parser', 
-             'spacy.syntax.transition_system',
-             'spacy.syntax.arc_eager',
-             'spacy.syntax._parse_features',
-             'spacy.gold', 'spacy.orth',
-             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
-             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
-             'spacy.cfile', 'spacy.matcher',
-             'spacy.syntax.ner',
-             'spacy.symbols']
-
-
-if __name__ == '__main__':
-    if sys.argv[1] == 'clean':
-        clean(MOD_NAMES)
-    else:
-        use_cython = sys.argv[1] == 'build_ext'
-        main(MOD_NAMES, use_cython)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -439,11 +439,23 @@ cdef class Doc:
                keep_reading = False
            yield n_bytes_str + data

-    # This function is terrible --- need to fix this.
-    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
-              unicode ent_type):
-        """Merge a multi-word expression into a single token.  Currently
-        experimental; API is likely to change."""
+
+    def token_index_start(self, int start_idx):
+        cdef int i
+        for i in range(self.length):
+            if self.data[i].idx == start_idx:
+                return i
+        return None
+
+    def token_index_end(self, int end_idx):
+        cdef int i
+        for i in range(self.length):
+            if (self.data[i].idx + self.data[i].lex.length) == end_idx:
+                return i + 1
+        return None
+
+    def range_from_indices(self, int start_idx, int end_idx):
+        assert start_idx < end_idx
        cdef int i
        cdef int start = -1
        cdef int end = -1
@ -454,10 +466,18 @@ cdef class Doc:
                if start == -1:
                    return None
                end = i + 1
-                break
-        else:
+                return (start, end)
        return None

+    # This function is terrible --- need to fix this.
+    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
+              unicode ent_type):
+        """Merge a multi-word expression into a single token.  Currently
+        experimental; API is likely to change."""
+        start_end = self.range_from_indices(start_idx, end_idx)
+        if start_end is None:
+            return None
+        start, end = start_end
        cdef Span span = self[start:end]
        # Get LexemeC for newly merged token
        new_orth = ''.join([t.text_with_ws for t in span])
--- a/spacy/tokens/spans.pxd
+++ b/spacy/tokens/spans.pxd
@ -4,8 +4,10 @@ from .doc cimport Doc
 cdef class Span:
    cdef readonly Doc doc
    cdef public int i
-    cdef public int start
-    cdef public int end
+    cdef public int start_token
+    cdef public int end_token
+    cdef public int start_idx
+    cdef public int end_idx
    cdef readonly int label

    cdef public _vector
--- a/spacy/tokens/spans.pyx
+++ b/spacy/tokens/spans.pyx
@ -21,8 +21,11 @@ cdef class Span:
            raise IndexError

        self.doc = tokens
-        self.start = start
-        self.end = end
+        # keep char offsets - as these don't change when merging spans
+        self.start_token = start
+        self.start_idx = self.doc[start].idx
+        self.end_token = end
+        self.end_idx = self.doc[end - 1].idx + len(self.doc[end - 1])
        self.label = label
        self._vector = vector
        self._vector_norm = vector_norm
@ -76,6 +79,30 @@ cdef class Span:
            return 0.0
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)

+    property start:
+        def __get__(self):
+            # if we haven't merged anything below check is false - so we get start token
+            if self.start_token >= len(self.doc) or self.doc[self.start_token].idx != self.start_idx:
+                new_start = self.doc.token_index_start(self.start_idx)
+                if new_start is not None:
+                    self.start_token = new_start
+                else:
+                    raise IndexError('Something went terribly wrong during a merge.'
+                                     'No token found with idx %s' % self.start_idx)
+            return self.start_token
+
+    property end:
+        def __get__(self):
+            # if we haven't merged anything we have fast access
+            if self.end_token >= len(self.doc) or self.doc[self.end_token - 1].idx != self.end_idx:
+                new_end = self.doc.token_index_end(self.end_idx)
+                if new_end is not None:
+                    self.end_token = new_end
+                else:
+                    raise IndexError('Something went terribly wrong during a merge.'
+                                     'No token found with idx %s' % self.end_idx)
+            return self.end_token
+
    property vector:
        def __get__(self):
            if self._vector is None: