changed start end to properties - allow merge

2025-08-07 05:40:20 +03:00 · 2015-11-03 17:13:42 +02:00 · 2015-11-03 17:13:42 +02:00 · c780bbda3e
commit c780bbda3e
parent 7691737fad
4 changed files with 61 additions and 242 deletions
--- a/setup.py
+++ b/setup.py
@ -1,230 +0,0 @@
 #!/usr/bin/env python
 from setuptools import setup
 import shutil
 import sys
 import os
 from os import path
 from setuptools import Extension
 from distutils import sysconfig
 from distutils.core import setup, Extension
 from distutils.command.build_ext import build_ext
 import platform
 # By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
 # http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
 compile_options =  {'msvc'  : ['/Ox', '/EHsc']  ,
                    'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']       }
 link_options    =  {'msvc'  : [] ,
                    'other' : [] }
 class build_ext_options:
    def build_options(self):
        c_type = None
        if self.compiler.compiler_type in compile_options:
            c_type = self.compiler.compiler_type
        elif 'other' in compile_options:
            c_type = 'other'
        if c_type is not None:
           for e in self.extensions:
               e.extra_compile_args = compile_options[c_type]
        l_type = None 
        if self.compiler.compiler_type in link_options:
            l_type = self.compiler.compiler_type
        elif 'other' in link_options:
            l_type = 'other'
        if l_type is not None:
           for e in self.extensions:
               e.extra_link_args = link_options[l_type]
 class build_ext_subclass( build_ext, build_ext_options ):
    def build_extensions(self):
        build_ext_options.build_options(self)
        build_ext.build_extensions(self)
 # PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But,
 # this is necessary to get it compile.
 # We have to resort to monkey-patching to set the compiler, because pypy broke
 # all the everything.
 pre_patch_customize_compiler = sysconfig.customize_compiler
 def my_customize_compiler(compiler):
    pre_patch_customize_compiler(compiler)
    compiler.compiler_cxx = ['c++']
 if platform.python_implementation() == 'PyPy':
    sysconfig.customize_compiler = my_customize_compiler
 #def install_headers():
 #    dest_dir = path.join(sys.prefix, 'include', 'murmurhash')
 #    if not path.exists(dest_dir):
 #        shutil.copytree('murmurhash/headers/murmurhash', dest_dir)
 #
 #    dest_dir = path.join(sys.prefix, 'include', 'numpy')
 includes = ['.', path.join(sys.prefix, 'include')]
 try:
    import numpy
    numpy_headers = path.join(numpy.get_include(), 'numpy')
    shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy'))
 except ImportError:
    pass
 except OSError:
    pass
 def clean(mod_names):
    for name in mod_names:
        name = name.replace('.', '/')
        so = name + '.so'
        html = name + '.html'
        cpp = name + '.cpp'
        c = name + '.c'
        for file_path in [so, html, cpp, c]:
            if os.path.exists(file_path):
                os.unlink(file_path)
 def name_to_path(mod_name, ext):
    return '%s.%s' % (mod_name.replace('.', '/'), ext)
 def c_ext(mod_name, language, includes):
    mod_path = name_to_path(mod_name, language)
    return Extension(mod_name, [mod_path], include_dirs=includes)
 def cython_setup(mod_names, language, includes):
    import Cython.Distutils
    import Cython.Build
    import distutils.core
    class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ):
        def build_extensions(self):
            build_ext_options.build_options(self)
            Cython.Distutils.build_ext.build_extensions(self)
    if language == 'cpp':
        language = 'c++'
    exts = []
    for mod_name in mod_names:
        mod_path = mod_name.replace('.', '/') + '.pyx'
        e = Extension(mod_name, [mod_path], language=language, include_dirs=includes)
        exts.append(e)
    distutils.core.setup(
        name='spacy',
        packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
                  'spacy.syntax', 'spacy.munge'],
        description="Industrial-strength NLP",
        author='Matthew Honnibal',
        author_email='honnibal@gmail.com',
        version=VERSION,
        url="http://honnibal.github.io/spaCy/",
        package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"],
                      "spacy.tokens": ["*.pxd"],
                      "spacy.serialize": ["*.pxd"],
                      "spacy.en": ["*.pxd", "data/pos/*",
                                   "data/wordnet/*", "data/tokenizer/*",
                                   "data/vocab/tag_map.json",
                                   "data/vocab/lexemes.bin",
                                   "data/vocab/strings.json"],
                      "spacy.syntax": ["*.pxd"]},
        ext_modules=exts,
        cmdclass={'build_ext': build_ext_cython_subclass},
        license="MIT",
    )
 def run_setup(exts):
    setup(
        name='spacy',
        packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
                  'spacy.syntax', 'spacy.munge',
                  'spacy.tests',
                  'spacy.tests.matcher',
                  'spacy.tests.morphology',
                  'spacy.tests.munge',
                  'spacy.tests.parser',
                  'spacy.tests.serialize',
                  'spacy.tests.spans',
                  'spacy.tests.tagger',
                  'spacy.tests.tokenizer',
                  'spacy.tests.tokens',
                  'spacy.tests.vectors',
                  'spacy.tests.vocab'],
        description="Industrial-strength NLP",
        author='Matthew Honnibal',
        author_email='honnibal@gmail.com',
        version=VERSION,
        url="http://honnibal.github.io/spaCy/",
        package_data={"spacy": ["*.pxd"],
                      "spacy.en": ["*.pxd", "data/pos/*",
                                   "data/wordnet/*", "data/tokenizer/*",
                                   "data/vocab/lexemes.bin",
                                   "data/vocab/serializer.json",
                                   "data/vocab/oov_prob",
                                   "data/vocab/strings.txt"],
                      "spacy.syntax": ["*.pxd"]},
        ext_modules=exts,
        license="MIT",
        install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43',
                          'thinc >= 3.4.1', "text_unidecode", 'plac', 'six',
                          'ujson', 'cloudpickle'],
        setup_requires=["headers_workaround"],
        cmdclass = {'build_ext': build_ext_subclass },
    )
    import headers_workaround
    headers_workaround.fix_venv_pypy_include()
    headers_workaround.install_headers('murmurhash')
    headers_workaround.install_headers('numpy')
 VERSION = '0.97'
 def main(modules, is_pypy):
    language = "cpp"
    includes = ['.', path.join(sys.prefix, 'include')]
    if sys.platform.startswith('darwin'):
        compile_options['other'].append('-mmacosx-version-min=10.8')
        compile_options['other'].append('-stdlib=libc++')
        link_options['other'].append('-lc++')
    if use_cython:
        cython_setup(modules, language, includes)
    else:
        exts = [c_ext(mn, language, includes)
                      for mn in modules]
        run_setup(exts)
 MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
             'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
             'spacy.morphology', 'spacy.tagger',
             'spacy.syntax.stateclass', 
             'spacy._ml', 'spacy._theano',
             'spacy.tokenizer',
             'spacy.syntax.parser', 
             'spacy.syntax.transition_system',
             'spacy.syntax.arc_eager',
             'spacy.syntax._parse_features',
             'spacy.gold', 'spacy.orth',
             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
             'spacy.cfile', 'spacy.matcher',
             'spacy.syntax.ner',
             'spacy.symbols']
 if __name__ == '__main__':
    if sys.argv[1] == 'clean':
        clean(MOD_NAMES)
    else:
        use_cython = sys.argv[1] == 'build_ext'
        main(MOD_NAMES, use_cython)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -439,11 +439,23 @@ cdef class Doc:
                keep_reading = False
            yield n_bytes_str + data
-    # This function is terrible --- need to fix this.
+
-    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
+    def token_index_start(self, int start_idx):
-              unicode ent_type):
+        cdef int i
-        """Merge a multi-word expression into a single token.  Currently
+        for i in range(self.length):
-        experimental; API is likely to change."""
+            if self.data[i].idx == start_idx:
                return i
        return None
    def token_index_end(self, int end_idx):
        cdef int i
        for i in range(self.length):
            if (self.data[i].idx + self.data[i].lex.length) == end_idx:
                return i + 1
        return None
    def range_from_indices(self, int start_idx, int end_idx):
        assert start_idx < end_idx
        cdef int i
        cdef int start = -1
        cdef int end = -1
@ -454,10 +466,18 @@ cdef class Doc:
                if start == -1:
                    return None
                end = i + 1
-                break
+                return (start, end)
-        else:
+        return None
            return None
    # This function is terrible --- need to fix this.
    def merge(self, int start_idx, int end_idx, unicode tag, unicode lemma,
              unicode ent_type):
        """Merge a multi-word expression into a single token.  Currently
        experimental; API is likely to change."""
        start_end = self.range_from_indices(start_idx, end_idx)
        if start_end is None:
            return None
        start, end = start_end
        cdef Span span = self[start:end]
        # Get LexemeC for newly merged token
        new_orth = ''.join([t.text_with_ws for t in span])
--- a/spacy/tokens/spans.pxd
+++ b/spacy/tokens/spans.pxd
@ -4,8 +4,10 @@ from .doc cimport Doc
 cdef class Span:
    cdef readonly Doc doc
    cdef public int i
-    cdef public int start
+    cdef public int start_token
-    cdef public int end
+    cdef public int end_token
    cdef public int start_idx
    cdef public int end_idx
    cdef readonly int label
    cdef public _vector
--- a/spacy/tokens/spans.pyx
+++ b/spacy/tokens/spans.pyx
@ -21,8 +21,11 @@ cdef class Span:
            raise IndexError
        self.doc = tokens
-        self.start = start
+        # keep char offsets - as these don't change when merging spans
-        self.end = end
+        self.start_token = start
        self.start_idx = self.doc[start].idx
        self.end_token = end
        self.end_idx = self.doc[end - 1].idx + len(self.doc[end - 1])
        self.label = label
        self._vector = vector
        self._vector_norm = vector_norm
@ -76,6 +79,30 @@ cdef class Span:
            return 0.0
        return numpy.dot(self.vector, other.vector) / (self.vector_norm * other.vector_norm)
    property start:
        def __get__(self):
            # if we haven't merged anything below check is false - so we get start token
            if self.start_token >= len(self.doc) or self.doc[self.start_token].idx != self.start_idx:
                new_start = self.doc.token_index_start(self.start_idx)
                if new_start is not None:
                    self.start_token = new_start
                else:
                    raise IndexError('Something went terribly wrong during a merge.'
                                     'No token found with idx %s' % self.start_idx)
            return self.start_token
    property end:
        def __get__(self):
            # if we haven't merged anything we have fast access
            if self.end_token >= len(self.doc) or self.doc[self.end_token - 1].idx != self.end_idx:
                new_end = self.doc.token_index_end(self.end_idx)
                if new_end is not None:
                    self.end_token = new_end
                else:
                    raise IndexError('Something went terribly wrong during a merge.'
                                     'No token found with idx %s' % self.end_idx)
            return self.end_token
    property vector:
        def __get__(self):
            if self._vector is None: