assign lex after spans, add tests

2025-09-21 03:22:37 +03:00 · 2015-11-04 19:49:22 +02:00 · 2015-11-04 19:49:22 +02:00 · 93918b5c23
commit 93918b5c23
parent 015a84a5ec
3 changed files with 258 additions and 6 deletions
--- a/setup.py
+++ b/setup.py
@ -0,0 +1,230 @@
+#!/usr/bin/env python
+from setuptools import setup
+import shutil
+
+import sys
+import os
+from os import path
+
+from setuptools import Extension
+from distutils import sysconfig
+from distutils.core import setup, Extension
+from distutils.command.build_ext import build_ext
+
+import platform
+
+# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
+# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
+compile_options =  {'msvc'  : ['/Ox', '/EHsc']  ,
+                    'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function']       }
+link_options    =  {'msvc'  : [] ,
+                    'other' : [] }
+class build_ext_options:
+    def build_options(self):
+        c_type = None
+        if self.compiler.compiler_type in compile_options:
+            c_type = self.compiler.compiler_type
+        elif 'other' in compile_options:
+            c_type = 'other'
+        if c_type is not None:
+           for e in self.extensions:
+               e.extra_compile_args = compile_options[c_type]
+
+        l_type = None 
+        if self.compiler.compiler_type in link_options:
+            l_type = self.compiler.compiler_type
+        elif 'other' in link_options:
+            l_type = 'other'
+        if l_type is not None:
+           for e in self.extensions:
+               e.extra_link_args = link_options[l_type]
+
+class build_ext_subclass( build_ext, build_ext_options ):
+    def build_extensions(self):
+        build_ext_options.build_options(self)
+        build_ext.build_extensions(self)
+        
+    
+
+# PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But,
+# this is necessary to get it compile.
+# We have to resort to monkey-patching to set the compiler, because pypy broke
+# all the everything.
+
+pre_patch_customize_compiler = sysconfig.customize_compiler
+def my_customize_compiler(compiler):
+    pre_patch_customize_compiler(compiler)
+    compiler.compiler_cxx = ['c++']
+
+
+if platform.python_implementation() == 'PyPy':
+    sysconfig.customize_compiler = my_customize_compiler
+
+#def install_headers():
+#    dest_dir = path.join(sys.prefix, 'include', 'murmurhash')
+#    if not path.exists(dest_dir):
+#        shutil.copytree('murmurhash/headers/murmurhash', dest_dir)
+#
+#    dest_dir = path.join(sys.prefix, 'include', 'numpy')
+
+
+includes = ['.', path.join(sys.prefix, 'include')]
+
+
+try:
+    import numpy
+    numpy_headers = path.join(numpy.get_include(), 'numpy')
+    shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy'))
+except ImportError:
+    pass
+except OSError:
+    pass
+
+
+def clean(mod_names):
+    for name in mod_names:
+        name = name.replace('.', '/')
+        so = name + '.so'
+        html = name + '.html'
+        cpp = name + '.cpp'
+        c = name + '.c'
+        for file_path in [so, html, cpp, c]:
+            if os.path.exists(file_path):
+                os.unlink(file_path)
+
+
+def name_to_path(mod_name, ext):
+    return '%s.%s' % (mod_name.replace('.', '/'), ext)
+
+
+def c_ext(mod_name, language, includes):
+    mod_path = name_to_path(mod_name, language)
+    return Extension(mod_name, [mod_path], include_dirs=includes)
+
+
+def cython_setup(mod_names, language, includes):
+    import Cython.Distutils
+    import Cython.Build
+    import distutils.core
+
+    class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ):
+        def build_extensions(self):
+            build_ext_options.build_options(self)
+            Cython.Distutils.build_ext.build_extensions(self)
+
+    if language == 'cpp':
+        language = 'c++'
+    exts = []
+    for mod_name in mod_names:
+        mod_path = mod_name.replace('.', '/') + '.pyx'
+        e = Extension(mod_name, [mod_path], language=language, include_dirs=includes)
+        exts.append(e)
+    distutils.core.setup(
+        name='spacy',
+        packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
+                  'spacy.syntax', 'spacy.munge'],
+        description="Industrial-strength NLP",
+        author='Matthew Honnibal',
+        author_email='honnibal@gmail.com',
+        version=VERSION,
+        url="http://honnibal.github.io/spaCy/",
+        package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"],
+                      "spacy.tokens": ["*.pxd"],
+                      "spacy.serialize": ["*.pxd"],
+                      "spacy.en": ["*.pxd", "data/pos/*",
+                                   "data/wordnet/*", "data/tokenizer/*",
+                                   "data/vocab/tag_map.json",
+                                   "data/vocab/lexemes.bin",
+                                   "data/vocab/strings.json"],
+                      "spacy.syntax": ["*.pxd"]},
+        ext_modules=exts,
+        cmdclass={'build_ext': build_ext_cython_subclass},
+        license="MIT",
+    )
+
+
+def run_setup(exts):
+    setup(
+        name='spacy',
+        packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize',
+                  'spacy.syntax', 'spacy.munge',
+                  'spacy.tests',
+                  'spacy.tests.matcher',
+                  'spacy.tests.morphology',
+                  'spacy.tests.munge',
+                  'spacy.tests.parser',
+                  'spacy.tests.serialize',
+                  'spacy.tests.spans',
+                  'spacy.tests.tagger',
+                  'spacy.tests.tokenizer',
+                  'spacy.tests.tokens',
+                  'spacy.tests.vectors',
+                  'spacy.tests.vocab'],
+        description="Industrial-strength NLP",
+        author='Matthew Honnibal',
+        author_email='honnibal@gmail.com',
+        version=VERSION,
+        url="http://honnibal.github.io/spaCy/",
+        package_data={"spacy": ["*.pxd"],
+                      "spacy.en": ["*.pxd", "data/pos/*",
+                                   "data/wordnet/*", "data/tokenizer/*",
+                                   "data/vocab/lexemes.bin",
+                                   "data/vocab/serializer.json",
+                                   "data/vocab/oov_prob",
+                                   "data/vocab/strings.txt"],
+                      "spacy.syntax": ["*.pxd"]},
+        ext_modules=exts,
+        license="MIT",
+        install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43',
+                          'thinc >= 3.4.1', "text_unidecode", 'plac', 'six',
+                          'ujson', 'cloudpickle'],
+        setup_requires=["headers_workaround"],
+        cmdclass = {'build_ext': build_ext_subclass },
+    )
+
+    import headers_workaround
+
+    headers_workaround.fix_venv_pypy_include()
+    headers_workaround.install_headers('murmurhash')
+    headers_workaround.install_headers('numpy')
+
+
+VERSION = '0.97'
+def main(modules, is_pypy):
+    language = "cpp"
+    includes = ['.', path.join(sys.prefix, 'include')]
+    if sys.platform.startswith('darwin'):
+        compile_options['other'].append('-mmacosx-version-min=10.8')
+        compile_options['other'].append('-stdlib=libc++')
+        link_options['other'].append('-lc++')
+    if use_cython:
+        cython_setup(modules, language, includes)
+    else:
+        exts = [c_ext(mn, language, includes)
+                      for mn in modules]
+        run_setup(exts)
+
+MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings',
+             'spacy.lexeme', 'spacy.vocab', 'spacy.attrs',
+             'spacy.morphology', 'spacy.tagger',
+             'spacy.syntax.stateclass', 
+             'spacy._ml', 'spacy._theano',
+             'spacy.tokenizer',
+             'spacy.syntax.parser', 
+             'spacy.syntax.transition_system',
+             'spacy.syntax.arc_eager',
+             'spacy.syntax._parse_features',
+             'spacy.gold', 'spacy.orth',
+             'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token',
+             'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits',
+             'spacy.cfile', 'spacy.matcher',
+             'spacy.syntax.ner',
+             'spacy.symbols']
+
+
+if __name__ == '__main__':
+    if sys.argv[1] == 'clean':
+        clean(MOD_NAMES)
+    else:
+        use_cython = sys.argv[1] == 'build_ext'
+        main(MOD_NAMES, use_cython)
--- a/spacy/tests/spans/test_merge.py
+++ b/spacy/tests/spans/test_merge.py
@ -1,7 +1,6 @@
 from __future__ import unicode_literals
 import pytest

-@pytest.mark.models
 def test_merge_tokens(EN):
    tokens = EN(u'Los Angeles start.')
    assert len(tokens) == 4
@ -13,7 +12,6 @@ def test_merge_tokens(EN):
    assert tokens[0].head.orth_ == 'start'


-@pytest.mark.models
 def test_merge_heads(EN):
    tokens = EN(u'I found a pilates class near work.')
    assert len(tokens) == 8
@ -32,7 +30,6 @@ def test_issue_54(EN):
    text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).'
    tokens = EN(text)

-@pytest.mark.models
 def test_np_merges(EN):
    text = u'displaCy is a parse tool built with Javascript'
    tokens = EN(text)
@ -47,3 +44,27 @@ def test_np_merges(EN):
        merged = tokens.merge(start, end, label, lemma, label)
        assert merged != None, (start, end, label, lemma) 

+def test_entity_merge(EN):
+    tokens = EN(u'Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale')
+    assert(len(tokens) == 15)
+    for ent in tokens.ents:
+        label, lemma, type_ = (ent.root.tag_, ent.root.lemma_, max(w.ent_type_ for w in ent))
+        ent.merge(label, lemma, type_)
+    # check looping is ok
+    assert(len(tokens) == 13)
+
+def test_sentence_update_after_merge(EN):
+    tokens = EN(u'Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale')
+    sent1, sent2 = list(tokens.sents)
+    init_len = len(sent1)
+    merge_me = tokens[0:2]
+    merge_me.merge(u'none', u'none', u'none')
+    assert(len(sent1) == init_len - 1)
+
+def test_subtree_size_check(EN):
+    tokens = EN(u'Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale')
+    sent1 = list(tokens.sents)[0]
+    init_len = len(list(sent1.root.subtree))
+    merge_me = tokens[0:2]
+    merge_me.merge(u'none', u'none', u'none')
+    assert(len(list(sent1.root.subtree)) == init_len - 1)
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -459,7 +459,6 @@ cdef class Doc:
    def range_from_indices(self, int start_idx, int end_idx):
        """ Get tuple - span of token indices which correspond to
            character indices (start_idx, end_idx) if such a span exists"""
-        assert start_idx < end_idx
        cdef int i
        cdef int start = -1
        cdef int end = -1
@ -490,8 +489,6 @@ cdef class Doc:
        cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth)
        # House the new merged token where it starts
        cdef TokenC* token = &self.data[start]
-        # Update fields
-        token.lex = lex
        token.spacy = self.data[end-1].spacy
        # What to do about morphology??
        # TODO: token.morph = ???
@ -509,6 +506,10 @@ cdef class Doc:
        # bridges over the entity. Here the alignment of the tokens changes.
        span_root = span.root.i
        token.dep = span.root.dep
+        # We update token.lex after keeping span root and dep, since
+        # setting token.lex will change span.start and span.end properties
+        # as it modifies the character offsets in the doc
+        token.lex = lex
        for i in range(self.length):
            self.data[i].head += i
        # Set the head of the merged token, and its dep relation, from the Span