diff --git a/setup.py b/setup.py new file mode 100644 index 000000000..1d719c626 --- /dev/null +++ b/setup.py @@ -0,0 +1,230 @@ +#!/usr/bin/env python +from setuptools import setup +import shutil + +import sys +import os +from os import path + +from setuptools import Extension +from distutils import sysconfig +from distutils.core import setup, Extension +from distutils.command.build_ext import build_ext + +import platform + +# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options +# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used +compile_options = {'msvc' : ['/Ox', '/EHsc'] , + 'other' : ['-O3', '-Wno-strict-prototypes', '-Wno-unused-function'] } +link_options = {'msvc' : [] , + 'other' : [] } +class build_ext_options: + def build_options(self): + c_type = None + if self.compiler.compiler_type in compile_options: + c_type = self.compiler.compiler_type + elif 'other' in compile_options: + c_type = 'other' + if c_type is not None: + for e in self.extensions: + e.extra_compile_args = compile_options[c_type] + + l_type = None + if self.compiler.compiler_type in link_options: + l_type = self.compiler.compiler_type + elif 'other' in link_options: + l_type = 'other' + if l_type is not None: + for e in self.extensions: + e.extra_link_args = link_options[l_type] + +class build_ext_subclass( build_ext, build_ext_options ): + def build_extensions(self): + build_ext_options.build_options(self) + build_ext.build_extensions(self) + + + +# PyPy --- NB! PyPy doesn't really work, it segfaults all over the place. But, +# this is necessary to get it compile. +# We have to resort to monkey-patching to set the compiler, because pypy broke +# all the everything. + +pre_patch_customize_compiler = sysconfig.customize_compiler +def my_customize_compiler(compiler): + pre_patch_customize_compiler(compiler) + compiler.compiler_cxx = ['c++'] + + +if platform.python_implementation() == 'PyPy': + sysconfig.customize_compiler = my_customize_compiler + +#def install_headers(): +# dest_dir = path.join(sys.prefix, 'include', 'murmurhash') +# if not path.exists(dest_dir): +# shutil.copytree('murmurhash/headers/murmurhash', dest_dir) +# +# dest_dir = path.join(sys.prefix, 'include', 'numpy') + + +includes = ['.', path.join(sys.prefix, 'include')] + + +try: + import numpy + numpy_headers = path.join(numpy.get_include(), 'numpy') + shutil.copytree(numpy_headers, path.join(sys.prefix, 'include', 'numpy')) +except ImportError: + pass +except OSError: + pass + + +def clean(mod_names): + for name in mod_names: + name = name.replace('.', '/') + so = name + '.so' + html = name + '.html' + cpp = name + '.cpp' + c = name + '.c' + for file_path in [so, html, cpp, c]: + if os.path.exists(file_path): + os.unlink(file_path) + + +def name_to_path(mod_name, ext): + return '%s.%s' % (mod_name.replace('.', '/'), ext) + + +def c_ext(mod_name, language, includes): + mod_path = name_to_path(mod_name, language) + return Extension(mod_name, [mod_path], include_dirs=includes) + + +def cython_setup(mod_names, language, includes): + import Cython.Distutils + import Cython.Build + import distutils.core + + class build_ext_cython_subclass( Cython.Distutils.build_ext, build_ext_options ): + def build_extensions(self): + build_ext_options.build_options(self) + Cython.Distutils.build_ext.build_extensions(self) + + if language == 'cpp': + language = 'c++' + exts = [] + for mod_name in mod_names: + mod_path = mod_name.replace('.', '/') + '.pyx' + e = Extension(mod_name, [mod_path], language=language, include_dirs=includes) + exts.append(e) + distutils.core.setup( + name='spacy', + packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize', + 'spacy.syntax', 'spacy.munge'], + description="Industrial-strength NLP", + author='Matthew Honnibal', + author_email='honnibal@gmail.com', + version=VERSION, + url="http://honnibal.github.io/spaCy/", + package_data={"spacy": ["*.pxd", "tests/*.py", "tests/*/*.py"], + "spacy.tokens": ["*.pxd"], + "spacy.serialize": ["*.pxd"], + "spacy.en": ["*.pxd", "data/pos/*", + "data/wordnet/*", "data/tokenizer/*", + "data/vocab/tag_map.json", + "data/vocab/lexemes.bin", + "data/vocab/strings.json"], + "spacy.syntax": ["*.pxd"]}, + ext_modules=exts, + cmdclass={'build_ext': build_ext_cython_subclass}, + license="MIT", + ) + + +def run_setup(exts): + setup( + name='spacy', + packages=['spacy', 'spacy.tokens', 'spacy.en', 'spacy.serialize', + 'spacy.syntax', 'spacy.munge', + 'spacy.tests', + 'spacy.tests.matcher', + 'spacy.tests.morphology', + 'spacy.tests.munge', + 'spacy.tests.parser', + 'spacy.tests.serialize', + 'spacy.tests.spans', + 'spacy.tests.tagger', + 'spacy.tests.tokenizer', + 'spacy.tests.tokens', + 'spacy.tests.vectors', + 'spacy.tests.vocab'], + description="Industrial-strength NLP", + author='Matthew Honnibal', + author_email='honnibal@gmail.com', + version=VERSION, + url="http://honnibal.github.io/spaCy/", + package_data={"spacy": ["*.pxd"], + "spacy.en": ["*.pxd", "data/pos/*", + "data/wordnet/*", "data/tokenizer/*", + "data/vocab/lexemes.bin", + "data/vocab/serializer.json", + "data/vocab/oov_prob", + "data/vocab/strings.txt"], + "spacy.syntax": ["*.pxd"]}, + ext_modules=exts, + license="MIT", + install_requires=['numpy', 'murmurhash', 'cymem >= 1.30', 'preshed >= 0.43', + 'thinc >= 3.4.1', "text_unidecode", 'plac', 'six', + 'ujson', 'cloudpickle'], + setup_requires=["headers_workaround"], + cmdclass = {'build_ext': build_ext_subclass }, + ) + + import headers_workaround + + headers_workaround.fix_venv_pypy_include() + headers_workaround.install_headers('murmurhash') + headers_workaround.install_headers('numpy') + + +VERSION = '0.97' +def main(modules, is_pypy): + language = "cpp" + includes = ['.', path.join(sys.prefix, 'include')] + if sys.platform.startswith('darwin'): + compile_options['other'].append('-mmacosx-version-min=10.8') + compile_options['other'].append('-stdlib=libc++') + link_options['other'].append('-lc++') + if use_cython: + cython_setup(modules, language, includes) + else: + exts = [c_ext(mn, language, includes) + for mn in modules] + run_setup(exts) + +MOD_NAMES = ['spacy.parts_of_speech', 'spacy.strings', + 'spacy.lexeme', 'spacy.vocab', 'spacy.attrs', + 'spacy.morphology', 'spacy.tagger', + 'spacy.syntax.stateclass', + 'spacy._ml', 'spacy._theano', + 'spacy.tokenizer', + 'spacy.syntax.parser', + 'spacy.syntax.transition_system', + 'spacy.syntax.arc_eager', + 'spacy.syntax._parse_features', + 'spacy.gold', 'spacy.orth', + 'spacy.tokens.doc', 'spacy.tokens.spans', 'spacy.tokens.token', + 'spacy.serialize.packer', 'spacy.serialize.huffman', 'spacy.serialize.bits', + 'spacy.cfile', 'spacy.matcher', + 'spacy.syntax.ner', + 'spacy.symbols'] + + +if __name__ == '__main__': + if sys.argv[1] == 'clean': + clean(MOD_NAMES) + else: + use_cython = sys.argv[1] == 'build_ext' + main(MOD_NAMES, use_cython) diff --git a/spacy/tests/spans/test_merge.py b/spacy/tests/spans/test_merge.py index 2360a0839..315757a0b 100644 --- a/spacy/tests/spans/test_merge.py +++ b/spacy/tests/spans/test_merge.py @@ -1,7 +1,6 @@ from __future__ import unicode_literals import pytest -@pytest.mark.models def test_merge_tokens(EN): tokens = EN(u'Los Angeles start.') assert len(tokens) == 4 @@ -13,7 +12,6 @@ def test_merge_tokens(EN): assert tokens[0].head.orth_ == 'start' -@pytest.mark.models def test_merge_heads(EN): tokens = EN(u'I found a pilates class near work.') assert len(tokens) == 8 @@ -32,7 +30,6 @@ def test_issue_54(EN): text = u'Talks given by women had a slightly higher number of questions asked (3.2$\pm$0.2) than talks given by men (2.6$\pm$0.1).' tokens = EN(text) -@pytest.mark.models def test_np_merges(EN): text = u'displaCy is a parse tool built with Javascript' tokens = EN(text) @@ -47,3 +44,27 @@ def test_np_merges(EN): merged = tokens.merge(start, end, label, lemma, label) assert merged != None, (start, end, label, lemma) +def test_entity_merge(EN): + tokens = EN(u'Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale') + assert(len(tokens) == 15) + for ent in tokens.ents: + label, lemma, type_ = (ent.root.tag_, ent.root.lemma_, max(w.ent_type_ for w in ent)) + ent.merge(label, lemma, type_) + # check looping is ok + assert(len(tokens) == 13) + +def test_sentence_update_after_merge(EN): + tokens = EN(u'Stewart Lee is a stand up comedian. He lives in England and loves Joe Pasquale') + sent1, sent2 = list(tokens.sents) + init_len = len(sent1) + merge_me = tokens[0:2] + merge_me.merge(u'none', u'none', u'none') + assert(len(sent1) == init_len - 1) + +def test_subtree_size_check(EN): + tokens = EN(u'Stewart Lee is a stand up comedian who lives in England and loves Joe Pasquale') + sent1 = list(tokens.sents)[0] + init_len = len(list(sent1.root.subtree)) + merge_me = tokens[0:2] + merge_me.merge(u'none', u'none', u'none') + assert(len(list(sent1.root.subtree)) == init_len - 1) diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx index d11054e35..555528a33 100644 --- a/spacy/tokens/doc.pyx +++ b/spacy/tokens/doc.pyx @@ -459,7 +459,6 @@ cdef class Doc: def range_from_indices(self, int start_idx, int end_idx): """ Get tuple - span of token indices which correspond to character indices (start_idx, end_idx) if such a span exists""" - assert start_idx < end_idx cdef int i cdef int start = -1 cdef int end = -1 @@ -490,8 +489,6 @@ cdef class Doc: cdef const LexemeC* lex = self.vocab.get(self.mem, new_orth) # House the new merged token where it starts cdef TokenC* token = &self.data[start] - # Update fields - token.lex = lex token.spacy = self.data[end-1].spacy # What to do about morphology?? # TODO: token.morph = ??? @@ -509,6 +506,10 @@ cdef class Doc: # bridges over the entity. Here the alignment of the tokens changes. span_root = span.root.i token.dep = span.root.dep + # We update token.lex after keeping span root and dep, since + # setting token.lex will change span.start and span.end properties + # as it modifies the character offsets in the doc + token.lex = lex for i in range(self.length): self.data[i].head += i # Set the head of the merged token, and its dep relation, from the Span