Merge pull request #248 from henningpeters/cythonize

Cythonize
2025-12-23 18:13:13 +03:00 · 2016-02-06 01:44:42 +11:00 · 2016-02-06 01:44:42 +11:00 · 52b2542e79
commit 52b2542e79
parent 8bd16ce8f7 3a50448bf3
1 changed files with 95 additions and 139 deletions
--- a/bin/cythonize.py
+++ b/bin/cythonize.py
@ -1,62 +1,50 @@
 #!/usr/bin/env python
-""" cythonize
+""" cythonize.py
-Cythonize pyx files into C files as needed.
+Cythonize pyx files into C++ files as needed.
-Usage: cythonize [root_dir]
+Usage: cythonize.py [root]
 Default [root_dir] is 'spacy'.
 Checks pyx files to see if they have been changed relative to their
-corresponding C files.  If they have, then runs cython on these files to
+corresponding C++ files. If they have, then runs cython on these files to
-recreate the C files.
+recreate the C++ files.
-The script thinks that the pyx files have changed relative to the C files
+Additionally, checks pxd files and setup.py if they have been changed. If
-by comparing hashes stored in a database file.
+they have, rebuilds everything.
-Simple script to invoke Cython (and Tempita) on all .pyx (.pyx.in)
+Change detection based on file hashes stored in JSON format.
 files; while waiting for a proper build system. Uses file hashes to
 figure out if rebuild is needed.
 For now, this script should be run by developers when changing Cython files
-only, and the resulting C files checked in, so that end-users (and Python-only
+and the resulting C++ files checked in, so that end-users (and Python-only
-developers) do not get the Cython/Tempita dependencies.
+developers) do not get the Cython dependencies.
-Originally written by Dag Sverre Seljebotn, and copied here from:
+Based upon:
 https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py
 https://raw.githubusercontent.com/numpy/numpy/master/tools/cythonize.py
-Note: this script does not check any of the dependent C libraries; it only
+Note: this script does not check any of the dependent C++ libraries.
 operates on the Cython .pyx files.
 """
-
+from __future__ import print_function
 from __future__ import division, print_function, absolute_import
 import os
 import re
 import sys
 import json
 import hashlib
 import subprocess
 import argparse
 HASH_FILE = 'cythonize.dat'
 DEFAULT_ROOT = 'spacy'
 VENDOR = 'spaCy'
-# WindowsError is not defined on unix systems
+HASH_FILE = 'cythonize.json'
-try:
+
    WindowsError
 except NameError:
    WindowsError = None
 #
 # Rules
 #
 def process_pyx(fromfile, tofile):
    print('Processing %s' % fromfile)
    try:
        from Cython.Compiler.Version import version as cython_version
        from distutils.version import LooseVersion
        if LooseVersion(cython_version) < LooseVersion('0.19'):
-            raise Exception('Building %s requires Cython >= 0.19' % VENDOR)
+            raise Exception('Require Cython >= 0.19')
    except ImportError:
        pass
@ -67,133 +55,101 @@ def process_pyx(fromfile, tofile):
    try:
        try:
-            r = subprocess.call(['cython'] + flags + ["-o", tofile, fromfile])
+            r = subprocess.call(['cython'] + flags + ['-o', tofile, fromfile])
            if r != 0:
                raise Exception('Cython failed')
        except OSError:
            # There are ways of installing Cython that don't result in a cython
            # executable on the path, see gh-2397.
            r = subprocess.call([sys.executable, '-c',
-                                 'import sys; from Cython.Compiler.Main import '
+                                'import sys; from Cython.Compiler.Main import '
-                                 'setuptools_main as main; sys.exit(main())'] + flags +
+                                'setuptools_main as main; sys.exit(main())'] + flags +
-                                 ["-o", tofile, fromfile])
+                                ['-o', tofile, fromfile])
            if r != 0:
                raise Exception('Cython failed')
    except OSError:
        raise OSError('Cython needs to be installed')
 def process_tempita_pyx(fromfile, tofile):
    try:
        try:
            from Cython import Tempita as tempita
        except ImportError:
            import tempita
    except ImportError:
        raise Exception('Building %s requires Tempita: '
                        'pip install --user Tempita' % VENDOR)
    with open(fromfile, "r") as f:
        tmpl = f.read()
    pyxcontent = tempita.sub(tmpl)
    assert fromfile.endswith('.pyx.in')
    pyxfile = fromfile[:-len('.pyx.in')] + '.pyx'
    with open(pyxfile, "w") as f:
        f.write(pyxcontent)
    process_pyx(pyxfile, tofile)
 rules = {
    # fromext : function
    '.pyx' : process_pyx,
    '.pyx.in' : process_tempita_pyx
    }
 #
 # Hash db
 #
 def load_hashes(filename):
    # Return { filename : (sha1 of input, sha1 of output) }
    if os.path.isfile(filename):
        hashes = {}
        with open(filename, 'r') as f:
            for line in f:
                filename, inhash, outhash = line.split()
                hashes[filename] = (inhash, outhash)
    else:
        hashes = {}
    return hashes
 def save_hashes(hash_db, filename):
    with open(filename, 'w') as f:
        for key, value in sorted(hash_db.items()):
            f.write("%s %s %s\n" % (key, value[0], value[1]))
 def sha1_of_file(filename):
    h = hashlib.sha1()
    with open(filename, "rb") as f:
        h.update(f.read())
    return h.hexdigest()
 #
 # Main program
 #
 def normpath(path):
    path = path.replace(os.sep, '/')
    if path.startswith('./'):
        path = path[2:]
    return path
 def get_hash(frompath, topath):
    from_hash = sha1_of_file(frompath)
    to_hash = sha1_of_file(topath) if os.path.exists(topath) else None
    return (from_hash, to_hash)
 def process(path, fromfile, tofile, processor_function, hash_db):
    fullfrompath = os.path.join(path, fromfile)
    fulltopath = os.path.join(path, tofile)
    current_hash = get_hash(fullfrompath, fulltopath)
    if current_hash == hash_db.get(normpath(fullfrompath), None):
        print('%s has not changed' % fullfrompath)
        return
 def preserve_cwd(path, func, *args):
    orig_cwd = os.getcwd()
    try:
        os.chdir(path)
-        print('Processing %s' % fullfrompath)
+        func(*args)
        processor_function(fromfile, tofile)
    finally:
        os.chdir(orig_cwd)
    # changed target file, recompute hash
    current_hash = get_hash(fullfrompath, fulltopath)
    # store hash in db
    hash_db[normpath(fullfrompath)] = current_hash
-def find_process_files(root_dir):
+def load_hashes(filename):
    hash_db = load_hashes(HASH_FILE)
    for cur_dir, dirs, files in os.walk(root_dir):
        for filename in files:
            in_file = os.path.join(cur_dir, filename + ".in")
            if filename.endswith('.pyx') and os.path.isfile(in_file):
                continue
            for fromext, function in rules.items():
                if filename.endswith(fromext):
                    toext = ".cpp"
                    # with open(os.path.join(cur_dir, filename), 'rb') as f:
                    #     data = f.read()
                    #     m = re.search(br"^\s*#\s*distutils:\s*language\s*=\s*c\+\+\s*$", data, re.I|re.M)
                    #     if m:
                    #         toext = ".cxx"
                    fromfile = filename
                    tofile = filename[:-len(fromext)] + toext
                    process(cur_dir, fromfile, tofile, function, hash_db)
                    save_hashes(hash_db, HASH_FILE)
 def main():
    try:
-        root_dir = sys.argv[1]
+        return json.load(open(filename))
-    except IndexError:
+    except (ValueError, IOError):
-        root_dir = DEFAULT_ROOT
+        return {}
-    find_process_files(root_dir)
+
 def save_hashes(hash_db, filename):
    json.dump(hash_db, open(filename, 'wb'))
 def get_hash(path):
    return hashlib.md5(open(path, 'rb').read()).hexdigest()
 def hash_changed(base, path, db):
    full_path = os.path.normpath(os.path.join(base, path))
    return not get_hash(full_path) == db.get(full_path)
 def hash_add(base, path, db):
    full_path = os.path.normpath(os.path.join(base, path))
    db[full_path] = get_hash(full_path)
 def process(base, filename, db):
    root, ext = os.path.splitext(filename)
    if ext in ['.pyx', '.cpp']:
        if hash_changed(base, filename, db):
            preserve_cwd(base, process_pyx, root + '.pyx', root + '.cpp')
            hash_add(base, root + '.cpp', db)
            hash_add(base, root + '.pyx', db)
 def check_changes(root, db):
    res = False
    new_db = {}
    setup_filename = 'setup.py'
    hash_add('.', setup_filename, new_db)
    if hash_changed('.', setup_filename, db):
        res = True
    for base, _, files in os.walk(root):
        for filename in files:
            if filename.endswith('.pxd'):
                hash_add(base, filename, new_db)
                if hash_changed(base, filename, db):
                    res = True
    if res:
        db.clear()
        db.update(new_db)
    return res
 def run(root):
    db = load_hashes(HASH_FILE)
    try:
        check_changes(root, db)
        for base, _, files in os.walk(root):
            for filename in files:
                process(base, filename, db)
    finally:
        save_hashes(db, HASH_FILE)
 if __name__ == '__main__':
-    main()
+    parser = argparse.ArgumentParser(description='Cythonize pyx files into C++ files as needed')
    parser.add_argument('root', help='root directory')
    args = parser.parse_args()
    run(args.root)