mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 01:46:28 +03:00
commit
52b2542e79
234
bin/cythonize.py
234
bin/cythonize.py
|
@ -1,62 +1,50 @@
|
||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
""" cythonize
|
""" cythonize.py
|
||||||
|
|
||||||
Cythonize pyx files into C files as needed.
|
Cythonize pyx files into C++ files as needed.
|
||||||
|
|
||||||
Usage: cythonize [root_dir]
|
Usage: cythonize.py [root]
|
||||||
|
|
||||||
Default [root_dir] is 'spacy'.
|
|
||||||
|
|
||||||
Checks pyx files to see if they have been changed relative to their
|
Checks pyx files to see if they have been changed relative to their
|
||||||
corresponding C files. If they have, then runs cython on these files to
|
corresponding C++ files. If they have, then runs cython on these files to
|
||||||
recreate the C files.
|
recreate the C++ files.
|
||||||
|
|
||||||
The script thinks that the pyx files have changed relative to the C files
|
Additionally, checks pxd files and setup.py if they have been changed. If
|
||||||
by comparing hashes stored in a database file.
|
they have, rebuilds everything.
|
||||||
|
|
||||||
Simple script to invoke Cython (and Tempita) on all .pyx (.pyx.in)
|
Change detection based on file hashes stored in JSON format.
|
||||||
files; while waiting for a proper build system. Uses file hashes to
|
|
||||||
figure out if rebuild is needed.
|
|
||||||
|
|
||||||
For now, this script should be run by developers when changing Cython files
|
For now, this script should be run by developers when changing Cython files
|
||||||
only, and the resulting C files checked in, so that end-users (and Python-only
|
and the resulting C++ files checked in, so that end-users (and Python-only
|
||||||
developers) do not get the Cython/Tempita dependencies.
|
developers) do not get the Cython dependencies.
|
||||||
|
|
||||||
Originally written by Dag Sverre Seljebotn, and copied here from:
|
Based upon:
|
||||||
|
|
||||||
https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py
|
https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py
|
||||||
|
https://raw.githubusercontent.com/numpy/numpy/master/tools/cythonize.py
|
||||||
|
|
||||||
Note: this script does not check any of the dependent C libraries; it only
|
Note: this script does not check any of the dependent C++ libraries.
|
||||||
operates on the Cython .pyx files.
|
|
||||||
"""
|
"""
|
||||||
|
from __future__ import print_function
|
||||||
from __future__ import division, print_function, absolute_import
|
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import re
|
|
||||||
import sys
|
import sys
|
||||||
|
import json
|
||||||
import hashlib
|
import hashlib
|
||||||
import subprocess
|
import subprocess
|
||||||
|
import argparse
|
||||||
|
|
||||||
HASH_FILE = 'cythonize.dat'
|
|
||||||
DEFAULT_ROOT = 'spacy'
|
|
||||||
VENDOR = 'spaCy'
|
|
||||||
|
|
||||||
# WindowsError is not defined on unix systems
|
HASH_FILE = 'cythonize.json'
|
||||||
try:
|
|
||||||
WindowsError
|
|
||||||
except NameError:
|
|
||||||
WindowsError = None
|
|
||||||
|
|
||||||
#
|
|
||||||
# Rules
|
|
||||||
#
|
|
||||||
def process_pyx(fromfile, tofile):
|
def process_pyx(fromfile, tofile):
|
||||||
|
print('Processing %s' % fromfile)
|
||||||
try:
|
try:
|
||||||
from Cython.Compiler.Version import version as cython_version
|
from Cython.Compiler.Version import version as cython_version
|
||||||
from distutils.version import LooseVersion
|
from distutils.version import LooseVersion
|
||||||
if LooseVersion(cython_version) < LooseVersion('0.19'):
|
if LooseVersion(cython_version) < LooseVersion('0.19'):
|
||||||
raise Exception('Building %s requires Cython >= 0.19' % VENDOR)
|
raise Exception('Require Cython >= 0.19')
|
||||||
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
pass
|
pass
|
||||||
|
@ -67,133 +55,101 @@ def process_pyx(fromfile, tofile):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
try:
|
try:
|
||||||
r = subprocess.call(['cython'] + flags + ["-o", tofile, fromfile])
|
r = subprocess.call(['cython'] + flags + ['-o', tofile, fromfile])
|
||||||
if r != 0:
|
if r != 0:
|
||||||
raise Exception('Cython failed')
|
raise Exception('Cython failed')
|
||||||
except OSError:
|
except OSError:
|
||||||
# There are ways of installing Cython that don't result in a cython
|
# There are ways of installing Cython that don't result in a cython
|
||||||
# executable on the path, see gh-2397.
|
# executable on the path, see gh-2397.
|
||||||
r = subprocess.call([sys.executable, '-c',
|
r = subprocess.call([sys.executable, '-c',
|
||||||
'import sys; from Cython.Compiler.Main import '
|
'import sys; from Cython.Compiler.Main import '
|
||||||
'setuptools_main as main; sys.exit(main())'] + flags +
|
'setuptools_main as main; sys.exit(main())'] + flags +
|
||||||
["-o", tofile, fromfile])
|
['-o', tofile, fromfile])
|
||||||
if r != 0:
|
if r != 0:
|
||||||
raise Exception('Cython failed')
|
raise Exception('Cython failed')
|
||||||
except OSError:
|
except OSError:
|
||||||
raise OSError('Cython needs to be installed')
|
raise OSError('Cython needs to be installed')
|
||||||
|
|
||||||
def process_tempita_pyx(fromfile, tofile):
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
from Cython import Tempita as tempita
|
|
||||||
except ImportError:
|
|
||||||
import tempita
|
|
||||||
except ImportError:
|
|
||||||
raise Exception('Building %s requires Tempita: '
|
|
||||||
'pip install --user Tempita' % VENDOR)
|
|
||||||
with open(fromfile, "r") as f:
|
|
||||||
tmpl = f.read()
|
|
||||||
pyxcontent = tempita.sub(tmpl)
|
|
||||||
assert fromfile.endswith('.pyx.in')
|
|
||||||
pyxfile = fromfile[:-len('.pyx.in')] + '.pyx'
|
|
||||||
with open(pyxfile, "w") as f:
|
|
||||||
f.write(pyxcontent)
|
|
||||||
process_pyx(pyxfile, tofile)
|
|
||||||
|
|
||||||
rules = {
|
|
||||||
# fromext : function
|
|
||||||
'.pyx' : process_pyx,
|
|
||||||
'.pyx.in' : process_tempita_pyx
|
|
||||||
}
|
|
||||||
#
|
|
||||||
# Hash db
|
|
||||||
#
|
|
||||||
def load_hashes(filename):
|
|
||||||
# Return { filename : (sha1 of input, sha1 of output) }
|
|
||||||
if os.path.isfile(filename):
|
|
||||||
hashes = {}
|
|
||||||
with open(filename, 'r') as f:
|
|
||||||
for line in f:
|
|
||||||
filename, inhash, outhash = line.split()
|
|
||||||
hashes[filename] = (inhash, outhash)
|
|
||||||
else:
|
|
||||||
hashes = {}
|
|
||||||
return hashes
|
|
||||||
|
|
||||||
def save_hashes(hash_db, filename):
|
|
||||||
with open(filename, 'w') as f:
|
|
||||||
for key, value in sorted(hash_db.items()):
|
|
||||||
f.write("%s %s %s\n" % (key, value[0], value[1]))
|
|
||||||
|
|
||||||
def sha1_of_file(filename):
|
|
||||||
h = hashlib.sha1()
|
|
||||||
with open(filename, "rb") as f:
|
|
||||||
h.update(f.read())
|
|
||||||
return h.hexdigest()
|
|
||||||
|
|
||||||
#
|
|
||||||
# Main program
|
|
||||||
#
|
|
||||||
|
|
||||||
def normpath(path):
|
|
||||||
path = path.replace(os.sep, '/')
|
|
||||||
if path.startswith('./'):
|
|
||||||
path = path[2:]
|
|
||||||
return path
|
|
||||||
|
|
||||||
def get_hash(frompath, topath):
|
|
||||||
from_hash = sha1_of_file(frompath)
|
|
||||||
to_hash = sha1_of_file(topath) if os.path.exists(topath) else None
|
|
||||||
return (from_hash, to_hash)
|
|
||||||
|
|
||||||
def process(path, fromfile, tofile, processor_function, hash_db):
|
|
||||||
fullfrompath = os.path.join(path, fromfile)
|
|
||||||
fulltopath = os.path.join(path, tofile)
|
|
||||||
current_hash = get_hash(fullfrompath, fulltopath)
|
|
||||||
if current_hash == hash_db.get(normpath(fullfrompath), None):
|
|
||||||
print('%s has not changed' % fullfrompath)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
def preserve_cwd(path, func, *args):
|
||||||
orig_cwd = os.getcwd()
|
orig_cwd = os.getcwd()
|
||||||
try:
|
try:
|
||||||
os.chdir(path)
|
os.chdir(path)
|
||||||
print('Processing %s' % fullfrompath)
|
func(*args)
|
||||||
processor_function(fromfile, tofile)
|
|
||||||
finally:
|
finally:
|
||||||
os.chdir(orig_cwd)
|
os.chdir(orig_cwd)
|
||||||
# changed target file, recompute hash
|
|
||||||
current_hash = get_hash(fullfrompath, fulltopath)
|
|
||||||
# store hash in db
|
|
||||||
hash_db[normpath(fullfrompath)] = current_hash
|
|
||||||
|
|
||||||
|
|
||||||
def find_process_files(root_dir):
|
def load_hashes(filename):
|
||||||
hash_db = load_hashes(HASH_FILE)
|
|
||||||
for cur_dir, dirs, files in os.walk(root_dir):
|
|
||||||
for filename in files:
|
|
||||||
in_file = os.path.join(cur_dir, filename + ".in")
|
|
||||||
if filename.endswith('.pyx') and os.path.isfile(in_file):
|
|
||||||
continue
|
|
||||||
for fromext, function in rules.items():
|
|
||||||
if filename.endswith(fromext):
|
|
||||||
toext = ".cpp"
|
|
||||||
# with open(os.path.join(cur_dir, filename), 'rb') as f:
|
|
||||||
# data = f.read()
|
|
||||||
# m = re.search(br"^\s*#\s*distutils:\s*language\s*=\s*c\+\+\s*$", data, re.I|re.M)
|
|
||||||
# if m:
|
|
||||||
# toext = ".cxx"
|
|
||||||
fromfile = filename
|
|
||||||
tofile = filename[:-len(fromext)] + toext
|
|
||||||
process(cur_dir, fromfile, tofile, function, hash_db)
|
|
||||||
save_hashes(hash_db, HASH_FILE)
|
|
||||||
|
|
||||||
def main():
|
|
||||||
try:
|
try:
|
||||||
root_dir = sys.argv[1]
|
return json.load(open(filename))
|
||||||
except IndexError:
|
except (ValueError, IOError):
|
||||||
root_dir = DEFAULT_ROOT
|
return {}
|
||||||
find_process_files(root_dir)
|
|
||||||
|
|
||||||
|
def save_hashes(hash_db, filename):
|
||||||
|
json.dump(hash_db, open(filename, 'wb'))
|
||||||
|
|
||||||
|
|
||||||
|
def get_hash(path):
|
||||||
|
return hashlib.md5(open(path, 'rb').read()).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def hash_changed(base, path, db):
|
||||||
|
full_path = os.path.normpath(os.path.join(base, path))
|
||||||
|
return not get_hash(full_path) == db.get(full_path)
|
||||||
|
|
||||||
|
|
||||||
|
def hash_add(base, path, db):
|
||||||
|
full_path = os.path.normpath(os.path.join(base, path))
|
||||||
|
db[full_path] = get_hash(full_path)
|
||||||
|
|
||||||
|
|
||||||
|
def process(base, filename, db):
|
||||||
|
root, ext = os.path.splitext(filename)
|
||||||
|
if ext in ['.pyx', '.cpp']:
|
||||||
|
if hash_changed(base, filename, db):
|
||||||
|
preserve_cwd(base, process_pyx, root + '.pyx', root + '.cpp')
|
||||||
|
hash_add(base, root + '.cpp', db)
|
||||||
|
hash_add(base, root + '.pyx', db)
|
||||||
|
|
||||||
|
|
||||||
|
def check_changes(root, db):
|
||||||
|
res = False
|
||||||
|
new_db = {}
|
||||||
|
|
||||||
|
setup_filename = 'setup.py'
|
||||||
|
hash_add('.', setup_filename, new_db)
|
||||||
|
if hash_changed('.', setup_filename, db):
|
||||||
|
res = True
|
||||||
|
|
||||||
|
for base, _, files in os.walk(root):
|
||||||
|
for filename in files:
|
||||||
|
if filename.endswith('.pxd'):
|
||||||
|
hash_add(base, filename, new_db)
|
||||||
|
if hash_changed(base, filename, db):
|
||||||
|
res = True
|
||||||
|
|
||||||
|
if res:
|
||||||
|
db.clear()
|
||||||
|
db.update(new_db)
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def run(root):
|
||||||
|
db = load_hashes(HASH_FILE)
|
||||||
|
|
||||||
|
try:
|
||||||
|
check_changes(root, db)
|
||||||
|
for base, _, files in os.walk(root):
|
||||||
|
for filename in files:
|
||||||
|
process(base, filename, db)
|
||||||
|
finally:
|
||||||
|
save_hashes(db, HASH_FILE)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
main()
|
parser = argparse.ArgumentParser(description='Cythonize pyx files into C++ files as needed')
|
||||||
|
parser.add_argument('root', help='root directory')
|
||||||
|
args = parser.parse_args()
|
||||||
|
run(args.root)
|
||||||
|
|
Loading…
Reference in New Issue
Block a user