mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 16:07:41 +03:00 
			
		
		
		
	Merge branch 'master' of ssh://github.com/honnibal/spaCy
This commit is contained in:
		
						commit
						198f9aa034
					
				
							
								
								
									
										235
									
								
								bin/cythonize.py
									
									
									
									
									
								
							
							
						
						
									
										235
									
								
								bin/cythonize.py
									
									
									
									
									
								
							|  | @ -1,62 +1,50 @@ | |||
| #!/usr/bin/env python | ||||
| """ cythonize | ||||
| """ cythonize.py | ||||
| 
 | ||||
| Cythonize pyx files into C files as needed. | ||||
| Cythonize pyx files into C++ files as needed. | ||||
| 
 | ||||
| Usage: cythonize [root_dir] | ||||
| 
 | ||||
| Default [root_dir] is 'spacy'. | ||||
| Usage: cythonize.py [root] | ||||
| 
 | ||||
| Checks pyx files to see if they have been changed relative to their | ||||
| corresponding C files.  If they have, then runs cython on these files to | ||||
| recreate the C files. | ||||
| corresponding C++ files. If they have, then runs cython on these files to | ||||
| recreate the C++ files. | ||||
| 
 | ||||
| The script thinks that the pyx files have changed relative to the C files | ||||
| by comparing hashes stored in a database file. | ||||
| Additionally, checks pxd files and setup.py if they have been changed. If | ||||
| they have, rebuilds everything. | ||||
| 
 | ||||
| Simple script to invoke Cython (and Tempita) on all .pyx (.pyx.in) | ||||
| files; while waiting for a proper build system. Uses file hashes to | ||||
| figure out if rebuild is needed. | ||||
| Change detection based on file hashes stored in JSON format. | ||||
| 
 | ||||
| For now, this script should be run by developers when changing Cython files | ||||
| only, and the resulting C files checked in, so that end-users (and Python-only | ||||
| developers) do not get the Cython/Tempita dependencies. | ||||
| and the resulting C++ files checked in, so that end-users (and Python-only | ||||
| developers) do not get the Cython dependencies. | ||||
| 
 | ||||
| Originally written by Dag Sverre Seljebotn, and copied here from: | ||||
| Based upon: | ||||
| 
 | ||||
| https://raw.github.com/dagss/private-scipy-refactor/cythonize/cythonize.py | ||||
| https://raw.githubusercontent.com/numpy/numpy/master/tools/cythonize.py | ||||
| 
 | ||||
| Note: this script does not check any of the dependent C libraries; it only | ||||
| operates on the Cython .pyx files. | ||||
| Note: this script does not check any of the dependent C++ libraries. | ||||
| """ | ||||
| 
 | ||||
| from __future__ import division, print_function, absolute_import | ||||
| from __future__ import print_function | ||||
| 
 | ||||
| import os | ||||
| import re | ||||
| import sys | ||||
| import json | ||||
| import hashlib | ||||
| import subprocess | ||||
| import argparse | ||||
| 
 | ||||
| HASH_FILE = 'cythonize.dat' | ||||
| DEFAULT_ROOT = 'spacy' | ||||
| VENDOR = 'spaCy' | ||||
| 
 | ||||
| # WindowsError is not defined on unix systems | ||||
| try: | ||||
|     WindowsError | ||||
| except NameError: | ||||
|     WindowsError = None | ||||
| HASH_FILE = 'cythonize.json' | ||||
| 
 | ||||
| 
 | ||||
| # | ||||
| # Rules | ||||
| # | ||||
| def process_pyx(fromfile, tofile): | ||||
|     print('Processing %s' % fromfile) | ||||
|     try: | ||||
|         from Cython.Compiler.Version import version as cython_version | ||||
|         from distutils.version import LooseVersion | ||||
|         if LooseVersion(cython_version) < LooseVersion('0.19'): | ||||
|             raise Exception('Building %s requires Cython >= 0.19' % VENDOR) | ||||
|             raise Exception('Require Cython >= 0.19') | ||||
| 
 | ||||
|     except ImportError: | ||||
|         pass | ||||
|  | @ -67,133 +55,102 @@ def process_pyx(fromfile, tofile): | |||
| 
 | ||||
|     try: | ||||
|         try: | ||||
|             r = subprocess.call(['cython'] + flags + ["-o", tofile, fromfile]) | ||||
|             r = subprocess.call(['cython'] + flags + ['-o', tofile, fromfile]) | ||||
|             if r != 0: | ||||
|                 raise Exception('Cython failed') | ||||
|         except OSError: | ||||
|             # There are ways of installing Cython that don't result in a cython | ||||
|             # executable on the path, see gh-2397. | ||||
|             r = subprocess.call([sys.executable, '-c', | ||||
|                                  'import sys; from Cython.Compiler.Main import ' | ||||
|                                  'setuptools_main as main; sys.exit(main())'] + flags + | ||||
|                                  ["-o", tofile, fromfile]) | ||||
|                                 'import sys; from Cython.Compiler.Main import ' | ||||
|                                 'setuptools_main as main; sys.exit(main())'] + flags + | ||||
|                                 ['-o', tofile, fromfile]) | ||||
|             if r != 0: | ||||
|                 raise Exception('Cython failed') | ||||
|     except OSError: | ||||
|         raise OSError('Cython needs to be installed') | ||||
| 
 | ||||
| def process_tempita_pyx(fromfile, tofile): | ||||
|     try: | ||||
|         try: | ||||
|             from Cython import Tempita as tempita | ||||
|         except ImportError: | ||||
|             import tempita | ||||
|     except ImportError: | ||||
|         raise Exception('Building %s requires Tempita: ' | ||||
|                         'pip install --user Tempita' % VENDOR) | ||||
|     with open(fromfile, "r") as f: | ||||
|         tmpl = f.read() | ||||
|     pyxcontent = tempita.sub(tmpl) | ||||
|     assert fromfile.endswith('.pyx.in') | ||||
|     pyxfile = fromfile[:-len('.pyx.in')] + '.pyx' | ||||
|     with open(pyxfile, "w") as f: | ||||
|         f.write(pyxcontent) | ||||
|     process_pyx(pyxfile, tofile) | ||||
| 
 | ||||
| rules = { | ||||
|     # fromext : function | ||||
|     '.pyx' : process_pyx, | ||||
|     '.pyx.in' : process_tempita_pyx | ||||
|     } | ||||
| # | ||||
| # Hash db | ||||
| # | ||||
| def load_hashes(filename): | ||||
|     # Return { filename : (sha1 of input, sha1 of output) } | ||||
|     if os.path.isfile(filename): | ||||
|         hashes = {} | ||||
|         with open(filename, 'r') as f: | ||||
|             for line in f: | ||||
|                 filename, inhash, outhash = line.split() | ||||
|                 hashes[filename] = (inhash, outhash) | ||||
|     else: | ||||
|         hashes = {} | ||||
|     return hashes | ||||
| 
 | ||||
| def save_hashes(hash_db, filename): | ||||
|     with open(filename, 'w') as f: | ||||
|         for key, value in sorted(hash_db.items()): | ||||
|             f.write("%s %s %s\n" % (key, value[0], value[1])) | ||||
| 
 | ||||
| def sha1_of_file(filename): | ||||
|     h = hashlib.sha1() | ||||
|     with open(filename, "rb") as f: | ||||
|         h.update(f.read()) | ||||
|     return h.hexdigest() | ||||
| 
 | ||||
| # | ||||
| # Main program | ||||
| # | ||||
| 
 | ||||
| def normpath(path): | ||||
|     path = path.replace(os.sep, '/') | ||||
|     if path.startswith('./'): | ||||
|         path = path[2:] | ||||
|     return path | ||||
| 
 | ||||
| def get_hash(frompath, topath): | ||||
|     from_hash = sha1_of_file(frompath) | ||||
|     to_hash = sha1_of_file(topath) if os.path.exists(topath) else None | ||||
|     return (from_hash, to_hash) | ||||
| 
 | ||||
| def process(path, fromfile, tofile, processor_function, hash_db): | ||||
|     fullfrompath = os.path.join(path, fromfile) | ||||
|     fulltopath = os.path.join(path, tofile) | ||||
|     current_hash = get_hash(fullfrompath, fulltopath) | ||||
|     if current_hash == hash_db.get(normpath(fullfrompath), None): | ||||
|         print('%s has not changed' % fullfrompath) | ||||
|         return | ||||
| 
 | ||||
| def preserve_cwd(path, func, *args): | ||||
|     orig_cwd = os.getcwd() | ||||
|     try: | ||||
|         os.chdir(path) | ||||
|         print('Processing %s' % fullfrompath) | ||||
|         processor_function(fromfile, tofile) | ||||
|         func(*args) | ||||
|     finally: | ||||
|         os.chdir(orig_cwd) | ||||
|     # changed target file, recompute hash | ||||
|     current_hash = get_hash(fullfrompath, fulltopath) | ||||
|     # store hash in db | ||||
|     hash_db[normpath(fullfrompath)] = current_hash | ||||
| 
 | ||||
| 
 | ||||
| def find_process_files(root_dir): | ||||
|     hash_db = load_hashes(HASH_FILE) | ||||
|     for cur_dir, dirs, files in os.walk(root_dir): | ||||
|         for filename in files: | ||||
|             in_file = os.path.join(cur_dir, filename + ".in") | ||||
|             if filename.endswith('.pyx') and os.path.isfile(in_file): | ||||
|                 continue | ||||
|             for fromext, function in rules.items(): | ||||
|                 if filename.endswith(fromext): | ||||
|                     toext = ".cpp" | ||||
|                     # with open(os.path.join(cur_dir, filename), 'rb') as f: | ||||
|                     #     data = f.read() | ||||
|                     #     m = re.search(br"^\s*#\s*distutils:\s*language\s*=\s*c\+\+\s*$", data, re.I|re.M) | ||||
|                     #     if m: | ||||
|                     #         toext = ".cxx" | ||||
|                     fromfile = filename | ||||
|                     tofile = filename[:-len(fromext)] + toext | ||||
|                     process(cur_dir, fromfile, tofile, function, hash_db) | ||||
|                     save_hashes(hash_db, HASH_FILE) | ||||
| 
 | ||||
| def main(): | ||||
| def load_hashes(filename): | ||||
|     try: | ||||
|         root_dir = sys.argv[1] | ||||
|     except IndexError: | ||||
|         root_dir = DEFAULT_ROOT | ||||
|     find_process_files(root_dir) | ||||
|         return json.load(open(filename)) | ||||
|     except (ValueError, IOError): | ||||
|         return {} | ||||
| 
 | ||||
| 
 | ||||
| def save_hashes(hash_db, filename): | ||||
|     with open(filename, 'w') as f: | ||||
|         f.write(json.dumps(hash_db)) | ||||
| 
 | ||||
| 
 | ||||
| def get_hash(path): | ||||
|     return hashlib.md5(open(path, 'rb').read()).hexdigest() | ||||
| 
 | ||||
| 
 | ||||
| def hash_changed(base, path, db): | ||||
|     full_path = os.path.normpath(os.path.join(base, path)) | ||||
|     return not get_hash(full_path) == db.get(full_path) | ||||
| 
 | ||||
| 
 | ||||
| def hash_add(base, path, db): | ||||
|     full_path = os.path.normpath(os.path.join(base, path)) | ||||
|     db[full_path] = get_hash(full_path) | ||||
| 
 | ||||
| 
 | ||||
| def process(base, filename, db): | ||||
|     root, ext = os.path.splitext(filename) | ||||
|     if ext in ['.pyx', '.cpp']: | ||||
|         if hash_changed(base, filename, db) or not os.path.isfile(os.path.join(base, root + '.cpp')): | ||||
|             preserve_cwd(base, process_pyx, root + '.pyx', root + '.cpp') | ||||
|             hash_add(base, root + '.cpp', db) | ||||
|             hash_add(base, root + '.pyx', db) | ||||
| 
 | ||||
| 
 | ||||
| def check_changes(root, db): | ||||
|     res = False | ||||
|     new_db = {} | ||||
| 
 | ||||
|     setup_filename = 'setup.py' | ||||
|     hash_add('.', setup_filename, new_db) | ||||
|     if hash_changed('.', setup_filename, db): | ||||
|         res = True | ||||
| 
 | ||||
|     for base, _, files in os.walk(root): | ||||
|         for filename in files: | ||||
|             if filename.endswith('.pxd'): | ||||
|                 hash_add(base, filename, new_db) | ||||
|                 if hash_changed(base, filename, db): | ||||
|                     res = True | ||||
| 
 | ||||
|     if res: | ||||
|         db.clear() | ||||
|         db.update(new_db) | ||||
|     return res | ||||
| 
 | ||||
| 
 | ||||
| def run(root): | ||||
|     db = load_hashes(HASH_FILE) | ||||
| 
 | ||||
|     try: | ||||
|         check_changes(root, db) | ||||
|         for base, _, files in os.walk(root): | ||||
|             for filename in files: | ||||
|                 process(base, filename, db) | ||||
|     finally: | ||||
|         save_hashes(db, HASH_FILE) | ||||
| 
 | ||||
| 
 | ||||
| if __name__ == '__main__': | ||||
|     main() | ||||
|     parser = argparse.ArgumentParser(description='Cythonize pyx files into C++ files as needed') | ||||
|     parser.add_argument('root', help='root directory') | ||||
|     args = parser.parse_args() | ||||
|     run(args.root) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user