spaCy/setup.py

219 lines
6.4 KiB
Python
Raw Normal View History

2014-07-05 22:49:34 +04:00
#!/usr/bin/env python
2016-01-15 20:57:01 +03:00
from __future__ import print_function
2016-10-19 01:27:57 +03:00
import io
2015-12-13 13:49:17 +03:00
import os
import subprocess
2014-07-05 22:49:34 +04:00
import sys
2015-12-14 01:32:23 +03:00
import contextlib
from distutils.command.build_ext import build_ext
2015-12-13 13:49:17 +03:00
from distutils.sysconfig import get_python_inc
2018-12-04 02:06:42 +03:00
import distutils.util
2016-04-28 23:10:43 +03:00
from distutils import ccompiler, msvccompiler
2017-05-03 21:10:59 +03:00
from setuptools import Extension, setup, find_packages
2015-12-13 13:49:17 +03:00
2018-12-04 02:06:42 +03:00
def is_new_osx():
"""Check whether we're on OSX >= 10.10"""
2018-12-04 02:06:42 +03:00
name = distutils.util.get_platform()
if sys.platform != "darwin":
2018-12-04 02:06:42 +03:00
return False
elif name.startswith("macosx-10"):
minor_version = int(name.split("-")[1].split(".")[1])
2018-12-04 02:06:42 +03:00
if minor_version >= 7:
return True
else:
return False
else:
return False
2017-05-03 21:10:59 +03:00
PACKAGES = find_packages()
2015-12-13 13:49:17 +03:00
MOD_NAMES = [
"spacy._align",
"spacy.parts_of_speech",
"spacy.strings",
"spacy.lexeme",
"spacy.vocab",
"spacy.attrs",
2019-03-18 19:27:51 +03:00
"spacy.kb",
"spacy.morphology",
"spacy.pipeline.pipes",
2019-03-07 12:45:55 +03:00
"spacy.pipeline.morphologizer",
"spacy.syntax.stateclass",
"spacy.syntax._state",
"spacy.tokenizer",
"spacy.syntax.nn_parser",
"spacy.syntax._parser_model",
"spacy.syntax._beam_utils",
"spacy.syntax.nonproj",
"spacy.syntax.transition_system",
"spacy.syntax.arc_eager",
"spacy.gold",
"spacy.tokens.doc",
"spacy.tokens.span",
"spacy.tokens.token",
2019-03-07 16:34:54 +03:00
"spacy.tokens.morphanalysis",
"spacy.tokens._retokenize",
"spacy.matcher.matcher",
"spacy.matcher.phrasematcher",
"spacy.matcher.dependencymatcher",
"spacy.syntax.ner",
"spacy.symbols",
"spacy.vectors",
2017-06-05 13:45:29 +03:00
]
2015-12-13 13:49:17 +03:00
COMPILE_OPTIONS = {
"msvc": ["/Ox", "/EHsc"],
"mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
"other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
2016-02-05 16:43:52 +03:00
}
2016-02-05 16:43:52 +03:00
LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
2016-02-05 16:43:52 +03:00
2016-12-20 13:05:06 +03:00
2018-12-04 02:06:42 +03:00
if is_new_osx():
# On Mac, use libc++ because Apple deprecated use of
# libstdc
COMPILE_OPTIONS["other"].append("-stdlib=libc++")
LINK_OPTIONS["other"].append("-lc++")
# g++ (used by unix compiler on mac) links to libstdc++ as a default lib.
# See: https://stackoverflow.com/questions/1653047/avoid-linking-to-libstdc
LINK_OPTIONS["other"].append("-nodefaultlibs")
2016-02-05 16:43:52 +03:00
2016-12-20 13:05:06 +03:00
2016-04-19 20:50:42 +03:00
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
2015-12-13 13:49:17 +03:00
class build_ext_options:
def build_options(self):
for e in self.extensions:
e.extra_compile_args += COMPILE_OPTIONS.get(
self.compiler.compiler_type, COMPILE_OPTIONS["other"]
)
2015-12-13 13:49:17 +03:00
for e in self.extensions:
e.extra_link_args += LINK_OPTIONS.get(
self.compiler.compiler_type, LINK_OPTIONS["other"]
)
2015-01-17 08:19:54 +03:00
2015-12-13 13:49:17 +03:00
class build_ext_subclass(build_ext, build_ext_options):
def build_extensions(self):
build_ext_options.build_options(self)
build_ext.build_extensions(self)
2015-01-17 08:19:54 +03:00
2015-12-14 01:32:23 +03:00
def generate_cython(root, source):
print("Cythonizing sources")
p = subprocess.call(
[sys.executable, os.path.join(root, "bin", "cythonize.py"), source],
env=os.environ,
)
2015-12-13 13:49:17 +03:00
if p != 0:
raise RuntimeError("Running cythonize failed")
2015-12-13 13:49:17 +03:00
Reduce size of language data (#4141) * Move Turkish lemmas to a json file Rather than a large dict in Python source, the data is now a big json file. This includes a method for loading the json file, falling back to a compressed file, and an update to MANIFEST.in that excludes json in the spacy/lang directory. This focuses on Turkish specifically because it has the most language data in core. * Transition all lemmatizer.py files to json This covers all lemmatizer.py files of a significant size (>500k or so). Small files were left alone. None of the affected files have logic, so this was pretty straightforward. One unusual thing is that the lemma data for Urdu doesn't seem to be used anywhere. That may require further investigation. * Move large lang data to json for fr/nb/nl/sv These are the languages that use a lemmatizer directory (rather than a single file) and are larger than English. For most of these languages there were many language data files, in which case only the large ones (>500k or so) were converted to json. It may or may not be a good idea to migrate the remaining Python files to json in the future. * Fix id lemmas.json The contents of this file were originally just copied from the Python source, but that used single quotes, so it had to be properly converted to json first. * Add .json.gz to gitignore This covers the json.gz files built as part of distribution. * Add language data gzip to build process Currently this gzip data on every build; it works, but it should be changed to only gzip when the source file has been updated. * Remove Danish lemmatizer.py Missed this when I added the json. * Update to match latest explosion/srsly#9 The way gzipped json is loaded/saved in srsly changed a bit. * Only compress language data if necessary If a .json.gz file exists and is newer than the corresponding json file, it's not recompressed. * Move en/el language data to json This only affected files >500kb, which was nouns for both languages and the generic lookup table for English. * Remove empty files in Norwegian tokenizer It's unclear why, but the Norwegian (nb) tokenizer had empty files for adj/adv/noun/verb lemmas. This may have been a result of copying the structure of the English lemmatizer. This removed the files, but still creates the empty sets in the lemmatizer. That may not actually be necessary. * Remove dubious entries in English lookup.json " furthest" and " skilled" - both prefixed with a space - were in the English lookup table. That seems obviously wrong so I have removed them. * Fix small issues with en/fr lemmatizers The en tokenizer was including the removed _nouns.py file, so that's removed. The fr tokenizer is unusual in that it has a lemmatizer directory with both __init__.py and lemmatizer.py. lemmatizer.py had not been converted to load the json language data, so that was fixed. * Auto-format * Auto-format * Update srsly pin * Consistently use pathlib paths
2019-08-20 15:54:11 +03:00
def gzip_language_data(root, source):
print("Compressing language data")
import srsly
from pathlib import Path
base = Path(root) / source
for jsonfile in base.glob("**/*.json"):
outfile = jsonfile.with_suffix(jsonfile.suffix + ".gz")
2019-09-12 16:35:01 +03:00
if outfile.is_file() and outfile.stat().st_mtime > jsonfile.stat().st_mtime:
Reduce size of language data (#4141) * Move Turkish lemmas to a json file Rather than a large dict in Python source, the data is now a big json file. This includes a method for loading the json file, falling back to a compressed file, and an update to MANIFEST.in that excludes json in the spacy/lang directory. This focuses on Turkish specifically because it has the most language data in core. * Transition all lemmatizer.py files to json This covers all lemmatizer.py files of a significant size (>500k or so). Small files were left alone. None of the affected files have logic, so this was pretty straightforward. One unusual thing is that the lemma data for Urdu doesn't seem to be used anywhere. That may require further investigation. * Move large lang data to json for fr/nb/nl/sv These are the languages that use a lemmatizer directory (rather than a single file) and are larger than English. For most of these languages there were many language data files, in which case only the large ones (>500k or so) were converted to json. It may or may not be a good idea to migrate the remaining Python files to json in the future. * Fix id lemmas.json The contents of this file were originally just copied from the Python source, but that used single quotes, so it had to be properly converted to json first. * Add .json.gz to gitignore This covers the json.gz files built as part of distribution. * Add language data gzip to build process Currently this gzip data on every build; it works, but it should be changed to only gzip when the source file has been updated. * Remove Danish lemmatizer.py Missed this when I added the json. * Update to match latest explosion/srsly#9 The way gzipped json is loaded/saved in srsly changed a bit. * Only compress language data if necessary If a .json.gz file exists and is newer than the corresponding json file, it's not recompressed. * Move en/el language data to json This only affected files >500kb, which was nouns for both languages and the generic lookup table for English. * Remove empty files in Norwegian tokenizer It's unclear why, but the Norwegian (nb) tokenizer had empty files for adj/adv/noun/verb lemmas. This may have been a result of copying the structure of the English lemmatizer. This removed the files, but still creates the empty sets in the lemmatizer. That may not actually be necessary. * Remove dubious entries in English lookup.json " furthest" and " skilled" - both prefixed with a space - were in the English lookup table. That seems obviously wrong so I have removed them. * Fix small issues with en/fr lemmatizers The en tokenizer was including the removed _nouns.py file, so that's removed. The fr tokenizer is unusual in that it has a lemmatizer directory with both __init__.py and lemmatizer.py. lemmatizer.py had not been converted to load the json language data, so that was fixed. * Auto-format * Auto-format * Update srsly pin * Consistently use pathlib paths
2019-08-20 15:54:11 +03:00
# If the gz is newer it doesn't need updating
print("Skipping {}, already compressed".format(jsonfile))
continue
data = srsly.read_json(jsonfile)
srsly.write_gzip_json(outfile, data)
print("Compressed {}".format(jsonfile))
2015-12-14 01:32:23 +03:00
def is_source_release(path):
return os.path.exists(os.path.join(path, "PKG-INFO"))
2015-12-14 01:32:23 +03:00
def clean(path):
2015-12-13 13:49:17 +03:00
for name in MOD_NAMES:
name = name.replace(".", "/")
for ext in [".so", ".html", ".cpp", ".c"]:
2015-12-14 01:32:23 +03:00
file_path = os.path.join(path, name + ext)
if os.path.exists(file_path):
os.unlink(file_path)
2015-01-25 06:49:10 +03:00
2015-12-14 01:32:23 +03:00
@contextlib.contextmanager
def chdir(new_dir):
old_dir = os.getcwd()
2015-12-13 13:49:17 +03:00
try:
2015-12-14 01:32:23 +03:00
os.chdir(new_dir)
sys.path.insert(0, new_dir)
yield
2015-12-13 13:49:17 +03:00
finally:
del sys.path[0]
2015-12-14 01:32:23 +03:00
os.chdir(old_dir)
def setup_package():
root = os.path.abspath(os.path.dirname(__file__))
if len(sys.argv) > 1 and sys.argv[1] == "clean":
2015-12-14 01:32:23 +03:00
return clean(root)
with chdir(root):
with io.open(os.path.join(root, "spacy", "about.py"), encoding="utf8") as f:
2016-03-13 20:12:32 +03:00
about = {}
2016-01-15 20:57:01 +03:00
exec(f.read(), about)
2015-12-14 01:32:23 +03:00
include_dirs = [
get_python_inc(plat_specific=True),
os.path.join(root, "include"),
]
2015-12-14 01:32:23 +03:00
if (
ccompiler.new_compiler().compiler_type == "msvc"
and msvccompiler.get_build_version() == 9
):
include_dirs.append(os.path.join(root, "include", "msvc9"))
2016-04-28 23:10:43 +03:00
2015-12-14 01:32:23 +03:00
ext_modules = []
for mod_name in MOD_NAMES:
mod_path = mod_name.replace(".", "/") + ".cpp"
extra_link_args = []
# ???
# Imported from patch from @mikepb
# See Issue #267. Running blind here...
if sys.platform == "darwin":
dylib_path = [".." for _ in range(mod_name.count("."))]
dylib_path = "/".join(dylib_path)
dylib_path = "@loader_path/%s/spacy/platform/darwin/lib" % dylib_path
extra_link_args.append("-Wl,-rpath,%s" % dylib_path)
2015-12-14 01:32:23 +03:00
ext_modules.append(
Extension(
mod_name,
[mod_path],
language="c++",
include_dirs=include_dirs,
extra_link_args=extra_link_args,
)
)
2015-12-14 01:32:23 +03:00
if not is_source_release(root):
generate_cython(root, "spacy")
Reduce size of language data (#4141) * Move Turkish lemmas to a json file Rather than a large dict in Python source, the data is now a big json file. This includes a method for loading the json file, falling back to a compressed file, and an update to MANIFEST.in that excludes json in the spacy/lang directory. This focuses on Turkish specifically because it has the most language data in core. * Transition all lemmatizer.py files to json This covers all lemmatizer.py files of a significant size (>500k or so). Small files were left alone. None of the affected files have logic, so this was pretty straightforward. One unusual thing is that the lemma data for Urdu doesn't seem to be used anywhere. That may require further investigation. * Move large lang data to json for fr/nb/nl/sv These are the languages that use a lemmatizer directory (rather than a single file) and are larger than English. For most of these languages there were many language data files, in which case only the large ones (>500k or so) were converted to json. It may or may not be a good idea to migrate the remaining Python files to json in the future. * Fix id lemmas.json The contents of this file were originally just copied from the Python source, but that used single quotes, so it had to be properly converted to json first. * Add .json.gz to gitignore This covers the json.gz files built as part of distribution. * Add language data gzip to build process Currently this gzip data on every build; it works, but it should be changed to only gzip when the source file has been updated. * Remove Danish lemmatizer.py Missed this when I added the json. * Update to match latest explosion/srsly#9 The way gzipped json is loaded/saved in srsly changed a bit. * Only compress language data if necessary If a .json.gz file exists and is newer than the corresponding json file, it's not recompressed. * Move en/el language data to json This only affected files >500kb, which was nouns for both languages and the generic lookup table for English. * Remove empty files in Norwegian tokenizer It's unclear why, but the Norwegian (nb) tokenizer had empty files for adj/adv/noun/verb lemmas. This may have been a result of copying the structure of the English lemmatizer. This removed the files, but still creates the empty sets in the lemmatizer. That may not actually be necessary. * Remove dubious entries in English lookup.json " furthest" and " skilled" - both prefixed with a space - were in the English lookup table. That seems obviously wrong so I have removed them. * Fix small issues with en/fr lemmatizers The en tokenizer was including the removed _nouns.py file, so that's removed. The fr tokenizer is unusual in that it has a lemmatizer directory with both __init__.py and lemmatizer.py. lemmatizer.py had not been converted to load the json language data, so that was fixed. * Auto-format * Auto-format * Update srsly pin * Consistently use pathlib paths
2019-08-20 15:54:11 +03:00
gzip_language_data(root, "spacy/lang")
2015-12-14 01:32:23 +03:00
setup(
name="spacy",
2015-12-14 01:32:23 +03:00
packages=PACKAGES,
version=about["__version__"],
2015-12-14 01:32:23 +03:00
ext_modules=ext_modules,
cmdclass={"build_ext": build_ext_subclass},
2015-12-14 01:32:23 +03:00
)
2015-01-04 21:30:56 +03:00
if __name__ == "__main__":
2015-12-14 01:32:23 +03:00
setup_package()