2014-07-05 22:49:34 +04:00
|
|
|
#!/usr/bin/env python
|
2016-01-15 20:57:01 +03:00
|
|
|
from __future__ import print_function
|
2016-10-19 01:27:57 +03:00
|
|
|
import io
|
2015-12-13 13:49:17 +03:00
|
|
|
import os
|
|
|
|
import subprocess
|
2014-07-05 22:49:34 +04:00
|
|
|
import sys
|
2015-12-14 01:32:23 +03:00
|
|
|
import contextlib
|
2015-10-13 01:31:59 +03:00
|
|
|
from distutils.command.build_ext import build_ext
|
2015-12-13 13:49:17 +03:00
|
|
|
from distutils.sysconfig import get_python_inc
|
2018-12-04 02:06:42 +03:00
|
|
|
import distutils.util
|
2016-04-28 23:10:43 +03:00
|
|
|
from distutils import ccompiler, msvccompiler
|
2017-05-03 21:10:59 +03:00
|
|
|
from setuptools import Extension, setup, find_packages
|
2015-12-13 13:49:17 +03:00
|
|
|
|
|
|
|
|
2018-12-04 02:06:42 +03:00
|
|
|
def is_new_osx():
|
2018-12-08 13:49:43 +03:00
|
|
|
"""Check whether we're on OSX >= 10.10"""
|
2018-12-04 02:06:42 +03:00
|
|
|
name = distutils.util.get_platform()
|
2018-12-08 13:49:43 +03:00
|
|
|
if sys.platform != "darwin":
|
2018-12-04 02:06:42 +03:00
|
|
|
return False
|
2018-12-08 13:49:43 +03:00
|
|
|
elif name.startswith("macosx-10"):
|
|
|
|
minor_version = int(name.split("-")[1].split(".")[1])
|
2018-12-04 02:06:42 +03:00
|
|
|
if minor_version >= 7:
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2018-12-19 15:54:02 +03:00
|
|
|
PACKAGE_DATA = {"": ["*.pyx", "*.pxd", "*.txt", "*.tokens", "*.json"]}
|
2016-11-06 13:58:26 +03:00
|
|
|
|
|
|
|
|
2017-05-03 21:10:59 +03:00
|
|
|
PACKAGES = find_packages()
|
2015-12-13 13:49:17 +03:00
|
|
|
|
|
|
|
|
|
|
|
MOD_NAMES = [
|
2018-11-27 00:04:35 +03:00
|
|
|
"spacy._align",
|
|
|
|
"spacy.parts_of_speech",
|
|
|
|
"spacy.strings",
|
|
|
|
"spacy.lexeme",
|
|
|
|
"spacy.vocab",
|
|
|
|
"spacy.attrs",
|
2019-03-18 19:27:51 +03:00
|
|
|
"spacy.kb",
|
2018-11-27 00:04:35 +03:00
|
|
|
"spacy.morphology",
|
2019-02-10 14:14:51 +03:00
|
|
|
"spacy.pipeline.pipes",
|
2019-03-07 12:45:55 +03:00
|
|
|
"spacy.pipeline.morphologizer",
|
2018-11-27 00:04:35 +03:00
|
|
|
"spacy.syntax.stateclass",
|
|
|
|
"spacy.syntax._state",
|
|
|
|
"spacy.tokenizer",
|
|
|
|
"spacy.syntax.nn_parser",
|
|
|
|
"spacy.syntax._parser_model",
|
|
|
|
"spacy.syntax._beam_utils",
|
|
|
|
"spacy.syntax.nonproj",
|
|
|
|
"spacy.syntax.transition_system",
|
|
|
|
"spacy.syntax.arc_eager",
|
|
|
|
"spacy.gold",
|
|
|
|
"spacy.tokens.doc",
|
|
|
|
"spacy.tokens.span",
|
|
|
|
"spacy.tokens.token",
|
2019-03-07 16:34:54 +03:00
|
|
|
"spacy.tokens.morphanalysis",
|
2018-11-27 00:04:35 +03:00
|
|
|
"spacy.tokens._retokenize",
|
2019-02-07 11:42:25 +03:00
|
|
|
"spacy.matcher.matcher",
|
|
|
|
"spacy.matcher.phrasematcher",
|
|
|
|
"spacy.matcher.dependencymatcher",
|
2018-11-27 00:04:35 +03:00
|
|
|
"spacy.syntax.ner",
|
|
|
|
"spacy.symbols",
|
|
|
|
"spacy.vectors",
|
2017-06-05 13:45:29 +03:00
|
|
|
]
|
2015-12-13 13:49:17 +03:00
|
|
|
|
|
|
|
|
2018-11-27 00:04:35 +03:00
|
|
|
COMPILE_OPTIONS = {
|
|
|
|
"msvc": ["/Ox", "/EHsc"],
|
|
|
|
"mingw32": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
|
|
|
"other": ["-O2", "-Wno-strict-prototypes", "-Wno-unused-function"],
|
2016-02-05 16:43:52 +03:00
|
|
|
}
|
2015-10-13 01:31:59 +03:00
|
|
|
|
2016-02-05 16:43:52 +03:00
|
|
|
|
2018-11-27 00:04:35 +03:00
|
|
|
LINK_OPTIONS = {"msvc": [], "mingw32": [], "other": []}
|
2016-02-05 16:43:52 +03:00
|
|
|
|
2016-12-20 13:05:06 +03:00
|
|
|
|
2018-12-04 02:06:42 +03:00
|
|
|
if is_new_osx():
|
|
|
|
# On Mac, use libc++ because Apple deprecated use of
|
|
|
|
# libstdc
|
|
|
|
COMPILE_OPTIONS["other"].append("-stdlib=libc++")
|
|
|
|
LINK_OPTIONS["other"].append("-lc++")
|
|
|
|
# g++ (used by unix compiler on mac) links to libstdc++ as a default lib.
|
|
|
|
# See: https://stackoverflow.com/questions/1653047/avoid-linking-to-libstdc
|
|
|
|
LINK_OPTIONS["other"].append("-nodefaultlibs")
|
2016-02-05 16:43:52 +03:00
|
|
|
|
2016-12-20 13:05:06 +03:00
|
|
|
|
2018-11-27 00:04:35 +03:00
|
|
|
USE_OPENMP_DEFAULT = "0" if sys.platform != "darwin" else None
|
|
|
|
if os.environ.get("USE_OPENMP", USE_OPENMP_DEFAULT) == "1":
|
|
|
|
if sys.platform == "darwin":
|
|
|
|
COMPILE_OPTIONS["other"].append("-fopenmp")
|
|
|
|
LINK_OPTIONS["other"].append("-fopenmp")
|
|
|
|
PACKAGE_DATA["spacy.platform.darwin.lib"] = ["*.dylib"]
|
|
|
|
PACKAGES.append("spacy.platform.darwin.lib")
|
2016-11-06 13:58:26 +03:00
|
|
|
|
2018-11-27 00:04:35 +03:00
|
|
|
elif sys.platform == "win32":
|
|
|
|
COMPILE_OPTIONS["msvc"].append("/openmp")
|
2016-11-06 13:58:26 +03:00
|
|
|
|
|
|
|
else:
|
2018-11-27 00:04:35 +03:00
|
|
|
COMPILE_OPTIONS["other"].append("-fopenmp")
|
|
|
|
LINK_OPTIONS["other"].append("-fopenmp")
|
2015-01-06 04:34:55 +03:00
|
|
|
|
2017-05-07 19:36:35 +03:00
|
|
|
|
2016-04-19 20:50:42 +03:00
|
|
|
# By subclassing build_extensions we have the actual compiler that will be used which is really known only after finalize_options
|
|
|
|
# http://stackoverflow.com/questions/724664/python-distutils-how-to-get-a-compiler-that-is-going-to-be-used
|
2015-12-13 13:49:17 +03:00
|
|
|
class build_ext_options:
|
|
|
|
def build_options(self):
|
|
|
|
for e in self.extensions:
|
2016-11-06 13:58:26 +03:00
|
|
|
e.extra_compile_args += COMPILE_OPTIONS.get(
|
2018-11-27 00:04:35 +03:00
|
|
|
self.compiler.compiler_type, COMPILE_OPTIONS["other"]
|
|
|
|
)
|
2015-12-13 13:49:17 +03:00
|
|
|
for e in self.extensions:
|
2016-11-06 13:58:26 +03:00
|
|
|
e.extra_link_args += LINK_OPTIONS.get(
|
2018-11-27 00:04:35 +03:00
|
|
|
self.compiler.compiler_type, LINK_OPTIONS["other"]
|
|
|
|
)
|
2015-01-17 08:19:54 +03:00
|
|
|
|
|
|
|
|
2015-12-13 13:49:17 +03:00
|
|
|
class build_ext_subclass(build_ext, build_ext_options):
|
|
|
|
def build_extensions(self):
|
|
|
|
build_ext_options.build_options(self)
|
|
|
|
build_ext.build_extensions(self)
|
2015-01-17 08:19:54 +03:00
|
|
|
|
|
|
|
|
2015-12-14 01:32:23 +03:00
|
|
|
def generate_cython(root, source):
|
2018-11-27 00:04:35 +03:00
|
|
|
print("Cythonizing sources")
|
|
|
|
p = subprocess.call(
|
|
|
|
[sys.executable, os.path.join(root, "bin", "cythonize.py"), source],
|
|
|
|
env=os.environ,
|
|
|
|
)
|
2015-12-13 13:49:17 +03:00
|
|
|
if p != 0:
|
2018-11-27 00:04:35 +03:00
|
|
|
raise RuntimeError("Running cythonize failed")
|
2015-12-13 13:49:17 +03:00
|
|
|
|
|
|
|
|
Reduce size of language data (#4141)
* Move Turkish lemmas to a json file
Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.
This focuses on Turkish specifically because it has the most language
data in core.
* Transition all lemmatizer.py files to json
This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.
None of the affected files have logic, so this was pretty
straightforward.
One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.
* Move large lang data to json for fr/nb/nl/sv
These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.
For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.
* Fix id lemmas.json
The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.
* Add .json.gz to gitignore
This covers the json.gz files built as part of distribution.
* Add language data gzip to build process
Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.
* Remove Danish lemmatizer.py
Missed this when I added the json.
* Update to match latest explosion/srsly#9
The way gzipped json is loaded/saved in srsly changed a bit.
* Only compress language data if necessary
If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.
* Move en/el language data to json
This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.
* Remove empty files in Norwegian tokenizer
It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.
This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.
* Remove dubious entries in English lookup.json
" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.
* Fix small issues with en/fr lemmatizers
The en tokenizer was including the removed _nouns.py file, so that's
removed.
The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.
* Auto-format
* Auto-format
* Update srsly pin
* Consistently use pathlib paths
2019-08-20 15:54:11 +03:00
|
|
|
def gzip_language_data(root, source):
|
|
|
|
print("Compressing language data")
|
|
|
|
import srsly
|
|
|
|
from pathlib import Path
|
|
|
|
|
|
|
|
base = Path(root) / source
|
|
|
|
for jsonfile in base.glob("**/*.json"):
|
|
|
|
outfile = jsonfile.with_suffix(jsonfile.suffix + ".gz")
|
2019-09-12 16:35:01 +03:00
|
|
|
if outfile.is_file() and outfile.stat().st_mtime > jsonfile.stat().st_mtime:
|
Reduce size of language data (#4141)
* Move Turkish lemmas to a json file
Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.
This focuses on Turkish specifically because it has the most language
data in core.
* Transition all lemmatizer.py files to json
This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.
None of the affected files have logic, so this was pretty
straightforward.
One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.
* Move large lang data to json for fr/nb/nl/sv
These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.
For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.
* Fix id lemmas.json
The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.
* Add .json.gz to gitignore
This covers the json.gz files built as part of distribution.
* Add language data gzip to build process
Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.
* Remove Danish lemmatizer.py
Missed this when I added the json.
* Update to match latest explosion/srsly#9
The way gzipped json is loaded/saved in srsly changed a bit.
* Only compress language data if necessary
If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.
* Move en/el language data to json
This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.
* Remove empty files in Norwegian tokenizer
It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.
This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.
* Remove dubious entries in English lookup.json
" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.
* Fix small issues with en/fr lemmatizers
The en tokenizer was including the removed _nouns.py file, so that's
removed.
The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.
* Auto-format
* Auto-format
* Update srsly pin
* Consistently use pathlib paths
2019-08-20 15:54:11 +03:00
|
|
|
# If the gz is newer it doesn't need updating
|
|
|
|
print("Skipping {}, already compressed".format(jsonfile))
|
|
|
|
continue
|
|
|
|
data = srsly.read_json(jsonfile)
|
|
|
|
srsly.write_gzip_json(outfile, data)
|
|
|
|
print("Compressed {}".format(jsonfile))
|
|
|
|
|
|
|
|
|
2015-12-14 01:32:23 +03:00
|
|
|
def is_source_release(path):
|
2018-11-27 00:04:35 +03:00
|
|
|
return os.path.exists(os.path.join(path, "PKG-INFO"))
|
2015-12-14 01:32:23 +03:00
|
|
|
|
|
|
|
|
|
|
|
def clean(path):
|
2015-12-13 13:49:17 +03:00
|
|
|
for name in MOD_NAMES:
|
2018-11-27 00:04:35 +03:00
|
|
|
name = name.replace(".", "/")
|
|
|
|
for ext in [".so", ".html", ".cpp", ".c"]:
|
2015-12-14 01:32:23 +03:00
|
|
|
file_path = os.path.join(path, name + ext)
|
|
|
|
if os.path.exists(file_path):
|
|
|
|
os.unlink(file_path)
|
2015-01-25 06:49:10 +03:00
|
|
|
|
|
|
|
|
2015-12-14 01:32:23 +03:00
|
|
|
@contextlib.contextmanager
|
|
|
|
def chdir(new_dir):
|
|
|
|
old_dir = os.getcwd()
|
2015-12-13 13:49:17 +03:00
|
|
|
try:
|
2015-12-14 01:32:23 +03:00
|
|
|
os.chdir(new_dir)
|
|
|
|
sys.path.insert(0, new_dir)
|
|
|
|
yield
|
2015-12-13 13:49:17 +03:00
|
|
|
finally:
|
|
|
|
del sys.path[0]
|
2015-12-14 01:32:23 +03:00
|
|
|
os.chdir(old_dir)
|
|
|
|
|
|
|
|
|
|
|
|
def setup_package():
|
|
|
|
root = os.path.abspath(os.path.dirname(__file__))
|
|
|
|
|
2018-11-27 00:04:35 +03:00
|
|
|
if len(sys.argv) > 1 and sys.argv[1] == "clean":
|
2015-12-14 01:32:23 +03:00
|
|
|
return clean(root)
|
|
|
|
|
|
|
|
with chdir(root):
|
2018-11-27 00:04:35 +03:00
|
|
|
with io.open(os.path.join(root, "spacy", "about.py"), encoding="utf8") as f:
|
2016-03-13 20:12:32 +03:00
|
|
|
about = {}
|
2016-01-15 20:57:01 +03:00
|
|
|
exec(f.read(), about)
|
2015-12-14 01:32:23 +03:00
|
|
|
|
2018-11-27 00:04:35 +03:00
|
|
|
with io.open(os.path.join(root, "README.md"), encoding="utf8") as f:
|
2016-03-13 20:12:32 +03:00
|
|
|
readme = f.read()
|
|
|
|
|
2015-12-14 01:32:23 +03:00
|
|
|
include_dirs = [
|
|
|
|
get_python_inc(plat_specific=True),
|
2018-11-27 00:04:35 +03:00
|
|
|
os.path.join(root, "include"),
|
|
|
|
]
|
2015-12-14 01:32:23 +03:00
|
|
|
|
2018-11-27 00:04:35 +03:00
|
|
|
if (
|
|
|
|
ccompiler.new_compiler().compiler_type == "msvc"
|
|
|
|
and msvccompiler.get_build_version() == 9
|
|
|
|
):
|
|
|
|
include_dirs.append(os.path.join(root, "include", "msvc9"))
|
2016-04-28 23:10:43 +03:00
|
|
|
|
2015-12-14 01:32:23 +03:00
|
|
|
ext_modules = []
|
|
|
|
for mod_name in MOD_NAMES:
|
2018-11-27 00:04:35 +03:00
|
|
|
mod_path = mod_name.replace(".", "/") + ".cpp"
|
2016-11-06 13:58:26 +03:00
|
|
|
extra_link_args = []
|
2018-12-01 04:36:56 +03:00
|
|
|
extra_compile_args = []
|
2016-11-06 13:58:26 +03:00
|
|
|
# ???
|
|
|
|
# Imported from patch from @mikepb
|
|
|
|
# See Issue #267. Running blind here...
|
2018-11-27 00:04:35 +03:00
|
|
|
if sys.platform == "darwin":
|
|
|
|
dylib_path = [".." for _ in range(mod_name.count("."))]
|
|
|
|
dylib_path = "/".join(dylib_path)
|
|
|
|
dylib_path = "@loader_path/%s/spacy/platform/darwin/lib" % dylib_path
|
|
|
|
extra_link_args.append("-Wl,-rpath,%s" % dylib_path)
|
2015-12-14 01:32:23 +03:00
|
|
|
ext_modules.append(
|
2018-11-27 00:04:35 +03:00
|
|
|
Extension(
|
|
|
|
mod_name,
|
|
|
|
[mod_path],
|
|
|
|
language="c++",
|
|
|
|
include_dirs=include_dirs,
|
|
|
|
extra_link_args=extra_link_args,
|
|
|
|
)
|
|
|
|
)
|
2015-12-14 01:32:23 +03:00
|
|
|
|
|
|
|
if not is_source_release(root):
|
2018-11-27 00:04:35 +03:00
|
|
|
generate_cython(root, "spacy")
|
Reduce size of language data (#4141)
* Move Turkish lemmas to a json file
Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.
This focuses on Turkish specifically because it has the most language
data in core.
* Transition all lemmatizer.py files to json
This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.
None of the affected files have logic, so this was pretty
straightforward.
One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.
* Move large lang data to json for fr/nb/nl/sv
These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.
For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.
* Fix id lemmas.json
The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.
* Add .json.gz to gitignore
This covers the json.gz files built as part of distribution.
* Add language data gzip to build process
Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.
* Remove Danish lemmatizer.py
Missed this when I added the json.
* Update to match latest explosion/srsly#9
The way gzipped json is loaded/saved in srsly changed a bit.
* Only compress language data if necessary
If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.
* Move en/el language data to json
This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.
* Remove empty files in Norwegian tokenizer
It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.
This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.
* Remove dubious entries in English lookup.json
" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.
* Fix small issues with en/fr lemmatizers
The en tokenizer was including the removed _nouns.py file, so that's
removed.
The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.
* Auto-format
* Auto-format
* Update srsly pin
* Consistently use pathlib paths
2019-08-20 15:54:11 +03:00
|
|
|
gzip_language_data(root, "spacy/lang")
|
2015-12-14 01:32:23 +03:00
|
|
|
|
|
|
|
setup(
|
2019-05-28 18:11:39 +03:00
|
|
|
name="spacy",
|
2016-01-15 20:01:02 +03:00
|
|
|
zip_safe=False,
|
2015-12-14 01:32:23 +03:00
|
|
|
packages=PACKAGES,
|
2016-11-06 13:58:26 +03:00
|
|
|
package_data=PACKAGE_DATA,
|
2018-11-27 00:04:35 +03:00
|
|
|
description=about["__summary__"],
|
2016-03-13 20:12:32 +03:00
|
|
|
long_description=readme,
|
2018-11-27 00:04:35 +03:00
|
|
|
long_description_content_type="text/markdown",
|
|
|
|
author=about["__author__"],
|
|
|
|
author_email=about["__email__"],
|
|
|
|
version=about["__version__"],
|
|
|
|
url=about["__uri__"],
|
|
|
|
license=about["__license__"],
|
2015-12-14 01:32:23 +03:00
|
|
|
ext_modules=ext_modules,
|
2018-11-27 00:04:35 +03:00
|
|
|
scripts=["bin/spacy"],
|
2016-03-13 20:12:32 +03:00
|
|
|
install_requires=[
|
2018-11-27 00:04:35 +03:00
|
|
|
"numpy>=1.15.0",
|
💫 Use Blis for matrix multiplications (#2966)
Our epic matrix multiplication odyssey is drawing to a close...
I've now finally got the Blis linear algebra routines in a self-contained Python package, with wheels for Windows, Linux and OSX. The only missing platform at the moment is Windows Python 2.7. The result is at https://github.com/explosion/cython-blis
Thinc v7.0.0 will make the change to Blis. I've put a Thinc v7.0.0.dev0 up on PyPi so that we can test these changes with the CI, and even get them out to spacy-nightly, before Thinc v7.0.0 is released. This PR also updates the other dependencies to be in line with the current versions master is using. I've also resolved the msgpack deprecation problems, and gotten spaCy and Thinc up to date with the latest Cython.
The point of switching to Blis is to have control of how our matrix multiplications are executed across platforms. When we were using numpy for this, a different library would be used on pip and conda, OSX would use Accelerate, etc. This would open up different bugs and performance problems, especially when multi-threading was introduced.
With the change to Blis, we now strictly single-thread the matrix multiplications. This will make it much easier to use multiprocessing to parallelise the runtime, since we won't have nested parallelism problems to deal with.
* Use blis
* Use -2 arg to Cython
* Update dependencies
* Fix requirements
* Update setup dependencies
* Fix requirement typo
* Fix msgpack errors
* Remove Python27 test from Appveyor, until Blis works there
* Auto-format setup.py
* Fix murmurhash version
2018-11-27 02:44:04 +03:00
|
|
|
"murmurhash>=0.28.0,<1.1.0",
|
|
|
|
"cymem>=2.0.2,<2.1.0",
|
2019-09-10 20:13:07 +03:00
|
|
|
"preshed>=3.0.0,<3.1.0",
|
2019-09-10 21:12:24 +03:00
|
|
|
"thinc>=7.1.1,<7.2.0",
|
2019-08-25 14:50:47 +03:00
|
|
|
"blis>=0.4.0,<0.5.0",
|
2018-11-27 00:04:35 +03:00
|
|
|
"plac<1.0.0,>=0.9.6",
|
|
|
|
"requests>=2.13.0,<3.0.0",
|
2019-03-22 15:31:58 +03:00
|
|
|
"wasabi>=0.2.0,<1.1.0",
|
Reduce size of language data (#4141)
* Move Turkish lemmas to a json file
Rather than a large dict in Python source, the data is now a big json
file. This includes a method for loading the json file, falling back to
a compressed file, and an update to MANIFEST.in that excludes json in
the spacy/lang directory.
This focuses on Turkish specifically because it has the most language
data in core.
* Transition all lemmatizer.py files to json
This covers all lemmatizer.py files of a significant size (>500k or so).
Small files were left alone.
None of the affected files have logic, so this was pretty
straightforward.
One unusual thing is that the lemma data for Urdu doesn't seem to be
used anywhere. That may require further investigation.
* Move large lang data to json for fr/nb/nl/sv
These are the languages that use a lemmatizer directory (rather than a
single file) and are larger than English.
For most of these languages there were many language data files, in
which case only the large ones (>500k or so) were converted to json. It
may or may not be a good idea to migrate the remaining Python files to
json in the future.
* Fix id lemmas.json
The contents of this file were originally just copied from the Python
source, but that used single quotes, so it had to be properly converted
to json first.
* Add .json.gz to gitignore
This covers the json.gz files built as part of distribution.
* Add language data gzip to build process
Currently this gzip data on every build; it works, but it should be
changed to only gzip when the source file has been updated.
* Remove Danish lemmatizer.py
Missed this when I added the json.
* Update to match latest explosion/srsly#9
The way gzipped json is loaded/saved in srsly changed a bit.
* Only compress language data if necessary
If a .json.gz file exists and is newer than the corresponding json file,
it's not recompressed.
* Move en/el language data to json
This only affected files >500kb, which was nouns for both languages and
the generic lookup table for English.
* Remove empty files in Norwegian tokenizer
It's unclear why, but the Norwegian (nb) tokenizer had empty files for
adj/adv/noun/verb lemmas. This may have been a result of copying the
structure of the English lemmatizer.
This removed the files, but still creates the empty sets in the
lemmatizer. That may not actually be necessary.
* Remove dubious entries in English lookup.json
" furthest" and " skilled" - both prefixed with a space - were in the
English lookup table. That seems obviously wrong so I have removed them.
* Fix small issues with en/fr lemmatizers
The en tokenizer was including the removed _nouns.py file, so that's
removed.
The fr tokenizer is unusual in that it has a lemmatizer directory with
both __init__.py and lemmatizer.py. lemmatizer.py had not been converted
to load the json language data, so that was fixed.
* Auto-format
* Auto-format
* Update srsly pin
* Consistently use pathlib paths
2019-08-20 15:54:11 +03:00
|
|
|
"srsly>=0.1.0,<1.1.0",
|
2018-11-27 00:04:35 +03:00
|
|
|
'pathlib==1.0.1; python_version < "3.4"',
|
|
|
|
],
|
|
|
|
setup_requires=["wheel"],
|
2018-05-19 19:12:23 +03:00
|
|
|
extras_require={
|
2019-03-20 02:59:27 +03:00
|
|
|
"cuda": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy>=5.0.0b4"],
|
|
|
|
"cuda80": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda80>=5.0.0b4"],
|
|
|
|
"cuda90": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda90>=5.0.0b4"],
|
|
|
|
"cuda91": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda91>=5.0.0b4"],
|
|
|
|
"cuda92": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda92>=5.0.0b4"],
|
|
|
|
"cuda100": ["thinc_gpu_ops>=0.0.1,<0.1.0", "cupy-cuda100>=5.0.0b4"],
|
2019-02-25 11:37:05 +03:00
|
|
|
# Language tokenizers with external dependencies
|
|
|
|
"ja": ["mecab-python3==0.7"],
|
2019-07-09 23:23:16 +03:00
|
|
|
"ko": ["natto-py==0.9.0"],
|
2019-08-25 12:35:21 +03:00
|
|
|
"th": ["pythainlp>=2.0"],
|
2018-05-19 19:12:23 +03:00
|
|
|
},
|
2019-02-07 22:54:07 +03:00
|
|
|
python_requires=">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*",
|
2016-03-12 15:47:10 +03:00
|
|
|
classifiers=[
|
2018-11-27 00:04:35 +03:00
|
|
|
"Development Status :: 5 - Production/Stable",
|
|
|
|
"Environment :: Console",
|
|
|
|
"Intended Audience :: Developers",
|
|
|
|
"Intended Audience :: Science/Research",
|
|
|
|
"License :: OSI Approved :: MIT License",
|
|
|
|
"Operating System :: POSIX :: Linux",
|
|
|
|
"Operating System :: MacOS :: MacOS X",
|
|
|
|
"Operating System :: Microsoft :: Windows",
|
|
|
|
"Programming Language :: Cython",
|
|
|
|
"Programming Language :: Python :: 2",
|
|
|
|
"Programming Language :: Python :: 2.7",
|
|
|
|
"Programming Language :: Python :: 3",
|
|
|
|
"Programming Language :: Python :: 3.4",
|
|
|
|
"Programming Language :: Python :: 3.5",
|
|
|
|
"Programming Language :: Python :: 3.6",
|
|
|
|
"Programming Language :: Python :: 3.7",
|
|
|
|
"Topic :: Scientific/Engineering",
|
|
|
|
],
|
|
|
|
cmdclass={"build_ext": build_ext_subclass},
|
2015-12-14 01:32:23 +03:00
|
|
|
)
|
2015-01-04 21:30:56 +03:00
|
|
|
|
|
|
|
|
2018-11-27 00:04:35 +03:00
|
|
|
if __name__ == "__main__":
|
2015-12-14 01:32:23 +03:00
|
|
|
setup_package()
|