From 2c37e0ccf6d8704bb242e63889be411f2a7b44aa Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Tue, 27 Nov 2018 00:44:04 +0100
Subject: [PATCH] =?UTF-8?q?=F0=9F=92=AB=20Use=20Blis=20for=20matrix=20mult?=
 =?UTF-8?q?iplications=20(#2966)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Our epic matrix multiplication odyssey is drawing to a close...

I've now finally got the Blis linear algebra routines in a self-contained Python package, with wheels for Windows, Linux and OSX. The only missing platform at the moment is Windows Python 2.7. The result is at https://github.com/explosion/cython-blis

Thinc v7.0.0 will make the change to Blis. I've put a Thinc v7.0.0.dev0 up on PyPi so that we can test these changes with the CI, and even get them out to spacy-nightly, before Thinc v7.0.0 is released. This PR also updates the other dependencies to be in line with the current versions master is using. I've also resolved the msgpack deprecation problems, and gotten spaCy and Thinc up to date with the latest Cython.

The point of switching to Blis is to have control of how our matrix multiplications are executed across platforms. When we were using numpy for this, a different library would be used on pip and conda, OSX would use Accelerate, etc. This would open up different bugs and performance problems, especially when multi-threading was introduced.

With the change to Blis, we now strictly single-thread the matrix multiplications. This will make it much easier to use multiprocessing to parallelise the runtime, since we won't have nested parallelism problems to deal with.

* Use blis

* Use -2 arg to Cython

* Update dependencies

* Fix requirements

* Update setup dependencies

* Fix requirement typo

* Fix msgpack errors

* Remove Python27 test from Appveyor, until Blis works there

* Auto-format setup.py

* Fix murmurhash version
---
 .appveyor.yml                  |  5 +++--
 bin/cythonize.py               |  4 ++--
 requirements.txt               | 13 +++++++------
 setup.py                       | 12 +++++++-----
 spacy/pipeline.pyx             | 12 ++++++------
 spacy/syntax/_parser_model.pyx | 18 +++++++++++-------
 spacy/syntax/nn_parser.pyx     |  1 -
 spacy/util.py                  |  2 +-
 8 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/.appveyor.yml b/.appveyor.yml
index 80cbbee4f..237e7b10e 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -5,14 +5,15 @@ environment:
     # For Python versions available on Appveyor, see
     # http://www.appveyor.com/docs/installed-software#python
 
-    - PYTHON: "C:\\Python27-x64"
+      #- PYTHON: "C:\\Python27-x64"
     #- PYTHON: "C:\\Python34"
     #- PYTHON: "C:\\Python35"
     #- DISTUTILS_USE_SDK: "1"
     #- PYTHON: "C:\\Python34-x64"
     #- DISTUTILS_USE_SDK: "1"
-    #- PYTHON: "C:\\Python35-x64"
+    - PYTHON: "C:\\Python35-x64"
     - PYTHON: "C:\\Python36-x64"
+    - PYTHON: "C:\\Python37-x64"
 
 install:
   # We need wheel installed to build wheels
diff --git a/bin/cythonize.py b/bin/cythonize.py
index 47f3b23fc..fcc2922eb 100755
--- a/bin/cythonize.py
+++ b/bin/cythonize.py
@@ -38,7 +38,7 @@ import argparse
 HASH_FILE = 'cythonize.json'
 
 
-def process_pyx(fromfile, tofile):
+def process_pyx(fromfile, tofile, language_level='-2'):
     print('Processing %s' % fromfile)
     try:
         from Cython.Compiler.Version import version as cython_version
@@ -49,7 +49,7 @@ def process_pyx(fromfile, tofile):
     except ImportError:
         pass
 
-    flags = ['--fast-fail']
+    flags = ['--fast-fail', language_level]
     if tofile.endswith('.cpp'):
         flags += ['--cplus']
 
diff --git a/requirements.txt b/requirements.txt
index e71c2b547..cabd28c3b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,14 +1,15 @@
-cython>=0.24,<0.28.0
+cython>=0.25
 numpy>=1.15.0
-cymem>=1.30,<1.32
-preshed>=1.0.0,<2.0.0
-thinc>=6.11.2,<6.12.0
-murmurhash>=0.28,<0.29
+cymem>=2.0.2,<2.1.0
+preshed>=2.0.1,<2.1.0
+thinc==7.0.0.dev0
+blis>=0.2.2,<0.3.0
+murmurhash>=0.28.0,<1.1.0
 cytoolz>=0.9.0,<0.10.0
 plac<1.0.0,>=0.9.6
 ujson>=1.35
 dill>=0.2,<0.3
-regex==2017.4.5
+regex==2018.01.10
 requests>=2.13.0,<3.0.0
 pytest>=3.6.0,<4.0.0
 pytest-timeout>=1.3.0,<2.0.0
diff --git a/setup.py b/setup.py
index 34e270a44..0bf48f709 100755
--- a/setup.py
+++ b/setup.py
@@ -197,13 +197,14 @@ def setup_package():
             scripts=["bin/spacy"],
             install_requires=[
                 "numpy>=1.15.0",
-                "murmurhash>=0.28,<0.29",
-                "cymem>=1.30,<1.32",
-                "preshed>=1.0.0,<2.0.0",
-                "thinc>=6.11.2,<6.12.0",
+                "murmurhash>=0.28.0,<1.1.0",
+                "cymem>=2.0.2,<2.1.0",
+                "preshed>=2.0.1,<2.1.0",
+                "thinc==7.0.0.dev0",
+                "blis>=0.2.2,<0.3.0",
                 "plac<1.0.0,>=0.9.6",
                 "ujson>=1.35",
-                "regex==2017.4.5",
+                "regex==2018.01.10",
                 "dill>=0.2,<0.3",
                 "requests>=2.13.0,<3.0.0",
                 'pathlib==1.0.1; python_version < "3.4"',
@@ -214,6 +215,7 @@ def setup_package():
                 "cuda80": ["cupy-cuda80>=4.0"],
                 "cuda90": ["cupy-cuda90>=4.0"],
                 "cuda91": ["cupy-cuda91>=4.0"],
+                "cuda92": ["cupy-cuda92>=4.0"],
             },
             classifiers=[
                 "Development Status :: 5 - Production/Stable",
diff --git a/spacy/pipeline.pyx b/spacy/pipeline.pyx
index 2ce3bbb27..f7c4ec4e0 100644
--- a/spacy/pipeline.pyx
+++ b/spacy/pipeline.pyx
@@ -234,7 +234,7 @@ class EntityRuler(object):
         **kwargs: Other config paramters, mostly for consistency.
         RETURNS (EntityRuler): The loaded entity ruler.
         """
-        patterns = msgpack.loads(patterns_bytes)
+        patterns = msgpack.loads(patterns_bytes, raw=False)
         self.add_patterns(patterns)
         return self
 
@@ -243,7 +243,7 @@ class EntityRuler(object):
 
         RETURNS (bytes): The serialized patterns.
         """
-        return msgpack.dumps(self.patterns)
+        return msgpack.dumps(self.patterns, use_bin_type=True)
 
     def from_disk(self, path, **kwargs):
         """Load the entity ruler from a file. Expects a file containing
@@ -747,7 +747,7 @@ class Tagger(Pipe):
         serialize['cfg'] = lambda: ujson.dumps(self.cfg)
         tag_map = OrderedDict(sorted(self.vocab.morphology.tag_map.items()))
         serialize['tag_map'] = lambda: msgpack.dumps(
-            tag_map, use_bin_type=True, encoding='utf8')
+            tag_map, use_bin_type=True)
         return util.to_bytes(serialize, exclude)
 
     def from_bytes(self, bytes_data, **exclude):
@@ -765,7 +765,7 @@ class Tagger(Pipe):
             self.model.from_bytes(b)
 
         def load_tag_map(b):
-            tag_map = msgpack.loads(b, encoding='utf8')
+            tag_map = msgpack.loads(b, raw=False)
             self.vocab.morphology = Morphology(
                 self.vocab.strings, tag_map=tag_map,
                 lemmatizer=self.vocab.morphology.lemmatizer,
@@ -785,7 +785,7 @@ class Tagger(Pipe):
         serialize = OrderedDict((
             ('vocab', lambda p: self.vocab.to_disk(p)),
             ('tag_map', lambda p: p.open('wb').write(msgpack.dumps(
-                tag_map, use_bin_type=True, encoding='utf8'))),
+                tag_map, use_bin_type=True))),
             ('model', lambda p: p.open('wb').write(self.model.to_bytes())),
             ('cfg', lambda p: p.open('w').write(json_dumps(self.cfg)))
         ))
@@ -803,7 +803,7 @@ class Tagger(Pipe):
 
         def load_tag_map(p):
             with p.open('rb') as file_:
-                tag_map = msgpack.loads(file_.read(), encoding='utf8')
+                tag_map = msgpack.loads(file_.read(), raw=False)
             self.vocab.morphology = Morphology(
                 self.vocab.strings, tag_map=tag_map,
                 lemmatizer=self.vocab.morphology.lemmatizer,
diff --git a/spacy/syntax/_parser_model.pyx b/spacy/syntax/_parser_model.pyx
index 0ff001523..cfaa8ddf0 100644
--- a/spacy/syntax/_parser_model.pyx
+++ b/spacy/syntax/_parser_model.pyx
@@ -25,8 +25,7 @@ from thinc.misc import LayerNorm
 from thinc.neural.ops import CupyOps
 from thinc.neural.util import get_array_module
 from thinc.linalg cimport Vec, VecVec
-from thinc cimport openblas
-
+cimport blis.cy
 
 from .._ml import zero_init, PrecomputableAffine, Tok2Vec, flatten
 from .._ml import link_vectors_to_models, create_default_optimizer
@@ -107,10 +106,14 @@ cdef void predict_states(ActivationsC* A, StateC** states,
             which = Vec.arg_max(&A.unmaxed[index], n.pieces)
             A.hiddens[i*n.hiddens + j] = A.unmaxed[index + which]
     memset(A.scores, 0, n.states * n.classes * sizeof(float))
+    cdef double one = 1.0
     # Compute hidden-to-output
-    openblas.simple_gemm(A.scores, n.states, n.classes,
-        A.hiddens, n.states, n.hiddens,
-        W.hidden_weights, n.classes, n.hiddens, 0, 1)
+    blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE,
+        n.states, n.classes, n.hiddens, one,
+        <float*>A.hiddens, n.hiddens, 1,
+        <float*>W.hidden_weights, n.hiddens, 1,
+        one,
+        <float*>A.scores, n.classes, 1)
     # Add bias
     for i in range(n.states):
         VecVec.add_i(&A.scores[i*n.classes],
@@ -132,8 +135,9 @@ cdef void sum_state_features(float* output,
             else:
                 idx = token_ids[f] * id_stride + f*O
                 feature = &cached[idx]
-            openblas.simple_axpy(&output[b*O], O,
-                feature, one)
+            blis.cy.axpyv(blis.cy.NO_CONJUGATE, O, one,
+                <float*>feature, 1,
+                &output[b*O], 1)
         token_ids += F
 
 
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index f421520ce..82e87ae61 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -27,7 +27,6 @@ from thinc.misc import LayerNorm
 from thinc.neural.ops import CupyOps
 from thinc.neural.util import get_array_module
 from thinc.linalg cimport Vec, VecVec
-from thinc cimport openblas
 
 from ._parser_model cimport resize_activations, predict_states, arg_max_if_valid
 from ._parser_model cimport WeightsC, ActivationsC, SizesC, cpu_log_loss
diff --git a/spacy/util.py b/spacy/util.py
index d0d112c91..e83fd3a11 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -566,7 +566,7 @@ def to_bytes(getters, exclude):
     for key, getter in getters.items():
         if key not in exclude:
             serialized[key] = getter()
-    return msgpack.dumps(serialized, use_bin_type=True, encoding='utf8')
+    return msgpack.dumps(serialized, use_bin_type=True)
 
 
 def from_bytes(bytes_data, setters, exclude):