Merge pull request #9951 from explosion/master

Update develop branch with master
2026-02-17 20:50:55 +03:00 · 2021-12-29 10:11:43 +01:00 · 2021-12-29 10:11:43 +01:00 · b8106e0f95
commit b8106e0f95
parent 7ec1452f5f 9d63dfacfc
6 changed files with 31 additions and 26 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,11 +1,8 @@
-recursive-include include *.h
 recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
 include LICENSE
 include README.md
 include pyproject.toml
 include spacy/py.typed
-recursive-exclude spacy/lang *.json
-recursive-include spacy/lang *.json.gz
-recursive-include spacy/cli *.json *.yml
+recursive-include spacy/cli *.yml
 recursive-include licenses *
 recursive-exclude spacy *.cpp
--- a/setup.cfg
+++ b/setup.cfg
@ -77,31 +77,33 @@ transformers =
 ray =
    spacy_ray>=0.1.0,<1.0.0
 cuda =
-    cupy>=5.0.0b4,<10.0.0
+    cupy>=5.0.0b4,<11.0.0
 cuda80 =
-    cupy-cuda80>=5.0.0b4,<10.0.0
+    cupy-cuda80>=5.0.0b4,<11.0.0
 cuda90 =
-    cupy-cuda90>=5.0.0b4,<10.0.0
+    cupy-cuda90>=5.0.0b4,<11.0.0
 cuda91 =
-    cupy-cuda91>=5.0.0b4,<10.0.0
+    cupy-cuda91>=5.0.0b4,<11.0.0
 cuda92 =
-    cupy-cuda92>=5.0.0b4,<10.0.0
+    cupy-cuda92>=5.0.0b4,<11.0.0
 cuda100 =
-    cupy-cuda100>=5.0.0b4,<10.0.0
+    cupy-cuda100>=5.0.0b4,<11.0.0
 cuda101 =
-    cupy-cuda101>=5.0.0b4,<10.0.0
+    cupy-cuda101>=5.0.0b4,<11.0.0
 cuda102 =
-    cupy-cuda102>=5.0.0b4,<10.0.0
+    cupy-cuda102>=5.0.0b4,<11.0.0
 cuda110 =
-    cupy-cuda110>=5.0.0b4,<10.0.0
+    cupy-cuda110>=5.0.0b4,<11.0.0
 cuda111 =
-    cupy-cuda111>=5.0.0b4,<10.0.0
+    cupy-cuda111>=5.0.0b4,<11.0.0
 cuda112 =
-    cupy-cuda112>=5.0.0b4,<10.0.0
+    cupy-cuda112>=5.0.0b4,<11.0.0
 cuda113 =
-    cupy-cuda113>=5.0.0b4,<10.0.0
+    cupy-cuda113>=5.0.0b4,<11.0.0
 cuda114 =
-    cupy-cuda114>=5.0.0b4,<10.0.0
+    cupy-cuda114>=5.0.0b4,<11.0.0
+cuda115 =
+    cupy-cuda115>=5.0.0b4,<11.0.0
 apple =
    thinc-apple-ops>=0.0.4,<1.0.0
 # Language tokenizers with external dependencies
--- a/spacy/ml/models/tok2vec.py
+++ b/spacy/ml/models/tok2vec.py
@ -123,7 +123,7 @@ def MultiHashEmbed(
    attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
    account some subword information, without constructing a fully character-based
    representation. If pretrained vectors are available, they can be included in
-    the representation as well, with the vectors table will be kept static
+    the representation as well, with the vectors table kept static
    (i.e. it's not updated).

    The `width` parameter specifies the output width of the layer and the widths
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -1,5 +1,5 @@
 cimport numpy as np
-from libc.stdint cimport uint32_t
+from libc.stdint cimport uint32_t, uint64_t
 from cython.operator cimport dereference as deref
 from libcpp.set cimport set as cppset
 from murmurhash.mrmr cimport hash128_x64
@ -353,12 +353,18 @@ cdef class Vectors:
        key (str): The string key.
        RETURNS: A list of the integer hashes.
        """
-        cdef uint32_t[4] out
+        # MurmurHash3_x64_128 returns an array of 2 uint64_t values.
+        cdef uint64_t[2] out
        chars = s.encode("utf8")
        cdef char* utf8_string = chars
        hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
-        rows = [out[i] for i in range(min(self.hash_count, 4))]
-        return rows
+        rows = [
+            out[0] & 0xffffffffu,
+            out[0] >> 32,
+            out[1] & 0xffffffffu,
+            out[1] >> 32,
+        ]
+        return rows[:min(self.hash_count, 4)]

    def _get_ngrams(self, unicode key):
        """Get all padded ngram strings using the ngram settings.
--- a/website/UNIVERSE.md
+++ b/website/UNIVERSE.md
@ -44,7 +44,7 @@ markup is correct.
    "id": "unique-project-id",
    "title": "Project title",
    "slogan": "A short summary",
-    "description": "A longer description – *Mardown allowed!*",
+    "description": "A longer description – *Markdown allowed!*",
    "github": "user/repo",
    "pip": "package-name",
    "code_example": [
--- a/website/docs/api/architectures.md
+++ b/website/docs/api/architectures.md
@ -158,7 +158,7 @@ be configured with the `attrs` argument. The suggested attributes are `NORM`,
 `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some
 subword information, without construction a fully character-based
 representation. If pretrained vectors are available, they can be included in the
-representation as well, with the vectors table will be kept static (i.e. it's
+representation as well, with the vectors table kept static (i.e. it's
 not updated).

 | Name                     | Description                                                                                                                                                                                                                                                                                                                                                                                                                                        |
@ -296,7 +296,7 @@ learned linear projection to control the dimensionality. Unknown tokens are
 mapped to a zero vector. See the documentation on
 [static vectors](/usage/embeddings-transformers#static-vectors) for details.

-| Name        |  Description                                                                                                                                                                                                            |
+| Name        | Description                                                                                                                                                                                                             |
 | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `nO`        | The output width of the layer, after the linear projection. ~~Optional[int]~~                                                                                                                                           |
 | `nM`        | The width of the static vectors. ~~Optional[int]~~                                                                                                                                                                      |
@ -318,7 +318,7 @@ mapped to a zero vector. See the documentation on
 Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
 of feature names to extract, which should refer to token attributes.

-| Name        |  Description                                                             |
+| Name        | Description                                                              |
 | ----------- | ------------------------------------------------------------------------ |
 | `columns`   | The token attributes to extract. ~~List[Union[int, str]]~~               |
 | **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |