From 18e5638af04f0f8f4e61735f75715b6ca65bd4b2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 20 Dec 2021 15:48:35 +0100 Subject: [PATCH 1/5] Extend cupy to v10.x (#9911) * Add extra for `cupy-cuda115` --- setup.cfg | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/setup.cfg b/setup.cfg index 72f4b39da..50e982cbf 100644 --- a/setup.cfg +++ b/setup.cfg @@ -77,31 +77,33 @@ transformers = ray = spacy_ray>=0.1.0,<1.0.0 cuda = - cupy>=5.0.0b4,<10.0.0 + cupy>=5.0.0b4,<11.0.0 cuda80 = - cupy-cuda80>=5.0.0b4,<10.0.0 + cupy-cuda80>=5.0.0b4,<11.0.0 cuda90 = - cupy-cuda90>=5.0.0b4,<10.0.0 + cupy-cuda90>=5.0.0b4,<11.0.0 cuda91 = - cupy-cuda91>=5.0.0b4,<10.0.0 + cupy-cuda91>=5.0.0b4,<11.0.0 cuda92 = - cupy-cuda92>=5.0.0b4,<10.0.0 + cupy-cuda92>=5.0.0b4,<11.0.0 cuda100 = - cupy-cuda100>=5.0.0b4,<10.0.0 + cupy-cuda100>=5.0.0b4,<11.0.0 cuda101 = - cupy-cuda101>=5.0.0b4,<10.0.0 + cupy-cuda101>=5.0.0b4,<11.0.0 cuda102 = - cupy-cuda102>=5.0.0b4,<10.0.0 + cupy-cuda102>=5.0.0b4,<11.0.0 cuda110 = - cupy-cuda110>=5.0.0b4,<10.0.0 + cupy-cuda110>=5.0.0b4,<11.0.0 cuda111 = - cupy-cuda111>=5.0.0b4,<10.0.0 + cupy-cuda111>=5.0.0b4,<11.0.0 cuda112 = - cupy-cuda112>=5.0.0b4,<10.0.0 + cupy-cuda112>=5.0.0b4,<11.0.0 cuda113 = - cupy-cuda113>=5.0.0b4,<10.0.0 + cupy-cuda113>=5.0.0b4,<11.0.0 cuda114 = - cupy-cuda114>=5.0.0b4,<10.0.0 + cupy-cuda114>=5.0.0b4,<11.0.0 +cuda115 = + cupy-cuda115>=5.0.0b4,<11.0.0 apple = thinc-apple-ops>=0.0.4,<1.0.0 # Language tokenizers with external dependencies From 11630737562118beeb575ba06bf440fb9b013bfb Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 20 Dec 2021 16:40:20 +0100 Subject: [PATCH 2/5] Remove outdated patterns MANIFEST.in (#9912) --- MANIFEST.in | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index c1524d460..b7826e456 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,11 +1,8 @@ -recursive-include include *.h recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml include LICENSE include README.md include pyproject.toml include spacy/py.typed -recursive-exclude spacy/lang *.json -recursive-include spacy/lang *.json.gz -recursive-include spacy/cli *.json *.yml +recursive-include spacy/cli *.yml recursive-include licenses * recursive-exclude spacy *.cpp From 837d241b686a8fa71fb79a5cbdaf65c178554772 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 20 Dec 2021 17:11:31 +0100 Subject: [PATCH 3/5] Make floret murmurhash endian-neutral (#9735) --- spacy/vectors.pyx | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 1b985a638..345e8df68 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,5 +1,5 @@ cimport numpy as np -from libc.stdint cimport uint32_t +from libc.stdint cimport uint32_t, uint64_t from cython.operator cimport dereference as deref from libcpp.set cimport set as cppset from murmurhash.mrmr cimport hash128_x64 @@ -353,12 +353,18 @@ cdef class Vectors: key (str): The string key. RETURNS: A list of the integer hashes. """ - cdef uint32_t[4] out + # MurmurHash3_x64_128 returns an array of 2 uint64_t values. + cdef uint64_t[2] out chars = s.encode("utf8") cdef char* utf8_string = chars hash128_x64(utf8_string, len(chars), self.hash_seed, &out) - rows = [out[i] for i in range(min(self.hash_count, 4))] - return rows + rows = [ + out[0] & 0xffffffffu, + out[0] >> 32, + out[1] & 0xffffffffu, + out[1] >> 32, + ] + return rows[:min(self.hash_count, 4)] def _get_ngrams(self, unicode key): """Get all padded ngram strings using the ngram settings. From 72abf9e1021df4300d0f776b80980442247d6a9a Mon Sep 17 00:00:00 2001 From: Peter Baumgartner <5107405+pmbaumgartner@users.noreply.github.com> Date: Mon, 27 Dec 2021 05:18:08 -0500 Subject: [PATCH 4/5] MultiHashEmbed vector docs correction (#9918) --- spacy/ml/models/tok2vec.py | 2 +- website/docs/api/architectures.md | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 44ab50e85..ecdf6be27 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -123,7 +123,7 @@ def MultiHashEmbed( attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into account some subword information, without constructing a fully character-based representation. If pretrained vectors are available, they can be included in - the representation as well, with the vectors table will be kept static + the representation as well, with the vectors table kept static (i.e. it's not updated). The `width` parameter specifies the output width of the layer and the widths diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 44ba94d9e..07b76393f 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -158,7 +158,7 @@ be configured with the `attrs` argument. The suggested attributes are `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some subword information, without construction a fully character-based representation. If pretrained vectors are available, they can be included in the -representation as well, with the vectors table will be kept static (i.e. it's +representation as well, with the vectors table kept static (i.e. it's not updated). | Name | Description | @@ -296,7 +296,7 @@ learned linear projection to control the dimensionality. Unknown tokens are mapped to a zero vector. See the documentation on [static vectors](/usage/embeddings-transformers#static-vectors) for details. -| Name |  Description | +| Name | Description | | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `nO` | The output width of the layer, after the linear projection. ~~Optional[int]~~ | | `nM` | The width of the static vectors. ~~Optional[int]~~ | @@ -318,7 +318,7 @@ mapped to a zero vector. See the documentation on Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list of feature names to extract, which should refer to token attributes. -| Name |  Description | +| Name | Description | | ----------- | ------------------------------------------------------------------------ | | `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ | | **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ | From 9d63dfacfc85e7cd6db7190bd742dfe240205de5 Mon Sep 17 00:00:00 2001 From: Yoav Vollansky <4323333+yoavxyoav@users.noreply.github.com> Date: Mon, 27 Dec 2021 14:46:04 +0200 Subject: [PATCH 5/5] Update UNIVERSE.md (#9941) typo --- website/UNIVERSE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/UNIVERSE.md b/website/UNIVERSE.md index d37c4561a..770bbde13 100644 --- a/website/UNIVERSE.md +++ b/website/UNIVERSE.md @@ -44,7 +44,7 @@ markup is correct. "id": "unique-project-id", "title": "Project title", "slogan": "A short summary", - "description": "A longer description – *Mardown allowed!*", + "description": "A longer description – *Markdown allowed!*", "github": "user/repo", "pip": "package-name", "code_example": [