diff --git a/MANIFEST.in b/MANIFEST.in index c1524d460..b7826e456 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,11 +1,8 @@ -recursive-include include *.h recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml include LICENSE include README.md include pyproject.toml include spacy/py.typed -recursive-exclude spacy/lang *.json -recursive-include spacy/lang *.json.gz -recursive-include spacy/cli *.json *.yml +recursive-include spacy/cli *.yml recursive-include licenses * recursive-exclude spacy *.cpp diff --git a/setup.cfg b/setup.cfg index 72f4b39da..50e982cbf 100644 --- a/setup.cfg +++ b/setup.cfg @@ -77,31 +77,33 @@ transformers = ray = spacy_ray>=0.1.0,<1.0.0 cuda = - cupy>=5.0.0b4,<10.0.0 + cupy>=5.0.0b4,<11.0.0 cuda80 = - cupy-cuda80>=5.0.0b4,<10.0.0 + cupy-cuda80>=5.0.0b4,<11.0.0 cuda90 = - cupy-cuda90>=5.0.0b4,<10.0.0 + cupy-cuda90>=5.0.0b4,<11.0.0 cuda91 = - cupy-cuda91>=5.0.0b4,<10.0.0 + cupy-cuda91>=5.0.0b4,<11.0.0 cuda92 = - cupy-cuda92>=5.0.0b4,<10.0.0 + cupy-cuda92>=5.0.0b4,<11.0.0 cuda100 = - cupy-cuda100>=5.0.0b4,<10.0.0 + cupy-cuda100>=5.0.0b4,<11.0.0 cuda101 = - cupy-cuda101>=5.0.0b4,<10.0.0 + cupy-cuda101>=5.0.0b4,<11.0.0 cuda102 = - cupy-cuda102>=5.0.0b4,<10.0.0 + cupy-cuda102>=5.0.0b4,<11.0.0 cuda110 = - cupy-cuda110>=5.0.0b4,<10.0.0 + cupy-cuda110>=5.0.0b4,<11.0.0 cuda111 = - cupy-cuda111>=5.0.0b4,<10.0.0 + cupy-cuda111>=5.0.0b4,<11.0.0 cuda112 = - cupy-cuda112>=5.0.0b4,<10.0.0 + cupy-cuda112>=5.0.0b4,<11.0.0 cuda113 = - cupy-cuda113>=5.0.0b4,<10.0.0 + cupy-cuda113>=5.0.0b4,<11.0.0 cuda114 = - cupy-cuda114>=5.0.0b4,<10.0.0 + cupy-cuda114>=5.0.0b4,<11.0.0 +cuda115 = + cupy-cuda115>=5.0.0b4,<11.0.0 apple = thinc-apple-ops>=0.0.4,<1.0.0 # Language tokenizers with external dependencies diff --git a/spacy/ml/models/tok2vec.py b/spacy/ml/models/tok2vec.py index 44ab50e85..ecdf6be27 100644 --- a/spacy/ml/models/tok2vec.py +++ b/spacy/ml/models/tok2vec.py @@ -123,7 +123,7 @@ def MultiHashEmbed( attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into account some subword information, without constructing a fully character-based representation. If pretrained vectors are available, they can be included in - the representation as well, with the vectors table will be kept static + the representation as well, with the vectors table kept static (i.e. it's not updated). The `width` parameter specifies the output width of the layer and the widths diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index 1b985a638..345e8df68 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -1,5 +1,5 @@ cimport numpy as np -from libc.stdint cimport uint32_t +from libc.stdint cimport uint32_t, uint64_t from cython.operator cimport dereference as deref from libcpp.set cimport set as cppset from murmurhash.mrmr cimport hash128_x64 @@ -353,12 +353,18 @@ cdef class Vectors: key (str): The string key. RETURNS: A list of the integer hashes. """ - cdef uint32_t[4] out + # MurmurHash3_x64_128 returns an array of 2 uint64_t values. + cdef uint64_t[2] out chars = s.encode("utf8") cdef char* utf8_string = chars hash128_x64(utf8_string, len(chars), self.hash_seed, &out) - rows = [out[i] for i in range(min(self.hash_count, 4))] - return rows + rows = [ + out[0] & 0xffffffffu, + out[0] >> 32, + out[1] & 0xffffffffu, + out[1] >> 32, + ] + return rows[:min(self.hash_count, 4)] def _get_ngrams(self, unicode key): """Get all padded ngram strings using the ngram settings. diff --git a/website/UNIVERSE.md b/website/UNIVERSE.md index d37c4561a..770bbde13 100644 --- a/website/UNIVERSE.md +++ b/website/UNIVERSE.md @@ -44,7 +44,7 @@ markup is correct. "id": "unique-project-id", "title": "Project title", "slogan": "A short summary", - "description": "A longer description – *Mardown allowed!*", + "description": "A longer description – *Markdown allowed!*", "github": "user/repo", "pip": "package-name", "code_example": [ diff --git a/website/docs/api/architectures.md b/website/docs/api/architectures.md index 44ba94d9e..07b76393f 100644 --- a/website/docs/api/architectures.md +++ b/website/docs/api/architectures.md @@ -158,7 +158,7 @@ be configured with the `attrs` argument. The suggested attributes are `NORM`, `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some subword information, without construction a fully character-based representation. If pretrained vectors are available, they can be included in the -representation as well, with the vectors table will be kept static (i.e. it's +representation as well, with the vectors table kept static (i.e. it's not updated). | Name | Description | @@ -296,7 +296,7 @@ learned linear projection to control the dimensionality. Unknown tokens are mapped to a zero vector. See the documentation on [static vectors](/usage/embeddings-transformers#static-vectors) for details. -| Name |  Description | +| Name | Description | | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | `nO` | The output width of the layer, after the linear projection. ~~Optional[int]~~ | | `nM` | The width of the static vectors. ~~Optional[int]~~ | @@ -318,7 +318,7 @@ mapped to a zero vector. See the documentation on Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list of feature names to extract, which should refer to token attributes. -| Name |  Description | +| Name | Description | | ----------- | ------------------------------------------------------------------------ | | `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ | | **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |