Merge pull request #9951 from explosion/master

Update develop branch with master
This commit is contained in:
Sofie Van Landeghem 2021-12-29 10:11:43 +01:00 committed by GitHub
commit b8106e0f95
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 31 additions and 26 deletions

View File

@ -1,11 +1,8 @@
recursive-include include *.h
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
include LICENSE include LICENSE
include README.md include README.md
include pyproject.toml include pyproject.toml
include spacy/py.typed include spacy/py.typed
recursive-exclude spacy/lang *.json recursive-include spacy/cli *.yml
recursive-include spacy/lang *.json.gz
recursive-include spacy/cli *.json *.yml
recursive-include licenses * recursive-include licenses *
recursive-exclude spacy *.cpp recursive-exclude spacy *.cpp

View File

@ -77,31 +77,33 @@ transformers =
ray = ray =
spacy_ray>=0.1.0,<1.0.0 spacy_ray>=0.1.0,<1.0.0
cuda = cuda =
cupy>=5.0.0b4,<10.0.0 cupy>=5.0.0b4,<11.0.0
cuda80 = cuda80 =
cupy-cuda80>=5.0.0b4,<10.0.0 cupy-cuda80>=5.0.0b4,<11.0.0
cuda90 = cuda90 =
cupy-cuda90>=5.0.0b4,<10.0.0 cupy-cuda90>=5.0.0b4,<11.0.0
cuda91 = cuda91 =
cupy-cuda91>=5.0.0b4,<10.0.0 cupy-cuda91>=5.0.0b4,<11.0.0
cuda92 = cuda92 =
cupy-cuda92>=5.0.0b4,<10.0.0 cupy-cuda92>=5.0.0b4,<11.0.0
cuda100 = cuda100 =
cupy-cuda100>=5.0.0b4,<10.0.0 cupy-cuda100>=5.0.0b4,<11.0.0
cuda101 = cuda101 =
cupy-cuda101>=5.0.0b4,<10.0.0 cupy-cuda101>=5.0.0b4,<11.0.0
cuda102 = cuda102 =
cupy-cuda102>=5.0.0b4,<10.0.0 cupy-cuda102>=5.0.0b4,<11.0.0
cuda110 = cuda110 =
cupy-cuda110>=5.0.0b4,<10.0.0 cupy-cuda110>=5.0.0b4,<11.0.0
cuda111 = cuda111 =
cupy-cuda111>=5.0.0b4,<10.0.0 cupy-cuda111>=5.0.0b4,<11.0.0
cuda112 = cuda112 =
cupy-cuda112>=5.0.0b4,<10.0.0 cupy-cuda112>=5.0.0b4,<11.0.0
cuda113 = cuda113 =
cupy-cuda113>=5.0.0b4,<10.0.0 cupy-cuda113>=5.0.0b4,<11.0.0
cuda114 = cuda114 =
cupy-cuda114>=5.0.0b4,<10.0.0 cupy-cuda114>=5.0.0b4,<11.0.0
cuda115 =
cupy-cuda115>=5.0.0b4,<11.0.0
apple = apple =
thinc-apple-ops>=0.0.4,<1.0.0 thinc-apple-ops>=0.0.4,<1.0.0
# Language tokenizers with external dependencies # Language tokenizers with external dependencies

View File

@ -123,7 +123,7 @@ def MultiHashEmbed(
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
account some subword information, without constructing a fully character-based account some subword information, without constructing a fully character-based
representation. If pretrained vectors are available, they can be included in representation. If pretrained vectors are available, they can be included in
the representation as well, with the vectors table will be kept static the representation as well, with the vectors table kept static
(i.e. it's not updated). (i.e. it's not updated).
The `width` parameter specifies the output width of the layer and the widths The `width` parameter specifies the output width of the layer and the widths

View File

@ -1,5 +1,5 @@
cimport numpy as np cimport numpy as np
from libc.stdint cimport uint32_t from libc.stdint cimport uint32_t, uint64_t
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
from libcpp.set cimport set as cppset from libcpp.set cimport set as cppset
from murmurhash.mrmr cimport hash128_x64 from murmurhash.mrmr cimport hash128_x64
@ -353,12 +353,18 @@ cdef class Vectors:
key (str): The string key. key (str): The string key.
RETURNS: A list of the integer hashes. RETURNS: A list of the integer hashes.
""" """
cdef uint32_t[4] out # MurmurHash3_x64_128 returns an array of 2 uint64_t values.
cdef uint64_t[2] out
chars = s.encode("utf8") chars = s.encode("utf8")
cdef char* utf8_string = chars cdef char* utf8_string = chars
hash128_x64(utf8_string, len(chars), self.hash_seed, &out) hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
rows = [out[i] for i in range(min(self.hash_count, 4))] rows = [
return rows out[0] & 0xffffffffu,
out[0] >> 32,
out[1] & 0xffffffffu,
out[1] >> 32,
]
return rows[:min(self.hash_count, 4)]
def _get_ngrams(self, unicode key): def _get_ngrams(self, unicode key):
"""Get all padded ngram strings using the ngram settings. """Get all padded ngram strings using the ngram settings.

View File

@ -44,7 +44,7 @@ markup is correct.
"id": "unique-project-id", "id": "unique-project-id",
"title": "Project title", "title": "Project title",
"slogan": "A short summary", "slogan": "A short summary",
"description": "A longer description *Mardown allowed!*", "description": "A longer description *Markdown allowed!*",
"github": "user/repo", "github": "user/repo",
"pip": "package-name", "pip": "package-name",
"code_example": [ "code_example": [

View File

@ -158,7 +158,7 @@ be configured with the `attrs` argument. The suggested attributes are `NORM`,
`PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some `PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some
subword information, without construction a fully character-based subword information, without construction a fully character-based
representation. If pretrained vectors are available, they can be included in the representation. If pretrained vectors are available, they can be included in the
representation as well, with the vectors table will be kept static (i.e. it's representation as well, with the vectors table kept static (i.e. it's
not updated). not updated).
| Name | Description | | Name | Description |
@ -296,7 +296,7 @@ learned linear projection to control the dimensionality. Unknown tokens are
mapped to a zero vector. See the documentation on mapped to a zero vector. See the documentation on
[static vectors](/usage/embeddings-transformers#static-vectors) for details. [static vectors](/usage/embeddings-transformers#static-vectors) for details.
| Name |  Description | | Name | Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nO` | The output width of the layer, after the linear projection. ~~Optional[int]~~ | | `nO` | The output width of the layer, after the linear projection. ~~Optional[int]~~ |
| `nM` | The width of the static vectors. ~~Optional[int]~~ | | `nM` | The width of the static vectors. ~~Optional[int]~~ |
@ -318,7 +318,7 @@ mapped to a zero vector. See the documentation on
Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
of feature names to extract, which should refer to token attributes. of feature names to extract, which should refer to token attributes.
| Name |  Description | | Name | Description |
| ----------- | ------------------------------------------------------------------------ | | ----------- | ------------------------------------------------------------------------ |
| `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ | | `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ |
| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ | | **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |