Merge pull request #9951 from explosion/master

Update develop branch with master
This commit is contained in:
Sofie Van Landeghem 2021-12-29 10:11:43 +01:00 committed by GitHub
commit b8106e0f95
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 31 additions and 26 deletions

View File

@ -1,11 +1,8 @@
recursive-include include *.h
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
include LICENSE
include README.md
include pyproject.toml
include spacy/py.typed
recursive-exclude spacy/lang *.json
recursive-include spacy/lang *.json.gz
recursive-include spacy/cli *.json *.yml
recursive-include spacy/cli *.yml
recursive-include licenses *
recursive-exclude spacy *.cpp

View File

@ -77,31 +77,33 @@ transformers =
ray =
spacy_ray>=0.1.0,<1.0.0
cuda =
cupy>=5.0.0b4,<10.0.0
cupy>=5.0.0b4,<11.0.0
cuda80 =
cupy-cuda80>=5.0.0b4,<10.0.0
cupy-cuda80>=5.0.0b4,<11.0.0
cuda90 =
cupy-cuda90>=5.0.0b4,<10.0.0
cupy-cuda90>=5.0.0b4,<11.0.0
cuda91 =
cupy-cuda91>=5.0.0b4,<10.0.0
cupy-cuda91>=5.0.0b4,<11.0.0
cuda92 =
cupy-cuda92>=5.0.0b4,<10.0.0
cupy-cuda92>=5.0.0b4,<11.0.0
cuda100 =
cupy-cuda100>=5.0.0b4,<10.0.0
cupy-cuda100>=5.0.0b4,<11.0.0
cuda101 =
cupy-cuda101>=5.0.0b4,<10.0.0
cupy-cuda101>=5.0.0b4,<11.0.0
cuda102 =
cupy-cuda102>=5.0.0b4,<10.0.0
cupy-cuda102>=5.0.0b4,<11.0.0
cuda110 =
cupy-cuda110>=5.0.0b4,<10.0.0
cupy-cuda110>=5.0.0b4,<11.0.0
cuda111 =
cupy-cuda111>=5.0.0b4,<10.0.0
cupy-cuda111>=5.0.0b4,<11.0.0
cuda112 =
cupy-cuda112>=5.0.0b4,<10.0.0
cupy-cuda112>=5.0.0b4,<11.0.0
cuda113 =
cupy-cuda113>=5.0.0b4,<10.0.0
cupy-cuda113>=5.0.0b4,<11.0.0
cuda114 =
cupy-cuda114>=5.0.0b4,<10.0.0
cupy-cuda114>=5.0.0b4,<11.0.0
cuda115 =
cupy-cuda115>=5.0.0b4,<11.0.0
apple =
thinc-apple-ops>=0.0.4,<1.0.0
# Language tokenizers with external dependencies

View File

@ -123,7 +123,7 @@ def MultiHashEmbed(
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
account some subword information, without constructing a fully character-based
representation. If pretrained vectors are available, they can be included in
the representation as well, with the vectors table will be kept static
the representation as well, with the vectors table kept static
(i.e. it's not updated).
The `width` parameter specifies the output width of the layer and the widths

View File

@ -1,5 +1,5 @@
cimport numpy as np
from libc.stdint cimport uint32_t
from libc.stdint cimport uint32_t, uint64_t
from cython.operator cimport dereference as deref
from libcpp.set cimport set as cppset
from murmurhash.mrmr cimport hash128_x64
@ -353,12 +353,18 @@ cdef class Vectors:
key (str): The string key.
RETURNS: A list of the integer hashes.
"""
cdef uint32_t[4] out
# MurmurHash3_x64_128 returns an array of 2 uint64_t values.
cdef uint64_t[2] out
chars = s.encode("utf8")
cdef char* utf8_string = chars
hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
rows = [out[i] for i in range(min(self.hash_count, 4))]
return rows
rows = [
out[0] & 0xffffffffu,
out[0] >> 32,
out[1] & 0xffffffffu,
out[1] >> 32,
]
return rows[:min(self.hash_count, 4)]
def _get_ngrams(self, unicode key):
"""Get all padded ngram strings using the ngram settings.

View File

@ -44,7 +44,7 @@ markup is correct.
"id": "unique-project-id",
"title": "Project title",
"slogan": "A short summary",
"description": "A longer description *Mardown allowed!*",
"description": "A longer description *Markdown allowed!*",
"github": "user/repo",
"pip": "package-name",
"code_example": [

View File

@ -158,7 +158,7 @@ be configured with the `attrs` argument. The suggested attributes are `NORM`,
`PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some
subword information, without construction a fully character-based
representation. If pretrained vectors are available, they can be included in the
representation as well, with the vectors table will be kept static (i.e. it's
representation as well, with the vectors table kept static (i.e. it's
not updated).
| Name | Description |
@ -296,7 +296,7 @@ learned linear projection to control the dimensionality. Unknown tokens are
mapped to a zero vector. See the documentation on
[static vectors](/usage/embeddings-transformers#static-vectors) for details.
| Name |  Description |
| Name | Description |
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `nO` | The output width of the layer, after the linear projection. ~~Optional[int]~~ |
| `nM` | The width of the static vectors. ~~Optional[int]~~ |
@ -318,7 +318,7 @@ mapped to a zero vector. See the documentation on
Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
of feature names to extract, which should refer to token attributes.
| Name |  Description |
| Name | Description |
| ----------- | ------------------------------------------------------------------------ |
| `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ |
| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |