mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-11 17:56:30 +03:00
Merge pull request #9951 from explosion/master
Update develop branch with master
This commit is contained in:
commit
b8106e0f95
|
@ -1,11 +1,8 @@
|
|||
recursive-include include *.h
|
||||
recursive-include spacy *.pyi *.pyx *.pxd *.txt *.cfg *.jinja *.toml
|
||||
include LICENSE
|
||||
include README.md
|
||||
include pyproject.toml
|
||||
include spacy/py.typed
|
||||
recursive-exclude spacy/lang *.json
|
||||
recursive-include spacy/lang *.json.gz
|
||||
recursive-include spacy/cli *.json *.yml
|
||||
recursive-include spacy/cli *.yml
|
||||
recursive-include licenses *
|
||||
recursive-exclude spacy *.cpp
|
||||
|
|
28
setup.cfg
28
setup.cfg
|
@ -77,31 +77,33 @@ transformers =
|
|||
ray =
|
||||
spacy_ray>=0.1.0,<1.0.0
|
||||
cuda =
|
||||
cupy>=5.0.0b4,<10.0.0
|
||||
cupy>=5.0.0b4,<11.0.0
|
||||
cuda80 =
|
||||
cupy-cuda80>=5.0.0b4,<10.0.0
|
||||
cupy-cuda80>=5.0.0b4,<11.0.0
|
||||
cuda90 =
|
||||
cupy-cuda90>=5.0.0b4,<10.0.0
|
||||
cupy-cuda90>=5.0.0b4,<11.0.0
|
||||
cuda91 =
|
||||
cupy-cuda91>=5.0.0b4,<10.0.0
|
||||
cupy-cuda91>=5.0.0b4,<11.0.0
|
||||
cuda92 =
|
||||
cupy-cuda92>=5.0.0b4,<10.0.0
|
||||
cupy-cuda92>=5.0.0b4,<11.0.0
|
||||
cuda100 =
|
||||
cupy-cuda100>=5.0.0b4,<10.0.0
|
||||
cupy-cuda100>=5.0.0b4,<11.0.0
|
||||
cuda101 =
|
||||
cupy-cuda101>=5.0.0b4,<10.0.0
|
||||
cupy-cuda101>=5.0.0b4,<11.0.0
|
||||
cuda102 =
|
||||
cupy-cuda102>=5.0.0b4,<10.0.0
|
||||
cupy-cuda102>=5.0.0b4,<11.0.0
|
||||
cuda110 =
|
||||
cupy-cuda110>=5.0.0b4,<10.0.0
|
||||
cupy-cuda110>=5.0.0b4,<11.0.0
|
||||
cuda111 =
|
||||
cupy-cuda111>=5.0.0b4,<10.0.0
|
||||
cupy-cuda111>=5.0.0b4,<11.0.0
|
||||
cuda112 =
|
||||
cupy-cuda112>=5.0.0b4,<10.0.0
|
||||
cupy-cuda112>=5.0.0b4,<11.0.0
|
||||
cuda113 =
|
||||
cupy-cuda113>=5.0.0b4,<10.0.0
|
||||
cupy-cuda113>=5.0.0b4,<11.0.0
|
||||
cuda114 =
|
||||
cupy-cuda114>=5.0.0b4,<10.0.0
|
||||
cupy-cuda114>=5.0.0b4,<11.0.0
|
||||
cuda115 =
|
||||
cupy-cuda115>=5.0.0b4,<11.0.0
|
||||
apple =
|
||||
thinc-apple-ops>=0.0.4,<1.0.0
|
||||
# Language tokenizers with external dependencies
|
||||
|
|
|
@ -123,7 +123,7 @@ def MultiHashEmbed(
|
|||
attributes are NORM, PREFIX, SUFFIX and SHAPE. This lets the model take into
|
||||
account some subword information, without constructing a fully character-based
|
||||
representation. If pretrained vectors are available, they can be included in
|
||||
the representation as well, with the vectors table will be kept static
|
||||
the representation as well, with the vectors table kept static
|
||||
(i.e. it's not updated).
|
||||
|
||||
The `width` parameter specifies the output width of the layer and the widths
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
cimport numpy as np
|
||||
from libc.stdint cimport uint32_t
|
||||
from libc.stdint cimport uint32_t, uint64_t
|
||||
from cython.operator cimport dereference as deref
|
||||
from libcpp.set cimport set as cppset
|
||||
from murmurhash.mrmr cimport hash128_x64
|
||||
|
@ -353,12 +353,18 @@ cdef class Vectors:
|
|||
key (str): The string key.
|
||||
RETURNS: A list of the integer hashes.
|
||||
"""
|
||||
cdef uint32_t[4] out
|
||||
# MurmurHash3_x64_128 returns an array of 2 uint64_t values.
|
||||
cdef uint64_t[2] out
|
||||
chars = s.encode("utf8")
|
||||
cdef char* utf8_string = chars
|
||||
hash128_x64(utf8_string, len(chars), self.hash_seed, &out)
|
||||
rows = [out[i] for i in range(min(self.hash_count, 4))]
|
||||
return rows
|
||||
rows = [
|
||||
out[0] & 0xffffffffu,
|
||||
out[0] >> 32,
|
||||
out[1] & 0xffffffffu,
|
||||
out[1] >> 32,
|
||||
]
|
||||
return rows[:min(self.hash_count, 4)]
|
||||
|
||||
def _get_ngrams(self, unicode key):
|
||||
"""Get all padded ngram strings using the ngram settings.
|
||||
|
|
|
@ -44,7 +44,7 @@ markup is correct.
|
|||
"id": "unique-project-id",
|
||||
"title": "Project title",
|
||||
"slogan": "A short summary",
|
||||
"description": "A longer description – *Mardown allowed!*",
|
||||
"description": "A longer description – *Markdown allowed!*",
|
||||
"github": "user/repo",
|
||||
"pip": "package-name",
|
||||
"code_example": [
|
||||
|
|
|
@ -158,7 +158,7 @@ be configured with the `attrs` argument. The suggested attributes are `NORM`,
|
|||
`PREFIX`, `SUFFIX` and `SHAPE`. This lets the model take into account some
|
||||
subword information, without construction a fully character-based
|
||||
representation. If pretrained vectors are available, they can be included in the
|
||||
representation as well, with the vectors table will be kept static (i.e. it's
|
||||
representation as well, with the vectors table kept static (i.e. it's
|
||||
not updated).
|
||||
|
||||
| Name | Description |
|
||||
|
@ -296,7 +296,7 @@ learned linear projection to control the dimensionality. Unknown tokens are
|
|||
mapped to a zero vector. See the documentation on
|
||||
[static vectors](/usage/embeddings-transformers#static-vectors) for details.
|
||||
|
||||
| Name | Description |
|
||||
| Name | Description |
|
||||
| ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `nO` | The output width of the layer, after the linear projection. ~~Optional[int]~~ |
|
||||
| `nM` | The width of the static vectors. ~~Optional[int]~~ |
|
||||
|
@ -318,7 +318,7 @@ mapped to a zero vector. See the documentation on
|
|||
Extract arrays of input features from [`Doc`](/api/doc) objects. Expects a list
|
||||
of feature names to extract, which should refer to token attributes.
|
||||
|
||||
| Name | Description |
|
||||
| Name | Description |
|
||||
| ----------- | ------------------------------------------------------------------------ |
|
||||
| `columns` | The token attributes to extract. ~~List[Union[int, str]]~~ |
|
||||
| **CREATES** | The created feature extraction layer. ~~Model[List[Doc], List[Ints2d]]~~ |
|
||||
|
|
Loading…
Reference in New Issue
Block a user