mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
Remove Vectors.from_glove (#5209)
This commit is contained in:
parent
828acffc12
commit
d88a377bed
|
@ -355,44 +355,6 @@ cdef class Vectors:
|
||||||
for i in range(len(queries)) ], dtype="uint64")
|
for i in range(len(queries)) ], dtype="uint64")
|
||||||
return (keys, best_rows, scores)
|
return (keys, best_rows, scores)
|
||||||
|
|
||||||
def from_glove(self, path):
|
|
||||||
"""Load GloVe vectors from a directory. Assumes binary format,
|
|
||||||
that the vocab is in a vocab.txt, and that vectors are named
|
|
||||||
vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
|
|
||||||
vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
|
|
||||||
By default GloVe outputs 64-bit vectors.
|
|
||||||
|
|
||||||
path (unicode / Path): The path to load the GloVe vectors from.
|
|
||||||
RETURNS: A `StringStore` object, holding the key-to-string mapping.
|
|
||||||
|
|
||||||
DOCS: https://spacy.io/api/vectors#from_glove
|
|
||||||
"""
|
|
||||||
path = util.ensure_path(path)
|
|
||||||
width = None
|
|
||||||
for name in path.iterdir():
|
|
||||||
if name.parts[-1].startswith("vectors"):
|
|
||||||
_, dims, dtype, _2 = name.parts[-1].split('.')
|
|
||||||
width = int(dims)
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
raise IOError(Errors.E061.format(filename=path))
|
|
||||||
bin_loc = path / "vectors.{dims}.{dtype}.bin".format(dims=dims, dtype=dtype)
|
|
||||||
xp = get_array_module(self.data)
|
|
||||||
self.data = None
|
|
||||||
with bin_loc.open("rb") as file_:
|
|
||||||
self.data = xp.fromfile(file_, dtype=dtype)
|
|
||||||
if dtype != "float32":
|
|
||||||
self.data = xp.ascontiguousarray(self.data, dtype="float32")
|
|
||||||
if self.data.ndim == 1:
|
|
||||||
self.data = self.data.reshape((self.data.size//width, width))
|
|
||||||
n = 0
|
|
||||||
strings = StringStore()
|
|
||||||
with (path / "vocab.txt").open("r") as file_:
|
|
||||||
for i, line in enumerate(file_):
|
|
||||||
key = strings.add(line.strip())
|
|
||||||
self.add(key, row=i)
|
|
||||||
return strings
|
|
||||||
|
|
||||||
def to_disk(self, path, **kwargs):
|
def to_disk(self, path, **kwargs):
|
||||||
"""Save the current state to a directory.
|
"""Save the current state to a directory.
|
||||||
|
|
||||||
|
|
|
@ -326,25 +326,6 @@ performed in chunks, to avoid consuming too much memory. You can set the
|
||||||
| `sort` | bool | Whether to sort the entries returned by score. Defaults to `True`. |
|
| `sort` | bool | Whether to sort the entries returned by score. Defaults to `True`. |
|
||||||
| **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. |
|
| **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. |
|
||||||
|
|
||||||
## Vectors.from_glove {#from_glove tag="method"}
|
|
||||||
|
|
||||||
Load [GloVe](https://nlp.stanford.edu/projects/glove/) vectors from a directory.
|
|
||||||
Assumes binary format, that the vocab is in a `vocab.txt`, and that vectors are
|
|
||||||
named `vectors.{size}.[fd.bin]`, e.g. `vectors.128.f.bin` for 128d float32
|
|
||||||
vectors, `vectors.300.d.bin` for 300d float64 (double) vectors, etc. By default
|
|
||||||
GloVe outputs 64-bit vectors.
|
|
||||||
|
|
||||||
> #### Example
|
|
||||||
>
|
|
||||||
> ```python
|
|
||||||
> vectors = Vectors()
|
|
||||||
> vectors.from_glove("/path/to/glove_vectors")
|
|
||||||
> ```
|
|
||||||
|
|
||||||
| Name | Type | Description |
|
|
||||||
| ------ | ---------------- | ---------------------------------------- |
|
|
||||||
| `path` | unicode / `Path` | The path to load the GloVe vectors from. |
|
|
||||||
|
|
||||||
## Vectors.to_disk {#to_disk tag="method"}
|
## Vectors.to_disk {#to_disk tag="method"}
|
||||||
|
|
||||||
Save the current state to a directory.
|
Save the current state to a directory.
|
||||||
|
|
|
@ -177,37 +177,6 @@ for word, vector in vector_data.items():
|
||||||
vocab.set_vector(word, vector)
|
vocab.set_vector(word, vector)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Loading GloVe vectors {#custom-loading-glove new="2"}
|
|
||||||
|
|
||||||
spaCy comes with built-in support for loading
|
|
||||||
[GloVe](https://nlp.stanford.edu/projects/glove/) vectors from a directory. The
|
|
||||||
[`Vectors.from_glove`](/api/vectors#from_glove) method assumes a binary format,
|
|
||||||
the vocab provided in a `vocab.txt`, and the naming scheme of
|
|
||||||
`vectors.{size}.[fd`.bin]. For example:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
### Directory structure
|
|
||||||
└── vectors
|
|
||||||
├── vectors.128.f.bin # vectors file
|
|
||||||
└── vocab.txt # vocabulary
|
|
||||||
```
|
|
||||||
|
|
||||||
| File name | Dimensions | Data type |
|
|
||||||
| ------------------- | ---------- | ---------------- |
|
|
||||||
| `vectors.128.f.bin` | 128 | float32 |
|
|
||||||
| `vectors.300.d.bin` | 300 | float64 (double) |
|
|
||||||
|
|
||||||
```python
|
|
||||||
nlp = spacy.load("en_core_web_sm")
|
|
||||||
nlp.vocab.vectors.from_glove("/path/to/vectors")
|
|
||||||
```
|
|
||||||
|
|
||||||
If your instance of `Language` already contains vectors, they will be
|
|
||||||
overwritten. To create your own GloVe vectors model package like spaCy's
|
|
||||||
[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), you can call
|
|
||||||
[`nlp.to_disk`](/api/language#to_disk), and then package the model using the
|
|
||||||
[`package`](/api/cli#package) command.
|
|
||||||
|
|
||||||
### Using custom similarity methods {#custom-similarity}
|
### Using custom similarity methods {#custom-similarity}
|
||||||
|
|
||||||
By default, [`Token.vector`](/api/token#vector) returns the vector for its
|
By default, [`Token.vector`](/api/token#vector) returns the vector for its
|
||||||
|
|
Loading…
Reference in New Issue
Block a user