mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 09:14:32 +03:00
Remove Vectors.from_glove (#5209)
This commit is contained in:
parent
828acffc12
commit
d88a377bed
|
@ -355,44 +355,6 @@ cdef class Vectors:
|
|||
for i in range(len(queries)) ], dtype="uint64")
|
||||
return (keys, best_rows, scores)
|
||||
|
||||
def from_glove(self, path):
|
||||
"""Load GloVe vectors from a directory. Assumes binary format,
|
||||
that the vocab is in a vocab.txt, and that vectors are named
|
||||
vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32
|
||||
vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc.
|
||||
By default GloVe outputs 64-bit vectors.
|
||||
|
||||
path (unicode / Path): The path to load the GloVe vectors from.
|
||||
RETURNS: A `StringStore` object, holding the key-to-string mapping.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#from_glove
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
width = None
|
||||
for name in path.iterdir():
|
||||
if name.parts[-1].startswith("vectors"):
|
||||
_, dims, dtype, _2 = name.parts[-1].split('.')
|
||||
width = int(dims)
|
||||
break
|
||||
else:
|
||||
raise IOError(Errors.E061.format(filename=path))
|
||||
bin_loc = path / "vectors.{dims}.{dtype}.bin".format(dims=dims, dtype=dtype)
|
||||
xp = get_array_module(self.data)
|
||||
self.data = None
|
||||
with bin_loc.open("rb") as file_:
|
||||
self.data = xp.fromfile(file_, dtype=dtype)
|
||||
if dtype != "float32":
|
||||
self.data = xp.ascontiguousarray(self.data, dtype="float32")
|
||||
if self.data.ndim == 1:
|
||||
self.data = self.data.reshape((self.data.size//width, width))
|
||||
n = 0
|
||||
strings = StringStore()
|
||||
with (path / "vocab.txt").open("r") as file_:
|
||||
for i, line in enumerate(file_):
|
||||
key = strings.add(line.strip())
|
||||
self.add(key, row=i)
|
||||
return strings
|
||||
|
||||
def to_disk(self, path, **kwargs):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
|
|
|
@ -326,25 +326,6 @@ performed in chunks, to avoid consuming too much memory. You can set the
|
|||
| `sort` | bool | Whether to sort the entries returned by score. Defaults to `True`. |
|
||||
| **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. |
|
||||
|
||||
## Vectors.from_glove {#from_glove tag="method"}
|
||||
|
||||
Load [GloVe](https://nlp.stanford.edu/projects/glove/) vectors from a directory.
|
||||
Assumes binary format, that the vocab is in a `vocab.txt`, and that vectors are
|
||||
named `vectors.{size}.[fd.bin]`, e.g. `vectors.128.f.bin` for 128d float32
|
||||
vectors, `vectors.300.d.bin` for 300d float64 (double) vectors, etc. By default
|
||||
GloVe outputs 64-bit vectors.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
> ```python
|
||||
> vectors = Vectors()
|
||||
> vectors.from_glove("/path/to/glove_vectors")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | ---------------------------------------- |
|
||||
| `path` | unicode / `Path` | The path to load the GloVe vectors from. |
|
||||
|
||||
## Vectors.to_disk {#to_disk tag="method"}
|
||||
|
||||
Save the current state to a directory.
|
||||
|
|
|
@ -177,37 +177,6 @@ for word, vector in vector_data.items():
|
|||
vocab.set_vector(word, vector)
|
||||
```
|
||||
|
||||
### Loading GloVe vectors {#custom-loading-glove new="2"}
|
||||
|
||||
spaCy comes with built-in support for loading
|
||||
[GloVe](https://nlp.stanford.edu/projects/glove/) vectors from a directory. The
|
||||
[`Vectors.from_glove`](/api/vectors#from_glove) method assumes a binary format,
|
||||
the vocab provided in a `vocab.txt`, and the naming scheme of
|
||||
`vectors.{size}.[fd`.bin]. For example:
|
||||
|
||||
```yaml
|
||||
### Directory structure
|
||||
└── vectors
|
||||
├── vectors.128.f.bin # vectors file
|
||||
└── vocab.txt # vocabulary
|
||||
```
|
||||
|
||||
| File name | Dimensions | Data type |
|
||||
| ------------------- | ---------- | ---------------- |
|
||||
| `vectors.128.f.bin` | 128 | float32 |
|
||||
| `vectors.300.d.bin` | 300 | float64 (double) |
|
||||
|
||||
```python
|
||||
nlp = spacy.load("en_core_web_sm")
|
||||
nlp.vocab.vectors.from_glove("/path/to/vectors")
|
||||
```
|
||||
|
||||
If your instance of `Language` already contains vectors, they will be
|
||||
overwritten. To create your own GloVe vectors model package like spaCy's
|
||||
[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), you can call
|
||||
[`nlp.to_disk`](/api/language#to_disk), and then package the model using the
|
||||
[`package`](/api/cli#package) command.
|
||||
|
||||
### Using custom similarity methods {#custom-similarity}
|
||||
|
||||
By default, [`Token.vector`](/api/token#vector) returns the vector for its
|
||||
|
|
Loading…
Reference in New Issue
Block a user