From d88a377bed122018dd54b4228f48b73bee6881b1 Mon Sep 17 00:00:00 2001 From: adrianeboyd Date: Thu, 26 Mar 2020 10:45:47 +0100 Subject: [PATCH] Remove Vectors.from_glove (#5209) --- spacy/vectors.pyx | 38 ------------------------ website/docs/api/vectors.md | 19 ------------ website/docs/usage/vectors-similarity.md | 31 ------------------- 3 files changed, 88 deletions(-) diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx index c6526b89d..f8643640a 100644 --- a/spacy/vectors.pyx +++ b/spacy/vectors.pyx @@ -355,44 +355,6 @@ cdef class Vectors: for i in range(len(queries)) ], dtype="uint64") return (keys, best_rows, scores) - def from_glove(self, path): - """Load GloVe vectors from a directory. Assumes binary format, - that the vocab is in a vocab.txt, and that vectors are named - vectors.{size}.[fd].bin, e.g. vectors.128.f.bin for 128d float32 - vectors, vectors.300.d.bin for 300d float64 (double) vectors, etc. - By default GloVe outputs 64-bit vectors. - - path (unicode / Path): The path to load the GloVe vectors from. - RETURNS: A `StringStore` object, holding the key-to-string mapping. - - DOCS: https://spacy.io/api/vectors#from_glove - """ - path = util.ensure_path(path) - width = None - for name in path.iterdir(): - if name.parts[-1].startswith("vectors"): - _, dims, dtype, _2 = name.parts[-1].split('.') - width = int(dims) - break - else: - raise IOError(Errors.E061.format(filename=path)) - bin_loc = path / "vectors.{dims}.{dtype}.bin".format(dims=dims, dtype=dtype) - xp = get_array_module(self.data) - self.data = None - with bin_loc.open("rb") as file_: - self.data = xp.fromfile(file_, dtype=dtype) - if dtype != "float32": - self.data = xp.ascontiguousarray(self.data, dtype="float32") - if self.data.ndim == 1: - self.data = self.data.reshape((self.data.size//width, width)) - n = 0 - strings = StringStore() - with (path / "vocab.txt").open("r") as file_: - for i, line in enumerate(file_): - key = strings.add(line.strip()) - self.add(key, row=i) - return strings - def to_disk(self, path, **kwargs): """Save the current state to a directory. diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md index 3588672db..93e747c1e 100644 --- a/website/docs/api/vectors.md +++ b/website/docs/api/vectors.md @@ -326,25 +326,6 @@ performed in chunks, to avoid consuming too much memory. You can set the | `sort` | bool | Whether to sort the entries returned by score. Defaults to `True`. | | **RETURNS** | tuple | The most similar entries as a `(keys, best_rows, scores)` tuple. | -## Vectors.from_glove {#from_glove tag="method"} - -Load [GloVe](https://nlp.stanford.edu/projects/glove/) vectors from a directory. -Assumes binary format, that the vocab is in a `vocab.txt`, and that vectors are -named `vectors.{size}.[fd.bin]`, e.g. `vectors.128.f.bin` for 128d float32 -vectors, `vectors.300.d.bin` for 300d float64 (double) vectors, etc. By default -GloVe outputs 64-bit vectors. - -> #### Example -> -> ```python -> vectors = Vectors() -> vectors.from_glove("/path/to/glove_vectors") -> ``` - -| Name | Type | Description | -| ------ | ---------------- | ---------------------------------------- | -| `path` | unicode / `Path` | The path to load the GloVe vectors from. | - ## Vectors.to_disk {#to_disk tag="method"} Save the current state to a directory. diff --git a/website/docs/usage/vectors-similarity.md b/website/docs/usage/vectors-similarity.md index 93ba67704..9b65bb80a 100644 --- a/website/docs/usage/vectors-similarity.md +++ b/website/docs/usage/vectors-similarity.md @@ -177,37 +177,6 @@ for word, vector in vector_data.items(): vocab.set_vector(word, vector) ``` -### Loading GloVe vectors {#custom-loading-glove new="2"} - -spaCy comes with built-in support for loading -[GloVe](https://nlp.stanford.edu/projects/glove/) vectors from a directory. The -[`Vectors.from_glove`](/api/vectors#from_glove) method assumes a binary format, -the vocab provided in a `vocab.txt`, and the naming scheme of -`vectors.{size}.[fd`.bin]. For example: - -```yaml -### Directory structure -└── vectors - ├── vectors.128.f.bin # vectors file - └── vocab.txt # vocabulary -``` - -| File name | Dimensions | Data type | -| ------------------- | ---------- | ---------------- | -| `vectors.128.f.bin` | 128 | float32 | -| `vectors.300.d.bin` | 300 | float64 (double) | - -```python -nlp = spacy.load("en_core_web_sm") -nlp.vocab.vectors.from_glove("/path/to/vectors") -``` - -If your instance of `Language` already contains vectors, they will be -overwritten. To create your own GloVe vectors model package like spaCy's -[`en_vectors_web_lg`](/models/en-starters#en_vectors_web_lg), you can call -[`nlp.to_disk`](/api/language#to_disk), and then package the model using the -[`package`](/api/cli#package) command. - ### Using custom similarity methods {#custom-similarity} By default, [`Token.vector`](/api/token#vector) returns the vector for its