Remove names for vectors (#12243)

* Remove names for vectors

Named vectors are basically a carry-over from v2 and aren't used for
anything.

* Format
This commit is contained in:
Adriane Boyd 2023-02-08 14:37:42 +01:00 committed by GitHub
parent 5089efa2d0
commit cf85b81f34
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 18 additions and 44 deletions

View File

@ -21,7 +21,6 @@ def init_vectors_cli(
prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"), prune: int = Opt(-1, "--prune", "-p", help="Optional number of vectors to prune to"),
truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"), truncate: int = Opt(0, "--truncate", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"), mode: str = Opt("default", "--mode", "-m", help="Vectors mode: default or floret"),
name: Optional[str] = Opt(None, "--name", "-n", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"), verbose: bool = Opt(False, "--verbose", "-V", "-VV", help="Display more information for debugging purposes"),
jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True), jsonl_loc: Optional[Path] = Opt(None, "--lexemes-jsonl", "-j", help="Location of JSONL-formatted attributes file", hidden=True),
# fmt: on # fmt: on
@ -44,7 +43,6 @@ def init_vectors_cli(
vectors_loc, vectors_loc,
truncate=truncate, truncate=truncate,
prune=prune, prune=prune,
name=name,
mode=mode, mode=mode,
) )
msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors") msg.good(f"Successfully converted {len(nlp.vocab.vectors)} vectors")

View File

@ -174,8 +174,7 @@ class Language:
if not isinstance(vocab, Vocab) and vocab is not True: if not isinstance(vocab, Vocab) and vocab is not True:
raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab))) raise ValueError(Errors.E918.format(vocab=vocab, vocab_type=type(Vocab)))
if vocab is True: if vocab is True:
vectors_name = meta.get("vectors", {}).get("name") vocab = create_vocab(self.lang, self.Defaults)
vocab = create_vocab(self.lang, self.Defaults, vectors_name=vectors_name)
else: else:
if (self.lang and vocab.lang) and (self.lang != vocab.lang): if (self.lang and vocab.lang) and (self.lang != vocab.lang):
raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang)) raise ValueError(Errors.E150.format(nlp=self.lang, vocab=vocab.lang))
@ -229,7 +228,6 @@ class Language:
"width": self.vocab.vectors_length, "width": self.vocab.vectors_length,
"vectors": len(self.vocab.vectors), "vectors": len(self.vocab.vectors),
"keys": self.vocab.vectors.n_keys, "keys": self.vocab.vectors.n_keys,
"name": self.vocab.vectors.name,
"mode": self.vocab.vectors.mode, "mode": self.vocab.vectors.mode,
} }
self._meta["labels"] = dict(self.pipe_labels) self._meta["labels"] = dict(self.pipe_labels)
@ -2197,9 +2195,6 @@ class Language:
if path.exists(): if path.exists():
data = srsly.read_json(path) data = srsly.read_json(path)
self.meta.update(data) self.meta.update(data)
# self.meta always overrides meta["vectors"] with the metadata
# from self.vocab.vectors, so set the name directly
self.vocab.vectors.name = data.get("vectors", {}).get("name")
def deserialize_vocab(path: Path) -> None: def deserialize_vocab(path: Path) -> None:
if path.exists(): if path.exists():
@ -2268,9 +2263,6 @@ class Language:
def deserialize_meta(b): def deserialize_meta(b):
data = srsly.json_loads(b) data = srsly.json_loads(b)
self.meta.update(data) self.meta.update(data)
# self.meta always overrides meta["vectors"] with the metadata
# from self.vocab.vectors, so set the name directly
self.vocab.vectors.name = data.get("vectors", {}).get("name")
deserializers: Dict[str, Callable[[bytes], Any]] = {} deserializers: Dict[str, Callable[[bytes], Any]] = {}
deserializers["config.cfg"] = lambda b: self.config.from_bytes( deserializers["config.cfg"] = lambda b: self.config.from_bytes(

View File

@ -181,7 +181,7 @@ def test_issue4042_bug2():
@pytest.mark.issue(4725) @pytest.mark.issue(4725)
def test_issue4725_1(): def test_issue4725_1():
"""Ensure the pickling of the NER goes well""" """Ensure the pickling of the NER goes well"""
vocab = Vocab(vectors_name="test_vocab_add_vector") vocab = Vocab()
nlp = English(vocab=vocab) nlp = English(vocab=vocab)
config = { config = {
"update_with_oracle_cut_size": 111, "update_with_oracle_cut_size": 111,

View File

@ -84,7 +84,7 @@ def test_issue1539():
@pytest.mark.issue(1807) @pytest.mark.issue(1807)
def test_issue1807(): def test_issue1807():
"""Test vocab.set_vector also adds the word to the vocab.""" """Test vocab.set_vector also adds the word to the vocab."""
vocab = Vocab(vectors_name="test_issue1807") vocab = Vocab()
assert "hello" not in vocab assert "hello" not in vocab
vocab.set_vector("hello", numpy.ones((50,), dtype="f")) vocab.set_vector("hello", numpy.ones((50,), dtype="f"))
assert "hello" in vocab assert "hello" in vocab
@ -94,13 +94,12 @@ def test_issue1807():
def test_issue2871(): def test_issue2871():
"""Test that vectors recover the correct key for spaCy reserved words.""" """Test that vectors recover the correct key for spaCy reserved words."""
words = ["dog", "cat", "SUFFIX"] words = ["dog", "cat", "SUFFIX"]
vocab = Vocab(vectors_name="test_issue2871") vocab = Vocab()
vocab.vectors.resize(shape=(3, 10)) vocab.vectors.resize(shape=(3, 10))
vector_data = numpy.zeros((3, 10), dtype="f") vector_data = numpy.zeros((3, 10), dtype="f")
for word in words: for word in words:
_ = vocab[word] # noqa: F841 _ = vocab[word] # noqa: F841
vocab.set_vector(word, vector_data[0]) vocab.set_vector(word, vector_data[0])
vocab.vectors.name = "dummy_vectors"
assert vocab["dog"].rank == 0 assert vocab["dog"].rank == 0
assert vocab["cat"].rank == 1 assert vocab["cat"].rank == 1
assert vocab["SUFFIX"].rank == 2 assert vocab["SUFFIX"].rank == 2
@ -125,7 +124,7 @@ def test_issue4725_2():
# ensures that this runs correctly and doesn't hang or crash because of the global vectors # ensures that this runs correctly and doesn't hang or crash because of the global vectors
# if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows), # if it does crash, it's usually because of calling 'spawn' for multiprocessing (e.g. on Windows),
# or because of issues with pickling the NER (cf test_issue4725_1) # or because of issues with pickling the NER (cf test_issue4725_1)
vocab = Vocab(vectors_name="test_vocab_add_vector") vocab = Vocab()
data = numpy.ndarray((5, 3), dtype="f") data = numpy.ndarray((5, 3), dtype="f")
data[0] = 1.0 data[0] = 1.0
data[1] = 2.0 data[1] = 2.0
@ -340,7 +339,7 @@ def test_vectors_doc_doc_similarity(vocab, text1, text2):
def test_vocab_add_vector(): def test_vocab_add_vector():
vocab = Vocab(vectors_name="test_vocab_add_vector") vocab = Vocab()
data = OPS.xp.ndarray((5, 3), dtype="f") data = OPS.xp.ndarray((5, 3), dtype="f")
data[0] = 1.0 data[0] = 1.0
data[1] = 2.0 data[1] = 2.0
@ -356,7 +355,7 @@ def test_vocab_add_vector():
def test_vocab_prune_vectors(): def test_vocab_prune_vectors():
vocab = Vocab(vectors_name="test_vocab_prune_vectors") vocab = Vocab()
_ = vocab["cat"] # noqa: F841 _ = vocab["cat"] # noqa: F841
_ = vocab["dog"] # noqa: F841 _ = vocab["dog"] # noqa: F841
_ = vocab["kitten"] # noqa: F841 _ = vocab["kitten"] # noqa: F841
@ -405,7 +404,7 @@ def test_vectors_serialize():
def test_vector_is_oov(): def test_vector_is_oov():
vocab = Vocab(vectors_name="test_vocab_is_oov") vocab = Vocab()
data = OPS.xp.ndarray((5, 3), dtype="f") data = OPS.xp.ndarray((5, 3), dtype="f")
data[0] = 1.0 data[0] = 1.0
data[1] = 2.0 data[1] = 2.0

View File

@ -202,7 +202,6 @@ def convert_vectors(
*, *,
truncate: int, truncate: int,
prune: int, prune: int,
name: Optional[str] = None,
mode: str = VectorsMode.default, mode: str = VectorsMode.default,
) -> None: ) -> None:
vectors_loc = ensure_path(vectors_loc) vectors_loc = ensure_path(vectors_loc)
@ -241,12 +240,6 @@ def convert_vectors(
strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
) )
nlp.vocab.deduplicate_vectors() nlp.vocab.deduplicate_vectors()
if name is None:
# TODO: Is this correct? Does this matter?
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
else:
nlp.vocab.vectors.name = name
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
if prune >= 1 and mode != VectorsMode.floret: if prune >= 1 and mode != VectorsMode.floret:
nlp.vocab.prune_vectors(prune) nlp.vocab.prune_vectors(prune)

View File

@ -52,7 +52,6 @@ cdef class Vectors:
DOCS: https://spacy.io/api/vectors DOCS: https://spacy.io/api/vectors
""" """
cdef public object strings cdef public object strings
cdef public object name
cdef readonly object mode cdef readonly object mode
cdef public object data cdef public object data
cdef public object key2row cdef public object key2row
@ -64,14 +63,13 @@ cdef class Vectors:
cdef readonly unicode bow cdef readonly unicode bow
cdef readonly unicode eow cdef readonly unicode eow
def __init__(self, *, strings=None, shape=None, data=None, keys=None, name=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"): def __init__(self, *, strings=None, shape=None, data=None, keys=None, mode=Mode.default, minn=0, maxn=0, hash_count=1, hash_seed=0, bow="<", eow=">"):
"""Create a new vector store. """Create a new vector store.
strings (StringStore): The string store. strings (StringStore): The string store.
shape (tuple): Size of the table, as (# entries, # columns) shape (tuple): Size of the table, as (# entries, # columns)
data (numpy.ndarray or cupy.ndarray): The vector data. data (numpy.ndarray or cupy.ndarray): The vector data.
keys (iterable): A sequence of keys, aligned with the data. keys (iterable): A sequence of keys, aligned with the data.
name (str): A name to identify the vectors table.
mode (str): Vectors mode: "default" or "floret" (default: "default"). mode (str): Vectors mode: "default" or "floret" (default: "default").
minn (int): The floret char ngram minn (default: 0). minn (int): The floret char ngram minn (default: 0).
maxn (int): The floret char ngram maxn (default: 0). maxn (int): The floret char ngram maxn (default: 0).
@ -85,7 +83,6 @@ cdef class Vectors:
self.strings = strings self.strings = strings
if self.strings is None: if self.strings is None:
self.strings = StringStore() self.strings = StringStore()
self.name = name
if mode not in Mode.values(): if mode not in Mode.values():
raise ValueError( raise ValueError(
Errors.E202.format( Errors.E202.format(

View File

@ -11,7 +11,8 @@ from .vectors import Vectors
from pathlib import Path from pathlib import Path
def create_vocab( def create_vocab(
lang: Optional[str], defaults: Any, vectors_name: Optional[str] = ... lang: Optional[str],
defaults: Any,
) -> Vocab: ... ) -> Vocab: ...
class Vocab: class Vocab:
@ -28,7 +29,6 @@ class Vocab:
strings: Optional[Union[List[str], StringStore]] = ..., strings: Optional[Union[List[str], StringStore]] = ...,
lookups: Optional[Lookups] = ..., lookups: Optional[Lookups] = ...,
oov_prob: float = ..., oov_prob: float = ...,
vectors_name: Optional[str] = ...,
writing_system: Dict[str, Any] = ..., writing_system: Dict[str, Any] = ...,
get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ..., get_noun_chunks: Optional[Callable[[Union[Doc, Span]], Iterator[Span]]] = ...,
) -> None: ... ) -> None: ...

View File

@ -23,7 +23,7 @@ from .lang.norm_exceptions import BASE_NORMS
from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang from .lang.lex_attrs import LEX_ATTRS, is_stop, get_lang
def create_vocab(lang, defaults, vectors_name=None): def create_vocab(lang, defaults):
# If the spacy-lookups-data package is installed, we pre-populate the lookups # If the spacy-lookups-data package is installed, we pre-populate the lookups
# with lexeme data, if available # with lexeme data, if available
lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters} lex_attrs = {**LEX_ATTRS, **defaults.lex_attr_getters}
@ -39,7 +39,6 @@ def create_vocab(lang, defaults, vectors_name=None):
lex_attr_getters=lex_attrs, lex_attr_getters=lex_attrs,
writing_system=defaults.writing_system, writing_system=defaults.writing_system,
get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"), get_noun_chunks=defaults.syntax_iterators.get("noun_chunks"),
vectors_name=vectors_name,
) )
@ -51,8 +50,8 @@ cdef class Vocab:
DOCS: https://spacy.io/api/vocab DOCS: https://spacy.io/api/vocab
""" """
def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None, def __init__(self, lex_attr_getters=None, strings=tuple(), lookups=None,
oov_prob=-20., vectors_name=None, writing_system={}, oov_prob=-20., writing_system={}, get_noun_chunks=None,
get_noun_chunks=None, **deprecated_kwargs): **deprecated_kwargs):
"""Create the vocabulary. """Create the vocabulary.
lex_attr_getters (dict): A dictionary mapping attribute IDs to lex_attr_getters (dict): A dictionary mapping attribute IDs to
@ -61,7 +60,6 @@ cdef class Vocab:
vice versa. vice versa.
lookups (Lookups): Container for large lookup tables and dictionaries. lookups (Lookups): Container for large lookup tables and dictionaries.
oov_prob (float): Default OOV probability. oov_prob (float): Default OOV probability.
vectors_name (str): Optional name to identify the vectors table.
get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]): get_noun_chunks (Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]):
A function that yields base noun phrases used for Doc.noun_chunks. A function that yields base noun phrases used for Doc.noun_chunks.
""" """
@ -78,7 +76,7 @@ cdef class Vocab:
_ = self[string] _ = self[string]
self.lex_attr_getters = lex_attr_getters self.lex_attr_getters = lex_attr_getters
self.morphology = Morphology(self.strings) self.morphology = Morphology(self.strings)
self.vectors = Vectors(strings=self.strings, name=vectors_name) self.vectors = Vectors(strings=self.strings)
self.lookups = lookups self.lookups = lookups
self.writing_system = writing_system self.writing_system = writing_system
self.get_noun_chunks = get_noun_chunks self.get_noun_chunks = get_noun_chunks
@ -308,7 +306,7 @@ cdef class Vocab:
for key, row in self.vectors.key2row.items() for key, row in self.vectors.key2row.items()
} }
# replace vectors with deduplicated version # replace vectors with deduplicated version
self.vectors = Vectors(strings=self.strings, data=data, name=self.vectors.name) self.vectors = Vectors(strings=self.strings, data=data)
for key, row in key2row.items(): for key, row in key2row.items():
self.vectors.add(key, row=row) self.vectors.add(key, row=row)
@ -358,7 +356,7 @@ cdef class Vocab:
keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64") keys = xp.asarray([key for (prob, i, key) in priority], dtype="uint64")
keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]]) keep = xp.ascontiguousarray(self.vectors.data[indices[:nr_row]])
toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]]) toss = xp.ascontiguousarray(self.vectors.data[indices[nr_row:]])
self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row], name=self.vectors.name) self.vectors = Vectors(strings=self.strings, data=keep, keys=keys[:nr_row])
syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size) syn_keys, syn_rows, scores = self.vectors.most_similar(toss, batch_size=batch_size)
syn_keys = ops.to_numpy(syn_keys) syn_keys = ops.to_numpy(syn_keys)
remap = {} remap = {}

View File

@ -201,7 +201,7 @@ This functionality was previously available as part of the command `init-model`.
</Infobox> </Infobox>
```bash ```bash
$ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--name] [--verbose] $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--truncate] [--verbose]
``` ```
| Name | Description | | Name | Description |
@ -212,7 +212,6 @@ $ python -m spacy init vectors [lang] [vectors_loc] [output_dir] [--prune] [--tr
| `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ | | `--truncate`, `-t` | Number of vectors to truncate to when reading in vectors file. Defaults to `0` for no truncation. ~~int (option)~~ |
| `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ | | `--prune`, `-p` | Number of vectors to prune the vocabulary to. Defaults to `-1` for no pruning. ~~int (option)~~ |
| `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ | | `--mode`, `-m` | Vectors mode: `default` or [`floret`](https://github.com/explosion/floret). Defaults to `default`. ~~Optional[str] \(option)~~ |
| `--name`, `-n` | Name to assign to the word vectors in the `meta.json`, e.g. `en_core_web_md.vectors`. ~~Optional[str] \(option)~~ |
| `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ | | `--verbose`, `-V` | Print additional information and explanations. ~~bool (flag)~~ |
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ | | `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | A spaCy pipeline directory containing the vocab and vectors. | | **CREATES** | A spaCy pipeline directory containing the vocab and vectors. |

View File

@ -52,7 +52,6 @@ modified later.
| `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ | | `shape` | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. ~~Tuple[int, int]~~ |
| `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ | | `data` | The vector data. ~~numpy.ndarray[ndim=2, dtype=float32]~~ |
| `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ | | `keys` | A sequence of keys aligned with the data. ~~Iterable[Union[str, int]]~~ |
| `name` | A name to identify the vectors table. ~~str~~ |
| `mode` <Tag variant="new">3.2</Tag> | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ | | `mode` <Tag variant="new">3.2</Tag> | Vectors mode: `"default"` or [`"floret"`](https://github.com/explosion/floret) (default: `"default"`). ~~str~~ |
| `minn` <Tag variant="new">3.2</Tag> | The floret char ngram minn (default: `0`). ~~int~~ | | `minn` <Tag variant="new">3.2</Tag> | The floret char ngram minn (default: `0`). ~~int~~ |
| `maxn` <Tag variant="new">3.2</Tag> | The floret char ngram maxn (default: `0`). ~~int~~ | | `maxn` <Tag variant="new">3.2</Tag> | The floret char ngram maxn (default: `0`). ~~int~~ |

View File

@ -27,7 +27,6 @@ Create the vocabulary.
| `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ | | `strings` | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. ~~Union[List[str], StringStore]~~ |
| `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ | | `lookups` | A [`Lookups`](/api/lookups) that stores the `lexeme_norm` and other large lookup tables. Defaults to `None`. ~~Optional[Lookups]~~ |
| `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ | | `oov_prob` | The default OOV probability. Defaults to `-20.0`. ~~float~~ |
| `vectors_name` | A name to identify the vectors table. ~~str~~ |
| `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ | | `writing_system` | A dictionary describing the language's writing system. Typically provided by [`Language.Defaults`](/api/language#defaults). ~~Dict[str, Any]~~ |
| `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ | | `get_noun_chunks` | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |