mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
c053f158c5
* Add support for fasttext-bloom hash-only vectors Overview: * Extend `Vectors` to have two modes: `default` and `ngram` * `default` is the default mode and equivalent to the current `Vectors` * `ngram` supports the hash-only ngram tables from `fasttext-bloom` * Extend `spacy.StaticVectors.v2` to handle both modes with no changes for `default` vectors * Extend `spacy init vectors` to support ngram tables The `ngram` mode **only** supports vector tables produced by this fork of fastText, which adds an option to represent all vectors using only the ngram buckets table and which uses the exact same ngram generation algorithm and hash function (`MurmurHash3_x64_128`). `fasttext-bloom` produces an additional `.hashvec` table, which can be loaded by `spacy init vectors --fasttext-bloom-vectors`. https://github.com/adrianeboyd/fastText/tree/feature/bloom Implementation details: * `Vectors` now includes the `StringStore` as `Vectors.strings` so that the API can stay consistent for both `default` (which can look up from `str` or `int`) and `ngram` (which requires `str` to calculate the ngrams). * In ngram mode `Vectors` uses a default `Vectors` object as a cache since the ngram vectors lookups are relatively expensive. * The default cache size is the same size as the provided ngram vector table. * Once the cache is full, no more entries are added. The user is responsible for managing the cache in cases where the initial documents are not representative of the texts. * The cache can be resized by setting `Vectors.ngram_cache_size` or cleared with `vectors._ngram_cache.clear()`. * The API ends up a bit split between methods for `default` and for `ngram`, so functions that only make sense for `default` or `ngram` include warnings with custom messages suggesting alternatives where possible. * `Vocab.vectors` becomes a property so that the string stores can be synced when assigning vectors to a vocab. * `Vectors` serializes its own config settings as `vectors.cfg`. * The `Vectors` serialization methods have added support for `exclude` so that the `Vocab` can exclude the `Vectors` strings while serializing. Removed: * The `minn` and `maxn` options and related code from `Vocab.get_vector`, which does not work in a meaningful way for default vector tables. * The unused `GlobalRegistry` in `Vectors`. * Refactor to use reduce_mean Refactor to use reduce_mean and remove the ngram vectors cache. * Rename to floret * Rename to floret in error messages * Use --vectors-mode in CLI, vector init * Fix vectors mode in init * Remove unused var * Minor API and docstrings adjustments * Rename `--vectors-mode` to `--mode` in `init vectors` CLI * Rename `Vectors.get_floret_vectors` to `Vectors.get_batch` and support both modes. * Minor updates to Vectors docstrings. * Update API docs for Vectors and init vectors CLI * Update types for StaticVectors
142 lines
5.2 KiB
Python
142 lines
5.2 KiB
Python
import pytest
|
|
import pickle
|
|
from thinc.api import get_current_ops
|
|
from spacy.vocab import Vocab
|
|
from spacy.strings import StringStore
|
|
from spacy.vectors import Vectors
|
|
|
|
from ..util import make_tempdir
|
|
|
|
|
|
test_strings = [([], []), (["rats", "are", "cute"], ["i", "like", "rats"])]
|
|
test_strings_attrs = [(["rats", "are", "cute"], "Hello")]
|
|
|
|
|
|
@pytest.mark.parametrize("text", ["rat"])
|
|
def test_serialize_vocab(en_vocab, text):
|
|
text_hash = en_vocab.strings.add(text)
|
|
vocab_bytes = en_vocab.to_bytes(exclude=["lookups"])
|
|
new_vocab = Vocab().from_bytes(vocab_bytes)
|
|
assert new_vocab.strings[text_hash] == text
|
|
assert new_vocab.to_bytes(exclude=["lookups"]) == vocab_bytes
|
|
|
|
|
|
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
|
def test_serialize_vocab_roundtrip_bytes(strings1, strings2):
|
|
vocab1 = Vocab(strings=strings1)
|
|
vocab2 = Vocab(strings=strings2)
|
|
vocab1_b = vocab1.to_bytes()
|
|
vocab2_b = vocab2.to_bytes()
|
|
if strings1 == strings2:
|
|
assert vocab1_b == vocab2_b
|
|
else:
|
|
assert vocab1_b != vocab2_b
|
|
vocab1 = vocab1.from_bytes(vocab1_b)
|
|
assert vocab1.to_bytes() == vocab1_b
|
|
new_vocab1 = Vocab().from_bytes(vocab1_b)
|
|
assert new_vocab1.to_bytes() == vocab1_b
|
|
assert len(new_vocab1.strings) == len(strings1)
|
|
assert sorted([s for s in new_vocab1.strings]) == sorted(strings1)
|
|
|
|
|
|
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
|
def test_serialize_vocab_roundtrip_disk(strings1, strings2):
|
|
vocab1 = Vocab(strings=strings1)
|
|
vocab2 = Vocab(strings=strings2)
|
|
with make_tempdir() as d:
|
|
file_path1 = d / "vocab1"
|
|
file_path2 = d / "vocab2"
|
|
vocab1.to_disk(file_path1)
|
|
vocab2.to_disk(file_path2)
|
|
vocab1_d = Vocab().from_disk(file_path1)
|
|
vocab2_d = Vocab().from_disk(file_path2)
|
|
# check strings rather than lexemes, which are only reloaded on demand
|
|
assert set(strings1) == set([s for s in vocab1_d.strings])
|
|
assert set(strings2) == set([s for s in vocab2_d.strings])
|
|
if set(strings1) == set(strings2):
|
|
assert [s for s in vocab1_d.strings] == [s for s in vocab2_d.strings]
|
|
else:
|
|
assert [s for s in vocab1_d.strings] != [s for s in vocab2_d.strings]
|
|
|
|
|
|
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
|
def test_serialize_vocab_lex_attrs_bytes(strings, lex_attr):
|
|
vocab1 = Vocab(strings=strings)
|
|
vocab2 = Vocab()
|
|
vocab1[strings[0]].norm_ = lex_attr
|
|
assert vocab1[strings[0]].norm_ == lex_attr
|
|
assert vocab2[strings[0]].norm_ != lex_attr
|
|
vocab2 = vocab2.from_bytes(vocab1.to_bytes())
|
|
assert vocab2[strings[0]].norm_ == lex_attr
|
|
|
|
|
|
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
|
def test_deserialize_vocab_seen_entries(strings, lex_attr):
|
|
# Reported in #2153
|
|
vocab = Vocab(strings=strings)
|
|
vocab.from_bytes(vocab.to_bytes())
|
|
assert len(vocab.strings) == len(strings)
|
|
|
|
|
|
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
|
def test_serialize_vocab_lex_attrs_disk(strings, lex_attr):
|
|
vocab1 = Vocab(strings=strings)
|
|
vocab2 = Vocab()
|
|
vocab1[strings[0]].norm_ = lex_attr
|
|
assert vocab1[strings[0]].norm_ == lex_attr
|
|
assert vocab2[strings[0]].norm_ != lex_attr
|
|
with make_tempdir() as d:
|
|
file_path = d / "vocab"
|
|
vocab1.to_disk(file_path)
|
|
vocab2 = vocab2.from_disk(file_path)
|
|
assert vocab2[strings[0]].norm_ == lex_attr
|
|
|
|
|
|
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
|
def test_serialize_stringstore_roundtrip_bytes(strings1, strings2):
|
|
sstore1 = StringStore(strings=strings1)
|
|
sstore2 = StringStore(strings=strings2)
|
|
sstore1_b = sstore1.to_bytes()
|
|
sstore2_b = sstore2.to_bytes()
|
|
if set(strings1) == set(strings2):
|
|
assert sstore1_b == sstore2_b
|
|
else:
|
|
assert sstore1_b != sstore2_b
|
|
sstore1 = sstore1.from_bytes(sstore1_b)
|
|
assert sstore1.to_bytes() == sstore1_b
|
|
new_sstore1 = StringStore().from_bytes(sstore1_b)
|
|
assert new_sstore1.to_bytes() == sstore1_b
|
|
assert set(new_sstore1) == set(strings1)
|
|
|
|
|
|
@pytest.mark.parametrize("strings1,strings2", test_strings)
|
|
def test_serialize_stringstore_roundtrip_disk(strings1, strings2):
|
|
sstore1 = StringStore(strings=strings1)
|
|
sstore2 = StringStore(strings=strings2)
|
|
with make_tempdir() as d:
|
|
file_path1 = d / "strings1"
|
|
file_path2 = d / "strings2"
|
|
sstore1.to_disk(file_path1)
|
|
sstore2.to_disk(file_path2)
|
|
sstore1_d = StringStore().from_disk(file_path1)
|
|
sstore2_d = StringStore().from_disk(file_path2)
|
|
assert set(sstore1_d) == set(sstore1)
|
|
assert set(sstore2_d) == set(sstore2)
|
|
if set(strings1) == set(strings2):
|
|
assert set(sstore1_d) == set(sstore2_d)
|
|
else:
|
|
assert set(sstore1_d) != set(sstore2_d)
|
|
|
|
|
|
@pytest.mark.parametrize("strings,lex_attr", test_strings_attrs)
|
|
def test_pickle_vocab(strings, lex_attr):
|
|
vocab = Vocab(strings=strings)
|
|
ops = get_current_ops()
|
|
vectors = Vectors(data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1)
|
|
vocab.vectors = vectors
|
|
vocab[strings[0]].norm_ = lex_attr
|
|
vocab_pickled = pickle.dumps(vocab)
|
|
vocab_unpickled = pickle.loads(vocab_pickled)
|
|
assert vocab.to_bytes() == vocab_unpickled.to_bytes()
|
|
assert vocab_unpickled.vectors.mode == "floret"
|