mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-26 18:06:29 +03:00
c053f158c5
* Add support for fasttext-bloom hash-only vectors Overview: * Extend `Vectors` to have two modes: `default` and `ngram` * `default` is the default mode and equivalent to the current `Vectors` * `ngram` supports the hash-only ngram tables from `fasttext-bloom` * Extend `spacy.StaticVectors.v2` to handle both modes with no changes for `default` vectors * Extend `spacy init vectors` to support ngram tables The `ngram` mode **only** supports vector tables produced by this fork of fastText, which adds an option to represent all vectors using only the ngram buckets table and which uses the exact same ngram generation algorithm and hash function (`MurmurHash3_x64_128`). `fasttext-bloom` produces an additional `.hashvec` table, which can be loaded by `spacy init vectors --fasttext-bloom-vectors`. https://github.com/adrianeboyd/fastText/tree/feature/bloom Implementation details: * `Vectors` now includes the `StringStore` as `Vectors.strings` so that the API can stay consistent for both `default` (which can look up from `str` or `int`) and `ngram` (which requires `str` to calculate the ngrams). * In ngram mode `Vectors` uses a default `Vectors` object as a cache since the ngram vectors lookups are relatively expensive. * The default cache size is the same size as the provided ngram vector table. * Once the cache is full, no more entries are added. The user is responsible for managing the cache in cases where the initial documents are not representative of the texts. * The cache can be resized by setting `Vectors.ngram_cache_size` or cleared with `vectors._ngram_cache.clear()`. * The API ends up a bit split between methods for `default` and for `ngram`, so functions that only make sense for `default` or `ngram` include warnings with custom messages suggesting alternatives where possible. * `Vocab.vectors` becomes a property so that the string stores can be synced when assigning vectors to a vocab. * `Vectors` serializes its own config settings as `vectors.cfg`. * The `Vectors` serialization methods have added support for `exclude` so that the `Vocab` can exclude the `Vectors` strings while serializing. Removed: * The `minn` and `maxn` options and related code from `Vocab.get_vector`, which does not work in a meaningful way for default vector tables. * The unused `GlobalRegistry` in `Vectors`. * Refactor to use reduce_mean Refactor to use reduce_mean and remove the ngram vectors cache. * Rename to floret * Rename to floret in error messages * Use --vectors-mode in CLI, vector init * Fix vectors mode in init * Remove unused var * Minor API and docstrings adjustments * Rename `--vectors-mode` to `--mode` in `init vectors` CLI * Rename `Vectors.get_floret_vectors` to `Vectors.get_batch` and support both modes. * Minor updates to Vectors docstrings. * Update API docs for Vectors and init vectors CLI * Update types for StaticVectors
338 lines
13 KiB
Python
338 lines
13 KiB
Python
from typing import Union, Dict, Optional, Any, IO, TYPE_CHECKING
|
|
from thinc.api import Config, fix_random_seed, set_gpu_allocator
|
|
from thinc.api import ConfigValidationError
|
|
from pathlib import Path
|
|
import srsly
|
|
import numpy
|
|
import tarfile
|
|
import gzip
|
|
import zipfile
|
|
import tqdm
|
|
from itertools import islice
|
|
import warnings
|
|
|
|
from .pretrain import get_tok2vec_ref
|
|
from ..lookups import Lookups
|
|
from ..vectors import Vectors, Mode as VectorsMode
|
|
from ..errors import Errors, Warnings
|
|
from ..schemas import ConfigSchemaTraining
|
|
from ..util import registry, load_model_from_config, resolve_dot_names, logger
|
|
from ..util import load_model, ensure_path, get_sourced_components
|
|
from ..util import OOV_RANK, DEFAULT_OOV_PROB
|
|
|
|
if TYPE_CHECKING:
|
|
from ..language import Language # noqa: F401
|
|
|
|
|
|
def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
|
raw_config = config
|
|
config = raw_config.interpolate()
|
|
if "seed" not in config["training"]:
|
|
raise ValueError(Errors.E1015.format(value="[training] seed"))
|
|
if "gpu_allocator" not in config["training"]:
|
|
raise ValueError(Errors.E1015.format(value="[training] gpu_allocator"))
|
|
if config["training"]["seed"] is not None:
|
|
fix_random_seed(config["training"]["seed"])
|
|
allocator = config["training"]["gpu_allocator"]
|
|
if use_gpu >= 0 and allocator:
|
|
set_gpu_allocator(allocator)
|
|
# Use original config here before it's resolved to functions
|
|
sourced = get_sourced_components(config)
|
|
nlp = load_model_from_config(raw_config, auto_fill=True)
|
|
logger.info("Set up nlp object from config")
|
|
config = nlp.config.interpolate()
|
|
# Resolve all training-relevant sections using the filled nlp config
|
|
T = registry.resolve(config["training"], schema=ConfigSchemaTraining)
|
|
dot_names = [T["train_corpus"], T["dev_corpus"]]
|
|
if not isinstance(T["train_corpus"], str):
|
|
raise ConfigValidationError(
|
|
desc=Errors.E897.format(
|
|
field="training.train_corpus", type=type(T["train_corpus"])
|
|
)
|
|
)
|
|
if not isinstance(T["dev_corpus"], str):
|
|
raise ConfigValidationError(
|
|
desc=Errors.E897.format(
|
|
field="training.dev_corpus", type=type(T["dev_corpus"])
|
|
)
|
|
)
|
|
train_corpus, dev_corpus = resolve_dot_names(config, dot_names)
|
|
optimizer = T["optimizer"]
|
|
# Components that shouldn't be updated during training
|
|
frozen_components = T["frozen_components"]
|
|
# Sourced components that require resume_training
|
|
resume_components = [p for p in sourced if p not in frozen_components]
|
|
logger.info(f"Pipeline: {nlp.pipe_names}")
|
|
if resume_components:
|
|
with nlp.select_pipes(enable=resume_components):
|
|
logger.info(f"Resuming training for: {resume_components}")
|
|
nlp.resume_training(sgd=optimizer)
|
|
# Make sure that listeners are defined before initializing further
|
|
nlp._link_components()
|
|
with nlp.select_pipes(disable=[*frozen_components, *resume_components]):
|
|
if T["max_epochs"] == -1:
|
|
sample_size = 100
|
|
logger.debug(
|
|
f"Due to streamed train corpus, using only first {sample_size} "
|
|
f"examples for initialization. If necessary, provide all labels "
|
|
f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
|
|
)
|
|
nlp.initialize(
|
|
lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
|
|
)
|
|
else:
|
|
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
|
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
|
|
# Detect components with listeners that are not frozen consistently
|
|
for name, proc in nlp.pipeline:
|
|
for listener in getattr(
|
|
proc, "listening_components", []
|
|
): # e.g. tok2vec/transformer
|
|
# Don't warn about components not in the pipeline
|
|
if listener not in nlp.pipe_names:
|
|
continue
|
|
if listener in frozen_components and name not in frozen_components:
|
|
logger.warning(Warnings.W087.format(name=name, listener=listener))
|
|
# We always check this regardless, in case user freezes tok2vec
|
|
if listener not in frozen_components and name in frozen_components:
|
|
if name not in T["annotating_components"]:
|
|
logger.warning(Warnings.W086.format(name=name, listener=listener))
|
|
return nlp
|
|
|
|
|
|
def init_vocab(
|
|
nlp: "Language",
|
|
*,
|
|
data: Optional[Path] = None,
|
|
lookups: Optional[Lookups] = None,
|
|
vectors: Optional[str] = None,
|
|
) -> None:
|
|
if lookups:
|
|
nlp.vocab.lookups = lookups
|
|
logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
|
data_path = ensure_path(data)
|
|
if data_path is not None:
|
|
lex_attrs = srsly.read_jsonl(data_path)
|
|
for lexeme in nlp.vocab:
|
|
lexeme.rank = OOV_RANK
|
|
for attrs in lex_attrs:
|
|
if "settings" in attrs:
|
|
continue
|
|
lexeme = nlp.vocab[attrs["orth"]]
|
|
lexeme.set_attrs(**attrs)
|
|
if len(nlp.vocab):
|
|
oov_prob = min(lex.prob for lex in nlp.vocab) - 1
|
|
else:
|
|
oov_prob = DEFAULT_OOV_PROB
|
|
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
|
logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
|
logger.info("Created vocabulary")
|
|
if vectors is not None:
|
|
load_vectors_into_model(nlp, vectors)
|
|
logger.info(f"Added vectors: {vectors}")
|
|
# warn if source model vectors are not identical
|
|
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
|
vectors_hash = hash(nlp.vocab.vectors.to_bytes())
|
|
for sourced_component, sourced_vectors_hash in sourced_vectors_hashes.items():
|
|
if vectors_hash != sourced_vectors_hash:
|
|
warnings.warn(Warnings.W113.format(name=sourced_component))
|
|
logger.info("Finished initializing nlp object")
|
|
|
|
|
|
def load_vectors_into_model(
|
|
nlp: "Language", name: Union[str, Path], *, add_strings: bool = True
|
|
) -> None:
|
|
"""Load word vectors from an installed model or path into a model instance."""
|
|
try:
|
|
# Load with the same vocab, which automatically adds the vectors to
|
|
# the current nlp object. Exclude lookups so they are not modified.
|
|
exclude = ["lookups"]
|
|
if not add_strings:
|
|
exclude.append("strings")
|
|
vectors_nlp = load_model(name, vocab=nlp.vocab, exclude=exclude)
|
|
except ConfigValidationError as e:
|
|
title = f"Config validation error for vectors {name}"
|
|
desc = (
|
|
"This typically means that there's a problem in the config.cfg included "
|
|
"with the packaged vectors. Make sure that the vectors package you're "
|
|
"loading is compatible with the current version of spaCy."
|
|
)
|
|
err = ConfigValidationError.from_error(e, title=title, desc=desc)
|
|
raise err from None
|
|
|
|
if (
|
|
len(vectors_nlp.vocab.vectors.keys()) == 0
|
|
and vectors_nlp.vocab.vectors.mode != VectorsMode.floret
|
|
) or (
|
|
vectors_nlp.vocab.vectors.data.shape[0] == 0
|
|
and vectors_nlp.vocab.vectors.mode == VectorsMode.floret
|
|
):
|
|
logger.warning(Warnings.W112.format(name=name))
|
|
|
|
for lex in nlp.vocab:
|
|
lex.rank = nlp.vocab.vectors.key2row.get(lex.orth, OOV_RANK) # type: ignore[attr-defined]
|
|
|
|
|
|
def init_tok2vec(
|
|
nlp: "Language", pretrain_config: Dict[str, Any], init_config: Dict[str, Any]
|
|
) -> bool:
|
|
# Load pretrained tok2vec weights - cf. CLI command 'pretrain'
|
|
P = pretrain_config
|
|
I = init_config
|
|
weights_data = None
|
|
init_tok2vec = ensure_path(I["init_tok2vec"])
|
|
if init_tok2vec is not None:
|
|
if not init_tok2vec.exists():
|
|
err = f"can't find pretrained tok2vec: {init_tok2vec}"
|
|
errors = [{"loc": ["initialize", "init_tok2vec"], "msg": err}]
|
|
raise ConfigValidationError(config=nlp.config, errors=errors)
|
|
with init_tok2vec.open("rb") as file_:
|
|
weights_data = file_.read()
|
|
if weights_data is not None:
|
|
layer = get_tok2vec_ref(nlp, P)
|
|
layer.from_bytes(weights_data)
|
|
logger.info(f"Loaded pretrained weights from {init_tok2vec}")
|
|
return True
|
|
return False
|
|
|
|
|
|
def convert_vectors(
|
|
nlp: "Language",
|
|
vectors_loc: Optional[Path],
|
|
*,
|
|
truncate: int,
|
|
prune: int,
|
|
name: Optional[str] = None,
|
|
mode: str = VectorsMode.default,
|
|
) -> None:
|
|
vectors_loc = ensure_path(vectors_loc)
|
|
if vectors_loc and vectors_loc.parts[-1].endswith(".npz"):
|
|
nlp.vocab.vectors = Vectors(
|
|
strings=nlp.vocab.strings, data=numpy.load(vectors_loc.open("rb"))
|
|
)
|
|
for lex in nlp.vocab:
|
|
if lex.rank and lex.rank != OOV_RANK:
|
|
nlp.vocab.vectors.add(lex.orth, row=lex.rank) # type: ignore[attr-defined]
|
|
else:
|
|
if vectors_loc:
|
|
logger.info(f"Reading vectors from {vectors_loc}")
|
|
vectors_data, vector_keys, floret_settings = read_vectors(
|
|
vectors_loc,
|
|
truncate,
|
|
mode=mode,
|
|
)
|
|
logger.info(f"Loaded vectors from {vectors_loc}")
|
|
else:
|
|
vectors_data, vector_keys = (None, None)
|
|
if vector_keys is not None and mode != VectorsMode.floret:
|
|
for word in vector_keys:
|
|
if word not in nlp.vocab:
|
|
nlp.vocab[word]
|
|
if vectors_data is not None:
|
|
if mode == VectorsMode.floret:
|
|
nlp.vocab.vectors = Vectors(
|
|
strings=nlp.vocab.strings,
|
|
data=vectors_data,
|
|
**floret_settings,
|
|
)
|
|
else:
|
|
nlp.vocab.vectors = Vectors(
|
|
strings=nlp.vocab.strings, data=vectors_data, keys=vector_keys
|
|
)
|
|
if name is None:
|
|
# TODO: Is this correct? Does this matter?
|
|
nlp.vocab.vectors.name = f"{nlp.meta['lang']}_{nlp.meta['name']}.vectors"
|
|
else:
|
|
nlp.vocab.vectors.name = name
|
|
nlp.meta["vectors"]["name"] = nlp.vocab.vectors.name
|
|
if prune >= 1 and mode != VectorsMode.floret:
|
|
nlp.vocab.prune_vectors(prune)
|
|
|
|
|
|
def read_vectors(
|
|
vectors_loc: Path, truncate_vectors: int, *, mode: str = VectorsMode.default
|
|
):
|
|
f = ensure_shape(vectors_loc)
|
|
header_parts = next(f).split()
|
|
shape = tuple(int(size) for size in header_parts[:2])
|
|
floret_settings = {}
|
|
if mode == VectorsMode.floret:
|
|
if len(header_parts) != 8:
|
|
raise ValueError(
|
|
"Invalid header for floret vectors. "
|
|
"Expected: bucket dim minn maxn hash_count hash_seed BOW EOW"
|
|
)
|
|
floret_settings = {
|
|
"mode": "floret",
|
|
"minn": int(header_parts[2]),
|
|
"maxn": int(header_parts[3]),
|
|
"hash_count": int(header_parts[4]),
|
|
"hash_seed": int(header_parts[5]),
|
|
"bow": header_parts[6],
|
|
"eow": header_parts[7],
|
|
}
|
|
if truncate_vectors >= 1:
|
|
raise ValueError(Errors.E860)
|
|
else:
|
|
assert len(header_parts) == 2
|
|
if truncate_vectors >= 1:
|
|
shape = (truncate_vectors, shape[1])
|
|
vectors_data = numpy.zeros(shape=shape, dtype="f")
|
|
vectors_keys = []
|
|
for i, line in enumerate(tqdm.tqdm(f)):
|
|
line = line.rstrip()
|
|
pieces = line.rsplit(" ", vectors_data.shape[1])
|
|
word = pieces.pop(0)
|
|
if len(pieces) != vectors_data.shape[1]:
|
|
raise ValueError(Errors.E094.format(line_num=i, loc=vectors_loc))
|
|
vectors_data[i] = numpy.asarray(pieces, dtype="f")
|
|
vectors_keys.append(word)
|
|
if i == truncate_vectors - 1:
|
|
break
|
|
return vectors_data, vectors_keys, floret_settings
|
|
|
|
|
|
def open_file(loc: Union[str, Path]) -> IO:
|
|
"""Handle .gz, .tar.gz or unzipped files"""
|
|
loc = ensure_path(loc)
|
|
if tarfile.is_tarfile(str(loc)):
|
|
return tarfile.open(str(loc), "r:gz") # type: ignore[return-value]
|
|
elif loc.parts[-1].endswith("gz"):
|
|
return (line.decode("utf8") for line in gzip.open(str(loc), "r")) # type: ignore[return-value]
|
|
elif loc.parts[-1].endswith("zip"):
|
|
zip_file = zipfile.ZipFile(str(loc))
|
|
names = zip_file.namelist()
|
|
file_ = zip_file.open(names[0])
|
|
return (line.decode("utf8") for line in file_) # type: ignore[return-value]
|
|
else:
|
|
return loc.open("r", encoding="utf8")
|
|
|
|
|
|
def ensure_shape(vectors_loc):
|
|
"""Ensure that the first line of the data is the vectors shape.
|
|
If it's not, we read in the data and output the shape as the first result,
|
|
so that the reader doesn't have to deal with the problem.
|
|
"""
|
|
lines = open_file(vectors_loc)
|
|
first_line = next(lines)
|
|
try:
|
|
shape = tuple(int(size) for size in first_line.split()[:2])
|
|
except ValueError:
|
|
shape = None
|
|
if shape is not None:
|
|
# All good, give the data
|
|
yield first_line
|
|
yield from lines
|
|
else:
|
|
# Figure out the shape, make it the first value, and then give the
|
|
# rest of the data.
|
|
width = len(first_line.split()) - 1
|
|
length = 1
|
|
for _ in lines:
|
|
length += 1
|
|
yield f"{length} {width}"
|
|
# Reading the lines in again from file. This to avoid having to
|
|
# store all the results in a list in memory
|
|
lines2 = open_file(vectors_loc)
|
|
yield from lines2
|