Have logging calls use string formatting types (#12215)

* change logging call for spacy.LookupsDataLoader.v1

* substitutions in language and _util

* various more substitutions

* add string formatting guidelines to contribution guidelines
This commit is contained in:
Sofie Van Landeghem 2023-02-02 11:15:22 +01:00 committed by GitHub
parent 4c60afb946
commit 79ef6cf0f9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 37 additions and 28 deletions

View File

@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
Python modules. If you've built spaCy from source, you'll already have both
tools installed.
As a general rule of thumb, we use f-strings for any formatting of strings.
One exception are calls to Python's `logging` functionality.
To avoid unnecessary string conversions in these cases, we use string formatting
templates with `%s` and `%d` etc.
**⚠️ Note that formatting and linting is currently only possible for Python
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**

View File

@ -90,9 +90,9 @@ def parse_config_overrides(
cli_overrides = _parse_overrides(args, is_cli=True)
if cli_overrides:
keys = [k for k in cli_overrides if k not in env_overrides]
logger.debug(f"Config overrides from CLI: {keys}")
logger.debug("Config overrides from CLI: %s", keys)
if env_overrides:
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
logger.debug("Config overrides from env variables: %s", list(env_overrides))
return {**cli_overrides, **env_overrides}

View File

@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
# in the list.
while commands:
for i, cmd in enumerate(list(commands)):
logger.debug(f"CMD: {cmd['name']}.")
logger.debug("CMD: %s.", cmd["name"])
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if all(dep.exists() for dep in deps):
cmd_hash = get_command_hash("", "", deps, cmd["script"])
for output_path in cmd.get("outputs", []):
url = storage.pull(output_path, command_hash=cmd_hash)
logger.debug(
f"URL: {url} for {output_path} with command hash {cmd_hash}"
"URL: %s for %s with command hash %s",
url,
output_path,
cmd_hash,
)
yield url, output_path
@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
commands.pop(i)
break
else:
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
else:
# If we didn't break the for loop, break the while loop.
break

View File

@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
remote = config["remotes"][remote]
storage = RemoteStorage(project_dir, remote)
for cmd in config.get("commands", []):
logger.debug(f"CMD: cmd['name']")
logger.debug("CMD: %s", cmd["name"])
deps = [project_dir / dep for dep in cmd.get("deps", [])]
if any(not dep.exists() for dep in deps):
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
continue
cmd_hash = get_command_hash(
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
)
logger.debug(f"CMD_HASH: {cmd_hash}")
logger.debug("CMD_HASH: %s", cmd_hash)
for output_path in cmd.get("outputs", []):
output_loc = project_dir / output_path
if output_loc.exists() and _is_not_empty_dir(output_loc):
@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
content_hash=get_content_hash(output_loc),
)
logger.debug(
f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
"URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
)
yield output_path, url

View File

@ -104,7 +104,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
@registry.misc("spacy.LookupsDataLoader.v1")
def load_lookups_data(lang, tables):
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
lookups = load_lookups(lang=lang, tables=tables)
return lookups
@ -1969,7 +1969,7 @@ class Language:
pipe = self.get_pipe(pipe_name)
pipe_cfg = self._pipe_configs[pipe_name]
if listeners:
util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
util.logger.debug("Replacing listeners of component '%s'", pipe_name)
if len(list(listeners)) != len(pipe_listeners):
# The number of listeners defined in the component model doesn't
# match the listeners to replace, so we won't be able to update

View File

@ -46,7 +46,7 @@ def assert_sents_error(doc):
def warn_error(proc_name, proc, docs, e):
logger = logging.getLogger("spacy")
logger.warning(f"Trouble with component {proc_name}.")
logger.warning("Trouble with component %s.", proc_name)
@pytest.fixture

View File

@ -11,7 +11,7 @@ def create_copy_from_base_model(
) -> Callable[[Language], Language]:
def copy_from_base_model(nlp):
if tokenizer:
logger.info(f"Copying tokenizer from: {tokenizer}")
logger.info("Copying tokenizer from: %s", tokenizer)
base_nlp = load_model(tokenizer)
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
@ -23,7 +23,7 @@ def create_copy_from_base_model(
)
)
if vocab:
logger.info(f"Copying vocab from: {vocab}")
logger.info("Copying vocab from: %s", vocab)
# only reload if the vocab is from a different model
if tokenizer != vocab:
base_nlp = load_model(vocab)

View File

@ -29,7 +29,7 @@ def create_docbin_reader(
) -> Callable[["Language"], Iterable[Example]]:
if path is None:
raise ValueError(Errors.E913)
util.logger.debug(f"Loading corpus from path: {path}")
util.logger.debug("Loading corpus from path: %s", path)
return Corpus(
path,
gold_preproc=gold_preproc,

View File

@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
frozen_components = T["frozen_components"]
# Sourced components that require resume_training
resume_components = [p for p in sourced if p not in frozen_components]
logger.info(f"Pipeline: {nlp.pipe_names}")
logger.info("Pipeline: %s", nlp.pipe_names)
if resume_components:
with nlp.select_pipes(enable=resume_components):
logger.info(f"Resuming training for: {resume_components}")
logger.info("Resuming training for: %s", resume_components)
nlp.resume_training(sgd=optimizer)
# Make sure that listeners are defined before initializing further
nlp._link_components()
@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
if T["max_epochs"] == -1:
sample_size = 100
logger.debug(
f"Due to streamed train corpus, using only first {sample_size} "
f"examples for initialization. If necessary, provide all labels "
f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
"Due to streamed train corpus, using only first %s examples for initialization. "
"If necessary, provide all labels in [initialize]. "
"More info: https://spacy.io/api/cli#init_labels",
sample_size,
)
nlp.initialize(
lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
)
else:
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
logger.info("Initialized pipeline components: %s", nlp.pipe_names)
# Detect components with listeners that are not frozen consistently
for name, proc in nlp.pipeline:
for listener in getattr(
@ -109,7 +110,7 @@ def init_vocab(
) -> None:
if lookups:
nlp.vocab.lookups = lookups
logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
logger.info("Added vocab lookups: %s", ", ".join(lookups.tables))
data_path = ensure_path(data)
if data_path is not None:
lex_attrs = srsly.read_jsonl(data_path)
@ -125,11 +126,11 @@ def init_vocab(
else:
oov_prob = DEFAULT_OOV_PROB
nlp.vocab.cfg.update({"oov_prob": oov_prob})
logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
logger.info("Added %d lexical entries to the vocab", len(nlp.vocab))
logger.info("Created vocabulary")
if vectors is not None:
load_vectors_into_model(nlp, vectors)
logger.info(f"Added vectors: {vectors}")
logger.info("Added vectors: %s", vectors)
# warn if source model vectors are not identical
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
@ -191,7 +192,7 @@ def init_tok2vec(
if weights_data is not None:
layer = get_tok2vec_ref(nlp, P)
layer.from_bytes(weights_data)
logger.info(f"Loaded pretrained weights from {init_tok2vec}")
logger.info("Loaded pretrained weights from %s", init_tok2vec)
return True
return False
@ -216,13 +217,13 @@ def convert_vectors(
nlp.vocab.deduplicate_vectors()
else:
if vectors_loc:
logger.info(f"Reading vectors from {vectors_loc}")
logger.info("Reading vectors from %s", vectors_loc)
vectors_data, vector_keys, floret_settings = read_vectors(
vectors_loc,
truncate,
mode=mode,
)
logger.info(f"Loaded vectors from {vectors_loc}")
logger.info("Loaded vectors from %s", vectors_loc)
else:
vectors_data, vector_keys = (None, None)
if vector_keys is not None and mode != VectorsMode.floret:

View File

@ -370,6 +370,6 @@ def clean_output_dir(path: Optional[Path]) -> None:
if subdir.exists():
try:
shutil.rmtree(str(subdir))
logger.debug(f"Removed existing output directory: {subdir}")
logger.debug("Removed existing output directory: %s", subdir)
except Exception as e:
raise IOError(Errors.E901.format(path=path)) from e