From 79ef6cf0f9ca75468457c86d0d6fd0d8709a9308 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Thu, 2 Feb 2023 11:15:22 +0100
Subject: [PATCH] Have logging calls use string formatting types (#12215)

* change logging call for spacy.LookupsDataLoader.v1

* substitutions in language and _util

* various more substitutions

* add string formatting guidelines to contribution guidelines
---
 CONTRIBUTING.md              |  5 +++++
 spacy/cli/_util.py           |  4 ++--
 spacy/cli/project/pull.py    |  9 ++++++---
 spacy/cli/project/push.py    |  8 ++++----
 spacy/language.py            |  4 ++--
 spacy/tests/test_language.py |  2 +-
 spacy/training/callbacks.py  |  4 ++--
 spacy/training/corpus.py     |  2 +-
 spacy/training/initialize.py | 25 +++++++++++++------------
 spacy/training/loop.py       |  2 +-
 10 files changed, 37 insertions(+), 28 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 1f396bd71..f6f6dab59 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
 Python modules. If you've built spaCy from source, you'll already have both
 tools installed.
 
+As a general rule of thumb, we use f-strings for any formatting of strings.
+One exception are calls to Python's `logging` functionality.
+To avoid unnecessary string conversions in these cases, we use string formatting
+templates with `%s` and `%d` etc.
+
 **⚠️ Note that formatting and linting is currently only possible for Python
 modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
 
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index ba3892b1d..f104feff9 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -90,9 +90,9 @@ def parse_config_overrides(
     cli_overrides = _parse_overrides(args, is_cli=True)
     if cli_overrides:
         keys = [k for k in cli_overrides if k not in env_overrides]
-        logger.debug(f"Config overrides from CLI: {keys}")
+        logger.debug("Config overrides from CLI: %s", keys)
     if env_overrides:
-        logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
+        logger.debug("Config overrides from env variables: %s", list(env_overrides))
     return {**cli_overrides, **env_overrides}
 
 
diff --git a/spacy/cli/project/pull.py b/spacy/cli/project/pull.py
index 6e3cde88c..8894baa50 100644
--- a/spacy/cli/project/pull.py
+++ b/spacy/cli/project/pull.py
@@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
     # in the list.
     while commands:
         for i, cmd in enumerate(list(commands)):
-            logger.debug(f"CMD: {cmd['name']}.")
+            logger.debug("CMD: %s.", cmd["name"])
             deps = [project_dir / dep for dep in cmd.get("deps", [])]
             if all(dep.exists() for dep in deps):
                 cmd_hash = get_command_hash("", "", deps, cmd["script"])
                 for output_path in cmd.get("outputs", []):
                     url = storage.pull(output_path, command_hash=cmd_hash)
                     logger.debug(
-                        f"URL: {url} for {output_path} with command hash {cmd_hash}"
+                        "URL: %s for %s with command hash %s",
+                        url,
+                        output_path,
+                        cmd_hash,
                     )
                     yield url, output_path
 
@@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
                 commands.pop(i)
                 break
             else:
-                logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
+                logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
         else:
             # If we didn't break the for loop, break the while loop.
             break
diff --git a/spacy/cli/project/push.py b/spacy/cli/project/push.py
index bc779e9cd..a8178de21 100644
--- a/spacy/cli/project/push.py
+++ b/spacy/cli/project/push.py
@@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
         remote = config["remotes"][remote]
     storage = RemoteStorage(project_dir, remote)
     for cmd in config.get("commands", []):
-        logger.debug(f"CMD: cmd['name']")
+        logger.debug("CMD: %s", cmd["name"])
         deps = [project_dir / dep for dep in cmd.get("deps", [])]
         if any(not dep.exists() for dep in deps):
-            logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
+            logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
             continue
         cmd_hash = get_command_hash(
             "", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
         )
-        logger.debug(f"CMD_HASH: {cmd_hash}")
+        logger.debug("CMD_HASH: %s", cmd_hash)
         for output_path in cmd.get("outputs", []):
             output_loc = project_dir / output_path
             if output_loc.exists() and _is_not_empty_dir(output_loc):
@@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
                     content_hash=get_content_hash(output_loc),
                 )
                 logger.debug(
-                    f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
+                    "URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
                 )
                 yield output_path, url
 
diff --git a/spacy/language.py b/spacy/language.py
index e0abfd5e7..9fdcf6328 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -104,7 +104,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
 
 @registry.misc("spacy.LookupsDataLoader.v1")
 def load_lookups_data(lang, tables):
-    util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
+    util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
     lookups = load_lookups(lang=lang, tables=tables)
     return lookups
 
@@ -1969,7 +1969,7 @@ class Language:
         pipe = self.get_pipe(pipe_name)
         pipe_cfg = self._pipe_configs[pipe_name]
         if listeners:
-            util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
+            util.logger.debug("Replacing listeners of component '%s'", pipe_name)
             if len(list(listeners)) != len(pipe_listeners):
                 # The number of listeners defined in the component model doesn't
                 # match the listeners to replace, so we won't be able to update
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 03790eb86..236856dad 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -46,7 +46,7 @@ def assert_sents_error(doc):
 
 def warn_error(proc_name, proc, docs, e):
     logger = logging.getLogger("spacy")
-    logger.warning(f"Trouble with component {proc_name}.")
+    logger.warning("Trouble with component %s.", proc_name)
 
 
 @pytest.fixture
diff --git a/spacy/training/callbacks.py b/spacy/training/callbacks.py
index 426fddf90..7e2494f5b 100644
--- a/spacy/training/callbacks.py
+++ b/spacy/training/callbacks.py
@@ -11,7 +11,7 @@ def create_copy_from_base_model(
 ) -> Callable[[Language], Language]:
     def copy_from_base_model(nlp):
         if tokenizer:
-            logger.info(f"Copying tokenizer from: {tokenizer}")
+            logger.info("Copying tokenizer from: %s", tokenizer)
             base_nlp = load_model(tokenizer)
             if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
                 nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
@@ -23,7 +23,7 @@ def create_copy_from_base_model(
                     )
                 )
         if vocab:
-            logger.info(f"Copying vocab from: {vocab}")
+            logger.info("Copying vocab from: %s", vocab)
             # only reload if the vocab is from a different model
             if tokenizer != vocab:
                 base_nlp = load_model(vocab)
diff --git a/spacy/training/corpus.py b/spacy/training/corpus.py
index d626ad0e0..086ad831c 100644
--- a/spacy/training/corpus.py
+++ b/spacy/training/corpus.py
@@ -29,7 +29,7 @@ def create_docbin_reader(
 ) -> Callable[["Language"], Iterable[Example]]:
     if path is None:
         raise ValueError(Errors.E913)
-    util.logger.debug(f"Loading corpus from path: {path}")
+    util.logger.debug("Loading corpus from path: %s", path)
     return Corpus(
         path,
         gold_preproc=gold_preproc,
diff --git a/spacy/training/initialize.py b/spacy/training/initialize.py
index 6304e4a84..e90617852 100644
--- a/spacy/training/initialize.py
+++ b/spacy/training/initialize.py
@@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
     frozen_components = T["frozen_components"]
     # Sourced components that require resume_training
     resume_components = [p for p in sourced if p not in frozen_components]
-    logger.info(f"Pipeline: {nlp.pipe_names}")
+    logger.info("Pipeline: %s", nlp.pipe_names)
     if resume_components:
         with nlp.select_pipes(enable=resume_components):
-            logger.info(f"Resuming training for: {resume_components}")
+            logger.info("Resuming training for: %s", resume_components)
             nlp.resume_training(sgd=optimizer)
     # Make sure that listeners are defined before initializing further
     nlp._link_components()
@@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
         if T["max_epochs"] == -1:
             sample_size = 100
             logger.debug(
-                f"Due to streamed train corpus, using only first {sample_size} "
-                f"examples for initialization. If necessary, provide all labels "
-                f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
+                "Due to streamed train corpus, using only first %s examples for initialization. "
+                "If necessary, provide all labels in [initialize]. "
+                "More info: https://spacy.io/api/cli#init_labels",
+                sample_size,
             )
             nlp.initialize(
                 lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
             )
         else:
             nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
-        logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
+        logger.info("Initialized pipeline components: %s", nlp.pipe_names)
     # Detect components with listeners that are not frozen consistently
     for name, proc in nlp.pipeline:
         for listener in getattr(
@@ -109,7 +110,7 @@ def init_vocab(
 ) -> None:
     if lookups:
         nlp.vocab.lookups = lookups
-        logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
+        logger.info("Added vocab lookups: %s", ", ".join(lookups.tables))
     data_path = ensure_path(data)
     if data_path is not None:
         lex_attrs = srsly.read_jsonl(data_path)
@@ -125,11 +126,11 @@ def init_vocab(
         else:
             oov_prob = DEFAULT_OOV_PROB
         nlp.vocab.cfg.update({"oov_prob": oov_prob})
-        logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
+        logger.info("Added %d lexical entries to the vocab", len(nlp.vocab))
     logger.info("Created vocabulary")
     if vectors is not None:
         load_vectors_into_model(nlp, vectors)
-        logger.info(f"Added vectors: {vectors}")
+        logger.info("Added vectors: %s", vectors)
     # warn if source model vectors are not identical
     sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
     vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
@@ -191,7 +192,7 @@ def init_tok2vec(
     if weights_data is not None:
         layer = get_tok2vec_ref(nlp, P)
         layer.from_bytes(weights_data)
-        logger.info(f"Loaded pretrained weights from {init_tok2vec}")
+        logger.info("Loaded pretrained weights from %s", init_tok2vec)
         return True
     return False
 
@@ -216,13 +217,13 @@ def convert_vectors(
         nlp.vocab.deduplicate_vectors()
     else:
         if vectors_loc:
-            logger.info(f"Reading vectors from {vectors_loc}")
+            logger.info("Reading vectors from %s", vectors_loc)
             vectors_data, vector_keys, floret_settings = read_vectors(
                 vectors_loc,
                 truncate,
                 mode=mode,
             )
-            logger.info(f"Loaded vectors from {vectors_loc}")
+            logger.info("Loaded vectors from %s", vectors_loc)
         else:
             vectors_data, vector_keys = (None, None)
         if vector_keys is not None and mode != VectorsMode.floret:
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index 885257772..eca40e3d9 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -370,6 +370,6 @@ def clean_output_dir(path: Optional[Path]) -> None:
             if subdir.exists():
                 try:
                     shutil.rmtree(str(subdir))
-                    logger.debug(f"Removed existing output directory: {subdir}")
+                    logger.debug("Removed existing output directory: %s", subdir)
                 except Exception as e:
                     raise IOError(Errors.E901.format(path=path)) from e