mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-26 01:04:34 +03:00
Have logging calls use string formatting types (#12215)
* change logging call for spacy.LookupsDataLoader.v1 * substitutions in language and _util * various more substitutions * add string formatting guidelines to contribution guidelines
This commit is contained in:
parent
4c60afb946
commit
79ef6cf0f9
|
@ -173,6 +173,11 @@ formatting and [`flake8`](http://flake8.pycqa.org/en/latest/) for linting its
|
||||||
Python modules. If you've built spaCy from source, you'll already have both
|
Python modules. If you've built spaCy from source, you'll already have both
|
||||||
tools installed.
|
tools installed.
|
||||||
|
|
||||||
|
As a general rule of thumb, we use f-strings for any formatting of strings.
|
||||||
|
One exception are calls to Python's `logging` functionality.
|
||||||
|
To avoid unnecessary string conversions in these cases, we use string formatting
|
||||||
|
templates with `%s` and `%d` etc.
|
||||||
|
|
||||||
**⚠️ Note that formatting and linting is currently only possible for Python
|
**⚠️ Note that formatting and linting is currently only possible for Python
|
||||||
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
|
modules in `.py` files, not Cython modules in `.pyx` and `.pxd` files.**
|
||||||
|
|
||||||
|
|
|
@ -90,9 +90,9 @@ def parse_config_overrides(
|
||||||
cli_overrides = _parse_overrides(args, is_cli=True)
|
cli_overrides = _parse_overrides(args, is_cli=True)
|
||||||
if cli_overrides:
|
if cli_overrides:
|
||||||
keys = [k for k in cli_overrides if k not in env_overrides]
|
keys = [k for k in cli_overrides if k not in env_overrides]
|
||||||
logger.debug(f"Config overrides from CLI: {keys}")
|
logger.debug("Config overrides from CLI: %s", keys)
|
||||||
if env_overrides:
|
if env_overrides:
|
||||||
logger.debug(f"Config overrides from env variables: {list(env_overrides)}")
|
logger.debug("Config overrides from env variables: %s", list(env_overrides))
|
||||||
return {**cli_overrides, **env_overrides}
|
return {**cli_overrides, **env_overrides}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -39,14 +39,17 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||||
# in the list.
|
# in the list.
|
||||||
while commands:
|
while commands:
|
||||||
for i, cmd in enumerate(list(commands)):
|
for i, cmd in enumerate(list(commands)):
|
||||||
logger.debug(f"CMD: {cmd['name']}.")
|
logger.debug("CMD: %s.", cmd["name"])
|
||||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||||
if all(dep.exists() for dep in deps):
|
if all(dep.exists() for dep in deps):
|
||||||
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
cmd_hash = get_command_hash("", "", deps, cmd["script"])
|
||||||
for output_path in cmd.get("outputs", []):
|
for output_path in cmd.get("outputs", []):
|
||||||
url = storage.pull(output_path, command_hash=cmd_hash)
|
url = storage.pull(output_path, command_hash=cmd_hash)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"URL: {url} for {output_path} with command hash {cmd_hash}"
|
"URL: %s for %s with command hash %s",
|
||||||
|
url,
|
||||||
|
output_path,
|
||||||
|
cmd_hash,
|
||||||
)
|
)
|
||||||
yield url, output_path
|
yield url, output_path
|
||||||
|
|
||||||
|
@ -58,7 +61,7 @@ def project_pull(project_dir: Path, remote: str, *, verbose: bool = False):
|
||||||
commands.pop(i)
|
commands.pop(i)
|
||||||
break
|
break
|
||||||
else:
|
else:
|
||||||
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs.")
|
logger.debug("Dependency missing. Skipping %s outputs.", cmd["name"])
|
||||||
else:
|
else:
|
||||||
# If we didn't break the for loop, break the while loop.
|
# If we didn't break the for loop, break the while loop.
|
||||||
break
|
break
|
||||||
|
|
|
@ -37,15 +37,15 @@ def project_push(project_dir: Path, remote: str):
|
||||||
remote = config["remotes"][remote]
|
remote = config["remotes"][remote]
|
||||||
storage = RemoteStorage(project_dir, remote)
|
storage = RemoteStorage(project_dir, remote)
|
||||||
for cmd in config.get("commands", []):
|
for cmd in config.get("commands", []):
|
||||||
logger.debug(f"CMD: cmd['name']")
|
logger.debug("CMD: %s", cmd["name"])
|
||||||
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
deps = [project_dir / dep for dep in cmd.get("deps", [])]
|
||||||
if any(not dep.exists() for dep in deps):
|
if any(not dep.exists() for dep in deps):
|
||||||
logger.debug(f"Dependency missing. Skipping {cmd['name']} outputs")
|
logger.debug("Dependency missing. Skipping %s outputs", cmd["name"])
|
||||||
continue
|
continue
|
||||||
cmd_hash = get_command_hash(
|
cmd_hash = get_command_hash(
|
||||||
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
"", "", [project_dir / dep for dep in cmd.get("deps", [])], cmd["script"]
|
||||||
)
|
)
|
||||||
logger.debug(f"CMD_HASH: {cmd_hash}")
|
logger.debug("CMD_HASH: %s", cmd_hash)
|
||||||
for output_path in cmd.get("outputs", []):
|
for output_path in cmd.get("outputs", []):
|
||||||
output_loc = project_dir / output_path
|
output_loc = project_dir / output_path
|
||||||
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
if output_loc.exists() and _is_not_empty_dir(output_loc):
|
||||||
|
@ -55,7 +55,7 @@ def project_push(project_dir: Path, remote: str):
|
||||||
content_hash=get_content_hash(output_loc),
|
content_hash=get_content_hash(output_loc),
|
||||||
)
|
)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"URL: {url} for output {output_path} with cmd_hash {cmd_hash}"
|
"URL: %s for output %s with cmd_hash %s", url, output_path, cmd_hash
|
||||||
)
|
)
|
||||||
yield output_path, url
|
yield output_path, url
|
||||||
|
|
||||||
|
|
|
@ -104,7 +104,7 @@ def create_tokenizer() -> Callable[["Language"], Tokenizer]:
|
||||||
|
|
||||||
@registry.misc("spacy.LookupsDataLoader.v1")
|
@registry.misc("spacy.LookupsDataLoader.v1")
|
||||||
def load_lookups_data(lang, tables):
|
def load_lookups_data(lang, tables):
|
||||||
util.logger.debug(f"Loading lookups from spacy-lookups-data: {tables}")
|
util.logger.debug("Loading lookups from spacy-lookups-data: %s", tables)
|
||||||
lookups = load_lookups(lang=lang, tables=tables)
|
lookups = load_lookups(lang=lang, tables=tables)
|
||||||
return lookups
|
return lookups
|
||||||
|
|
||||||
|
@ -1969,7 +1969,7 @@ class Language:
|
||||||
pipe = self.get_pipe(pipe_name)
|
pipe = self.get_pipe(pipe_name)
|
||||||
pipe_cfg = self._pipe_configs[pipe_name]
|
pipe_cfg = self._pipe_configs[pipe_name]
|
||||||
if listeners:
|
if listeners:
|
||||||
util.logger.debug(f"Replacing listeners of component '{pipe_name}'")
|
util.logger.debug("Replacing listeners of component '%s'", pipe_name)
|
||||||
if len(list(listeners)) != len(pipe_listeners):
|
if len(list(listeners)) != len(pipe_listeners):
|
||||||
# The number of listeners defined in the component model doesn't
|
# The number of listeners defined in the component model doesn't
|
||||||
# match the listeners to replace, so we won't be able to update
|
# match the listeners to replace, so we won't be able to update
|
||||||
|
|
|
@ -46,7 +46,7 @@ def assert_sents_error(doc):
|
||||||
|
|
||||||
def warn_error(proc_name, proc, docs, e):
|
def warn_error(proc_name, proc, docs, e):
|
||||||
logger = logging.getLogger("spacy")
|
logger = logging.getLogger("spacy")
|
||||||
logger.warning(f"Trouble with component {proc_name}.")
|
logger.warning("Trouble with component %s.", proc_name)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
|
|
@ -11,7 +11,7 @@ def create_copy_from_base_model(
|
||||||
) -> Callable[[Language], Language]:
|
) -> Callable[[Language], Language]:
|
||||||
def copy_from_base_model(nlp):
|
def copy_from_base_model(nlp):
|
||||||
if tokenizer:
|
if tokenizer:
|
||||||
logger.info(f"Copying tokenizer from: {tokenizer}")
|
logger.info("Copying tokenizer from: %s", tokenizer)
|
||||||
base_nlp = load_model(tokenizer)
|
base_nlp = load_model(tokenizer)
|
||||||
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
|
if nlp.config["nlp"]["tokenizer"] == base_nlp.config["nlp"]["tokenizer"]:
|
||||||
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
|
nlp.tokenizer.from_bytes(base_nlp.tokenizer.to_bytes(exclude=["vocab"]))
|
||||||
|
@ -23,7 +23,7 @@ def create_copy_from_base_model(
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if vocab:
|
if vocab:
|
||||||
logger.info(f"Copying vocab from: {vocab}")
|
logger.info("Copying vocab from: %s", vocab)
|
||||||
# only reload if the vocab is from a different model
|
# only reload if the vocab is from a different model
|
||||||
if tokenizer != vocab:
|
if tokenizer != vocab:
|
||||||
base_nlp = load_model(vocab)
|
base_nlp = load_model(vocab)
|
||||||
|
|
|
@ -29,7 +29,7 @@ def create_docbin_reader(
|
||||||
) -> Callable[["Language"], Iterable[Example]]:
|
) -> Callable[["Language"], Iterable[Example]]:
|
||||||
if path is None:
|
if path is None:
|
||||||
raise ValueError(Errors.E913)
|
raise ValueError(Errors.E913)
|
||||||
util.logger.debug(f"Loading corpus from path: {path}")
|
util.logger.debug("Loading corpus from path: %s", path)
|
||||||
return Corpus(
|
return Corpus(
|
||||||
path,
|
path,
|
||||||
gold_preproc=gold_preproc,
|
gold_preproc=gold_preproc,
|
||||||
|
|
|
@ -62,10 +62,10 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
||||||
frozen_components = T["frozen_components"]
|
frozen_components = T["frozen_components"]
|
||||||
# Sourced components that require resume_training
|
# Sourced components that require resume_training
|
||||||
resume_components = [p for p in sourced if p not in frozen_components]
|
resume_components = [p for p in sourced if p not in frozen_components]
|
||||||
logger.info(f"Pipeline: {nlp.pipe_names}")
|
logger.info("Pipeline: %s", nlp.pipe_names)
|
||||||
if resume_components:
|
if resume_components:
|
||||||
with nlp.select_pipes(enable=resume_components):
|
with nlp.select_pipes(enable=resume_components):
|
||||||
logger.info(f"Resuming training for: {resume_components}")
|
logger.info("Resuming training for: %s", resume_components)
|
||||||
nlp.resume_training(sgd=optimizer)
|
nlp.resume_training(sgd=optimizer)
|
||||||
# Make sure that listeners are defined before initializing further
|
# Make sure that listeners are defined before initializing further
|
||||||
nlp._link_components()
|
nlp._link_components()
|
||||||
|
@ -73,16 +73,17 @@ def init_nlp(config: Config, *, use_gpu: int = -1) -> "Language":
|
||||||
if T["max_epochs"] == -1:
|
if T["max_epochs"] == -1:
|
||||||
sample_size = 100
|
sample_size = 100
|
||||||
logger.debug(
|
logger.debug(
|
||||||
f"Due to streamed train corpus, using only first {sample_size} "
|
"Due to streamed train corpus, using only first %s examples for initialization. "
|
||||||
f"examples for initialization. If necessary, provide all labels "
|
"If necessary, provide all labels in [initialize]. "
|
||||||
f"in [initialize]. More info: https://spacy.io/api/cli#init_labels"
|
"More info: https://spacy.io/api/cli#init_labels",
|
||||||
|
sample_size,
|
||||||
)
|
)
|
||||||
nlp.initialize(
|
nlp.initialize(
|
||||||
lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
|
lambda: islice(train_corpus(nlp), sample_size), sgd=optimizer
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
nlp.initialize(lambda: train_corpus(nlp), sgd=optimizer)
|
||||||
logger.info(f"Initialized pipeline components: {nlp.pipe_names}")
|
logger.info("Initialized pipeline components: %s", nlp.pipe_names)
|
||||||
# Detect components with listeners that are not frozen consistently
|
# Detect components with listeners that are not frozen consistently
|
||||||
for name, proc in nlp.pipeline:
|
for name, proc in nlp.pipeline:
|
||||||
for listener in getattr(
|
for listener in getattr(
|
||||||
|
@ -109,7 +110,7 @@ def init_vocab(
|
||||||
) -> None:
|
) -> None:
|
||||||
if lookups:
|
if lookups:
|
||||||
nlp.vocab.lookups = lookups
|
nlp.vocab.lookups = lookups
|
||||||
logger.info(f"Added vocab lookups: {', '.join(lookups.tables)}")
|
logger.info("Added vocab lookups: %s", ", ".join(lookups.tables))
|
||||||
data_path = ensure_path(data)
|
data_path = ensure_path(data)
|
||||||
if data_path is not None:
|
if data_path is not None:
|
||||||
lex_attrs = srsly.read_jsonl(data_path)
|
lex_attrs = srsly.read_jsonl(data_path)
|
||||||
|
@ -125,11 +126,11 @@ def init_vocab(
|
||||||
else:
|
else:
|
||||||
oov_prob = DEFAULT_OOV_PROB
|
oov_prob = DEFAULT_OOV_PROB
|
||||||
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
nlp.vocab.cfg.update({"oov_prob": oov_prob})
|
||||||
logger.info(f"Added {len(nlp.vocab)} lexical entries to the vocab")
|
logger.info("Added %d lexical entries to the vocab", len(nlp.vocab))
|
||||||
logger.info("Created vocabulary")
|
logger.info("Created vocabulary")
|
||||||
if vectors is not None:
|
if vectors is not None:
|
||||||
load_vectors_into_model(nlp, vectors)
|
load_vectors_into_model(nlp, vectors)
|
||||||
logger.info(f"Added vectors: {vectors}")
|
logger.info("Added vectors: %s", vectors)
|
||||||
# warn if source model vectors are not identical
|
# warn if source model vectors are not identical
|
||||||
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
sourced_vectors_hashes = nlp.meta.pop("_sourced_vectors_hashes", {})
|
||||||
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
vectors_hash = hash(nlp.vocab.vectors.to_bytes(exclude=["strings"]))
|
||||||
|
@ -191,7 +192,7 @@ def init_tok2vec(
|
||||||
if weights_data is not None:
|
if weights_data is not None:
|
||||||
layer = get_tok2vec_ref(nlp, P)
|
layer = get_tok2vec_ref(nlp, P)
|
||||||
layer.from_bytes(weights_data)
|
layer.from_bytes(weights_data)
|
||||||
logger.info(f"Loaded pretrained weights from {init_tok2vec}")
|
logger.info("Loaded pretrained weights from %s", init_tok2vec)
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@ -216,13 +217,13 @@ def convert_vectors(
|
||||||
nlp.vocab.deduplicate_vectors()
|
nlp.vocab.deduplicate_vectors()
|
||||||
else:
|
else:
|
||||||
if vectors_loc:
|
if vectors_loc:
|
||||||
logger.info(f"Reading vectors from {vectors_loc}")
|
logger.info("Reading vectors from %s", vectors_loc)
|
||||||
vectors_data, vector_keys, floret_settings = read_vectors(
|
vectors_data, vector_keys, floret_settings = read_vectors(
|
||||||
vectors_loc,
|
vectors_loc,
|
||||||
truncate,
|
truncate,
|
||||||
mode=mode,
|
mode=mode,
|
||||||
)
|
)
|
||||||
logger.info(f"Loaded vectors from {vectors_loc}")
|
logger.info("Loaded vectors from %s", vectors_loc)
|
||||||
else:
|
else:
|
||||||
vectors_data, vector_keys = (None, None)
|
vectors_data, vector_keys = (None, None)
|
||||||
if vector_keys is not None and mode != VectorsMode.floret:
|
if vector_keys is not None and mode != VectorsMode.floret:
|
||||||
|
|
|
@ -370,6 +370,6 @@ def clean_output_dir(path: Optional[Path]) -> None:
|
||||||
if subdir.exists():
|
if subdir.exists():
|
||||||
try:
|
try:
|
||||||
shutil.rmtree(str(subdir))
|
shutil.rmtree(str(subdir))
|
||||||
logger.debug(f"Removed existing output directory: {subdir}")
|
logger.debug("Removed existing output directory: %s", subdir)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise IOError(Errors.E901.format(path=path)) from e
|
raise IOError(Errors.E901.format(path=path)) from e
|
||||||
|
|
Loading…
Reference in New Issue
Block a user