From 64d2d27c5dbf8e5657187975d2c9627f30e108a2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 22 Dec 2022 10:53:16 +0100 Subject: [PATCH 1/7] Add classifier for python 3.11 (#12013) --- setup.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.cfg b/setup.cfg index cf6e6f84b..d290d706c 100644 --- a/setup.cfg +++ b/setup.cfg @@ -22,6 +22,7 @@ classifiers = Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 + Programming Language :: Python :: 3.11 Topic :: Scientific/Engineering project_urls = Release notes = https://github.com/explosion/spaCy/releases From 90896504a5dba1babac04a2b88662179409ae006 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 23 Dec 2022 12:44:07 +0100 Subject: [PATCH 2/7] Auto-format code with black (#12019) Co-authored-by: explosion-bot --- spacy/cli/apply.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/spacy/cli/apply.py b/spacy/cli/apply.py index 9d170bc95..f0df4e757 100644 --- a/spacy/cli/apply.py +++ b/spacy/cli/apply.py @@ -53,9 +53,7 @@ def _stream_jsonl(path: Path, field: str) -> Iterable[str]: """ for entry in srsly.read_jsonl(path): if field not in entry: - msg.fail( - f"{path} does not contain the required '{field}' field.", exits=1 - ) + msg.fail(f"{path} does not contain the required '{field}' field.", exits=1) else: yield entry[field] @@ -118,8 +116,10 @@ def apply( paths = walk_directory(data_path) if len(paths) == 0: docbin.to_disk(output_file) - msg.warn("Did not find data to process," - f" {data_path} seems to be an empty directory.") + msg.warn( + "Did not find data to process," + f" {data_path} seems to be an empty directory." + ) return nlp = load_model(model) msg.good(f"Loaded model {model}") From aa2b471a6e289d1c1bb51558df779ae028671225 Mon Sep 17 00:00:00 2001 From: Madeesh Kannan Date: Fri, 23 Dec 2022 15:21:44 +0100 Subject: [PATCH 3/7] New console logger with expanded progress tracking (#11972) * Add `ConsoleLogger.v3` This addition expands the progress bar feature to count up the training/distillation steps to either the next evaluation pass or the maximum number of steps. * Rename progress bar types * Add defaults to docs Minor fixes * Move comment * Minor punctuation fixes * Explicitly check for `None` when validating progress bar type Co-authored-by: Paul O'Leary McCann --- spacy/errors.py | 1 + spacy/training/loggers.py | 48 ++++++++++++++++++++++++++++++++--- website/docs/api/top-level.md | 34 ++++++++++++++++++++----- 3 files changed, 74 insertions(+), 9 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 0e5ef91ed..cd9281e91 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -962,6 +962,7 @@ class Errors(metaclass=ErrorsWithCodes): E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default " "knowledge base, use `InMemoryLookupKB`.") E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.") + E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py index 408ea7140..7de31822e 100644 --- a/spacy/training/loggers.py +++ b/spacy/training/loggers.py @@ -26,6 +26,8 @@ def setup_table( return final_cols, final_widths, ["r" for _ in final_widths] +# We cannot rename this method as it's directly imported +# and used by external packages such as spacy-loggers. @registry.loggers("spacy.ConsoleLogger.v2") def console_logger( progress_bar: bool = False, @@ -33,7 +35,27 @@ def console_logger( output_file: Optional[Union[str, Path]] = None, ): """The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file. - progress_bar (bool): Whether the logger should print the progress bar. + progress_bar (bool): Whether the logger should print a progress bar tracking the steps till the next evaluation pass. + console_output (bool): Whether the logger should print the logs on the console. + output_file (Optional[Union[str, Path]]): The file to save the training logs to. + """ + return console_logger_v3( + progress_bar=None if progress_bar is False else "eval", + console_output=console_output, + output_file=output_file, + ) + + +@registry.loggers("spacy.ConsoleLogger.v3") +def console_logger_v3( + progress_bar: Optional[str] = None, + console_output: bool = True, + output_file: Optional[Union[str, Path]] = None, +): + """The ConsoleLogger.v3 prints out training logs in the console and/or saves them to a jsonl file. + progress_bar (Optional[str]): Type of progress bar to show in the console. Allowed values: + train - Tracks the number of steps from the beginning of training until the full training run is complete (training.max_steps is reached). + eval - Tracks the number of steps between the previous and next evaluation (training.eval_frequency is reached). console_output (bool): Whether the logger should print the logs on the console. output_file (Optional[Union[str, Path]]): The file to save the training logs to. """ @@ -70,6 +92,7 @@ def console_logger( for name, proc in nlp.pipeline if hasattr(proc, "is_trainable") and proc.is_trainable ] + max_steps = nlp.config["training"]["max_steps"] eval_frequency = nlp.config["training"]["eval_frequency"] score_weights = nlp.config["training"]["score_weights"] score_cols = [col for col, value in score_weights.items() if value is not None] @@ -84,6 +107,13 @@ def console_logger( write(msg.row(table_header, widths=table_widths, spacing=spacing)) write(msg.row(["-" * width for width in table_widths], spacing=spacing)) progress = None + expected_progress_types = ("train", "eval") + if progress_bar is not None and progress_bar not in expected_progress_types: + raise ValueError( + Errors.E1048.format( + unexpected=progress_bar, expected=expected_progress_types + ) + ) def log_step(info: Optional[Dict[str, Any]]) -> None: nonlocal progress @@ -141,11 +171,23 @@ def console_logger( ) ) if progress_bar: + if progress_bar == "train": + total = max_steps + desc = f"Last Eval Epoch: {info['epoch']}" + initial = info["step"] + else: + total = eval_frequency + desc = f"Epoch {info['epoch']+1}" + initial = 0 # Set disable=None, so that it disables on non-TTY progress = tqdm.tqdm( - total=eval_frequency, disable=None, leave=False, file=stderr + total=total, + disable=None, + leave=False, + file=stderr, + initial=initial, ) - progress.set_description(f"Epoch {info['epoch']+1}") + progress.set_description(desc) def finalize() -> None: if output_stream: diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md index 26a5d42f4..883c5e3b9 100644 --- a/website/docs/api/top-level.md +++ b/website/docs/api/top-level.md @@ -513,7 +513,7 @@ a [Weights & Biases](https://www.wandb.com/) dashboard. Instead of using one of the built-in loggers, you can [implement your own](/usage/training#custom-logging). -#### spacy.ConsoleLogger.v2 {#ConsoleLogger tag="registered function"} +#### spacy.ConsoleLogger.v2 {tag="registered function"} > #### Example config > @@ -564,11 +564,33 @@ start decreasing across epochs. -| Name | Description | -| ---------------- | --------------------------------------------------------------------- | -| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ | -| `console_output` | Whether the logger should print the logs on the console. ~~bool~~ | -| `output_file` | The file to save the training logs to. ~~Optional[Union[str, Path]]~~ | +| Name | Description | +| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- | +| `progress_bar` | Whether the logger should print a progress bar tracking the steps till the next evaluation pass (default: `False`). ~~bool~~ | +| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~ | +| `output_file` | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~ | + +#### spacy.ConsoleLogger.v3 {#ConsoleLogger tag="registered function"} + +> #### Example config +> +> ```ini +> [training.logger] +> @loggers = "spacy.ConsoleLogger.v3" +> progress_bar = "all_steps" +> console_output = true +> output_file = "training_log.jsonl" +> ``` + +Writes the results of a training step to the console in a tabular format and +optionally saves them to a `jsonl` file. + +| Name | Description | +| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `progress_bar` | Type of progress bar to show in the console: `"train"`, `"eval"` or `None`. | +| | The bar tracks the number of steps until `training.max_steps` and `training.eval_frequency` are reached respectively (default: `None`). ~~Optional[str]~~ | +| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~ | +| `output_file` | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~ | ## Readers {#readers} From 933b54ac798a7d64f9cde4d85b55556e84e44bd6 Mon Sep 17 00:00:00 2001 From: kadarakos Date: Mon, 26 Dec 2022 13:26:35 +0100 Subject: [PATCH 4/7] typo fix (#11995) --- spacy/pipeline/span_ruler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py index 807a4ffe5..0e7e9ebf7 100644 --- a/spacy/pipeline/span_ruler.py +++ b/spacy/pipeline/span_ruler.py @@ -170,7 +170,7 @@ def prioritize_existing_ents_filter( @registry.misc("spacy.prioritize_existing_ents_filter.v1") -def make_preverse_existing_ents_filter(): +def make_preserve_existing_ents_filter(): return prioritize_existing_ents_filter From ef9e504eacc806162666c964bd00d152fc15f9e3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Thu, 29 Dec 2022 14:01:08 +0100 Subject: [PATCH 5/7] Rename modified textcat scorer to v2 (#11971) As a follow-up to #11696, rename the modified scorer to v2 and move the v1 scorer to `spacy-legacy`. --- requirements.txt | 2 +- setup.cfg | 2 +- spacy/pipeline/textcat.py | 4 ++-- spacy/tests/pipeline/test_textcat.py | 17 +++++++++++++++++ 4 files changed, 21 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index 0440835f2..5bc1c8684 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ # Our libraries -spacy-legacy>=3.0.10,<3.1.0 +spacy-legacy>=3.0.11,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 diff --git a/setup.cfg b/setup.cfg index d290d706c..cee8c0c33 100644 --- a/setup.cfg +++ b/setup.cfg @@ -42,7 +42,7 @@ setup_requires = thinc>=8.1.0,<8.2.0 install_requires = # Our libraries - spacy-legacy>=3.0.10,<3.1.0 + spacy-legacy>=3.0.11,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py index 65121114d..650a01949 100644 --- a/spacy/pipeline/textcat.py +++ b/spacy/pipeline/textcat.py @@ -74,7 +74,7 @@ subword_features = true default_config={ "threshold": 0.0, "model": DEFAULT_SINGLE_TEXTCAT_MODEL, - "scorer": {"@scorers": "spacy.textcat_scorer.v1"}, + "scorer": {"@scorers": "spacy.textcat_scorer.v2"}, }, default_score_weights={ "cats_score": 1.0, @@ -117,7 +117,7 @@ def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]: ) -@registry.scorers("spacy.textcat_scorer.v1") +@registry.scorers("spacy.textcat_scorer.v2") def make_textcat_scorer(): return textcat_score diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index 155ce99a2..eafe4c128 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -895,3 +895,20 @@ def test_textcat_multi_threshold(): scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0}) assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 + + +@pytest.mark.parametrize("component_name,scorer", [("textcat", "spacy.textcat_scorer.v1")]) +def test_textcat_legacy_scorers(component_name, scorer): + """Check that legacy scorers are registered and produce the expected score + keys.""" + nlp = English() + nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}}) + + train_examples = [] + for text, annotations in TRAIN_DATA_SINGLE_LABEL: + train_examples.append(Example.from_dict(nlp.make_doc(text), annotations)) + nlp.initialize(get_examples=lambda: train_examples) + + # score the model (it's not actually trained but that doesn't matter) + scores = nlp.evaluate(train_examples) + assert 0 <= scores["cats_score"] <= 1 From abb0ab109d33d2deaa6155a61fad649a25472f9c Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 2 Jan 2023 11:59:57 +0100 Subject: [PATCH 6/7] Auto-format code with black (#12035) Co-authored-by: explosion-bot --- spacy/tests/pipeline/test_textcat.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py index eafe4c128..048586cec 100644 --- a/spacy/tests/pipeline/test_textcat.py +++ b/spacy/tests/pipeline/test_textcat.py @@ -897,7 +897,9 @@ def test_textcat_multi_threshold(): assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0 -@pytest.mark.parametrize("component_name,scorer", [("textcat", "spacy.textcat_scorer.v1")]) +@pytest.mark.parametrize( + "component_name,scorer", [("textcat", "spacy.textcat_scorer.v1")] +) def test_textcat_legacy_scorers(component_name, scorer): """Check that legacy scorers are registered and produce the expected score keys.""" From 31c1beba787446059de58a1478e6aec197fd0bbb Mon Sep 17 00:00:00 2001 From: Wannaphong Phatthiyaphaibun Date: Tue, 3 Jan 2023 15:03:59 +0700 Subject: [PATCH 7/7] Add spacy-pythainlp (#12038) * Add spacy-pythainlp * Move submission to right section * Minor cleanup * Remove extra list call * Update universe.json Co-authored-by: Paul O'Leary McCann --- website/meta/universe.json | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/website/meta/universe.json b/website/meta/universe.json index db533c3b2..99d121507 100644 --- a/website/meta/universe.json +++ b/website/meta/universe.json @@ -4062,6 +4062,33 @@ "author_links": { "github": "yasufumy" } + }, + { + "id": "spacy-pythainlp", + "title": "spaCy-PyThaiNLP", + "slogan": "PyThaiNLP for spaCy", + "description": "This package wraps the PyThaiNLP library to add support for Thai to spaCy.", + "github": "PyThaiNLP/spaCy-PyThaiNLP", + "code_example": [ + "import spacy", + "import spacy_pythainlp.core", + "", + "nlp = spacy.blank('th')", + "nlp.add_pipe('pythainlp')", + "doc = nlp('ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน ผมอยากไปเที่ยว')", + "", + "print(list(doc.sents))", + "# output: [ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน , ผมอยากไปเที่ยว]" + ], + "code_language": "python", + "author": "Wannaphong Phatthiyaphaibun", + "author_links": { + "twitter": "@wannaphong_p", + "github": "wannaphong", + "website": "https://iam.wannaphong.com/" + }, + "category": ["pipeline", "research"], + "tags": ["Thai"] } ],