Merge branch 'copy_master' into copy_v4

2025-11-14 23:06:01 +03:00 · 2023-01-03 13:34:05 +01:00 · 2023-01-03 13:34:05 +01:00 · 6852adc8b7
commit 6852adc8b7
parent 20b63943f5 31c1beba78
10 changed files with 131 additions and 19 deletions
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=3.0.10,<3.1.0
+spacy-legacy>=3.0.11,<3.1.0
 spacy-loggers>=1.0.0,<2.0.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
--- a/setup.cfg
+++ b/setup.cfg
@ -22,6 +22,7 @@ classifiers =
    Programming Language :: Python :: 3.8
    Programming Language :: Python :: 3.9
    Programming Language :: Python :: 3.10
    Programming Language :: Python :: 3.11
    Topic :: Scientific/Engineering
 project_urls =
    Release notes = https://github.com/explosion/spaCy/releases
@ -33,7 +34,7 @@ include_package_data = true
 python_requires = >=3.6
 install_requires =
    # Our libraries
-    spacy-legacy>=3.0.10,<3.1.0
+    spacy-legacy>=3.0.11,<3.1.0
    spacy-loggers>=1.0.0,<2.0.0
    murmurhash>=0.28.0,<1.1.0
    cymem>=2.0.2,<2.1.0
--- a/spacy/cli/apply.py
+++ b/spacy/cli/apply.py
@ -53,9 +53,7 @@ def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
    """
    for entry in srsly.read_jsonl(path):
        if field not in entry:
-            msg.fail(
+            msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
                f"{path} does not contain the required '{field}' field.", exits=1
            )
        else:
            yield entry[field]
@ -118,8 +116,10 @@ def apply(
    paths = walk_directory(data_path)
    if len(paths) == 0:
        docbin.to_disk(output_file)
-        msg.warn("Did not find data to process,"
+        msg.warn(
-                 f" {data_path} seems to be an empty directory.")
+            "Did not find data to process,"
            f" {data_path} seems to be an empty directory."
        )
        return
    nlp = load_model(model)
    msg.good(f"Loaded model {model}")
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -944,6 +944,7 @@ class Errors(metaclass=ErrorsWithCodes):
    E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
             "knowledge base, use `InMemoryLookupKB`.")
    E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
    E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
    # v4 error strings
    E4000 = ("Expected a Doc as input, but got: '{type}'")
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@ -179,7 +179,7 @@ def prioritize_existing_ents_filter(
@registry.misc("spacy.prioritize_existing_ents_filter.v1")
-def make_preverse_existing_ents_filter():
+def make_preserve_existing_ents_filter():
    return prioritize_existing_ents_filter
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -77,7 +77,7 @@ subword_features = true
    default_config={
        "threshold": 0.0,
        "model": DEFAULT_SINGLE_TEXTCAT_MODEL,
-        "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
+        "scorer": {"@scorers": "spacy.textcat_scorer.v2"},
        "save_activations": False,
    },
    default_score_weights={
@ -130,7 +130,7 @@ def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
    )
-@registry.scorers("spacy.textcat_scorer.v1")
+@registry.scorers("spacy.textcat_scorer.v2")
 def make_textcat_scorer():
    return textcat_score
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -934,3 +934,22 @@ def test_save_activations_multi():
    doc = nlp("This is a test.")
    assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"]
    assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,)
@pytest.mark.parametrize(
    "component_name,scorer", [("textcat", "spacy.textcat_scorer.v1")]
 )
 def test_textcat_legacy_scorers(component_name, scorer):
    """Check that legacy scorers are registered and produce the expected score
    keys."""
    nlp = English()
    nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}})
    train_examples = []
    for text, annotations in TRAIN_DATA_SINGLE_LABEL:
        train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
    nlp.initialize(get_examples=lambda: train_examples)
    # score the model (it's not actually trained but that doesn't matter)
    scores = nlp.evaluate(train_examples)
    assert 0 <= scores["cats_score"] <= 1
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@ -26,6 +26,8 @@ def setup_table(
    return final_cols, final_widths, ["r" for _ in final_widths]
 # We cannot rename this method as it's directly imported
 # and used by external packages such as spacy-loggers.
@registry.loggers("spacy.ConsoleLogger.v2")
 def console_logger(
    progress_bar: bool = False,
@ -33,7 +35,27 @@ def console_logger(
    output_file: Optional[Union[str, Path]] = None,
 ):
    """The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file.
-    progress_bar (bool): Whether the logger should print the progress bar.
+    progress_bar (bool): Whether the logger should print a progress bar tracking the steps till the next evaluation pass.
    console_output (bool): Whether the logger should print the logs on the console.
    output_file (Optional[Union[str, Path]]): The file to save the training logs to.
    """
    return console_logger_v3(
        progress_bar=None if progress_bar is False else "eval",
        console_output=console_output,
        output_file=output_file,
    )
@registry.loggers("spacy.ConsoleLogger.v3")
 def console_logger_v3(
    progress_bar: Optional[str] = None,
    console_output: bool = True,
    output_file: Optional[Union[str, Path]] = None,
 ):
    """The ConsoleLogger.v3 prints out training logs in the console and/or saves them to a jsonl file.
    progress_bar (Optional[str]): Type of progress bar to show in the console. Allowed values:
        train - Tracks the number of steps from the beginning of training until the full training run is complete (training.max_steps is reached).
        eval - Tracks the number of steps between the previous and next evaluation (training.eval_frequency is reached).
    console_output (bool): Whether the logger should print the logs on the console.
    output_file (Optional[Union[str, Path]]): The file to save the training logs to.
    """
@ -70,6 +92,7 @@ def console_logger(
            for name, proc in nlp.pipeline
            if hasattr(proc, "is_trainable") and proc.is_trainable
        ]
        max_steps = nlp.config["training"]["max_steps"]
        eval_frequency = nlp.config["training"]["eval_frequency"]
        score_weights = nlp.config["training"]["score_weights"]
        score_cols = [col for col, value in score_weights.items() if value is not None]
@ -84,6 +107,13 @@ def console_logger(
            write(msg.row(table_header, widths=table_widths, spacing=spacing))
            write(msg.row(["-" * width for width in table_widths], spacing=spacing))
        progress = None
        expected_progress_types = ("train", "eval")
        if progress_bar is not None and progress_bar not in expected_progress_types:
            raise ValueError(
                Errors.E1048.format(
                    unexpected=progress_bar, expected=expected_progress_types
                )
            )
        def log_step(info: Optional[Dict[str, Any]]) -> None:
            nonlocal progress
@ -141,11 +171,23 @@ def console_logger(
                    )
                )
                if progress_bar:
                    if progress_bar == "train":
                        total = max_steps
                        desc = f"Last Eval Epoch: {info['epoch']}"
                        initial = info["step"]
                    else:
                        total = eval_frequency
                        desc = f"Epoch {info['epoch']+1}"
                        initial = 0
                    # Set disable=None, so that it disables on non-TTY
                    progress = tqdm.tqdm(
-                        total=eval_frequency, disable=None, leave=False, file=stderr
+                        total=total,
                        disable=None,
                        leave=False,
                        file=stderr,
                        initial=initial,
                    )
-                    progress.set_description(f"Epoch {info['epoch']+1}")
+                    progress.set_description(desc)
        def finalize() -> None:
            if output_stream:
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -513,7 +513,7 @@ a [Weights & Biases](https://www.wandb.com/) dashboard.
 Instead of using one of the built-in loggers, you can
 [implement your own](/usage/training#custom-logging).
-#### spacy.ConsoleLogger.v2 {#ConsoleLogger tag="registered function"}
+#### spacy.ConsoleLogger.v2 {tag="registered function"}
 > #### Example config
 >
@ -565,10 +565,32 @@ start decreasing across epochs.
 </Accordion>
 | Name             | Description                                                                                                                  |
-| ---------------- | --------------------------------------------------------------------- |
+| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- |
-| `progress_bar`   | Whether the logger should print the progress bar ~~bool~~             |
+| `progress_bar`   | Whether the logger should print a progress bar tracking the steps till the next evaluation pass (default: `False`). ~~bool~~ |
-| `console_output` | Whether the logger should print the logs on the console. ~~bool~~     |
+| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~                                          |
-| `output_file`    | The file to save the training logs to. ~~Optional[Union[str, Path]]~~ |
+| `output_file`    | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~                                      |
 #### spacy.ConsoleLogger.v3 {#ConsoleLogger tag="registered function"}
 > #### Example config
 >
 > ```ini
 > [training.logger]
 > @loggers = "spacy.ConsoleLogger.v3"
 > progress_bar = "all_steps"
 > console_output = true
 > output_file = "training_log.jsonl"
 > ```
 Writes the results of a training step to the console in a tabular format and
 optionally saves them to a `jsonl` file.
 | Name             | Description                                                                                                                                               |
 | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `progress_bar`   | Type of progress bar to show in the console: `"train"`, `"eval"` or `None`.                                                                               |
 |                  | The bar tracks the number of steps until `training.max_steps` and `training.eval_frequency` are reached respectively (default: `None`). ~~Optional[str]~~ |
 | `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~                                                                       |
 | `output_file`    | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~                                                                   |
 ## Readers {#readers}
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@ -4066,6 +4066,33 @@
            "author_links": {
                "github": "yasufumy"
            }
        },
        {
            "id": "spacy-pythainlp",
            "title": "spaCy-PyThaiNLP",
            "slogan": "PyThaiNLP for spaCy",
            "description": "This package wraps the PyThaiNLP library to add support for Thai to spaCy.",
            "github": "PyThaiNLP/spaCy-PyThaiNLP",
            "code_example": [
                "import spacy",
                "import spacy_pythainlp.core",
                "",
                "nlp = spacy.blank('th')",
                "nlp.add_pipe('pythainlp')",
                "doc = nlp('ผมเป็นคนไทย   แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน  ผมอยากไปเที่ยว')",
                "",
                "print(list(doc.sents))",
                "# output: [ผมเป็นคนไทย   แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน  , ผมอยากไปเที่ยว]"
            ],
            "code_language": "python",
            "author": "Wannaphong Phatthiyaphaibun",
            "author_links": {
                "twitter": "@wannaphong_p",
                "github": "wannaphong",
                "website": "https://iam.wannaphong.com/"
            },
            "category": ["pipeline", "research"],
            "tags": ["Thai"]
        }
    ],