mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-28 02:46:35 +03:00
Merge branch 'copy_master' into copy_v4
This commit is contained in:
commit
6852adc8b7
|
@ -1,5 +1,5 @@
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.10,<3.1.0
|
spacy-legacy>=3.0.11,<3.1.0
|
||||||
spacy-loggers>=1.0.0,<2.0.0
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
preshed>=3.0.2,<3.1.0
|
preshed>=3.0.2,<3.1.0
|
||||||
|
|
|
@ -22,6 +22,7 @@ classifiers =
|
||||||
Programming Language :: Python :: 3.8
|
Programming Language :: Python :: 3.8
|
||||||
Programming Language :: Python :: 3.9
|
Programming Language :: Python :: 3.9
|
||||||
Programming Language :: Python :: 3.10
|
Programming Language :: Python :: 3.10
|
||||||
|
Programming Language :: Python :: 3.11
|
||||||
Topic :: Scientific/Engineering
|
Topic :: Scientific/Engineering
|
||||||
project_urls =
|
project_urls =
|
||||||
Release notes = https://github.com/explosion/spaCy/releases
|
Release notes = https://github.com/explosion/spaCy/releases
|
||||||
|
@ -33,7 +34,7 @@ include_package_data = true
|
||||||
python_requires = >=3.6
|
python_requires = >=3.6
|
||||||
install_requires =
|
install_requires =
|
||||||
# Our libraries
|
# Our libraries
|
||||||
spacy-legacy>=3.0.10,<3.1.0
|
spacy-legacy>=3.0.11,<3.1.0
|
||||||
spacy-loggers>=1.0.0,<2.0.0
|
spacy-loggers>=1.0.0,<2.0.0
|
||||||
murmurhash>=0.28.0,<1.1.0
|
murmurhash>=0.28.0,<1.1.0
|
||||||
cymem>=2.0.2,<2.1.0
|
cymem>=2.0.2,<2.1.0
|
||||||
|
|
|
@ -53,9 +53,7 @@ def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
|
||||||
"""
|
"""
|
||||||
for entry in srsly.read_jsonl(path):
|
for entry in srsly.read_jsonl(path):
|
||||||
if field not in entry:
|
if field not in entry:
|
||||||
msg.fail(
|
msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
|
||||||
f"{path} does not contain the required '{field}' field.", exits=1
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
yield entry[field]
|
yield entry[field]
|
||||||
|
|
||||||
|
@ -118,8 +116,10 @@ def apply(
|
||||||
paths = walk_directory(data_path)
|
paths = walk_directory(data_path)
|
||||||
if len(paths) == 0:
|
if len(paths) == 0:
|
||||||
docbin.to_disk(output_file)
|
docbin.to_disk(output_file)
|
||||||
msg.warn("Did not find data to process,"
|
msg.warn(
|
||||||
f" {data_path} seems to be an empty directory.")
|
"Did not find data to process,"
|
||||||
|
f" {data_path} seems to be an empty directory."
|
||||||
|
)
|
||||||
return
|
return
|
||||||
nlp = load_model(model)
|
nlp = load_model(model)
|
||||||
msg.good(f"Loaded model {model}")
|
msg.good(f"Loaded model {model}")
|
||||||
|
|
|
@ -944,6 +944,7 @@ class Errors(metaclass=ErrorsWithCodes):
|
||||||
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
|
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
|
||||||
"knowledge base, use `InMemoryLookupKB`.")
|
"knowledge base, use `InMemoryLookupKB`.")
|
||||||
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
|
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
|
||||||
|
E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
|
||||||
|
|
||||||
# v4 error strings
|
# v4 error strings
|
||||||
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
E4000 = ("Expected a Doc as input, but got: '{type}'")
|
||||||
|
|
|
@ -179,7 +179,7 @@ def prioritize_existing_ents_filter(
|
||||||
|
|
||||||
|
|
||||||
@registry.misc("spacy.prioritize_existing_ents_filter.v1")
|
@registry.misc("spacy.prioritize_existing_ents_filter.v1")
|
||||||
def make_preverse_existing_ents_filter():
|
def make_preserve_existing_ents_filter():
|
||||||
return prioritize_existing_ents_filter
|
return prioritize_existing_ents_filter
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -77,7 +77,7 @@ subword_features = true
|
||||||
default_config={
|
default_config={
|
||||||
"threshold": 0.0,
|
"threshold": 0.0,
|
||||||
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
|
||||||
"scorer": {"@scorers": "spacy.textcat_scorer.v1"},
|
"scorer": {"@scorers": "spacy.textcat_scorer.v2"},
|
||||||
"save_activations": False,
|
"save_activations": False,
|
||||||
},
|
},
|
||||||
default_score_weights={
|
default_score_weights={
|
||||||
|
@ -130,7 +130,7 @@ def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@registry.scorers("spacy.textcat_scorer.v1")
|
@registry.scorers("spacy.textcat_scorer.v2")
|
||||||
def make_textcat_scorer():
|
def make_textcat_scorer():
|
||||||
return textcat_score
|
return textcat_score
|
||||||
|
|
||||||
|
|
|
@ -934,3 +934,22 @@ def test_save_activations_multi():
|
||||||
doc = nlp("This is a test.")
|
doc = nlp("This is a test.")
|
||||||
assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"]
|
assert list(doc.activations["textcat_multilabel"].keys()) == ["probabilities"]
|
||||||
assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,)
|
assert doc.activations["textcat_multilabel"]["probabilities"].shape == (nO,)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"component_name,scorer", [("textcat", "spacy.textcat_scorer.v1")]
|
||||||
|
)
|
||||||
|
def test_textcat_legacy_scorers(component_name, scorer):
|
||||||
|
"""Check that legacy scorers are registered and produce the expected score
|
||||||
|
keys."""
|
||||||
|
nlp = English()
|
||||||
|
nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}})
|
||||||
|
|
||||||
|
train_examples = []
|
||||||
|
for text, annotations in TRAIN_DATA_SINGLE_LABEL:
|
||||||
|
train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
|
||||||
|
nlp.initialize(get_examples=lambda: train_examples)
|
||||||
|
|
||||||
|
# score the model (it's not actually trained but that doesn't matter)
|
||||||
|
scores = nlp.evaluate(train_examples)
|
||||||
|
assert 0 <= scores["cats_score"] <= 1
|
||||||
|
|
|
@ -26,6 +26,8 @@ def setup_table(
|
||||||
return final_cols, final_widths, ["r" for _ in final_widths]
|
return final_cols, final_widths, ["r" for _ in final_widths]
|
||||||
|
|
||||||
|
|
||||||
|
# We cannot rename this method as it's directly imported
|
||||||
|
# and used by external packages such as spacy-loggers.
|
||||||
@registry.loggers("spacy.ConsoleLogger.v2")
|
@registry.loggers("spacy.ConsoleLogger.v2")
|
||||||
def console_logger(
|
def console_logger(
|
||||||
progress_bar: bool = False,
|
progress_bar: bool = False,
|
||||||
|
@ -33,7 +35,27 @@ def console_logger(
|
||||||
output_file: Optional[Union[str, Path]] = None,
|
output_file: Optional[Union[str, Path]] = None,
|
||||||
):
|
):
|
||||||
"""The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file.
|
"""The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file.
|
||||||
progress_bar (bool): Whether the logger should print the progress bar.
|
progress_bar (bool): Whether the logger should print a progress bar tracking the steps till the next evaluation pass.
|
||||||
|
console_output (bool): Whether the logger should print the logs on the console.
|
||||||
|
output_file (Optional[Union[str, Path]]): The file to save the training logs to.
|
||||||
|
"""
|
||||||
|
return console_logger_v3(
|
||||||
|
progress_bar=None if progress_bar is False else "eval",
|
||||||
|
console_output=console_output,
|
||||||
|
output_file=output_file,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@registry.loggers("spacy.ConsoleLogger.v3")
|
||||||
|
def console_logger_v3(
|
||||||
|
progress_bar: Optional[str] = None,
|
||||||
|
console_output: bool = True,
|
||||||
|
output_file: Optional[Union[str, Path]] = None,
|
||||||
|
):
|
||||||
|
"""The ConsoleLogger.v3 prints out training logs in the console and/or saves them to a jsonl file.
|
||||||
|
progress_bar (Optional[str]): Type of progress bar to show in the console. Allowed values:
|
||||||
|
train - Tracks the number of steps from the beginning of training until the full training run is complete (training.max_steps is reached).
|
||||||
|
eval - Tracks the number of steps between the previous and next evaluation (training.eval_frequency is reached).
|
||||||
console_output (bool): Whether the logger should print the logs on the console.
|
console_output (bool): Whether the logger should print the logs on the console.
|
||||||
output_file (Optional[Union[str, Path]]): The file to save the training logs to.
|
output_file (Optional[Union[str, Path]]): The file to save the training logs to.
|
||||||
"""
|
"""
|
||||||
|
@ -70,6 +92,7 @@ def console_logger(
|
||||||
for name, proc in nlp.pipeline
|
for name, proc in nlp.pipeline
|
||||||
if hasattr(proc, "is_trainable") and proc.is_trainable
|
if hasattr(proc, "is_trainable") and proc.is_trainable
|
||||||
]
|
]
|
||||||
|
max_steps = nlp.config["training"]["max_steps"]
|
||||||
eval_frequency = nlp.config["training"]["eval_frequency"]
|
eval_frequency = nlp.config["training"]["eval_frequency"]
|
||||||
score_weights = nlp.config["training"]["score_weights"]
|
score_weights = nlp.config["training"]["score_weights"]
|
||||||
score_cols = [col for col, value in score_weights.items() if value is not None]
|
score_cols = [col for col, value in score_weights.items() if value is not None]
|
||||||
|
@ -84,6 +107,13 @@ def console_logger(
|
||||||
write(msg.row(table_header, widths=table_widths, spacing=spacing))
|
write(msg.row(table_header, widths=table_widths, spacing=spacing))
|
||||||
write(msg.row(["-" * width for width in table_widths], spacing=spacing))
|
write(msg.row(["-" * width for width in table_widths], spacing=spacing))
|
||||||
progress = None
|
progress = None
|
||||||
|
expected_progress_types = ("train", "eval")
|
||||||
|
if progress_bar is not None and progress_bar not in expected_progress_types:
|
||||||
|
raise ValueError(
|
||||||
|
Errors.E1048.format(
|
||||||
|
unexpected=progress_bar, expected=expected_progress_types
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
def log_step(info: Optional[Dict[str, Any]]) -> None:
|
def log_step(info: Optional[Dict[str, Any]]) -> None:
|
||||||
nonlocal progress
|
nonlocal progress
|
||||||
|
@ -141,11 +171,23 @@ def console_logger(
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if progress_bar:
|
if progress_bar:
|
||||||
|
if progress_bar == "train":
|
||||||
|
total = max_steps
|
||||||
|
desc = f"Last Eval Epoch: {info['epoch']}"
|
||||||
|
initial = info["step"]
|
||||||
|
else:
|
||||||
|
total = eval_frequency
|
||||||
|
desc = f"Epoch {info['epoch']+1}"
|
||||||
|
initial = 0
|
||||||
# Set disable=None, so that it disables on non-TTY
|
# Set disable=None, so that it disables on non-TTY
|
||||||
progress = tqdm.tqdm(
|
progress = tqdm.tqdm(
|
||||||
total=eval_frequency, disable=None, leave=False, file=stderr
|
total=total,
|
||||||
|
disable=None,
|
||||||
|
leave=False,
|
||||||
|
file=stderr,
|
||||||
|
initial=initial,
|
||||||
)
|
)
|
||||||
progress.set_description(f"Epoch {info['epoch']+1}")
|
progress.set_description(desc)
|
||||||
|
|
||||||
def finalize() -> None:
|
def finalize() -> None:
|
||||||
if output_stream:
|
if output_stream:
|
||||||
|
|
|
@ -513,7 +513,7 @@ a [Weights & Biases](https://www.wandb.com/) dashboard.
|
||||||
Instead of using one of the built-in loggers, you can
|
Instead of using one of the built-in loggers, you can
|
||||||
[implement your own](/usage/training#custom-logging).
|
[implement your own](/usage/training#custom-logging).
|
||||||
|
|
||||||
#### spacy.ConsoleLogger.v2 {#ConsoleLogger tag="registered function"}
|
#### spacy.ConsoleLogger.v2 {tag="registered function"}
|
||||||
|
|
||||||
> #### Example config
|
> #### Example config
|
||||||
>
|
>
|
||||||
|
@ -565,10 +565,32 @@ start decreasing across epochs.
|
||||||
</Accordion>
|
</Accordion>
|
||||||
|
|
||||||
| Name | Description |
|
| Name | Description |
|
||||||
| ---------------- | --------------------------------------------------------------------- |
|
| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- |
|
||||||
| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ |
|
| `progress_bar` | Whether the logger should print a progress bar tracking the steps till the next evaluation pass (default: `False`). ~~bool~~ |
|
||||||
| `console_output` | Whether the logger should print the logs on the console. ~~bool~~ |
|
| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~ |
|
||||||
| `output_file` | The file to save the training logs to. ~~Optional[Union[str, Path]]~~ |
|
| `output_file` | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~ |
|
||||||
|
|
||||||
|
#### spacy.ConsoleLogger.v3 {#ConsoleLogger tag="registered function"}
|
||||||
|
|
||||||
|
> #### Example config
|
||||||
|
>
|
||||||
|
> ```ini
|
||||||
|
> [training.logger]
|
||||||
|
> @loggers = "spacy.ConsoleLogger.v3"
|
||||||
|
> progress_bar = "all_steps"
|
||||||
|
> console_output = true
|
||||||
|
> output_file = "training_log.jsonl"
|
||||||
|
> ```
|
||||||
|
|
||||||
|
Writes the results of a training step to the console in a tabular format and
|
||||||
|
optionally saves them to a `jsonl` file.
|
||||||
|
|
||||||
|
| Name | Description |
|
||||||
|
| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||||
|
| `progress_bar` | Type of progress bar to show in the console: `"train"`, `"eval"` or `None`. |
|
||||||
|
| | The bar tracks the number of steps until `training.max_steps` and `training.eval_frequency` are reached respectively (default: `None`). ~~Optional[str]~~ |
|
||||||
|
| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~ |
|
||||||
|
| `output_file` | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~ |
|
||||||
|
|
||||||
## Readers {#readers}
|
## Readers {#readers}
|
||||||
|
|
||||||
|
|
|
@ -4066,6 +4066,33 @@
|
||||||
"author_links": {
|
"author_links": {
|
||||||
"github": "yasufumy"
|
"github": "yasufumy"
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "spacy-pythainlp",
|
||||||
|
"title": "spaCy-PyThaiNLP",
|
||||||
|
"slogan": "PyThaiNLP for spaCy",
|
||||||
|
"description": "This package wraps the PyThaiNLP library to add support for Thai to spaCy.",
|
||||||
|
"github": "PyThaiNLP/spaCy-PyThaiNLP",
|
||||||
|
"code_example": [
|
||||||
|
"import spacy",
|
||||||
|
"import spacy_pythainlp.core",
|
||||||
|
"",
|
||||||
|
"nlp = spacy.blank('th')",
|
||||||
|
"nlp.add_pipe('pythainlp')",
|
||||||
|
"doc = nlp('ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน ผมอยากไปเที่ยว')",
|
||||||
|
"",
|
||||||
|
"print(list(doc.sents))",
|
||||||
|
"# output: [ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน , ผมอยากไปเที่ยว]"
|
||||||
|
],
|
||||||
|
"code_language": "python",
|
||||||
|
"author": "Wannaphong Phatthiyaphaibun",
|
||||||
|
"author_links": {
|
||||||
|
"twitter": "@wannaphong_p",
|
||||||
|
"github": "wannaphong",
|
||||||
|
"website": "https://iam.wannaphong.com/"
|
||||||
|
},
|
||||||
|
"category": ["pipeline", "research"],
|
||||||
|
"tags": ["Thai"]
|
||||||
}
|
}
|
||||||
|
|
||||||
],
|
],
|
||||||
|
|
Loading…
Reference in New Issue
Block a user