From afd7a2476d2491af864d0723bff96191ea61b429 Mon Sep 17 00:00:00 2001
From: Damian Romero <12145757+damian-romero@users.noreply.github.com>
Date: Thu, 1 Dec 2022 07:06:28 -0500
Subject: [PATCH 01/36] Fix typo in vocab.md table (#11908)
* Fix typo in vocab.md table
Fixes explosion/spaCy/#11907
* Reformat vocab.md with Prettier
---
website/docs/api/vocab.md | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md
index afbd1301d..5e4de219a 100644
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@@ -308,14 +308,14 @@ Load state from a binary string.
> assert type(PERSON) == int
> ```
-| Name | Description |
-| ---------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ |
-| `vectors` | A table associating word IDs to word vectors. ~~Vectors~~ |
-| `vectors_length` | Number of dimensions for each word vector. ~~int~~ |
-| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ |
-| `writing_system` | A dict with information about the language's writing system. ~~Dict[str, Any]~~ |
-| `get_noun_chunks` 3.0 | A function that yields base noun phrases used for [`Doc.noun_chunks`](/ap/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
+| Name | Description |
+| ---------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `strings` | A table managing the string-to-int mapping. ~~StringStore~~ |
+| `vectors` | A table associating word IDs to word vectors. ~~Vectors~~ |
+| `vectors_length` | Number of dimensions for each word vector. ~~int~~ |
+| `lookups` | The available lookup tables in this vocab. ~~Lookups~~ |
+| `writing_system` | A dict with information about the language's writing system. ~~Dict[str, Any]~~ |
+| `get_noun_chunks` 3.0 | A function that yields base noun phrases used for [`Doc.noun_chunks`](/api/doc#noun_chunks). ~~Optional[Callable[[Union[Doc, Span], Iterator[Tuple[int, int, int]]]]]~~ |
## Serialization fields {#serialization-fields}
From 9cf3fa9711dfff94e88d6e137a52ebabdcceaad8 Mon Sep 17 00:00:00 2001
From: Zhangrp
Date: Thu, 1 Dec 2022 20:30:27 +0800
Subject: [PATCH 02/36] Add docs for biluo_to_iob and iob_to_biluo. (#11901)
* Add docs for biluo_to_iob and iob_to_biluo.
* Fix typos.
* Remove redundant links.
---
website/docs/api/top-level.md | 48 +++++++++++++++++++++++++++++++++++
1 file changed, 48 insertions(+)
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 211affa4a..26a5d42f4 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -1004,6 +1004,54 @@ This method was previously available as `spacy.gold.spans_from_biluo_tags`.
| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags with each tag describing one token. Each tag string will be of the form of either `""`, `"O"` or `"{action}-{label}"`, where action is one of `"B"`, `"I"`, `"L"`, `"U"`. ~~List[str]~~ |
| **RETURNS** | A sequence of `Span` objects with added entity labels. ~~List[Span]~~ |
+### training.biluo_to_iob {#biluo_to_iob tag="function"}
+
+Convert a sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags to
+[IOB](/usage/linguistic-features#accessing-ner) tags. This is useful if you want
+use the BILUO tags with a model that only supports IOB tags.
+
+> #### Example
+>
+> ```python
+> from spacy.training import biluo_to_iob
+>
+> tags = ["O", "O", "B-LOC", "I-LOC", "L-LOC", "O"]
+> iob_tags = biluo_to_iob(tags)
+> assert iob_tags == ["O", "O", "B-LOC", "I-LOC", "I-LOC", "O"]
+> ```
+
+| Name | Description |
+| ----------- | --------------------------------------------------------------------------------------- |
+| `tags` | A sequence of [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~Iterable[str]~~ |
+| **RETURNS** | A list of [IOB](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
+
+### training.iob_to_biluo {#iob_to_biluo tag="function"}
+
+Convert a sequence of [IOB](/usage/linguistic-features#accessing-ner) tags to
+[BILUO](/usage/linguistic-features#accessing-ner) tags. This is useful if you
+want use the IOB tags with a model that only supports BILUO tags.
+
+
+
+This method was previously available as `spacy.gold.iob_to_biluo`.
+
+
+
+> #### Example
+>
+> ```python
+> from spacy.training import iob_to_biluo
+>
+> tags = ["O", "O", "B-LOC", "I-LOC", "O"]
+> biluo_tags = iob_to_biluo(tags)
+> assert biluo_tags == ["O", "O", "B-LOC", "L-LOC", "O"]
+> ```
+
+| Name | Description |
+| ----------- | ------------------------------------------------------------------------------------- |
+| `tags` | A sequence of [IOB](/usage/linguistic-features#accessing-ner) tags. ~~Iterable[str]~~ |
+| **RETURNS** | A list of [BILUO](/usage/linguistic-features#accessing-ner) tags. ~~List[str]~~ |
+
## Utility functions {#util source="spacy/util.py"}
spaCy comes with a small collection of utility functions located in
From 445c670a2d537598b3d562fb7f444050164a260b Mon Sep 17 00:00:00 2001
From: Adriane Boyd
Date: Fri, 2 Dec 2022 09:33:52 +0100
Subject: [PATCH 03/36] Fix spancat for zero suggestions (#11860)
* Add test for spancat predict with zero suggestions
* Fix spancat for zero suggestions
* Undo changes to extract_spans
* Use .sum() as in update
---
spacy/pipeline/spancat.py | 5 +++-
spacy/tests/pipeline/test_spancat.py | 43 ++++++++++++++++++++++------
2 files changed, 38 insertions(+), 10 deletions(-)
diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py
index 0a84c72fd..a3388e81a 100644
--- a/spacy/pipeline/spancat.py
+++ b/spacy/pipeline/spancat.py
@@ -272,7 +272,10 @@ class SpanCategorizer(TrainablePipe):
DOCS: https://spacy.io/api/spancategorizer#predict
"""
indices = self.suggester(docs, ops=self.model.ops)
- scores = self.model.predict((docs, indices)) # type: ignore
+ if indices.lengths.sum() == 0:
+ scores = self.model.ops.alloc2f(0, 0)
+ else:
+ scores = self.model.predict((docs, indices)) # type: ignore
return indices, scores
def set_candidates(
diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py
index 15256a763..e9db983d3 100644
--- a/spacy/tests/pipeline/test_spancat.py
+++ b/spacy/tests/pipeline/test_spancat.py
@@ -372,24 +372,39 @@ def test_overfitting_IO_overlapping():
def test_zero_suggestions():
- # Test with a suggester that returns 0 suggestions
+ # Test with a suggester that can return 0 suggestions
- @registry.misc("test_zero_suggester")
- def make_zero_suggester():
- def zero_suggester(docs, *, ops=None):
+ @registry.misc("test_mixed_zero_suggester")
+ def make_mixed_zero_suggester():
+ def mixed_zero_suggester(docs, *, ops=None):
if ops is None:
ops = get_current_ops()
- return Ragged(
- ops.xp.zeros((0, 0), dtype="i"), ops.xp.zeros((len(docs),), dtype="i")
- )
+ spans = []
+ lengths = []
+ for doc in docs:
+ if len(doc) > 0 and len(doc) % 2 == 0:
+ spans.append((0, 1))
+ lengths.append(1)
+ else:
+ lengths.append(0)
+ spans = ops.asarray2i(spans)
+ lengths_array = ops.asarray1i(lengths)
+ if len(spans) > 0:
+ output = Ragged(ops.xp.vstack(spans), lengths_array)
+ else:
+ output = Ragged(ops.xp.zeros((0, 0), dtype="i"), lengths_array)
+ return output
- return zero_suggester
+ return mixed_zero_suggester
fix_random_seed(0)
nlp = English()
spancat = nlp.add_pipe(
"spancat",
- config={"suggester": {"@misc": "test_zero_suggester"}, "spans_key": SPAN_KEY},
+ config={
+ "suggester": {"@misc": "test_mixed_zero_suggester"},
+ "spans_key": SPAN_KEY,
+ },
)
train_examples = make_examples(nlp)
optimizer = nlp.initialize(get_examples=lambda: train_examples)
@@ -397,6 +412,16 @@ def test_zero_suggestions():
assert set(spancat.labels) == {"LOC", "PERSON"}
nlp.update(train_examples, sgd=optimizer)
+ # empty doc
+ nlp("")
+ # single doc with zero suggestions
+ nlp("one")
+ # single doc with one suggestion
+ nlp("two two")
+ # batch with mixed zero/one suggestions
+ list(nlp.pipe(["one", "two two", "three three three", "", "four four four four"]))
+ # batch with no suggestions
+ list(nlp.pipe(["", "one", "three three three"]))
def test_set_candidates():
From f9d17a644b3d037924f715c03672ada6d12e4d86 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann
Date: Fri, 2 Dec 2022 18:17:11 +0900
Subject: [PATCH 04/36] Config generation fails for GPU without transformers
(#11899)
If you don't have spacy-transformers installed, but try to use `init
config` with the GPU flag, you'll get an error. The issue is that the
`use_transformers` flag in the config is conflated with the GPU flag,
and then there's an attempt to access transformers config info that may
not exist.
There may be a better way to do this, but this stops the error.
---
spacy/cli/templates/quickstart_training.jinja | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index 58864883a..b961ac892 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -1,7 +1,7 @@
{# This is a template for training configs used for the quickstart widget in
the docs and the init config command. It encodes various best practices and
can help generate the best possible configuration, given a user's requirements. #}
-{%- set use_transformer = hardware != "cpu" -%}
+{%- set use_transformer = hardware != "cpu" and transformer_data -%}
{%- set transformer = transformer_data[optimize] if use_transformer else {} -%}
{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%}
[paths]
From df0cb4b77be6e20a62143f5f65c3e165a4a45bcc Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
<41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 2 Dec 2022 14:49:12 +0100
Subject: [PATCH 05/36] Auto-format code with black (#11913)
Co-authored-by: explosion-bot
---
spacy/util.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/spacy/util.py b/spacy/util.py
index cba403361..8d211a9a5 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1643,7 +1643,9 @@ def _pipe(
docs: Iterable["Doc"],
proc: "PipeCallable",
name: str,
- default_error_handler: Callable[[str, "PipeCallable", List["Doc"], Exception], NoReturn],
+ default_error_handler: Callable[
+ [str, "PipeCallable", List["Doc"], Exception], NoReturn
+ ],
kwargs: Mapping[str, Any],
) -> Iterator["Doc"]:
if hasattr(proc, "pipe"):
From 4b2097a2713b548cba1c841fa5cb8f6f42e3e30f Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Mon, 5 Dec 2022 08:29:13 +0100
Subject: [PATCH 06/36] fix links (#11927)
---
website/docs/usage/v3-4.md | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/website/docs/usage/v3-4.md b/website/docs/usage/v3-4.md
index 597fc3cc8..e10110b71 100644
--- a/website/docs/usage/v3-4.md
+++ b/website/docs/usage/v3-4.md
@@ -66,8 +66,8 @@ The English CNN pipelines have new word vectors:
| Package | Model Version | TAG | Parser LAS | NER F |
| ----------------------------------------------- | ------------- | ---: | ---------: | ----: |
| [`en_core_web_md`](/models/en#en_core_web_md) | v3.3.0 | 97.3 | 90.1 | 84.6 |
-| [`en_core_web_md`](/models/en#en_core_web_lg) | v3.4.0 | 97.2 | 90.3 | 85.5 |
-| [`en_core_web_lg`](/models/en#en_core_web_md) | v3.3.0 | 97.4 | 90.1 | 85.3 |
+| [`en_core_web_md`](/models/en#en_core_web_md) | v3.4.0 | 97.2 | 90.3 | 85.5 |
+| [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.3.0 | 97.4 | 90.1 | 85.3 |
| [`en_core_web_lg`](/models/en#en_core_web_lg) | v3.4.0 | 97.3 | 90.2 | 85.6 |
## Notes about upgrading from v3.3 {#upgrading}
From 5848656b5e3287d77674ce678e321eadea52f68e Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann
Date: Mon, 5 Dec 2022 17:43:23 +0900
Subject: [PATCH 07/36] Switch ubuntu-latest to ubuntu-20.04 in main tests
(#11928)
* Switch ubuntu-latest to ubuntu-20.04 in main tests
* Only use 20.04 for 3.6
---
azure-pipelines.yml | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 9c3b92f06..0f7ea91f9 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -41,7 +41,7 @@ jobs:
matrix:
# We're only running one platform per Python version to speed up builds
Python36Linux:
- imageName: "ubuntu-latest"
+ imageName: "ubuntu-20.04"
python.version: "3.6"
# Python36Windows:
# imageName: "windows-latest"
@@ -50,7 +50,7 @@ jobs:
# imageName: "macos-latest"
# python.version: "3.6"
# Python37Linux:
- # imageName: "ubuntu-latest"
+ # imageName: "ubuntu-20.04"
# python.version: "3.7"
Python37Windows:
imageName: "windows-latest"
From 6f342bdd72f300cdc431d0e0f2a168c62fd2a861 Mon Sep 17 00:00:00 2001
From: Darigov Research <30328618+darigovresearch@users.noreply.github.com>
Date: Mon, 5 Dec 2022 08:49:04 +0000
Subject: [PATCH 08/36] docs: Adds link to license in readme (#11924)
Would resolve https://github.com/explosion/spaCy/issues/11923 if merged
---
README.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/README.md b/README.md
index abfc3da67..7595460fb 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ parsing, **named entity recognition**, **text classification** and more,
multi-task learning with pretrained **transformers** like BERT, as well as a
production-ready [**training system**](https://spacy.io/usage/training) and easy
model packaging, deployment and workflow management. spaCy is commercial
-open-source software, released under the MIT license.
+open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE).
💫 **Version 3.4 out now!**
[Check out the release notes here.](https://github.com/explosion/spaCy/releases)
From 8afa8b5a7b8ee51eb42b83dabd0f3c1276369e73 Mon Sep 17 00:00:00 2001
From: Adriane Boyd
Date: Mon, 5 Dec 2022 10:00:00 +0100
Subject: [PATCH 09/36] Refactor kwargs in CLI msg for future wasabi
compatibility (#11918)
Necessary for mypy with wasabi v1+.
---
spacy/cli/project/run.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py
index a109c4a5a..6dd174902 100644
--- a/spacy/cli/project/run.py
+++ b/spacy/cli/project/run.py
@@ -101,8 +101,8 @@ def project_run(
if not (project_dir / dep).exists():
err = f"Missing dependency specified by command '{subcommand}': {dep}"
err_help = "Maybe you forgot to run the 'project assets' command or a previous step?"
- err_kwargs = {"exits": 1} if not dry else {}
- msg.fail(err, err_help, **err_kwargs)
+ err_exits = 1 if not dry else None
+ msg.fail(err, err_help, exits=err_exits)
check_spacy_commit = check_bool_env_var(ENV_VARS.PROJECT_USE_GIT_VERSION)
with working_dir(project_dir) as current_dir:
msg.divider(subcommand)
From 1aadcfcb37ba166558688782fabbcbe3e32ea020 Mon Sep 17 00:00:00 2001
From: Ryn Daniels <397565+ryndaniels@users.noreply.github.com>
Date: Mon, 5 Dec 2022 11:17:10 +0200
Subject: [PATCH 10/36] update lock-threads to v4 (#11930)
---
.github/workflows/lock.yml | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/.github/workflows/lock.yml b/.github/workflows/lock.yml
index c9833cdba..794adee85 100644
--- a/.github/workflows/lock.yml
+++ b/.github/workflows/lock.yml
@@ -15,11 +15,11 @@ jobs:
action:
runs-on: ubuntu-latest
steps:
- - uses: dessant/lock-threads@v3
+ - uses: dessant/lock-threads@v4
with:
process-only: 'issues'
issue-inactive-days: '30'
- issue-comment: >
- This thread has been automatically locked since there
- has not been any recent activity after it was closed.
+ issue-comment: >
+ This thread has been automatically locked since there
+ has not been any recent activity after it was closed.
Please open a new issue for related bugs.
From 23085ffef4bba62aff0de5993ff405cb3ff3528c Mon Sep 17 00:00:00 2001
From: Zhangrp
Date: Tue, 6 Dec 2022 16:42:12 +0800
Subject: [PATCH 11/36] Fix interpolation in directory names, see #11235.
(#11914)
---
spacy/cli/_util.py | 8 ++++----
spacy/tests/test_cli.py | 19 +++++++++++++++++++
2 files changed, 23 insertions(+), 4 deletions(-)
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 7ce006108..9b97a9f19 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -158,15 +158,15 @@ def load_project_config(
sys.exit(1)
validate_project_version(config)
validate_project_commands(config)
+ if interpolate:
+ err = f"{PROJECT_FILE} validation error"
+ with show_validation_error(title=err, hint_fill=False):
+ config = substitute_project_variables(config, overrides)
# Make sure directories defined in config exist
for subdir in config.get("directories", []):
dir_path = path / subdir
if not dir_path.exists():
dir_path.mkdir(parents=True)
- if interpolate:
- err = f"{PROJECT_FILE} validation error"
- with show_validation_error(title=err, hint_fill=False):
- config = substitute_project_variables(config, overrides)
return config
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 2e706458f..3104b49ff 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -123,6 +123,25 @@ def test_issue7055():
assert "model" in filled_cfg["components"]["ner"]
+@pytest.mark.issue(11235)
+def test_issue11235():
+ """
+ Test that the cli handles interpolation in the directory names correctly when loading project config.
+ """
+ lang_var = "en"
+ variables = {"lang": lang_var}
+ commands = [{"name": "x", "script": ["hello ${vars.lang}"]}]
+ directories = ["cfg", "${vars.lang}_model"]
+ project = {"commands": commands, "vars": variables, "directories": directories}
+ with make_tempdir() as d:
+ srsly.write_yaml(d / "project.yml", project)
+ cfg = load_project_config(d)
+ # Check that the directories are interpolated and created correctly
+ assert os.path.exists(d / "cfg")
+ assert os.path.exists(d / f"{lang_var}_model")
+ assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
+
+
def test_cli_info():
nlp = Dutch()
nlp.add_pipe("textcat")
From 27fac7df2e67a0dbfefd68834c14fb1f9505da49 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dani=C3=ABl=20de=20Kok?=
Date: Wed, 7 Dec 2022 05:53:41 +0100
Subject: [PATCH 12/36] EditTreeLemmatizer: correctly add strings when
initializing from labels (#11934)
Strings in replacement nodes where not added to the `StringStore`
when `EditTreeLemmatizer` was initialized from a set of labels. The
corresponding test did not capture this because it added the strings
through the examples that were passed to the initialization.
This change fixes both this bug in the initialization as the 'shadowing'
of the bug in the test.
---
spacy/pipeline/edit_tree_lemmatizer.py | 4 +-
.../pipeline/test_edit_tree_lemmatizer.py | 37 ++++++++++++++++++-
2 files changed, 38 insertions(+), 3 deletions(-)
diff --git a/spacy/pipeline/edit_tree_lemmatizer.py b/spacy/pipeline/edit_tree_lemmatizer.py
index 12f9b73a3..a56c9975e 100644
--- a/spacy/pipeline/edit_tree_lemmatizer.py
+++ b/spacy/pipeline/edit_tree_lemmatizer.py
@@ -328,9 +328,9 @@ class EditTreeLemmatizer(TrainablePipe):
tree = dict(tree)
if "orig" in tree:
- tree["orig"] = self.vocab.strings[tree["orig"]]
+ tree["orig"] = self.vocab.strings.add(tree["orig"])
if "orig" in tree:
- tree["subst"] = self.vocab.strings[tree["subst"]]
+ tree["subst"] = self.vocab.strings.add(tree["subst"])
trees.append(tree)
diff --git a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
index cf541e301..b12ca5dd4 100644
--- a/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
+++ b/spacy/tests/pipeline/test_edit_tree_lemmatizer.py
@@ -60,10 +60,45 @@ def test_initialize_from_labels():
nlp2 = Language()
lemmatizer2 = nlp2.add_pipe("trainable_lemmatizer")
lemmatizer2.initialize(
- get_examples=lambda: train_examples,
+ # We want to check that the strings in replacement nodes are
+ # added to the string store. Avoid that they get added through
+ # the examples.
+ get_examples=lambda: train_examples[:1],
labels=lemmatizer.label_data,
)
assert lemmatizer2.tree2label == {1: 0, 3: 1, 4: 2, 6: 3}
+ assert lemmatizer2.label_data == {
+ "trees": [
+ {"orig": "S", "subst": "s"},
+ {
+ "prefix_len": 1,
+ "suffix_len": 0,
+ "prefix_tree": 0,
+ "suffix_tree": 4294967295,
+ },
+ {"orig": "s", "subst": ""},
+ {
+ "prefix_len": 0,
+ "suffix_len": 1,
+ "prefix_tree": 4294967295,
+ "suffix_tree": 2,
+ },
+ {
+ "prefix_len": 0,
+ "suffix_len": 0,
+ "prefix_tree": 4294967295,
+ "suffix_tree": 4294967295,
+ },
+ {"orig": "E", "subst": "e"},
+ {
+ "prefix_len": 1,
+ "suffix_len": 0,
+ "prefix_tree": 5,
+ "suffix_tree": 4294967295,
+ },
+ ],
+ "labels": (1, 3, 4, 6),
+ }
def test_no_data():
From 916191848ab7bf90e88f23401451695f61903112 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann
Date: Wed, 7 Dec 2022 18:09:04 +0900
Subject: [PATCH 13/36] Update scattertext example code (#11937)
* Update scattertext example code
* Remove PMI Filter Threshold
---
website/meta/universe.json | 25 +++++++++++++++++++------
1 file changed, 19 insertions(+), 6 deletions(-)
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 97b53e9c5..8ca657561 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1468,13 +1468,26 @@
"image": "https://jasonkessler.github.io/2012conventions0.0.2.2.png",
"code_example": [
"import spacy",
- "import scattertext as st",
"",
- "nlp = spacy.load('en')",
- "corpus = st.CorpusFromPandas(convention_df,",
- " category_col='party',",
- " text_col='text',",
- " nlp=nlp).build()"
+ "from scattertext import SampleCorpora, produce_scattertext_explorer",
+ "from scattertext import produce_scattertext_html",
+ "from scattertext.CorpusFromPandas import CorpusFromPandas",
+ "",
+ "nlp = spacy.load('en_core_web_sm')",
+ "convention_df = SampleCorpora.ConventionData2012.get_data()",
+ "corpus = CorpusFromPandas(convention_df,",
+ " category_col='party',",
+ " text_col='text',",
+ " nlp=nlp).build()",
+ "",
+ "html = produce_scattertext_html(corpus,",
+ " category='democrat',",
+ " category_name='Democratic',",
+ " not_category_name='Republican',",
+ " minimum_term_frequency=5,",
+ " width_in_pixels=1000)",
+ "open('./simple.html', 'wb').write(html.encode('utf-8'))",
+ "print('Open ./simple.html in Chrome or Firefox.')"
],
"author": "Jason Kessler",
"author_links": {
From 5c3a60e8f4273aff7bd47bce01d62c8224967045 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann
Date: Wed, 7 Dec 2022 23:52:35 +0900
Subject: [PATCH 14/36] Add in errors used in the beam code that were removed
at some point (#11935)
I don't think there's any way to use the beam code at the moment, but as
long as it's around the errors it refers to should also be present.
---
spacy/errors.py | 5 +++++
1 file changed, 5 insertions(+)
diff --git a/spacy/errors.py b/spacy/errors.py
index e34614b0f..0e5ef91ed 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -345,6 +345,11 @@ class Errors(metaclass=ErrorsWithCodes):
"clear the existing vectors and resize the table.")
E074 = ("Error interpreting compiled match pattern: patterns are expected "
"to end with the attribute {attr}. Got: {bad_attr}.")
+ E079 = ("Error computing states in beam: number of predicted beams "
+ "({pbeams}) does not equal number of gold beams ({gbeams}).")
+ E080 = ("Duplicate state found in beam: {key}.")
+ E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
+ "does not equal number of losses ({losses}).")
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
"match.")
From 73919336fb1b003425373a07d41e5541dc5c3c46 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann
Date: Wed, 7 Dec 2022 23:56:03 +0900
Subject: [PATCH 15/36] Remove spacy-sentence-segmenter from Universe (#11932)
---
website/meta/universe.json | 19 -------------------
1 file changed, 19 deletions(-)
diff --git a/website/meta/universe.json b/website/meta/universe.json
index 8ca657561..db533c3b2 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -1023,25 +1023,6 @@
},
"category": ["pipeline"]
},
- {
- "id": "spacy-sentence-segmenter",
- "title": "Sentence Segmenter",
- "slogan": "Custom sentence segmentation for spaCy",
- "code_example": [
- "from seg.newline.segmenter import NewLineSegmenter",
- "import spacy",
- "",
- "nlseg = NewLineSegmenter()",
- "nlp = spacy.load('en')",
- "nlp.add_pipe(nlseg.set_sent_starts, name='sentence_segmenter', before='parser')",
- "doc = nlp(my_doc_text)"
- ],
- "author": "tc64",
- "author_links": {
- "github": "tc64"
- },
- "category": ["pipeline"]
- },
{
"id": "spacy_cld",
"title": "spaCy-CLD",
From 6d2ca1ab3a545491acbe058035677a263135e52a Mon Sep 17 00:00:00 2001
From: vincent d warmerdam
Date: Wed, 7 Dec 2022 16:02:09 +0100
Subject: [PATCH 16/36] Update custom solutions links (#11903)
* Update custom solutions
Will now point to https://explosion.ai/custom-solutions
* added-sidebar
* added-analysis-to-readme
* update-landing-page
---
README.md | 2 ++
website/meta/sidebars.json | 2 +-
website/meta/site.json | 2 +-
website/src/widgets/landing.js | 4 ++--
4 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/README.md b/README.md
index 7595460fb..195424551 100644
--- a/README.md
+++ b/README.md
@@ -46,6 +46,7 @@ open-source software, released under the [MIT license](https://github.com/explos
| 🛠 **[Changelog]** | Changes and version history. |
| 💝 **[Contribute]** | How to contribute to the spaCy project and code base. |
|
| Get a custom spaCy pipeline, tailor-made for your NLP problem by spaCy's core developers. Streamlined, production-ready, predictable and maintainable. Start by completing our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-pipelines)** |
+|
| Bespoke advice for problem solving, strategy and analysis for applied NLP projects. Services include data strategy, code reviews, pipeline design and annotation coaching. Curious? Fill in our 5-minute questionnaire to tell us what you need and we'll be in touch! **[Learn more →](https://explosion.ai/spacy-tailored-analysis)** |
[spacy 101]: https://spacy.io/usage/spacy-101
[new in v3.0]: https://spacy.io/usage/v3
@@ -59,6 +60,7 @@ open-source software, released under the [MIT license](https://github.com/explos
[changelog]: https://spacy.io/usage#changelog
[contribute]: https://github.com/explosion/spaCy/blob/master/CONTRIBUTING.md
+
## 💬 Where to ask questions
The spaCy project is maintained by the [spaCy team](https://explosion.ai/about).
diff --git a/website/meta/sidebars.json b/website/meta/sidebars.json
index 2d8745d77..339e4085b 100644
--- a/website/meta/sidebars.json
+++ b/website/meta/sidebars.json
@@ -45,7 +45,7 @@
{ "text": "v2.x Documentation", "url": "https://v2.spacy.io" },
{
"text": "Custom Solutions",
- "url": "https://explosion.ai/spacy-tailored-pipelines"
+ "url": "https://explosion.ai/custom-solutions"
}
]
}
diff --git a/website/meta/site.json b/website/meta/site.json
index 360a72178..fa79d3c69 100644
--- a/website/meta/site.json
+++ b/website/meta/site.json
@@ -51,7 +51,7 @@
{ "text": "Online Course", "url": "https://course.spacy.io" },
{
"text": "Custom Solutions",
- "url": "https://explosion.ai/spacy-tailored-pipelines"
+ "url": "https://explosion.ai/custom-solutions"
}
]
},
diff --git a/website/src/widgets/landing.js b/website/src/widgets/landing.js
index b7ae35f6e..c3aaa8a22 100644
--- a/website/src/widgets/landing.js
+++ b/website/src/widgets/landing.js
@@ -105,13 +105,13 @@ const Landing = ({ data }) => {
-
+
From f22fc7a1138545a2a75975909b5af554e8e1d616 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
<41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 9 Dec 2022 10:15:52 +0100
Subject: [PATCH 17/36] Auto-format code with black (#11955)
Co-authored-by: explosion-bot
---
spacy/tests/test_cli.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 3104b49ff..42af08749 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -140,7 +140,7 @@ def test_issue11235():
assert os.path.exists(d / "cfg")
assert os.path.exists(d / f"{lang_var}_model")
assert cfg["commands"][0]["script"][0] == f"hello {lang_var}"
-
+
def test_cli_info():
nlp = Dutch()
From 8c291ace0c0978e70257906438d3585022090e9f Mon Sep 17 00:00:00 2001
From: Adriane Boyd
Date: Mon, 12 Dec 2022 08:38:36 +0100
Subject: [PATCH 18/36] Extend to wasabi v1.1 (#11945)
* Extend to wasabi v1.1
* Temporarily run mypy and tests with newest wasabi
* Temporarily skip check requirements test
* Revert "Temporarily skip check requirements test"
This reverts commit 44f4ce20a8e8c92e8bfc8042cc68333589a96253.
* Revert "Temporarily run mypy and tests with newest wasabi"
This reverts commit e677a2257ced55e696cafc3a8e89eb2f7ddfc369.
---
requirements.txt | 2 +-
setup.cfg | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/requirements.txt b/requirements.txt
index 778c05e21..0440835f2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,7 +6,7 @@ preshed>=3.0.2,<3.1.0
thinc>=8.1.0,<8.2.0
ml_datasets>=0.2.0,<0.3.0
murmurhash>=0.28.0,<1.1.0
-wasabi>=0.9.1,<1.1.0
+wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
typer>=0.3.0,<0.8.0
diff --git a/setup.cfg b/setup.cfg
index 5768c9d3e..cf6e6f84b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -47,7 +47,7 @@ install_requires =
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
thinc>=8.1.0,<8.2.0
- wasabi>=0.9.1,<1.1.0
+ wasabi>=0.9.1,<1.2.0
srsly>=2.4.3,<3.0.0
catalogue>=2.0.6,<2.1.0
# Third-party dependencies
From 0591e67265d7378769c0fc0df4020817f2d514ec Mon Sep 17 00:00:00 2001
From: Adriane Boyd
Date: Mon, 12 Dec 2022 08:45:35 +0100
Subject: [PATCH 19/36] Cast to uint64 for all array-based doc representations
(#11933)
* Convert all individual values explicitly to uint64 for array-based doc representations
* Temporarily test with latest numpy v1.24.0rc
* Remove unnecessary conversion from attr_t
* Reduce number of individual casts
* Convert specifically from int32 to uint64
* Revert "Temporarily test with latest numpy v1.24.0rc"
This reverts commit eb0e3c5006515b9a7ff52bae59484c909b8a3f65.
* Also use int32 in tests
---
spacy/tests/doc/test_array.py | 4 ++--
spacy/tokens/doc.pyx | 2 ++
spacy/tokens/span.pyx | 4 ++--
spacy/training/example.pyx | 15 ++++++++-------
4 files changed, 14 insertions(+), 11 deletions(-)
diff --git a/spacy/tests/doc/test_array.py b/spacy/tests/doc/test_array.py
index c334cc6eb..1f2d7d999 100644
--- a/spacy/tests/doc/test_array.py
+++ b/spacy/tests/doc/test_array.py
@@ -123,14 +123,14 @@ def test_doc_from_array_heads_in_bounds(en_vocab):
# head before start
arr = doc.to_array(["HEAD"])
- arr[0] = -1
+ arr[0] = numpy.int32(-1).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr)
# head after end
arr = doc.to_array(["HEAD"])
- arr[0] = 5
+ arr[0] = numpy.int32(5).astype(numpy.uint64)
doc_from_array = Doc(en_vocab, words=words)
with pytest.raises(ValueError):
doc_from_array.from_array(["HEAD"], arr)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index f2621292c..075bc4d15 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -359,6 +359,7 @@ cdef class Doc:
for annot in annotations:
if annot:
if annot is heads or annot is sent_starts or annot is ent_iobs:
+ annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)):
if attrs.ndim == 1:
attrs[i] = annot[i]
@@ -1558,6 +1559,7 @@ cdef class Doc:
for j, (attr, annot) in enumerate(token_annotations.items()):
if attr is HEAD:
+ annot = numpy.array(annot, dtype=numpy.int32).astype(numpy.uint64)
for i in range(len(words)):
array[i, j] = annot[i]
elif attr is MORPH:
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index c3495f497..99a5f43bd 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -299,7 +299,7 @@ cdef class Span:
for ancestor in ancestors:
ancestor_i = ancestor.i - self.c.start
if ancestor_i in range(length):
- array[i, head_col] = ancestor_i - i
+ array[i, head_col] = numpy.int32(ancestor_i - i).astype(numpy.uint64)
# if there is no appropriate ancestor, define a new artificial root
value = array[i, head_col]
@@ -307,7 +307,7 @@ cdef class Span:
new_root = old_to_new_root.get(ancestor_i, None)
if new_root is not None:
# take the same artificial root as a previous token from the same sentence
- array[i, head_col] = new_root - i
+ array[i, head_col] = numpy.int32(new_root - i).astype(numpy.uint64)
else:
# set this token as the new artificial root
array[i, head_col] = 0
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index dfd337b9e..95b0f0de9 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -443,26 +443,27 @@ def _annot2array(vocab, tok_annot, doc_annot):
if key not in IDS:
raise ValueError(Errors.E974.format(obj="token", key=key))
elif key in ["ORTH", "SPACY"]:
- pass
+ continue
elif key == "HEAD":
attrs.append(key)
- values.append([h-i if h is not None else 0 for i, h in enumerate(value)])
+ row = [h-i if h is not None else 0 for i, h in enumerate(value)]
elif key == "DEP":
attrs.append(key)
- values.append([vocab.strings.add(h) if h is not None else MISSING_DEP for h in value])
+ row = [vocab.strings.add(h) if h is not None else MISSING_DEP for h in value]
elif key == "SENT_START":
attrs.append(key)
- values.append([to_ternary_int(v) for v in value])
+ row = [to_ternary_int(v) for v in value]
elif key == "MORPH":
attrs.append(key)
- values.append([vocab.morphology.add(v) for v in value])
+ row = [vocab.morphology.add(v) for v in value]
else:
attrs.append(key)
if not all(isinstance(v, str) for v in value):
types = set([type(v) for v in value])
raise TypeError(Errors.E969.format(field=key, types=types)) from None
- values.append([vocab.strings.add(v) for v in value])
- array = numpy.asarray(values, dtype="uint64")
+ row = [vocab.strings.add(v) for v in value]
+ values.append([numpy.array(v, dtype=numpy.int32).astype(numpy.uint64) if v < 0 else v for v in row])
+ array = numpy.array(values, dtype=numpy.uint64)
return attrs, array.T
From e5c7f3b0776d49c4f6aab7e02b503cdb84fb2134 Mon Sep 17 00:00:00 2001
From: Adriane Boyd
Date: Mon, 12 Dec 2022 10:13:10 +0100
Subject: [PATCH 20/36] CI: Install thinc-apple-ops through extra (#11963)
---
.github/azure-steps.yml | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 2f77706b8..d0db75f9a 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -107,7 +107,7 @@ steps:
displayName: "Run CPU tests"
- script: |
- python -m pip install --pre thinc-apple-ops
+ python -m pip install 'spacy[apple]'
python -m pytest --pyargs spacy
displayName: "Run CPU tests with thinc-apple-ops"
condition: and(startsWith(variables['imageName'], 'macos'), eq(variables['python.version'], '3.11'))
From c9d9d6847f9685c21eeec01f4b8cd053cadf8bf5 Mon Sep 17 00:00:00 2001
From: Adriane Boyd
Date: Thu, 15 Dec 2022 10:55:01 +0100
Subject: [PATCH 21/36] Update build constraints for python 3.11 (#11981)
---
build-constraints.txt | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/build-constraints.txt b/build-constraints.txt
index 956973abf..c1e82f1b0 100644
--- a/build-constraints.txt
+++ b/build-constraints.txt
@@ -5,4 +5,5 @@ numpy==1.17.3; python_version=='3.8' and platform_machine!='aarch64'
numpy==1.19.2; python_version=='3.8' and platform_machine=='aarch64'
numpy==1.19.3; python_version=='3.9'
numpy==1.21.3; python_version=='3.10'
-numpy; python_version>='3.11'
+numpy==1.23.2; python_version=='3.11'
+numpy; python_version>='3.12'
From 3a2b655a29203d1c181a2c14d230b3f9cf8dd54a Mon Sep 17 00:00:00 2001
From: cfuerbachersparks <119413757+cfuerbachersparks@users.noreply.github.com>
Date: Mon, 19 Dec 2022 10:33:38 +0100
Subject: [PATCH 22/36] Update lexeme.md (#11994)
Change suffix_ string to end
---
website/docs/api/lexeme.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md
index eb76afa90..557d04cce 100644
--- a/website/docs/api/lexeme.md
+++ b/website/docs/api/lexeme.md
@@ -138,7 +138,7 @@ The L2 norm of the lexeme's vector representation.
| `prefix` | Length-N substring from the start of the word. Defaults to `N=1`. ~~int~~ |
| `prefix_` | Length-N substring from the start of the word. Defaults to `N=1`. ~~str~~ |
| `suffix` | Length-N substring from the end of the word. Defaults to `N=3`. ~~int~~ |
-| `suffix_` | Length-N substring from the start of the word. Defaults to `N=3`. ~~str~~ |
+| `suffix_` | Length-N substring from the end of the word. Defaults to `N=3`. ~~str~~ |
| `is_alpha` | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. ~~bool~~ |
| `is_ascii` | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. ~~bool~~ |
| `is_digit` | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. ~~bool~~ |
From 18ffe5bbd6a554920107ff48d1387df34c3f872a Mon Sep 17 00:00:00 2001
From: Jos Polfliet
Date: Mon, 19 Dec 2022 16:17:49 +0100
Subject: [PATCH 23/36] Update stop_words.py (#11997)
fix typo in "aangaande"
---
spacy/lang/nl/stop_words.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/lang/nl/stop_words.py b/spacy/lang/nl/stop_words.py
index a2c6198e7..cd4fdefdf 100644
--- a/spacy/lang/nl/stop_words.py
+++ b/spacy/lang/nl/stop_words.py
@@ -15,7 +15,7 @@
STOP_WORDS = set(
"""
-aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaangde aangezien achter achterna
+aan af al alle alles allebei alleen allen als altijd ander anders andere anderen aangaande aangezien achter achterna
afgelopen aldus alhoewel anderzijds
ben bij bijna bijvoorbeeld behalve beide beiden beneden bent bepaald beter betere betreffende binnen binnenin boven
From c223cd7a86f460f3dabb9e7369eef136a653218e Mon Sep 17 00:00:00 2001
From: kadarakos
Date: Tue, 20 Dec 2022 17:11:33 +0100
Subject: [PATCH 24/36] Add apply CLI (#11376)
* annotate cli first try
* add batch-size and n_process
* rename to apply
* typing fix
* handle file suffixes
* walk directories
* support jsonl
* typing fix
* remove debug
* make suffix optional for walk
* revert unrelated
* don't warn but raise
* better error message
* minor touch up
* Update spacy/tests/test_cli.py
Co-authored-by: Adriane Boyd
* Update spacy/cli/apply.py
Co-authored-by: Sofie Van Landeghem
* Update spacy/cli/apply.py
Co-authored-by: Sofie Van Landeghem
* update tests and bugfix
* add force_overwrite
* typo
* fix adding .spacy suffix
* Update spacy/cli/apply.py
Co-authored-by: Sofie Van Landeghem
* Update spacy/cli/apply.py
Co-authored-by: Sofie Van Landeghem
* Update spacy/cli/apply.py
Co-authored-by: Sofie Van Landeghem
* store user data and rename cmd arg
* include test for user attr
* rename cmd arg
* better help message
* documentation
* prettier
* black
* link fix
* Update spacy/cli/apply.py
Co-authored-by: Paul O'Leary McCann
* Update website/docs/api/cli.md
Co-authored-by: Paul O'Leary McCann
* Update website/docs/api/cli.md
Co-authored-by: Paul O'Leary McCann
* Update website/docs/api/cli.md
Co-authored-by: Paul O'Leary McCann
* addressing reviews
* dont quit but warn
* prettier
Co-authored-by: Adriane Boyd
Co-authored-by: Sofie Van Landeghem
Co-authored-by: Paul O'Leary McCann
---
spacy/cli/__init__.py | 1 +
spacy/cli/_util.py | 23 +++++++
spacy/cli/apply.py | 143 ++++++++++++++++++++++++++++++++++++++++
spacy/cli/convert.py | 31 +--------
spacy/tests/test_cli.py | 78 ++++++++++++++++++++++
website/docs/api/cli.md | 35 +++++++++-
6 files changed, 280 insertions(+), 31 deletions(-)
create mode 100644 spacy/cli/apply.py
diff --git a/spacy/cli/__init__.py b/spacy/cli/__init__.py
index aab2c8d12..aabd1cfef 100644
--- a/spacy/cli/__init__.py
+++ b/spacy/cli/__init__.py
@@ -16,6 +16,7 @@ from .debug_config import debug_config # noqa: F401
from .debug_model import debug_model # noqa: F401
from .debug_diff import debug_diff # noqa: F401
from .evaluate import evaluate # noqa: F401
+from .apply import apply # noqa: F401
from .convert import convert # noqa: F401
from .init_pipeline import init_pipeline_cli # noqa: F401
from .init_config import init_config, fill_config # noqa: F401
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index 9b97a9f19..c46abffe5 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -582,6 +582,29 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
local_msg.info("To switch to GPU 0, use the option: --gpu-id 0")
+def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
+ if not path.is_dir():
+ return [path]
+ paths = [path]
+ locs = []
+ seen = set()
+ for path in paths:
+ if str(path) in seen:
+ continue
+ seen.add(str(path))
+ if path.parts[-1].startswith("."):
+ continue
+ elif path.is_dir():
+ paths.extend(path.iterdir())
+ elif suffix is not None and not path.parts[-1].endswith(suffix):
+ continue
+ else:
+ locs.append(path)
+ # It's good to sort these, in case the ordering messes up cache.
+ locs.sort()
+ return locs
+
+
def _format_number(number: Union[int, float], ndigits: int = 2) -> str:
"""Formats a number (float or int) rounding to `ndigits`, without truncating trailing 0s,
as happens with `round(number, ndigits)`"""
diff --git a/spacy/cli/apply.py b/spacy/cli/apply.py
new file mode 100644
index 000000000..9d170bc95
--- /dev/null
+++ b/spacy/cli/apply.py
@@ -0,0 +1,143 @@
+import tqdm
+import srsly
+
+from itertools import chain
+from pathlib import Path
+from typing import Optional, List, Iterable, cast, Union
+
+from wasabi import msg
+
+from ._util import app, Arg, Opt, setup_gpu, import_code, walk_directory
+
+from ..tokens import Doc, DocBin
+from ..vocab import Vocab
+from ..util import ensure_path, load_model
+
+
+path_help = """Location of the documents to predict on.
+Can be a single file in .spacy format or a .jsonl file.
+Files with other extensions are treated as single plain text documents.
+If a directory is provided it is traversed recursively to grab
+all files to be processed.
+The files can be a mixture of .spacy, .jsonl and text files.
+If .jsonl is provided the specified field is going
+to be grabbed ("text" by default)."""
+
+out_help = "Path to save the resulting .spacy file"
+code_help = (
+ "Path to Python file with additional " "code (registered functions) to be imported"
+)
+gold_help = "Use gold preprocessing provided in the .spacy files"
+force_msg = (
+ "The provided output file already exists. "
+ "To force overwriting the output file, set the --force or -F flag."
+)
+
+
+DocOrStrStream = Union[Iterable[str], Iterable[Doc]]
+
+
+def _stream_docbin(path: Path, vocab: Vocab) -> Iterable[Doc]:
+ """
+ Stream Doc objects from DocBin.
+ """
+ docbin = DocBin().from_disk(path)
+ for doc in docbin.get_docs(vocab):
+ yield doc
+
+
+def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
+ """
+ Stream "text" field from JSONL. If the field "text" is
+ not found it raises error.
+ """
+ for entry in srsly.read_jsonl(path):
+ if field not in entry:
+ msg.fail(
+ f"{path} does not contain the required '{field}' field.", exits=1
+ )
+ else:
+ yield entry[field]
+
+
+def _stream_texts(paths: Iterable[Path]) -> Iterable[str]:
+ """
+ Yields strings from text files in paths.
+ """
+ for path in paths:
+ with open(path, "r") as fin:
+ text = fin.read()
+ yield text
+
+
+@app.command("apply")
+def apply_cli(
+ # fmt: off
+ model: str = Arg(..., help="Model name or path"),
+ data_path: Path = Arg(..., help=path_help, exists=True),
+ output_file: Path = Arg(..., help=out_help, dir_okay=False),
+ code_path: Optional[Path] = Opt(None, "--code", "-c", help=code_help),
+ text_key: str = Opt("text", "--text-key", "-tk", help="Key containing text string for JSONL"),
+ force_overwrite: bool = Opt(False, "--force", "-F", help="Force overwriting the output file"),
+ use_gpu: int = Opt(-1, "--gpu-id", "-g", help="GPU ID or -1 for CPU."),
+ batch_size: int = Opt(1, "--batch-size", "-b", help="Batch size."),
+ n_process: int = Opt(1, "--n-process", "-n", help="number of processors to use.")
+):
+ """
+ Apply a trained pipeline to documents to get predictions.
+ Expects a loadable spaCy pipeline and path to the data, which
+ can be a directory or a file.
+ The data files can be provided in multiple formats:
+ 1. .spacy files
+ 2. .jsonl files with a specified "field" to read the text from.
+ 3. Files with any other extension are assumed to be containing
+ a single document.
+ DOCS: https://spacy.io/api/cli#apply
+ """
+ data_path = ensure_path(data_path)
+ output_file = ensure_path(output_file)
+ code_path = ensure_path(code_path)
+ if output_file.exists() and not force_overwrite:
+ msg.fail(force_msg, exits=1)
+ if not data_path.exists():
+ msg.fail(f"Couldn't find data path: {data_path}", exits=1)
+ import_code(code_path)
+ setup_gpu(use_gpu)
+ apply(data_path, output_file, model, text_key, batch_size, n_process)
+
+
+def apply(
+ data_path: Path,
+ output_file: Path,
+ model: str,
+ json_field: str,
+ batch_size: int,
+ n_process: int,
+):
+ docbin = DocBin(store_user_data=True)
+ paths = walk_directory(data_path)
+ if len(paths) == 0:
+ docbin.to_disk(output_file)
+ msg.warn("Did not find data to process,"
+ f" {data_path} seems to be an empty directory.")
+ return
+ nlp = load_model(model)
+ msg.good(f"Loaded model {model}")
+ vocab = nlp.vocab
+ streams: List[DocOrStrStream] = []
+ text_files = []
+ for path in paths:
+ if path.suffix == ".spacy":
+ streams.append(_stream_docbin(path, vocab))
+ elif path.suffix == ".jsonl":
+ streams.append(_stream_jsonl(path, json_field))
+ else:
+ text_files.append(path)
+ if len(text_files) > 0:
+ streams.append(_stream_texts(text_files))
+ datagen = cast(DocOrStrStream, chain(*streams))
+ for doc in tqdm.tqdm(nlp.pipe(datagen, batch_size=batch_size, n_process=n_process)):
+ docbin.add(doc)
+ if output_file.suffix == "":
+ output_file = output_file.with_suffix(".spacy")
+ docbin.to_disk(output_file)
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 04eb7078f..7f365ae2c 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -1,4 +1,4 @@
-from typing import Callable, Iterable, Mapping, Optional, Any, List, Union
+from typing import Callable, Iterable, Mapping, Optional, Any, Union
from enum import Enum
from pathlib import Path
from wasabi import Printer
@@ -7,7 +7,7 @@ import re
import sys
import itertools
-from ._util import app, Arg, Opt
+from ._util import app, Arg, Opt, walk_directory
from ..training import docs_to_json
from ..tokens import Doc, DocBin
from ..training.converters import iob_to_docs, conll_ner_to_docs, json_to_docs
@@ -189,33 +189,6 @@ def autodetect_ner_format(input_data: str) -> Optional[str]:
return None
-def walk_directory(path: Path, converter: str) -> List[Path]:
- if not path.is_dir():
- return [path]
- paths = [path]
- locs = []
- seen = set()
- for path in paths:
- if str(path) in seen:
- continue
- seen.add(str(path))
- if path.parts[-1].startswith("."):
- continue
- elif path.is_dir():
- paths.extend(path.iterdir())
- elif converter == "json" and not path.parts[-1].endswith("json"):
- continue
- elif converter == "conll" and not path.parts[-1].endswith("conll"):
- continue
- elif converter == "iob" and not path.parts[-1].endswith("iob"):
- continue
- else:
- locs.append(path)
- # It's good to sort these, in case the ordering messes up cache.
- locs.sort()
- return locs
-
-
def verify_cli_args(
msg: Printer,
input_path: Path,
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index 42af08749..c6768a3fd 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -5,6 +5,7 @@ from typing import Tuple, List, Dict, Any
import pkg_resources
import time
+import spacy
import numpy
import pytest
import srsly
@@ -32,6 +33,7 @@ from spacy.cli.package import _is_permitted_package_name
from spacy.cli.project.remote_storage import RemoteStorage
from spacy.cli.project.run import _check_requirements
from spacy.cli.validate import get_model_pkgs
+from spacy.cli.apply import apply
from spacy.cli.find_threshold import find_threshold
from spacy.lang.en import English
from spacy.lang.nl import Dutch
@@ -885,6 +887,82 @@ def test_span_length_freq_dist_output_must_be_correct():
assert list(span_freqs.keys()) == [3, 1, 4, 5, 2]
+def test_applycli_empty_dir():
+ with make_tempdir() as data_path:
+ output = data_path / "test.spacy"
+ apply(data_path, output, "blank:en", "text", 1, 1)
+
+
+def test_applycli_docbin():
+ with make_tempdir() as data_path:
+ output = data_path / "testout.spacy"
+ nlp = spacy.blank("en")
+ doc = nlp("testing apply cli.")
+ # test empty DocBin case
+ docbin = DocBin()
+ docbin.to_disk(data_path / "testin.spacy")
+ apply(data_path, output, "blank:en", "text", 1, 1)
+ docbin.add(doc)
+ docbin.to_disk(data_path / "testin.spacy")
+ apply(data_path, output, "blank:en", "text", 1, 1)
+
+
+def test_applycli_jsonl():
+ with make_tempdir() as data_path:
+ output = data_path / "testout.spacy"
+ data = [{"field": "Testing apply cli.", "key": 234}]
+ data2 = [{"field": "234"}]
+ srsly.write_jsonl(data_path / "test.jsonl", data)
+ apply(data_path, output, "blank:en", "field", 1, 1)
+ srsly.write_jsonl(data_path / "test2.jsonl", data2)
+ apply(data_path, output, "blank:en", "field", 1, 1)
+
+
+def test_applycli_txt():
+ with make_tempdir() as data_path:
+ output = data_path / "testout.spacy"
+ with open(data_path / "test.foo", "w") as ftest:
+ ftest.write("Testing apply cli.")
+ apply(data_path, output, "blank:en", "text", 1, 1)
+
+
+def test_applycli_mixed():
+ with make_tempdir() as data_path:
+ output = data_path / "testout.spacy"
+ text = "Testing apply cli"
+ nlp = spacy.blank("en")
+ doc = nlp(text)
+ jsonl_data = [{"text": text}]
+ srsly.write_jsonl(data_path / "test.jsonl", jsonl_data)
+ docbin = DocBin()
+ docbin.add(doc)
+ docbin.to_disk(data_path / "testin.spacy")
+ with open(data_path / "test.txt", "w") as ftest:
+ ftest.write(text)
+ apply(data_path, output, "blank:en", "text", 1, 1)
+ # Check whether it worked
+ result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
+ assert len(result) == 3
+ for doc in result:
+ assert doc.text == text
+
+
+def test_applycli_user_data():
+ Doc.set_extension("ext", default=0)
+ val = ("ext", 0)
+ with make_tempdir() as data_path:
+ output = data_path / "testout.spacy"
+ nlp = spacy.blank("en")
+ doc = nlp("testing apply cli.")
+ doc._.ext = val
+ docbin = DocBin(store_user_data=True)
+ docbin.add(doc)
+ docbin.to_disk(data_path / "testin.spacy")
+ apply(data_path, output, "blank:en", "", 1, 1)
+ result = list(DocBin().from_disk(output).get_docs(nlp.vocab))
+ assert result[0]._.ext == val
+
+
def test_local_remote_storage():
with make_tempdir() as d:
filename = "a.txt"
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 8823a3bd8..275e37ee0 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -12,6 +12,7 @@ menu:
- ['train', 'train']
- ['pretrain', 'pretrain']
- ['evaluate', 'evaluate']
+ - ['apply', 'apply']
- ['find-threshold', 'find-threshold']
- ['assemble', 'assemble']
- ['package', 'package']
@@ -474,7 +475,7 @@ report span characteristics such as the average span length and the span (or
span boundary) distinctiveness. The distinctiveness measure shows how different
the tokens are with respect to the rest of the corpus using the KL-divergence of
the token distributions. To learn more, you can check out Papay et al.'s work on
-[*Dissecting Span Identification Tasks with Performance Prediction* (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/).
+[_Dissecting Span Identification Tasks with Performance Prediction_ (EMNLP 2020)](https://aclanthology.org/2020.emnlp-main.396/).
@@ -1162,6 +1163,37 @@ $ python -m spacy evaluate [model] [data_path] [--output] [--code] [--gold-prepr
| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
| **CREATES** | Training results and optional metrics and visualizations. |
+## apply {#apply new="3.5" tag="command"}
+
+Applies a trained pipeline to data and stores the resulting annotated documents
+in a `DocBin`. The input can be a single file or a directory. The recognized
+input formats are:
+
+1. `.spacy`
+2. `.jsonl` containing a user specified `text_key`
+3. Files with any other extension are assumed to be plain text files containing
+ a single document.
+
+When a directory is provided it is traversed recursively to collect all files.
+
+```cli
+$ python -m spacy apply [model] [data-path] [output-file] [--code] [--text-key] [--force-overwrite] [--gpu-id] [--batch-size] [--n-process]
+```
+
+| Name | Description |
+| ----------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| `model` | Pipeline to apply to the data. Can be a package or a path to a data directory. ~~str (positional)~~ |
+| `data_path` | Location of data to be evaluated in spaCy's [binary format](/api/data-formats#training), jsonl, or plain text. ~~Path (positional)~~ |
+| `output-file`, `-o` | Output `DocBin` path. ~~str (positional)~~ |
+| `--code`, `-c` 3 | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-functions) for new architectures. ~~Optional[Path] \(option)~~ |
+| `--text-key`, `-tk` | The key for `.jsonl` files to use to grab the texts from. Defaults to `text`. ~~Optional[str] \(option)~~ |
+| `--force-overwrite`, `-F` | If the provided `output-file` already exists, then force `apply` to overwrite it. If this is `False` (default) then quits with a warning instead. ~~bool (flag)~~ |
+| `--gpu-id`, `-g` | GPU to use, if any. Defaults to `-1` for CPU. ~~int (option)~~ |
+| `--batch-size`, `-b` | Batch size to use for prediction. Defaults to `1`. ~~int (option)~~ |
+| `--n-process`, `-n` | Number of processes to use for prediction. Defaults to `1`. ~~int (option)~~ |
+| `--help`, `-h` | Show help message and available arguments. ~~bool (flag)~~ |
+| **CREATES** | A `DocBin` with the annotations from the `model` for all the files found in `data-path`. |
+
## find-threshold {#find-threshold new="3.5" tag="command"}
Runs prediction trials for a trained model with varying tresholds to maximize
@@ -1187,7 +1219,6 @@ be provided.
> $ python -m spacy find-threshold my_nlp data.spacy spancat threshold spans_sc_f
> ```
-
| Name | Description |
| ----------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `model` | Pipeline to evaluate. Can be a package or a path to a data directory. ~~str (positional)~~ |
From eef3d950b4266ab9545143de8070456ce7967950 Mon Sep 17 00:00:00 2001
From: Raphael Mitsch
Date: Wed, 21 Dec 2022 18:54:27 +0100
Subject: [PATCH 25/36] Fix `SpanGroup` and `Span` typing (#12009)
* Correct Span.label, Span.kb_id types. Fix SpanGroup.__iter__().
* Extend test.
* Rename test. Fix typo.
* Add comment.
* Fix types for Span.label, Span.kb_id, Span.char_span().
* Update spacy/tests/doc/test_span_group.py
Co-authored-by: Adriane Boyd
* Update docs.
* Fix typo.
* Update spacy/tokens/span_group.pyx
Co-authored-by: Adriane Boyd
Co-authored-by: Adriane Boyd
---
spacy/tests/doc/test_span_group.py | 15 ++++++++++++++-
spacy/tokens/span.pyi | 4 ++--
spacy/tokens/span_group.pyi | 1 +
spacy/tokens/span_group.pyx | 10 ++++++++++
website/docs/api/spangroup.md | 17 +++++++++++++++++
5 files changed, 44 insertions(+), 3 deletions(-)
diff --git a/spacy/tests/doc/test_span_group.py b/spacy/tests/doc/test_span_group.py
index 8c70a83e1..818569c64 100644
--- a/spacy/tests/doc/test_span_group.py
+++ b/spacy/tests/doc/test_span_group.py
@@ -1,7 +1,10 @@
+from typing import List
+
import pytest
from random import Random
from spacy.matcher import Matcher
-from spacy.tokens import Span, SpanGroup
+from spacy.tokens import Span, SpanGroup, Doc
+from spacy.util import filter_spans
@pytest.fixture
@@ -240,3 +243,13 @@ def test_span_group_extend(doc):
def test_span_group_dealloc(span_group):
with pytest.raises(AttributeError):
print(span_group.doc)
+
+
+@pytest.mark.issue(11975)
+def test_span_group_typing(doc: Doc):
+ """Tests whether typing of `SpanGroup` as `Iterable[Span]`-like object is accepted by mypy."""
+ span_group: SpanGroup = doc.spans["SPANS"]
+ spans: List[Span] = list(span_group)
+ for i, span in enumerate(span_group):
+ assert span == span_group[i] == spans[i]
+ filter_spans(span_group)
diff --git a/spacy/tokens/span.pyi b/spacy/tokens/span.pyi
index 0a6f306a6..9986a90e6 100644
--- a/spacy/tokens/span.pyi
+++ b/spacy/tokens/span.pyi
@@ -95,8 +95,8 @@ class Span:
self,
start_idx: int,
end_idx: int,
- label: int = ...,
- kb_id: int = ...,
+ label: Union[int, str] = ...,
+ kb_id: Union[int, str] = ...,
vector: Optional[Floats1d] = ...,
) -> Span: ...
@property
diff --git a/spacy/tokens/span_group.pyi b/spacy/tokens/span_group.pyi
index 21cd124ab..0b4aa83aa 100644
--- a/spacy/tokens/span_group.pyi
+++ b/spacy/tokens/span_group.pyi
@@ -18,6 +18,7 @@ class SpanGroup:
def doc(self) -> Doc: ...
@property
def has_overlap(self) -> bool: ...
+ def __iter__(self): ...
def __len__(self) -> int: ...
def append(self, span: Span) -> None: ...
def extend(self, spans: Iterable[Span]) -> None: ...
diff --git a/spacy/tokens/span_group.pyx b/spacy/tokens/span_group.pyx
index 1aa3c0bc8..608dda283 100644
--- a/spacy/tokens/span_group.pyx
+++ b/spacy/tokens/span_group.pyx
@@ -158,6 +158,16 @@ cdef class SpanGroup:
return self._concat(other)
return NotImplemented
+ def __iter__(self):
+ """
+ Iterate over the spans in this SpanGroup.
+ YIELDS (Span): A span in this SpanGroup.
+
+ DOCS: https://spacy.io/api/spangroup#iter
+ """
+ for i in range(self.c.size()):
+ yield self[i]
+
def append(self, Span span):
"""Add a span to the group. The span must refer to the same Doc
object as the span group.
diff --git a/website/docs/api/spangroup.md b/website/docs/api/spangroup.md
index 2d1cf73c4..bd9659acb 100644
--- a/website/docs/api/spangroup.md
+++ b/website/docs/api/spangroup.md
@@ -202,6 +202,23 @@ already present in the current span group.
| `other` | The span group or spans to append. ~~Union[SpanGroup, Iterable[Span]]~~ |
| **RETURNS** | The span group. ~~SpanGroup~~ |
+## SpanGroup.\_\_iter\_\_ {#iter tag="method" new="3.5"}
+
+Iterate over the spans in this span group.
+
+> #### Example
+>
+> ```python
+> doc = nlp("Their goi ng home")
+> doc.spans["errors"] = [doc[0:1], doc[1:3]]
+> for error_span in doc.spans["errors"]:
+> print(error_span)
+> ```
+
+| Name | Description |
+| ---------- | ----------------------------------- |
+| **YIELDS** | A span in this span group. ~~Span~~ |
+
## SpanGroup.append {#append tag="method"}
Add a [`Span`](/api/span) object to the group. The span must refer to the same
From 64d2d27c5dbf8e5657187975d2c9627f30e108a2 Mon Sep 17 00:00:00 2001
From: Adriane Boyd
Date: Thu, 22 Dec 2022 10:53:16 +0100
Subject: [PATCH 26/36] Add classifier for python 3.11 (#12013)
---
setup.cfg | 1 +
1 file changed, 1 insertion(+)
diff --git a/setup.cfg b/setup.cfg
index cf6e6f84b..d290d706c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -22,6 +22,7 @@ classifiers =
Programming Language :: Python :: 3.8
Programming Language :: Python :: 3.9
Programming Language :: Python :: 3.10
+ Programming Language :: Python :: 3.11
Topic :: Scientific/Engineering
project_urls =
Release notes = https://github.com/explosion/spaCy/releases
From 90896504a5dba1babac04a2b88662179409ae006 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
<41898282+github-actions[bot]@users.noreply.github.com>
Date: Fri, 23 Dec 2022 12:44:07 +0100
Subject: [PATCH 27/36] Auto-format code with black (#12019)
Co-authored-by: explosion-bot
---
spacy/cli/apply.py | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/spacy/cli/apply.py b/spacy/cli/apply.py
index 9d170bc95..f0df4e757 100644
--- a/spacy/cli/apply.py
+++ b/spacy/cli/apply.py
@@ -53,9 +53,7 @@ def _stream_jsonl(path: Path, field: str) -> Iterable[str]:
"""
for entry in srsly.read_jsonl(path):
if field not in entry:
- msg.fail(
- f"{path} does not contain the required '{field}' field.", exits=1
- )
+ msg.fail(f"{path} does not contain the required '{field}' field.", exits=1)
else:
yield entry[field]
@@ -118,8 +116,10 @@ def apply(
paths = walk_directory(data_path)
if len(paths) == 0:
docbin.to_disk(output_file)
- msg.warn("Did not find data to process,"
- f" {data_path} seems to be an empty directory.")
+ msg.warn(
+ "Did not find data to process,"
+ f" {data_path} seems to be an empty directory."
+ )
return
nlp = load_model(model)
msg.good(f"Loaded model {model}")
From aa2b471a6e289d1c1bb51558df779ae028671225 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan
Date: Fri, 23 Dec 2022 15:21:44 +0100
Subject: [PATCH 28/36] New console logger with expanded progress tracking
(#11972)
* Add `ConsoleLogger.v3`
This addition expands the progress bar feature to count up the training/distillation steps to either the next evaluation pass or the maximum number of steps.
* Rename progress bar types
* Add defaults to docs
Minor fixes
* Move comment
* Minor punctuation fixes
* Explicitly check for `None` when validating progress bar type
Co-authored-by: Paul O'Leary McCann
---
spacy/errors.py | 1 +
spacy/training/loggers.py | 48 ++++++++++++++++++++++++++++++++---
website/docs/api/top-level.md | 34 ++++++++++++++++++++-----
3 files changed, 74 insertions(+), 9 deletions(-)
diff --git a/spacy/errors.py b/spacy/errors.py
index 0e5ef91ed..cd9281e91 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -962,6 +962,7 @@ class Errors(metaclass=ErrorsWithCodes):
E1046 = ("{cls_name} is an abstract class and cannot be instantiated. If you are looking for spaCy's default "
"knowledge base, use `InMemoryLookupKB`.")
E1047 = ("`find_threshold()` only supports components with a `scorer` attribute.")
+ E1048 = ("Got '{unexpected}' as console progress bar type, but expected one of the following: {expected}")
# Deprecated model shortcuts, only used in errors and warnings
diff --git a/spacy/training/loggers.py b/spacy/training/loggers.py
index 408ea7140..7de31822e 100644
--- a/spacy/training/loggers.py
+++ b/spacy/training/loggers.py
@@ -26,6 +26,8 @@ def setup_table(
return final_cols, final_widths, ["r" for _ in final_widths]
+# We cannot rename this method as it's directly imported
+# and used by external packages such as spacy-loggers.
@registry.loggers("spacy.ConsoleLogger.v2")
def console_logger(
progress_bar: bool = False,
@@ -33,7 +35,27 @@ def console_logger(
output_file: Optional[Union[str, Path]] = None,
):
"""The ConsoleLogger.v2 prints out training logs in the console and/or saves them to a jsonl file.
- progress_bar (bool): Whether the logger should print the progress bar.
+ progress_bar (bool): Whether the logger should print a progress bar tracking the steps till the next evaluation pass.
+ console_output (bool): Whether the logger should print the logs on the console.
+ output_file (Optional[Union[str, Path]]): The file to save the training logs to.
+ """
+ return console_logger_v3(
+ progress_bar=None if progress_bar is False else "eval",
+ console_output=console_output,
+ output_file=output_file,
+ )
+
+
+@registry.loggers("spacy.ConsoleLogger.v3")
+def console_logger_v3(
+ progress_bar: Optional[str] = None,
+ console_output: bool = True,
+ output_file: Optional[Union[str, Path]] = None,
+):
+ """The ConsoleLogger.v3 prints out training logs in the console and/or saves them to a jsonl file.
+ progress_bar (Optional[str]): Type of progress bar to show in the console. Allowed values:
+ train - Tracks the number of steps from the beginning of training until the full training run is complete (training.max_steps is reached).
+ eval - Tracks the number of steps between the previous and next evaluation (training.eval_frequency is reached).
console_output (bool): Whether the logger should print the logs on the console.
output_file (Optional[Union[str, Path]]): The file to save the training logs to.
"""
@@ -70,6 +92,7 @@ def console_logger(
for name, proc in nlp.pipeline
if hasattr(proc, "is_trainable") and proc.is_trainable
]
+ max_steps = nlp.config["training"]["max_steps"]
eval_frequency = nlp.config["training"]["eval_frequency"]
score_weights = nlp.config["training"]["score_weights"]
score_cols = [col for col, value in score_weights.items() if value is not None]
@@ -84,6 +107,13 @@ def console_logger(
write(msg.row(table_header, widths=table_widths, spacing=spacing))
write(msg.row(["-" * width for width in table_widths], spacing=spacing))
progress = None
+ expected_progress_types = ("train", "eval")
+ if progress_bar is not None and progress_bar not in expected_progress_types:
+ raise ValueError(
+ Errors.E1048.format(
+ unexpected=progress_bar, expected=expected_progress_types
+ )
+ )
def log_step(info: Optional[Dict[str, Any]]) -> None:
nonlocal progress
@@ -141,11 +171,23 @@ def console_logger(
)
)
if progress_bar:
+ if progress_bar == "train":
+ total = max_steps
+ desc = f"Last Eval Epoch: {info['epoch']}"
+ initial = info["step"]
+ else:
+ total = eval_frequency
+ desc = f"Epoch {info['epoch']+1}"
+ initial = 0
# Set disable=None, so that it disables on non-TTY
progress = tqdm.tqdm(
- total=eval_frequency, disable=None, leave=False, file=stderr
+ total=total,
+ disable=None,
+ leave=False,
+ file=stderr,
+ initial=initial,
)
- progress.set_description(f"Epoch {info['epoch']+1}")
+ progress.set_description(desc)
def finalize() -> None:
if output_stream:
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 26a5d42f4..883c5e3b9 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -513,7 +513,7 @@ a [Weights & Biases](https://www.wandb.com/) dashboard.
Instead of using one of the built-in loggers, you can
[implement your own](/usage/training#custom-logging).
-#### spacy.ConsoleLogger.v2 {#ConsoleLogger tag="registered function"}
+#### spacy.ConsoleLogger.v2 {tag="registered function"}
> #### Example config
>
@@ -564,11 +564,33 @@ start decreasing across epochs.
-| Name | Description |
-| ---------------- | --------------------------------------------------------------------- |
-| `progress_bar` | Whether the logger should print the progress bar ~~bool~~ |
-| `console_output` | Whether the logger should print the logs on the console. ~~bool~~ |
-| `output_file` | The file to save the training logs to. ~~Optional[Union[str, Path]]~~ |
+| Name | Description |
+| ---------------- | ---------------------------------------------------------------------------------------------------------------------------- |
+| `progress_bar` | Whether the logger should print a progress bar tracking the steps till the next evaluation pass (default: `False`). ~~bool~~ |
+| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~ |
+| `output_file` | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~ |
+
+#### spacy.ConsoleLogger.v3 {#ConsoleLogger tag="registered function"}
+
+> #### Example config
+>
+> ```ini
+> [training.logger]
+> @loggers = "spacy.ConsoleLogger.v3"
+> progress_bar = "all_steps"
+> console_output = true
+> output_file = "training_log.jsonl"
+> ```
+
+Writes the results of a training step to the console in a tabular format and
+optionally saves them to a `jsonl` file.
+
+| Name | Description |
+| ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `progress_bar` | Type of progress bar to show in the console: `"train"`, `"eval"` or `None`. |
+| | The bar tracks the number of steps until `training.max_steps` and `training.eval_frequency` are reached respectively (default: `None`). ~~Optional[str]~~ |
+| `console_output` | Whether the logger should print the logs in the console (default: `True`). ~~bool~~ |
+| `output_file` | The file to save the training logs to (default: `None`). ~~Optional[Union[str, Path]]~~ |
## Readers {#readers}
From 933b54ac798a7d64f9cde4d85b55556e84e44bd6 Mon Sep 17 00:00:00 2001
From: kadarakos
Date: Mon, 26 Dec 2022 13:26:35 +0100
Subject: [PATCH 29/36] typo fix (#11995)
---
spacy/pipeline/span_ruler.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/spacy/pipeline/span_ruler.py b/spacy/pipeline/span_ruler.py
index 807a4ffe5..0e7e9ebf7 100644
--- a/spacy/pipeline/span_ruler.py
+++ b/spacy/pipeline/span_ruler.py
@@ -170,7 +170,7 @@ def prioritize_existing_ents_filter(
@registry.misc("spacy.prioritize_existing_ents_filter.v1")
-def make_preverse_existing_ents_filter():
+def make_preserve_existing_ents_filter():
return prioritize_existing_ents_filter
From ef9e504eacc806162666c964bd00d152fc15f9e3 Mon Sep 17 00:00:00 2001
From: Adriane Boyd
Date: Thu, 29 Dec 2022 14:01:08 +0100
Subject: [PATCH 30/36] Rename modified textcat scorer to v2 (#11971)
As a follow-up to #11696, rename the modified scorer to v2 and move the
v1 scorer to `spacy-legacy`.
---
requirements.txt | 2 +-
setup.cfg | 2 +-
spacy/pipeline/textcat.py | 4 ++--
spacy/tests/pipeline/test_textcat.py | 17 +++++++++++++++++
4 files changed, 21 insertions(+), 4 deletions(-)
diff --git a/requirements.txt b/requirements.txt
index 0440835f2..5bc1c8684 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
# Our libraries
-spacy-legacy>=3.0.10,<3.1.0
+spacy-legacy>=3.0.11,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
cymem>=2.0.2,<2.1.0
preshed>=3.0.2,<3.1.0
diff --git a/setup.cfg b/setup.cfg
index d290d706c..cee8c0c33 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -42,7 +42,7 @@ setup_requires =
thinc>=8.1.0,<8.2.0
install_requires =
# Our libraries
- spacy-legacy>=3.0.10,<3.1.0
+ spacy-legacy>=3.0.11,<3.1.0
spacy-loggers>=1.0.0,<2.0.0
murmurhash>=0.28.0,<1.1.0
cymem>=2.0.2,<2.1.0
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 65121114d..650a01949 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -74,7 +74,7 @@ subword_features = true
default_config={
"threshold": 0.0,
"model": DEFAULT_SINGLE_TEXTCAT_MODEL,
- "scorer": {"@scorers": "spacy.textcat_scorer.v1"},
+ "scorer": {"@scorers": "spacy.textcat_scorer.v2"},
},
default_score_weights={
"cats_score": 1.0,
@@ -117,7 +117,7 @@ def textcat_score(examples: Iterable[Example], **kwargs) -> Dict[str, Any]:
)
-@registry.scorers("spacy.textcat_scorer.v1")
+@registry.scorers("spacy.textcat_scorer.v2")
def make_textcat_scorer():
return textcat_score
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 155ce99a2..eafe4c128 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -895,3 +895,20 @@ def test_textcat_multi_threshold():
scores = nlp.evaluate(train_examples, scorer_cfg={"threshold": 0})
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
+
+
+@pytest.mark.parametrize("component_name,scorer", [("textcat", "spacy.textcat_scorer.v1")])
+def test_textcat_legacy_scorers(component_name, scorer):
+ """Check that legacy scorers are registered and produce the expected score
+ keys."""
+ nlp = English()
+ nlp.add_pipe(component_name, config={"scorer": {"@scorers": scorer}})
+
+ train_examples = []
+ for text, annotations in TRAIN_DATA_SINGLE_LABEL:
+ train_examples.append(Example.from_dict(nlp.make_doc(text), annotations))
+ nlp.initialize(get_examples=lambda: train_examples)
+
+ # score the model (it's not actually trained but that doesn't matter)
+ scores = nlp.evaluate(train_examples)
+ assert 0 <= scores["cats_score"] <= 1
From abb0ab109d33d2deaa6155a61fad649a25472f9c Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
<41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 2 Jan 2023 11:59:57 +0100
Subject: [PATCH 31/36] Auto-format code with black (#12035)
Co-authored-by: explosion-bot
---
spacy/tests/pipeline/test_textcat.py | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index eafe4c128..048586cec 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -897,7 +897,9 @@ def test_textcat_multi_threshold():
assert scores["cats_f_per_type"]["POSITIVE"]["r"] == 1.0
-@pytest.mark.parametrize("component_name,scorer", [("textcat", "spacy.textcat_scorer.v1")])
+@pytest.mark.parametrize(
+ "component_name,scorer", [("textcat", "spacy.textcat_scorer.v1")]
+)
def test_textcat_legacy_scorers(component_name, scorer):
"""Check that legacy scorers are registered and produce the expected score
keys."""
From 31c1beba787446059de58a1478e6aec197fd0bbb Mon Sep 17 00:00:00 2001
From: Wannaphong Phatthiyaphaibun
Date: Tue, 3 Jan 2023 15:03:59 +0700
Subject: [PATCH 32/36] Add spacy-pythainlp (#12038)
* Add spacy-pythainlp
* Move submission to right section
* Minor cleanup
* Remove extra list call
* Update universe.json
Co-authored-by: Paul O'Leary McCann
---
website/meta/universe.json | 27 +++++++++++++++++++++++++++
1 file changed, 27 insertions(+)
diff --git a/website/meta/universe.json b/website/meta/universe.json
index db533c3b2..99d121507 100644
--- a/website/meta/universe.json
+++ b/website/meta/universe.json
@@ -4062,6 +4062,33 @@
"author_links": {
"github": "yasufumy"
}
+ },
+ {
+ "id": "spacy-pythainlp",
+ "title": "spaCy-PyThaiNLP",
+ "slogan": "PyThaiNLP for spaCy",
+ "description": "This package wraps the PyThaiNLP library to add support for Thai to spaCy.",
+ "github": "PyThaiNLP/spaCy-PyThaiNLP",
+ "code_example": [
+ "import spacy",
+ "import spacy_pythainlp.core",
+ "",
+ "nlp = spacy.blank('th')",
+ "nlp.add_pipe('pythainlp')",
+ "doc = nlp('ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน ผมอยากไปเที่ยว')",
+ "",
+ "print(list(doc.sents))",
+ "# output: [ผมเป็นคนไทย แต่มะลิอยากไปโรงเรียนส่วนผมจะไปไหน , ผมอยากไปเที่ยว]"
+ ],
+ "code_language": "python",
+ "author": "Wannaphong Phatthiyaphaibun",
+ "author_links": {
+ "twitter": "@wannaphong_p",
+ "github": "wannaphong",
+ "website": "https://iam.wannaphong.com/"
+ },
+ "category": ["pipeline", "research"],
+ "tags": ["Thai"]
}
],
From dbd829f0ed2dba3eb6eb5b59b18396ed38e326b9 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann
Date: Wed, 4 Jan 2023 12:51:40 +0900
Subject: [PATCH 33/36] Fix inconsistency in displaCy docs about page option
(#12047)
* Fix inconsistency in displaCy docs about page option
The `page` option, which wraps the output SVG in HTML, is true by
default for `serve` but not for `render`. The `render` docs were wrong
though, so this updates them.
* Update the same statement in more docs
A few renderers used the same language
---
spacy/displacy/__init__.py | 2 +-
spacy/displacy/render.py | 4 ++--
website/docs/api/top-level.md | 2 +-
3 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index bc32001d7..2f2058b8e 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -36,7 +36,7 @@ def render(
jupyter (bool): Override Jupyter auto-detection.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
- RETURNS (str): Rendered HTML markup.
+ RETURNS (str): Rendered SVG or HTML markup.
DOCS: https://spacy.io/api/top-level#displacy.render
USAGE: https://spacy.io/usage/visualizers
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 50dc3466c..f74222dc2 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -94,7 +94,7 @@ class SpanRenderer:
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
- RETURNS (str): Rendered HTML markup.
+ RETURNS (str): Rendered SVG or HTML markup.
"""
rendered = []
for i, p in enumerate(parsed):
@@ -510,7 +510,7 @@ class EntityRenderer:
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
- RETURNS (str): Rendered HTML markup.
+ RETURNS (str): Rendered SVG or HTML markup.
"""
rendered = []
for i, p in enumerate(parsed):
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 883c5e3b9..6a63e07da 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -266,7 +266,7 @@ Render a dependency parse tree or named entity visualization.
| ----------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `docs` | Document(s) or span(s) to visualize. ~~Union[Iterable[Union[Doc, Span, dict]], Doc, Span, dict]~~ |
| `style` | Visualization style, `"dep"`, `"ent"` or `"span"` 3.3. Defaults to `"dep"`. ~~str~~ |
-| `page` | Render markup as full HTML page. Defaults to `True`. ~~bool~~ |
+| `page` | Render markup as full HTML page. Defaults to `False`. ~~bool~~ |
| `minify` | Minify HTML markup. Defaults to `False`. ~~bool~~ |
| `options` | [Visualizer-specific options](#displacy_options), e.g. colors. ~~Dict[str, Any]~~ |
| `manual` | Don't parse `Doc` and instead expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. Defaults to `False`. ~~bool~~ |
From 7f6c638c3acd732c0b52a45a2b3ad0388cd1ae66 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Thu, 5 Jan 2023 10:21:00 +0100
Subject: [PATCH 34/36] fix processing of "auto" in convert (#12050)
* fix processing of "auto" in walk_directory
* add check for None
* move AUTO check to convert and fix verification of args
* add specific CLI test with CliRunner
* cleanup
* more cleanup
* update docstring
---
spacy/cli/_util.py | 4 ++++
spacy/cli/convert.py | 26 ++++++++++++++++----------
spacy/tests/test_cli.py | 26 +++++++++++++++++++++++++-
spacy/tests/test_cli_app.py | 33 +++++++++++++++++++++++++++++++++
4 files changed, 78 insertions(+), 11 deletions(-)
create mode 100644 spacy/tests/test_cli_app.py
diff --git a/spacy/cli/_util.py b/spacy/cli/_util.py
index c46abffe5..0f4e9f599 100644
--- a/spacy/cli/_util.py
+++ b/spacy/cli/_util.py
@@ -583,6 +583,10 @@ def setup_gpu(use_gpu: int, silent=None) -> None:
def walk_directory(path: Path, suffix: Optional[str] = None) -> List[Path]:
+ """Given a directory and a suffix, recursively find all files matching the suffix.
+ Directories or files with names beginning with a . are ignored, but hidden flags on
+ filesystems are not checked.
+ When provided with a suffix `None`, there is no suffix-based filtering."""
if not path.is_dir():
return [path]
paths = [path]
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index 7f365ae2c..68d454b3e 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -28,6 +28,8 @@ CONVERTERS: Mapping[str, Callable[..., Iterable[Doc]]] = {
"json": json_to_docs,
}
+AUTO = "auto"
+
# File types that can be written to stdout
FILE_TYPES_STDOUT = ("json",)
@@ -49,7 +51,7 @@ def convert_cli(
model: Optional[str] = Opt(None, "--model", "--base", "-b", help="Trained spaCy pipeline for sentence segmentation to use as base (for --seg-sents)"),
morphology: bool = Opt(False, "--morphology", "-m", help="Enable appending morphology to tags"),
merge_subtokens: bool = Opt(False, "--merge-subtokens", "-T", help="Merge CoNLL-U subtokens"),
- converter: str = Opt("auto", "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
+ converter: str = Opt(AUTO, "--converter", "-c", help=f"Converter: {tuple(CONVERTERS.keys())}"),
ner_map: Optional[Path] = Opt(None, "--ner-map", "-nm", help="NER tag mapping (as JSON-encoded dict of entity types)", exists=True),
lang: Optional[str] = Opt(None, "--lang", "-l", help="Language (if tokenizer required)"),
concatenate: bool = Opt(None, "--concatenate", "-C", help="Concatenate output to a single file"),
@@ -70,8 +72,8 @@ def convert_cli(
output_dir: Union[str, Path] = "-" if output_dir == Path("-") else output_dir
silent = output_dir == "-"
msg = Printer(no_print=silent)
- verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
converter = _get_converter(msg, converter, input_path)
+ verify_cli_args(msg, input_path, output_dir, file_type.value, converter, ner_map)
convert(
input_path,
output_dir,
@@ -100,7 +102,7 @@ def convert(
model: Optional[str] = None,
morphology: bool = False,
merge_subtokens: bool = False,
- converter: str = "auto",
+ converter: str,
ner_map: Optional[Path] = None,
lang: Optional[str] = None,
concatenate: bool = False,
@@ -212,18 +214,22 @@ def verify_cli_args(
input_locs = walk_directory(input_path, converter)
if len(input_locs) == 0:
msg.fail("No input files in directory", input_path, exits=1)
- file_types = list(set([loc.suffix[1:] for loc in input_locs]))
- if converter == "auto" and len(file_types) >= 2:
- file_types_str = ",".join(file_types)
- msg.fail("All input files must be same type", file_types_str, exits=1)
- if converter != "auto" and converter not in CONVERTERS:
+ if converter not in CONVERTERS:
msg.fail(f"Can't find converter for {converter}", exits=1)
def _get_converter(msg, converter, input_path: Path):
if input_path.is_dir():
- input_path = walk_directory(input_path, converter)[0]
- if converter == "auto":
+ if converter == AUTO:
+ input_locs = walk_directory(input_path, suffix=None)
+ file_types = list(set([loc.suffix[1:] for loc in input_locs]))
+ if len(file_types) >= 2:
+ file_types_str = ",".join(file_types)
+ msg.fail("All input files must be same type", file_types_str, exits=1)
+ input_path = input_locs[0]
+ else:
+ input_path = walk_directory(input_path, suffix=converter)[0]
+ if converter == AUTO:
converter = input_path.suffix[1:]
if converter == "ner" or converter == "iob":
with input_path.open(encoding="utf8") as file_:
diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py
index c6768a3fd..c88e20de2 100644
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@@ -4,6 +4,7 @@ from collections import Counter
from typing import Tuple, List, Dict, Any
import pkg_resources
import time
+from pathlib import Path
import spacy
import numpy
@@ -15,7 +16,7 @@ from thinc.api import Config, ConfigValidationError
from spacy import about
from spacy.cli import info
-from spacy.cli._util import is_subpath_of, load_project_config
+from spacy.cli._util import is_subpath_of, load_project_config, walk_directory
from spacy.cli._util import parse_config_overrides, string_to_list
from spacy.cli._util import substitute_project_variables
from spacy.cli._util import validate_project_commands
@@ -1185,3 +1186,26 @@ def test_upload_download_local_file():
download_file(remote_file, local_file)
with local_file.open(mode="r") as file_:
assert file_.read() == content
+
+
+def test_walk_directory():
+ with make_tempdir() as d:
+ files = [
+ "data1.iob",
+ "data2.iob",
+ "data3.json",
+ "data4.conll",
+ "data5.conll",
+ "data6.conll",
+ "data7.txt",
+ ]
+
+ for f in files:
+ Path(d / f).touch()
+
+ assert (len(walk_directory(d))) == 7
+ assert (len(walk_directory(d, suffix=None))) == 7
+ assert (len(walk_directory(d, suffix="json"))) == 1
+ assert (len(walk_directory(d, suffix="iob"))) == 2
+ assert (len(walk_directory(d, suffix="conll"))) == 3
+ assert (len(walk_directory(d, suffix="pdf"))) == 0
diff --git a/spacy/tests/test_cli_app.py b/spacy/tests/test_cli_app.py
new file mode 100644
index 000000000..873a3ff66
--- /dev/null
+++ b/spacy/tests/test_cli_app.py
@@ -0,0 +1,33 @@
+import os
+from pathlib import Path
+from typer.testing import CliRunner
+
+from spacy.cli._util import app
+from .util import make_tempdir
+
+
+def test_convert_auto():
+ with make_tempdir() as d_in, make_tempdir() as d_out:
+ for f in ["data1.iob", "data2.iob", "data3.iob"]:
+ Path(d_in / f).touch()
+
+ # ensure that "automatic" suffix detection works
+ result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
+ assert "Generated output file" in result.stdout
+ out_files = os.listdir(d_out)
+ assert len(out_files) == 3
+ assert "data1.spacy" in out_files
+ assert "data2.spacy" in out_files
+ assert "data3.spacy" in out_files
+
+
+def test_convert_auto_conflict():
+ with make_tempdir() as d_in, make_tempdir() as d_out:
+ for f in ["data1.iob", "data2.iob", "data3.json"]:
+ Path(d_in / f).touch()
+
+ # ensure that "automatic" suffix detection warns when there are different file types
+ result = CliRunner().invoke(app, ["convert", str(d_in), str(d_out)])
+ assert "All input files must be same type" in result.stdout
+ out_files = os.listdir(d_out)
+ assert len(out_files) == 0
From f1dcdefc8abb21345680b79e9d538f06cf62bca0 Mon Sep 17 00:00:00 2001
From: Madeesh Kannan
Date: Thu, 5 Jan 2023 11:46:04 +0100
Subject: [PATCH 35/36] Add version tag to `before_update` config key (#12059)
---
website/docs/api/data-formats.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 768844cf3..420e827a0 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -186,7 +186,7 @@ process that are used when you run [`spacy train`](/api/cli#train).
| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~ |
| `batcher` | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~ |
| `before_to_disk` | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~ |
-| `before_update` | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ |
+| `before_update` 3.5 | Optional callback that is invoked at the start of each training step with the `nlp` object and a `Dict` containing the following entries: `step`, `epoch`. Can be used to make deferred changes to components. Defaults to `null`. ~~Optional[Callable[[Language, Dict[str, Any]], None]]~~ |
| `dev_corpus` | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~ |
| `dropout` | The dropout rate. Defaults to `0.1`. ~~float~~ |
| `eval_frequency` | How often to evaluate during training (steps). Defaults to `200`. ~~int~~ |
From 6d03b04901e95a71747a7e1ef0b00bc87bb2c807 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Mon, 9 Jan 2023 11:43:48 +0100
Subject: [PATCH 36/36] Improve score_cats for use with multiple textcat
components (#11820)
* add test for running evaluate on an nlp pipeline with two distinct textcat components
* cleanup
* merge dicts instead of overwrite
* don't add more labels to the given set
* Revert "merge dicts instead of overwrite"
This reverts commit 89bee0ed7798389e6de882a0234e6075fbdaf331.
* Switch tests to separate scorer keys rather than merged dicts
* Revert unrelated edits
* Switch textcat scorers to v2
* formatting
Co-authored-by: Adriane Boyd
---
spacy/pipeline/textcat_multilabel.py | 4 +-
spacy/scorer.py | 6 +-
spacy/tests/pipeline/test_textcat.py | 6 +-
spacy/tests/test_language.py | 107 +++++++++++++++++++++++++++
4 files changed, 116 insertions(+), 7 deletions(-)
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index 328cee723..41c0e2f63 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -74,7 +74,7 @@ subword_features = true
default_config={
"threshold": 0.5,
"model": DEFAULT_MULTI_TEXTCAT_MODEL,
- "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v1"},
+ "scorer": {"@scorers": "spacy.textcat_multilabel_scorer.v2"},
},
default_score_weights={
"cats_score": 1.0,
@@ -120,7 +120,7 @@ def textcat_multilabel_score(examples: Iterable[Example], **kwargs) -> Dict[str,
)
-@registry.scorers("spacy.textcat_multilabel_scorer.v1")
+@registry.scorers("spacy.textcat_multilabel_scorer.v2")
def make_textcat_multilabel_scorer():
return textcat_multilabel_score
diff --git a/spacy/scorer.py b/spacy/scorer.py
index 16fc303a0..d8c383ab8 100644
--- a/spacy/scorer.py
+++ b/spacy/scorer.py
@@ -476,14 +476,12 @@ class Scorer:
f_per_type = {label: PRFScore() for label in labels}
auc_per_type = {label: ROCAUCScore() for label in labels}
labels = set(labels)
- if labels:
- for eg in examples:
- labels.update(eg.predicted.cats.keys())
- labels.update(eg.reference.cats.keys())
for example in examples:
# Through this loop, None in the gold_cats indicates missing label.
pred_cats = getter(example.predicted, attr)
+ pred_cats = {k: v for k, v in pred_cats.items() if k in labels}
gold_cats = getter(example.reference, attr)
+ gold_cats = {k: v for k, v in gold_cats.items() if k in labels}
for label in labels:
pred_score = pred_cats.get(label, 0.0)
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index 048586cec..d042f3445 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -898,7 +898,11 @@ def test_textcat_multi_threshold():
@pytest.mark.parametrize(
- "component_name,scorer", [("textcat", "spacy.textcat_scorer.v1")]
+ "component_name,scorer",
+ [
+ ("textcat", "spacy.textcat_scorer.v1"),
+ ("textcat_multilabel", "spacy.textcat_multilabel_scorer.v1"),
+ ],
)
def test_textcat_legacy_scorers(component_name, scorer):
"""Check that legacy scorers are registered and produce the expected score
diff --git a/spacy/tests/test_language.py b/spacy/tests/test_language.py
index 03a98d32f..03790eb86 100644
--- a/spacy/tests/test_language.py
+++ b/spacy/tests/test_language.py
@@ -3,6 +3,7 @@ import logging
from unittest import mock
import pytest
from spacy.language import Language
+from spacy.scorer import Scorer
from spacy.tokens import Doc, Span
from spacy.vocab import Vocab
from spacy.training import Example
@@ -126,6 +127,112 @@ def test_evaluate_no_pipe(nlp):
nlp.evaluate([Example.from_dict(doc, annots)])
+def test_evaluate_textcat_multilabel(en_vocab):
+ """Test that evaluate works with a multilabel textcat pipe."""
+ nlp = Language(en_vocab)
+ textcat_multilabel = nlp.add_pipe("textcat_multilabel")
+ for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
+ textcat_multilabel.add_label(label)
+ nlp.initialize()
+
+ annots = {"cats": {"FEATURE": 1.0, "QUESTION": 1.0}}
+ doc = nlp.make_doc("hello world")
+ example = Example.from_dict(doc, annots)
+ scores = nlp.evaluate([example])
+ labels = nlp.get_pipe("textcat_multilabel").labels
+ for label in labels:
+ assert scores["cats_f_per_type"].get(label) is not None
+ for key in example.reference.cats.keys():
+ if key not in labels:
+ assert scores["cats_f_per_type"].get(key) is None
+
+
+def test_evaluate_multiple_textcat_final(en_vocab):
+ """Test that evaluate evaluates the final textcat component in a pipeline
+ with more than one textcat or textcat_multilabel."""
+ nlp = Language(en_vocab)
+ textcat = nlp.add_pipe("textcat")
+ for label in ("POSITIVE", "NEGATIVE"):
+ textcat.add_label(label)
+ textcat_multilabel = nlp.add_pipe("textcat_multilabel")
+ for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
+ textcat_multilabel.add_label(label)
+ nlp.initialize()
+
+ annots = {
+ "cats": {
+ "POSITIVE": 1.0,
+ "NEGATIVE": 0.0,
+ "FEATURE": 1.0,
+ "QUESTION": 1.0,
+ "POSITIVE": 1.0,
+ "NEGATIVE": 0.0,
+ }
+ }
+ doc = nlp.make_doc("hello world")
+ example = Example.from_dict(doc, annots)
+ scores = nlp.evaluate([example])
+ # get the labels from the final pipe
+ labels = nlp.get_pipe(nlp.pipe_names[-1]).labels
+ for label in labels:
+ assert scores["cats_f_per_type"].get(label) is not None
+ for key in example.reference.cats.keys():
+ if key not in labels:
+ assert scores["cats_f_per_type"].get(key) is None
+
+
+def test_evaluate_multiple_textcat_separate(en_vocab):
+ """Test that evaluate can evaluate multiple textcat components separately
+ with custom scorers."""
+
+ def custom_textcat_score(examples, **kwargs):
+ scores = Scorer.score_cats(
+ examples,
+ "cats",
+ multi_label=False,
+ **kwargs,
+ )
+ return {f"custom_{k}": v for k, v in scores.items()}
+
+ @spacy.registry.scorers("test_custom_textcat_scorer")
+ def make_custom_textcat_scorer():
+ return custom_textcat_score
+
+ nlp = Language(en_vocab)
+ textcat = nlp.add_pipe(
+ "textcat",
+ config={"scorer": {"@scorers": "test_custom_textcat_scorer"}},
+ )
+ for label in ("POSITIVE", "NEGATIVE"):
+ textcat.add_label(label)
+ textcat_multilabel = nlp.add_pipe("textcat_multilabel")
+ for label in ("FEATURE", "REQUEST", "BUG", "QUESTION"):
+ textcat_multilabel.add_label(label)
+ nlp.initialize()
+
+ annots = {
+ "cats": {
+ "POSITIVE": 1.0,
+ "NEGATIVE": 0.0,
+ "FEATURE": 1.0,
+ "QUESTION": 1.0,
+ "POSITIVE": 1.0,
+ "NEGATIVE": 0.0,
+ }
+ }
+ doc = nlp.make_doc("hello world")
+ example = Example.from_dict(doc, annots)
+ scores = nlp.evaluate([example])
+ # check custom scores for the textcat pipe
+ assert "custom_cats_f_per_type" in scores
+ labels = nlp.get_pipe("textcat").labels
+ assert set(scores["custom_cats_f_per_type"].keys()) == set(labels)
+ # check default scores for the textcat_multilabel pipe
+ assert "cats_f_per_type" in scores
+ labels = nlp.get_pipe("textcat_multilabel").labels
+ assert set(scores["cats_f_per_type"].keys()) == set(labels)
+
+
def vector_modification_pipe(doc):
doc.vector += 1
return doc