From be673462bead2ba416b43d1f84acf720a98e24f4 Mon Sep 17 00:00:00 2001 From: TAN Long <71320000+tanloong@users.noreply.github.com> Date: Tue, 28 Feb 2023 21:36:33 +0800 Subject: [PATCH 01/15] Add new REL_OPs: `>+`, `>-`, `<+`, and `<-` (#12334) * Add immediate left/right child/parent dependency relations * Add tests for new REL_OPs: `>+`, `>-`, `<+`, and `<-`. --------- Co-authored-by: Tan Long --- spacy/matcher/dependencymatcher.pyx | 26 +++++++++++++++++++ .../tests/matcher/test_dependency_matcher.py | 16 ++++++++++++ website/docs/api/dependencymatcher.mdx | 4 +++ website/docs/usage/rule-based-matching.mdx | 8 ++++++ 4 files changed, 54 insertions(+) diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx index 74c2d002f..adf96702b 100644 --- a/spacy/matcher/dependencymatcher.pyx +++ b/spacy/matcher/dependencymatcher.pyx @@ -82,8 +82,12 @@ cdef class DependencyMatcher: "$-": self._imm_left_sib, "$++": self._right_sib, "$--": self._left_sib, + ">+": self._imm_right_child, + ">-": self._imm_left_child, ">++": self._right_child, ">--": self._left_child, + "<+": self._imm_right_parent, + "<-": self._imm_left_parent, "<++": self._right_parent, "<--": self._left_parent, } @@ -427,12 +431,34 @@ cdef class DependencyMatcher: def _left_sib(self, doc, node): return [doc[child.i] for child in doc[node].head.children if child.i < node] + def _imm_right_child(self, doc, node): + for child in doc[node].children: + if child.i == node + 1: + return [doc[child.i]] + return [] + + def _imm_left_child(self, doc, node): + for child in doc[node].children: + if child.i == node - 1: + return [doc[child.i]] + return [] + def _right_child(self, doc, node): return [doc[child.i] for child in doc[node].children if child.i > node] def _left_child(self, doc, node): return [doc[child.i] for child in doc[node].children if child.i < node] + def _imm_right_parent(self, doc, node): + if doc[node].head.i == node + 1: + return [doc[node].head] + return [] + + def _imm_left_parent(self, doc, node): + if doc[node].head.i == node - 1: + return [doc[node].head] + return [] + def _right_parent(self, doc, node): if doc[node].head.i > node: return [doc[node].head] diff --git a/spacy/tests/matcher/test_dependency_matcher.py b/spacy/tests/matcher/test_dependency_matcher.py index b4e19d69d..200384320 100644 --- a/spacy/tests/matcher/test_dependency_matcher.py +++ b/spacy/tests/matcher/test_dependency_matcher.py @@ -316,16 +316,32 @@ def test_dependency_matcher_precedence_ops(en_vocab, op, num_matches): ("the", "brown", "$--", 0), ("brown", "the", "$--", 1), ("brown", "brown", "$--", 0), + ("over", "jumped", "<+", 0), + ("quick", "fox", "<+", 0), + ("the", "quick", "<+", 0), + ("brown", "fox", "<+", 1), ("quick", "fox", "<++", 1), ("quick", "over", "<++", 0), ("over", "jumped", "<++", 0), ("the", "fox", "<++", 2), + ("brown", "fox", "<-", 0), + ("fox", "over", "<-", 0), + ("the", "over", "<-", 0), + ("over", "jumped", "<-", 1), ("brown", "fox", "<--", 0), ("fox", "jumped", "<--", 0), ("fox", "over", "<--", 1), + ("fox", "brown", ">+", 0), + ("over", "fox", ">+", 0), + ("over", "the", ">+", 0), + ("jumped", "over", ">+", 1), ("jumped", "over", ">++", 1), ("fox", "lazy", ">++", 0), ("over", "the", ">++", 0), + ("jumped", "over", ">-", 0), + ("fox", "quick", ">-", 0), + ("brown", "quick", ">-", 0), + ("fox", "brown", ">-", 1), ("brown", "fox", ">--", 0), ("fox", "brown", ">--", 1), ("jumped", "fox", ">--", 1), diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx index 390034a6c..cad5185f7 100644 --- a/website/docs/api/dependencymatcher.mdx +++ b/website/docs/api/dependencymatcher.mdx @@ -82,8 +82,12 @@ come directly from | `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | | `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | | `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | | `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | | `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | | `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | | `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 628c2953f..6a11ac8bd 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -1110,6 +1110,14 @@ come directly from | `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | | `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | | `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | ### Designing dependency matcher patterns {id="dependencymatcher-patterns"} From 18f4378a910e84631b51e10d84b571ecf2f4be0c Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Tue, 28 Feb 2023 16:36:03 +0100 Subject: [PATCH 02/15] Fix error message for displacy auto_select_port (#12343) --- spacy/errors.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spacy/errors.py b/spacy/errors.py index d143e341c..ab013f3eb 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -967,7 +967,7 @@ class Errors(metaclass=ErrorsWithCodes): E1049 = ("No available port found for displaCy on host {host}. Please specify an available port " "with `displacy.serve(doc, port=port)`") E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " - "or use `auto_switch_port=True` to pick an available port automatically.") + "or use `auto_select_port=True` to pick an available port automatically.") # Deprecated model shortcuts, only used in errors and warnings From 5ed1db7ae4032a80a03aecff6cb4c5c170d3beed Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 1 Mar 2023 10:46:13 +0100 Subject: [PATCH 03/15] Add new tags in docs for #12334 (#12348) --- website/docs/api/dependencymatcher.mdx | 44 +++++++++++----------- website/docs/usage/rule-based-matching.mdx | 44 +++++++++++----------- 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/website/docs/api/dependencymatcher.mdx b/website/docs/api/dependencymatcher.mdx index cad5185f7..14e0916d1 100644 --- a/website/docs/api/dependencymatcher.mdx +++ b/website/docs/api/dependencymatcher.mdx @@ -68,28 +68,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | -| `A >+ B` | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A >- B` | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | -| `A <+ B` | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A <- B` | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | +| Symbol | Description | +| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | ## DependencyMatcher.\_\_init\_\_ {id="init",tag="method"} diff --git a/website/docs/usage/rule-based-matching.mdx b/website/docs/usage/rule-based-matching.mdx index 6a11ac8bd..55c043015 100644 --- a/website/docs/usage/rule-based-matching.mdx +++ b/website/docs/usage/rule-based-matching.mdx @@ -1096,28 +1096,28 @@ The following operators are supported by the `DependencyMatcher`, most of which come directly from [Semgrex](https://nlp.stanford.edu/nlp/javadoc/javanlp/edu/stanford/nlp/semgraph/semgrex/SemgrexPattern.html): -| Symbol | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------- | -| `A < B` | `A` is the immediate dependent of `B`. | -| `A > B` | `A` is the immediate head of `B`. | -| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | -| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | -| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | -| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | -| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | -| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | -| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | -| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | -| `A >+ B` | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A >- B` | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | -| `A <+ B` | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | -| `A <- B` | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | -| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | -| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | +| Symbol | Description | +| --------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | +| `A < B` | `A` is the immediate dependent of `B`. | +| `A > B` | `A` is the immediate head of `B`. | +| `A << B` | `A` is the dependent in a chain to `B` following dep → head paths. | +| `A >> B` | `A` is the head in a chain to `B` following head → dep paths. | +| `A . B` | `A` immediately precedes `B`, i.e. `A.i == B.i - 1`, and both are within the same dependency tree. | +| `A .* B` | `A` precedes `B`, i.e. `A.i < B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ; B` | `A` immediately follows `B`, i.e. `A.i == B.i + 1`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A ;* B` | `A` follows `B`, i.e. `A.i > B.i`, and both are within the same dependency tree _(not in Semgrex)_. | +| `A $+ B` | `B` is a right immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i - 1`. | +| `A $- B` | `B` is a left immediate sibling of `A`, i.e. `A` and `B` have the same parent and `A.i == B.i + 1`. | +| `A $++ B` | `B` is a right sibling of `A`, i.e. `A` and `B` have the same parent and `A.i < B.i`. | +| `A $-- B` | `B` is a left sibling of `A`, i.e. `A` and `B` have the same parent and `A.i > B.i`. | +| `A >+ B` 3.5.1 | `B` is a right immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A >- B` 3.5.1 | `B` is a left immediate child of `A`, i.e. `A` is a parent of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A >++ B` | `B` is a right child of `A`, i.e. `A` is a parent of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A >-- B` | `B` is a left child of `A`, i.e. `A` is a parent of `B` and `A.i > B.i` _(not in Semgrex)_. | +| `A <+ B` 3.5.1 | `B` is a right immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i - 1` _(not in Semgrex)_. | +| `A <- B` 3.5.1 | `B` is a left immediate parent of `A`, i.e. `A` is a child of `B` and `A.i == B.i + 1` _(not in Semgrex)_. | +| `A <++ B` | `B` is a right parent of `A`, i.e. `A` is a child of `B` and `A.i < B.i` _(not in Semgrex)_. | +| `A <-- B` | `B` is a left parent of `A`, i.e. `A` is a child of `B` and `A.i > B.i` _(not in Semgrex)_. | ### Designing dependency matcher patterns {id="dependencymatcher-patterns"} From f87919d8f0d10b5319029209ddb0417c0e61103e Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 1 Mar 2023 11:01:35 +0100 Subject: [PATCH 04/15] Update docs w.r.t. spacy.CandidateBatchGenerator.v1. (#12350) --- website/docs/api/architectures.mdx | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx index 2a1bc4380..966b5830a 100644 --- a/website/docs/api/architectures.mdx +++ b/website/docs/api/architectures.mdx @@ -924,6 +924,15 @@ plausible [`Candidate`](/api/kb/#candidate) objects. The default `CandidateGenerator` uses the text of a mention to find its potential aliases in the `KnowledgeBase`. Note that this function is case-dependent. +### spacy.CandidateBatchGenerator.v1 {id="CandidateBatchGenerator"} + +A function that takes as input a [`KnowledgeBase`](/api/kb) and an `Iterable` of +[`Span`](/api/span) objects denoting named entities, and returns a list of +plausible [`Candidate`](/api/kb/#candidate) objects per specified +[`Span`](/api/span). The default `CandidateBatchGenerator` uses the text of a +mention to find its potential aliases in the `KnowledgeBase`. Note that this +function is case-dependent. + ## Coreference {id="coref-architectures",tag="experimental"} A [`CoreferenceResolver`](/api/coref) component identifies tokens that refer to From b3e73645512637d48626768b13f7a598da04bd85 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 1 Mar 2023 12:06:07 +0100 Subject: [PATCH 05/15] rely on is_empty property instead of __len__ (#12347) --- spacy/errors.py | 3 +-- spacy/kb/kb_in_memory.pyx | 3 +++ spacy/pipeline/entity_linker.py | 2 +- spacy/tests/pipeline/test_entity_linker.py | 3 +++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index ab013f3eb..2c8b98aad 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -444,8 +444,7 @@ class Errors(metaclass=ErrorsWithCodes): E133 = ("The sum of prior probabilities for alias '{alias}' should not " "exceed 1, but found {sum}.") E134 = ("Entity '{entity}' is not defined in the Knowledge Base.") - E139 = ("Knowledge base for component '{name}' is empty. Use the methods " - "`kb.add_entity` and `kb.add_alias` to add entries.") + E139 = ("Knowledge base for component '{name}' is empty.") E140 = ("The list of entities, prior probabilities and entity vectors " "should be of equal length.") E141 = ("Entity vectors should be of length {required} instead of the " diff --git a/spacy/kb/kb_in_memory.pyx b/spacy/kb/kb_in_memory.pyx index edba523cf..2a74d047b 100644 --- a/spacy/kb/kb_in_memory.pyx +++ b/spacy/kb/kb_in_memory.pyx @@ -46,6 +46,9 @@ cdef class InMemoryLookupKB(KnowledgeBase): self._alias_index = PreshMap(nr_aliases + 1) self._aliases_table = alias_vec(nr_aliases + 1) + def is_empty(self): + return len(self) == 0 + def __len__(self): return self.get_size_entities() diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index 62845287b..a11964117 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -250,7 +250,7 @@ class EntityLinker(TrainablePipe): # Raise an error if the knowledge base is not initialized. if self.kb is None: raise ValueError(Errors.E1018.format(name=self.name)) - if len(self.kb) == 0: + if hasattr(self.kb, "is_empty") and self.kb.is_empty(): raise ValueError(Errors.E139.format(name=self.name)) def initialize( diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py index 99f164f15..2a6258386 100644 --- a/spacy/tests/pipeline/test_entity_linker.py +++ b/spacy/tests/pipeline/test_entity_linker.py @@ -353,6 +353,9 @@ def test_kb_default(nlp): """Test that the default (empty) KB is loaded upon construction""" entity_linker = nlp.add_pipe("entity_linker", config={}) assert len(entity_linker.kb) == 0 + with pytest.raises(ValueError, match="E139"): + # this raises an error because the KB is empty + entity_linker.validate_kb() assert entity_linker.kb.get_size_entities() == 0 assert entity_linker.kb.get_size_aliases() == 0 # 64 is the default value from pipeline.entity_linker From e325de3ff8f984ebd43dd7714a6811872bfc33a0 Mon Sep 17 00:00:00 2001 From: kadarakos Date: Wed, 1 Mar 2023 15:38:23 +0100 Subject: [PATCH 06/15] Displacy doc fix (#12352) * more details for color setting * more details for color setting * prettier --- website/docs/api/top-level.mdx | 32 +++++++++++++++--------------- website/docs/usage/visualizers.mdx | 12 +++++------ 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/website/docs/api/top-level.mdx b/website/docs/api/top-level.mdx index 9748719d7..d0851a59f 100644 --- a/website/docs/api/top-level.mdx +++ b/website/docs/api/top-level.mdx @@ -354,22 +354,22 @@ If a setting is not present in the options, the default value will be used. > displacy.serve(doc, style="dep", options=options) > ``` -| Name | Description | -| ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | -| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | -| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | -| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | -| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | -| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | -| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | -| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | -| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | -| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ | -| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ | -| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ | -| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ | -| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ | -| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ | +| Name | Description | +| ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `fine_grained` | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). Defaults to `False`. ~~bool~~ | +| `add_lemma` | Print the lemmas in a separate row below the token texts. Defaults to `False`. ~~bool~~ | +| `collapse_punct` | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. Defaults to `True`. ~~bool~~ | +| `collapse_phrases` | Merge noun phrases into one token. Defaults to `False`. ~~bool~~ | +| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | +| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ | +| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | +| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | +| `offset_x` | Spacing on left side of the SVG in px. Defaults to `50`. ~~int~~ | +| `arrow_stroke` | Width of arrow path in px. Defaults to `2`. ~~int~~ | +| `arrow_width` | Width of arrow head in px. Defaults to `10` in regular mode and `8` in compact mode. ~~int~~ | +| `arrow_spacing` | Spacing between arrows in px to avoid overlaps. Defaults to `20` in regular mode and `12` in compact mode. ~~int~~ | +| `word_spacing` | Vertical spacing between words and arcs in px. Defaults to `45`. ~~int~~ | +| `distance` | Distance between words in px. Defaults to `175` in regular mode and `150` in compact mode. ~~int~~ | #### Named Entity Visualizer options {id="displacy_options-ent"} diff --git a/website/docs/usage/visualizers.mdx b/website/docs/usage/visualizers.mdx index 1d3682af4..c372744de 100644 --- a/website/docs/usage/visualizers.mdx +++ b/website/docs/usage/visualizers.mdx @@ -58,12 +58,12 @@ arcs. -| Argument | Description | -| --------- | ----------------------------------------------------------------------------------------- | -| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | -| `color` | Text color (HEX, RGB or color names). Defaults to `"#000000"`. ~~str~~ | -| `bg` | Background color (HEX, RGB or color names). Defaults to `"#ffffff"`. ~~str~~ | -| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | +| Argument | Description | +| --------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `compact` | "Compact mode" with square arrows that takes up less space. Defaults to `False`. ~~bool~~ | +| `color` | Text color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#000000"`. ~~str~~ | +| `bg` | Background color. Can be provided in any CSS legal format as a string e.g.: `"#00ff00"`, `"rgb(0, 255, 0)"`, `"hsl(120, 100%, 50%)"` and `"green"` all correspond to the color green (without transparency). Defaults to `"#ffffff"`. ~~str~~ | +| `font` | Font name or font family for all text. Defaults to `"Arial"`. ~~str~~ | For a list of all available options, see the [`displacy` API documentation](/api/top-level#displacy_options). From 6f1632b3e98d3d2b9a24961394223eff908b2ee7 Mon Sep 17 00:00:00 2001 From: Raphael Mitsch Date: Wed, 1 Mar 2023 16:02:55 +0100 Subject: [PATCH 07/15] Make generation of empty `KnowledgeBase` instances configurable in `EntityLinker` (#12320) * Make empty_kb() configurable. * Format. * Update docs. * Be more specific in KB serialization test. * Update KB serialization tests. Update docs. * Remove doc update for batched candidate generation. * Fix serialization of subclassed KB in tests. * Format. * Update docstring. * Update docstring. * Switch from pickle to json for custom field serialization. --- spacy/ml/models/entity_linker.py | 8 +++ spacy/pipeline/entity_linker.py | 11 +++- spacy/tests/serialize/test_serialize_kb.py | 71 +++++++++++++++++++--- website/docs/api/architectures.mdx | 10 ++- website/docs/api/entitylinker.mdx | 28 +++++---- 5 files changed, 101 insertions(+), 27 deletions(-) diff --git a/spacy/ml/models/entity_linker.py b/spacy/ml/models/entity_linker.py index 299b6bb52..7332ca199 100644 --- a/spacy/ml/models/entity_linker.py +++ b/spacy/ml/models/entity_linker.py @@ -89,6 +89,14 @@ def load_kb( return kb_from_file +@registry.misc("spacy.EmptyKB.v2") +def empty_kb_for_config() -> Callable[[Vocab, int], KnowledgeBase]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return InMemoryLookupKB(vocab=vocab, entity_vector_length=entity_vector_length) + + return empty_kb_factory + + @registry.misc("spacy.EmptyKB.v1") def empty_kb( entity_vector_length: int, diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py index a11964117..f2dae0529 100644 --- a/spacy/pipeline/entity_linker.py +++ b/spacy/pipeline/entity_linker.py @@ -54,6 +54,7 @@ DEFAULT_NEL_MODEL = Config().from_str(default_model_config)["model"] "entity_vector_length": 64, "get_candidates": {"@misc": "spacy.CandidateGenerator.v1"}, "get_candidates_batch": {"@misc": "spacy.CandidateBatchGenerator.v1"}, + "generate_empty_kb": {"@misc": "spacy.EmptyKB.v2"}, "overwrite": True, "scorer": {"@scorers": "spacy.entity_linker_scorer.v1"}, "use_gold_ents": True, @@ -80,6 +81,7 @@ def make_entity_linker( get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool, scorer: Optional[Callable], use_gold_ents: bool, @@ -101,6 +103,7 @@ def make_entity_linker( get_candidates_batch ( Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. @@ -135,6 +138,7 @@ def make_entity_linker( entity_vector_length=entity_vector_length, get_candidates=get_candidates, get_candidates_batch=get_candidates_batch, + generate_empty_kb=generate_empty_kb, overwrite=overwrite, scorer=scorer, use_gold_ents=use_gold_ents, @@ -175,6 +179,7 @@ class EntityLinker(TrainablePipe): get_candidates_batch: Callable[ [KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]] ], + generate_empty_kb: Callable[[Vocab, int], KnowledgeBase], overwrite: bool = BACKWARD_OVERWRITE, scorer: Optional[Callable] = entity_linker_score, use_gold_ents: bool, @@ -198,6 +203,7 @@ class EntityLinker(TrainablePipe): Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]], Iterable[Candidate]] ): Function that produces a list of candidates, given a certain knowledge base and several textual mentions. + generate_empty_kb (Callable[[Vocab, int], KnowledgeBase]): Callable returning empty KnowledgeBase. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_links. use_gold_ents (bool): Whether to copy entities from gold docs or not. If false, another component must provide entity annotations. @@ -220,6 +226,7 @@ class EntityLinker(TrainablePipe): self.model = model self.name = name self.labels_discard = list(labels_discard) + # how many neighbour sentences to take into account self.n_sents = n_sents self.incl_prior = incl_prior self.incl_context = incl_context @@ -227,9 +234,7 @@ class EntityLinker(TrainablePipe): self.get_candidates_batch = get_candidates_batch self.cfg: Dict[str, Any] = {"overwrite": overwrite} self.distance = CosineDistance(normalize=False) - # how many neighbour sentences to take into account - # create an empty KB by default - self.kb = empty_kb(entity_vector_length)(self.vocab) + self.kb = generate_empty_kb(self.vocab, entity_vector_length) self.scorer = scorer self.use_gold_ents = use_gold_ents self.candidates_batch_size = candidates_batch_size diff --git a/spacy/tests/serialize/test_serialize_kb.py b/spacy/tests/serialize/test_serialize_kb.py index 8d3653ab1..f9d2e226b 100644 --- a/spacy/tests/serialize/test_serialize_kb.py +++ b/spacy/tests/serialize/test_serialize_kb.py @@ -1,7 +1,10 @@ -from typing import Callable +from pathlib import Path +from typing import Callable, Iterable, Any, Dict -from spacy import util -from spacy.util import ensure_path, registry, load_model_from_config +import srsly + +from spacy import util, Errors +from spacy.util import ensure_path, registry, load_model_from_config, SimpleFrozenList from spacy.kb.kb_in_memory import InMemoryLookupKB from spacy.vocab import Vocab from thinc.api import Config @@ -91,7 +94,10 @@ def test_serialize_subclassed_kb(): [components.entity_linker] factory = "entity_linker" - + + [components.entity_linker.generate_empty_kb] + @misc = "kb_test.CustomEmptyKB.v1" + [initialize] [initialize.components] @@ -99,7 +105,7 @@ def test_serialize_subclassed_kb(): [initialize.components.entity_linker] [initialize.components.entity_linker.kb_loader] - @misc = "spacy.CustomKB.v1" + @misc = "kb_test.CustomKB.v1" entity_vector_length = 342 custom_field = 666 """ @@ -109,10 +115,57 @@ def test_serialize_subclassed_kb(): super().__init__(vocab, entity_vector_length) self.custom_field = custom_field - @registry.misc("spacy.CustomKB.v1") + def to_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + """We overwrite InMemoryLookupKB.to_disk() to ensure that self.custom_field is stored as well.""" + path = ensure_path(path) + if not path.exists(): + path.mkdir(parents=True) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + + def serialize_custom_fields(file_path: Path) -> None: + srsly.write_json(file_path, {"custom_field": self.custom_field}) + + serialize = { + "contents": lambda p: self.write_contents(p), + "strings.json": lambda p: self.vocab.strings.to_disk(p), + "custom_fields": lambda p: serialize_custom_fields(p), + } + util.to_disk(path, serialize, exclude) + + def from_disk(self, path, exclude: Iterable[str] = SimpleFrozenList()): + """We overwrite InMemoryLookupKB.from_disk() to ensure that self.custom_field is loaded as well.""" + path = ensure_path(path) + if not path.exists(): + raise ValueError(Errors.E929.format(loc=path)) + if not path.is_dir(): + raise ValueError(Errors.E928.format(loc=path)) + + def deserialize_custom_fields(file_path: Path) -> None: + self.custom_field = srsly.read_json(file_path)["custom_field"] + + deserialize: Dict[str, Callable[[Any], Any]] = { + "contents": lambda p: self.read_contents(p), + "strings.json": lambda p: self.vocab.strings.from_disk(p), + "custom_fields": lambda p: deserialize_custom_fields(p), + } + util.from_disk(path, deserialize, exclude) + + @registry.misc("kb_test.CustomEmptyKB.v1") + def empty_custom_kb() -> Callable[[Vocab, int], SubInMemoryLookupKB]: + def empty_kb_factory(vocab: Vocab, entity_vector_length: int): + return SubInMemoryLookupKB( + vocab=vocab, + entity_vector_length=entity_vector_length, + custom_field=0, + ) + + return empty_kb_factory + + @registry.misc("kb_test.CustomKB.v1") def custom_kb( entity_vector_length: int, custom_field: int - ) -> Callable[[Vocab], InMemoryLookupKB]: + ) -> Callable[[Vocab], SubInMemoryLookupKB]: def custom_kb_factory(vocab): kb = SubInMemoryLookupKB( vocab=vocab, @@ -139,6 +192,6 @@ def test_serialize_subclassed_kb(): nlp2 = util.load_model_from_path(tmp_dir) entity_linker2 = nlp2.get_pipe("entity_linker") # After IO, the KB is the standard one - assert type(entity_linker2.kb) == InMemoryLookupKB + assert type(entity_linker2.kb) == SubInMemoryLookupKB assert entity_linker2.kb.entity_vector_length == 342 - assert not hasattr(entity_linker2.kb, "custom_field") + assert entity_linker2.kb.custom_field == 666 diff --git a/website/docs/api/architectures.mdx b/website/docs/api/architectures.mdx index 966b5830a..268c04a07 100644 --- a/website/docs/api/architectures.mdx +++ b/website/docs/api/architectures.mdx @@ -899,15 +899,21 @@ The `EntityLinker` model architecture is a Thinc `Model` with a | `nO` | Output dimension, determined by the length of the vectors encoding each entity in the KB. If the `nO` dimension is not set, the entity linking component will set it when `initialize` is called. ~~Optional[int]~~ | | **CREATES** | The model using the architecture. ~~Model[List[Doc], Floats2d]~~ | -### spacy.EmptyKB.v1 {id="EmptyKB"} +### spacy.EmptyKB.v1 {id="EmptyKB.v1"} A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) -instance. This is the default when a new entity linker component is created. +instance. | Name | Description | | ---------------------- | ----------------------------------------------------------------------------------- | | `entity_vector_length` | The length of the vectors encoding each entity in the KB. Defaults to `64`. ~~int~~ | +### spacy.EmptyKB.v2 {id="EmptyKB"} + +A function that creates an empty `KnowledgeBase` from a [`Vocab`](/api/vocab) +instance. This is the default when a new entity linker component is created. It +returns a `Callable[[Vocab, int], InMemoryLookupKB]`. + ### spacy.KBFromFile.v1 {id="KBFromFile"} A function that reads an existing `KnowledgeBase` from file. diff --git a/website/docs/api/entitylinker.mdx b/website/docs/api/entitylinker.mdx index bafb2f2da..d84dd3ca9 100644 --- a/website/docs/api/entitylinker.mdx +++ b/website/docs/api/entitylinker.mdx @@ -53,19 +53,21 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("entity_linker", config=config) > ``` -| Setting | Description | -| ---------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | -| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | -| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | -| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | -| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | -| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | -| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | -| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | -| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | -| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | -| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | +| Setting | Description | +| --------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `labels_discard` | NER labels that will automatically get a "NIL" prediction. Defaults to `[]`. ~~Iterable[str]~~ | +| `n_sents` | The number of neighbouring sentences to take into account. Defaults to 0. ~~int~~ | +| `incl_prior` | Whether or not to include prior probabilities from the KB in the model. Defaults to `True`. ~~bool~~ | +| `incl_context` | Whether or not to include the local context in the model. Defaults to `True`. ~~bool~~ | +| `model` | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component. Defaults to [EntityLinker](/api/architectures#EntityLinker). ~~Model~~ | +| `entity_vector_length` | Size of encoding vectors in the KB. Defaults to `64`. ~~int~~ | +| `use_gold_ents` | Whether to copy entities from the gold docs or not. Defaults to `True`. If `False`, entities must be set in the training data or by an annotating component in the pipeline. ~~int~~ | +| `get_candidates` | Function that generates plausible candidates for a given `Span` object. Defaults to [CandidateGenerator](/api/architectures#CandidateGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Span], Iterable[Candidate]]~~ | +| `get_candidates_batch` 3.5 | Function that generates plausible candidates for a given batch of `Span` objects. Defaults to [CandidateBatchGenerator](/api/architectures#CandidateBatchGenerator), a function looking up exact, case-dependent aliases in the KB. ~~Callable[[KnowledgeBase, Iterable[Span]], Iterable[Iterable[Candidate]]]~~ | +| `generate_empty_kb` 3.6 | Function that generates an empty `KnowledgeBase` object. Defaults to [`spacy.EmptyKB.v2`](/api/architectures#EmptyKB), which generates an empty [`InMemoryLookupKB`](/api/inmemorylookupkb). ~~Callable[[Vocab, int], KnowledgeBase]~~ | +| `overwrite` 3.2 | Whether existing annotation is overwritten. Defaults to `True`. ~~bool~~ | +| `scorer` 3.2 | The scoring method. Defaults to [`Scorer.score_links`](/api/scorer#score_links). ~~Optional[Callable]~~ | +| `threshold` 3.4 | Confidence threshold for entity predictions. The default of `None` implies that all predictions are accepted, otherwise those with a score beneath the treshold are discarded. If there are no predictions with scores above the threshold, the linked entity is `NIL`. ~~Optional[float]~~ | ```python %%GITHUB_SPACY/spacy/pipeline/entity_linker.py From e381efd936883c294a1c8d642bc7d05a12c05ae7 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 6 Mar 2023 14:48:57 +0100 Subject: [PATCH 08/15] Partially work around pending deprecation of pkg_resources (#12368) * Handle deprecation of pkg_resources * Replace `pkg_resources` with `importlib_metadata` for `spacy info --url` * Remove requirements check from `spacy project` given the lack of alternatives * Fix installed model URL method and CI test * Fix types/handling, simplify catch-all return * Move imports instead of disabling requirements check * Format * Reenable test with ignored deprecation warning * Fix except * Fix return --- .github/azure-steps.yml | 5 +++++ spacy/cli/info.py | 17 ++++++++--------- spacy/cli/project/run.py | 2 +- spacy/tests/test_cli.py | 4 +++- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml index ed69f611b..b2ccf3d81 100644 --- a/.github/azure-steps.yml +++ b/.github/azure-steps.yml @@ -59,6 +59,11 @@ steps: displayName: 'Test download CLI' condition: eq(variables['python_version'], '3.8') + - script: | + python -W error -m spacy info ca_core_news_sm | grep -q download_url + displayName: 'Test download_url in info CLI' + condition: eq(variables['python_version'], '3.8') + - script: | python -W error -c "import ca_core_news_sm; nlp = ca_core_news_sm.load(); doc=nlp('test')" displayName: 'Test no warnings on load (#11713)' diff --git a/spacy/cli/info.py b/spacy/cli/info.py index 974bc0f4e..d82bf3fbc 100644 --- a/spacy/cli/info.py +++ b/spacy/cli/info.py @@ -1,6 +1,5 @@ from typing import Optional, Dict, Any, Union, List import platform -import pkg_resources import json from pathlib import Path from wasabi import Printer, MarkdownRenderer @@ -10,6 +9,7 @@ from ._util import app, Arg, Opt, string_to_list from .download import get_model_filename, get_latest_version from .. import util from .. import about +from ..compat import importlib_metadata @app.command("info") @@ -137,15 +137,14 @@ def info_installed_model_url(model: str) -> Optional[str]: dist-info available. """ try: - dist = pkg_resources.get_distribution(model) - data = json.loads(dist.get_metadata("direct_url.json")) - return data["url"] - except pkg_resources.DistributionNotFound: - # no such package - return None + dist = importlib_metadata.distribution(model) + text = dist.read_text("direct_url.json") + if isinstance(text, str): + data = json.loads(text) + return data["url"] except Exception: - # something else, like no file or invalid JSON - return None + pass + return None def info_model_url(model: str) -> Dict[str, Any]: diff --git a/spacy/cli/project/run.py b/spacy/cli/project/run.py index 6dd174902..0f4858a99 100644 --- a/spacy/cli/project/run.py +++ b/spacy/cli/project/run.py @@ -2,7 +2,6 @@ from typing import Optional, List, Dict, Sequence, Any, Iterable, Tuple import os.path from pathlib import Path -import pkg_resources from wasabi import msg from wasabi.util import locale_escape import sys @@ -331,6 +330,7 @@ def _check_requirements(requirements: List[str]) -> Tuple[bool, bool]: RETURNS (Tuple[bool, bool]): Whether (1) any packages couldn't be imported, (2) any packages with version conflicts exist. """ + import pkg_resources failed_pkgs_msgs: List[str] = [] conflicting_pkgs_msgs: List[str] = [] diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index dc7ce46fe..752750d33 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -2,7 +2,6 @@ import os import math from collections import Counter from typing import Tuple, List, Dict, Any -import pkg_resources import time from pathlib import Path @@ -1126,6 +1125,7 @@ def test_cli_find_threshold(capsys): ) +@pytest.mark.filterwarnings("ignore::DeprecationWarning") @pytest.mark.parametrize( "reqs,output", [ @@ -1158,6 +1158,8 @@ def test_cli_find_threshold(capsys): ], ) def test_project_check_requirements(reqs, output): + import pkg_resources + # excessive guard against unlikely package name try: pkg_resources.require("spacyunknowndoesnotexist12345") From a86ec1b2b11dd7feb95f50b10e8a3adffa597bf2 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 6 Mar 2023 17:30:17 +0100 Subject: [PATCH 09/15] Update to use absolute imports in tests (#12372) --- spacy/tests/parser/test_ner.py | 4 ++-- spacy/tests/parser/test_parse.py | 6 +++--- spacy/tests/test_cli.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py index 00889efdc..030182a63 100644 --- a/spacy/tests/parser/test_ner.py +++ b/spacy/tests/parser/test_ner.py @@ -9,6 +9,8 @@ from spacy.lang.en import English from spacy.lang.it import Italian from spacy.language import Language from spacy.lookups import Lookups +from spacy.pipeline import EntityRecognizer +from spacy.pipeline.ner import DEFAULT_NER_MODEL from spacy.pipeline._parser_internals.ner import BiluoPushDown from spacy.training import Example, iob_to_biluo, split_bilu_label from spacy.tokens import Doc, Span @@ -16,8 +18,6 @@ from spacy.vocab import Vocab import logging from ..util import make_tempdir -from ...pipeline import EntityRecognizer -from ...pipeline.ner import DEFAULT_NER_MODEL TRAIN_DATA = [ ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}), diff --git a/spacy/tests/parser/test_parse.py b/spacy/tests/parser/test_parse.py index aaf31ed56..4b05c6721 100644 --- a/spacy/tests/parser/test_parse.py +++ b/spacy/tests/parser/test_parse.py @@ -8,11 +8,11 @@ from spacy.lang.en import English from spacy.tokens import Doc from spacy.training import Example from spacy.vocab import Vocab +from spacy.pipeline import DependencyParser +from spacy.pipeline.dep_parser import DEFAULT_PARSER_MODEL +from spacy.pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL -from ...pipeline import DependencyParser -from ...pipeline.dep_parser import DEFAULT_PARSER_MODEL from ..util import apply_transition_sequence, make_tempdir -from ...pipeline.tok2vec import DEFAULT_TOK2VEC_MODEL TRAIN_DATA = [ ( diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index 752750d33..f5bcdfd23 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -28,6 +28,7 @@ from spacy.cli.debug_data import _print_span_characteristics from spacy.cli.debug_data import _get_spans_length_freq_dist from spacy.cli.download import get_compatibility, get_version from spacy.cli.init_config import RECOMMENDATIONS, init_config, fill_config +from spacy.cli.init_pipeline import _init_labels from spacy.cli.package import get_third_party_dependencies from spacy.cli.package import _is_permitted_package_name from spacy.cli.project.remote_storage import RemoteStorage @@ -46,7 +47,6 @@ from spacy.training.converters import conll_ner_to_docs, conllu_to_docs from spacy.training.converters import iob_to_docs from spacy.util import ENV_VARS, get_minor_version, load_model_from_config, load_config -from ..cli.init_pipeline import _init_labels from .util import make_tempdir From 6177c875392b722fed0724845604727a303bbab3 Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Mon, 6 Mar 2023 18:06:31 +0100 Subject: [PATCH 10/15] Raise error for non-default vectors with PretrainVectors (#12366) --- spacy/errors.py | 2 + spacy/ml/models/multi_task.py | 3 ++ spacy/tests/training/test_pretraining.py | 47 +++++++++++++++++++----- 3 files changed, 43 insertions(+), 9 deletions(-) diff --git a/spacy/errors.py b/spacy/errors.py index 2c8b98aad..1047ed21a 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -549,6 +549,8 @@ class Errors(metaclass=ErrorsWithCodes): "during training, make sure to include it in 'annotating components'") # New errors added in v3.x + E850 = ("The PretrainVectors objective currently only supports default " + "vectors, not {mode} vectors.") E851 = ("The 'textcat' component labels should only have values of 0 or 1, " "but found value of '{val}'.") E852 = ("The tar file pulled from the remote attempted an unsafe path " diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py index a7d67c6dd..826fddd4f 100644 --- a/spacy/ml/models/multi_task.py +++ b/spacy/ml/models/multi_task.py @@ -8,6 +8,7 @@ from thinc.loss import Loss from ...util import registry, OOV_RANK from ...errors import Errors from ...attrs import ID +from ...vectors import Mode as VectorsMode import numpy from functools import partial @@ -23,6 +24,8 @@ def create_pretrain_vectors( maxout_pieces: int, hidden_size: int, loss: str ) -> Callable[["Vocab", Model], Model]: def create_vectors_objective(vocab: "Vocab", tok2vec: Model) -> Model: + if vocab.vectors.mode != VectorsMode.default: + raise ValueError(Errors.E850.format(mode=vocab.vectors.mode)) if vocab.vectors.shape[1] == 0: raise ValueError(Errors.E875) model = build_cloze_multi_task_model( diff --git a/spacy/tests/training/test_pretraining.py b/spacy/tests/training/test_pretraining.py index 9359c8485..c0d64f1e7 100644 --- a/spacy/tests/training/test_pretraining.py +++ b/spacy/tests/training/test_pretraining.py @@ -2,17 +2,19 @@ from pathlib import Path import numpy as np import pytest import srsly -from spacy.vocab import Vocab -from thinc.api import Config +from thinc.api import Config, get_current_ops +from spacy import util +from spacy.lang.en import English +from spacy.training.initialize import init_nlp +from spacy.training.loop import train +from spacy.training.pretrain import pretrain +from spacy.tokens import Doc, DocBin +from spacy.language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH +from spacy.ml.models.multi_task import create_pretrain_vectors +from spacy.vectors import Vectors +from spacy.vocab import Vocab from ..util import make_tempdir -from ... import util -from ...lang.en import English -from ...training.initialize import init_nlp -from ...training.loop import train -from ...training.pretrain import pretrain -from ...tokens import Doc, DocBin -from ...language import DEFAULT_CONFIG_PRETRAIN_PATH, DEFAULT_CONFIG_PATH pretrain_string_listener = """ [nlp] @@ -346,3 +348,30 @@ def write_vectors_model(tmp_dir): nlp = English(vocab) nlp.to_disk(nlp_path) return str(nlp_path) + + +def test_pretrain_default_vectors(): + nlp = English() + nlp.add_pipe("tok2vec") + nlp.initialize() + + # default vectors are supported + nlp.vocab.vectors = Vectors(shape=(10, 10)) + create_pretrain_vectors(1, 1, "cosine")(nlp.vocab, nlp.get_pipe("tok2vec").model) + + # error for no vectors + with pytest.raises(ValueError, match="E875"): + nlp.vocab.vectors = Vectors() + create_pretrain_vectors(1, 1, "cosine")( + nlp.vocab, nlp.get_pipe("tok2vec").model + ) + + # error for floret vectors + with pytest.raises(ValueError, match="E850"): + ops = get_current_ops() + nlp.vocab.vectors = Vectors( + data=ops.xp.zeros((10, 10)), mode="floret", hash_count=1 + ) + create_pretrain_vectors(1, 1, "cosine")( + nlp.vocab, nlp.get_pipe("tok2vec").model + ) From a1fc4ed9629fa046f52531fa9a396b417772100a Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Tue, 7 Mar 2023 13:29:08 +0100 Subject: [PATCH 11/15] fix types (#12365) --- spacy/lexeme.pyi | 3 ++- spacy/lexeme.pyx | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/spacy/lexeme.pyi b/spacy/lexeme.pyi index 4fcaa82cf..9b7a6156a 100644 --- a/spacy/lexeme.pyi +++ b/spacy/lexeme.pyi @@ -25,7 +25,8 @@ class Lexeme: def orth_(self) -> str: ... @property def text(self) -> str: ... - lower: str + orth: int + lower: int norm: int shape: int prefix: int diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx index 6c66effde..e70feaf9a 100644 --- a/spacy/lexeme.pyx +++ b/spacy/lexeme.pyx @@ -199,7 +199,7 @@ cdef class Lexeme: return self.orth_ property lower: - """RETURNS (str): Lowercase form of the lexeme.""" + """RETURNS (uint64): Lowercase form of the lexeme.""" def __get__(self): return self.c.lower From cbd85c96082dc7fb0758382163d14d56f958cacd Mon Sep 17 00:00:00 2001 From: Paul O'Leary McCann Date: Wed, 8 Mar 2023 01:47:45 +0900 Subject: [PATCH 12/15] Change GPU efficient textcat to use CNN, not BOW in generated configs (#11900) * Change GPU efficient textcat to use CNN, not BOW If you generate a config with a textcat component using GPU (transformers), the defaut option (efficiency) uses a BOW architecture, which does not use tok2vec features. While that can make sense as part of a larger pipeline, in the case of just a transformer and a textcat, that means the transformer is doing a lot of work for no purpose. This changes it so that the CNN architecture is used instead. It could also be changed to be the same as the accuracy config, which uses the ensemble architecture. * Add the transformer when using a textcat with GPU * Switch ubuntu-latest to ubuntu-20.04 in main tests (#11928) * Switch ubuntu-latest to ubuntu-20.04 in main tests * Only use 20.04 for 3.6 * Require thinc v8.1.7 * Require thinc v8.1.8 * Break up longer expression --------- Co-authored-by: Adriane Boyd --- pyproject.toml | 2 +- requirements.txt | 2 +- setup.cfg | 4 +-- spacy/cli/templates/quickstart_training.jinja | 31 ++++++++++++++----- 4 files changed, 27 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 7abd7a96f..9cd96ac2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ requires = [ "cymem>=2.0.2,<2.1.0", "preshed>=3.0.2,<3.1.0", "murmurhash>=0.28.0,<1.1.0", - "thinc>=8.1.0,<8.2.0", + "thinc>=8.1.8,<8.2.0", "numpy>=1.15.0", ] build-backend = "setuptools.build_meta" diff --git a/requirements.txt b/requirements.txt index bc9fc183c..63e03d558 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ spacy-legacy>=3.0.11,<3.1.0 spacy-loggers>=1.0.0,<2.0.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 -thinc>=8.1.0,<8.2.0 +thinc>=8.1.8,<8.2.0 ml_datasets>=0.2.0,<0.3.0 murmurhash>=0.28.0,<1.1.0 wasabi>=0.9.1,<1.2.0 diff --git a/setup.cfg b/setup.cfg index cddc5148c..27499805b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -39,7 +39,7 @@ setup_requires = cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 murmurhash>=0.28.0,<1.1.0 - thinc>=8.1.0,<8.2.0 + thinc>=8.1.8,<8.2.0 install_requires = # Our libraries spacy-legacy>=3.0.11,<3.1.0 @@ -47,7 +47,7 @@ install_requires = murmurhash>=0.28.0,<1.1.0 cymem>=2.0.2,<2.1.0 preshed>=3.0.2,<3.1.0 - thinc>=8.1.0,<8.2.0 + thinc>=8.1.8,<8.2.0 wasabi>=0.9.1,<1.2.0 srsly>=2.4.3,<3.0.0 catalogue>=2.0.6,<2.1.0 diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index b961ac892..441189341 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -24,8 +24,11 @@ gpu_allocator = null lang = "{{ lang }}" {%- set has_textcat = ("textcat" in components or "textcat_multilabel" in components) -%} {%- set with_accuracy = optimize == "accuracy" -%} -{%- set has_accurate_textcat = has_textcat and with_accuracy -%} -{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or has_accurate_textcat) -%} +{# The BOW textcat doesn't need a source of features, so it can omit the +tok2vec/transformer. #} +{%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%} +{%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%} +{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%} {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%} {%- else -%} {%- set full_pipeline = components -%} @@ -221,10 +224,16 @@ no_output_layer = false {% else -%} [components.textcat.model] -@architectures = "spacy.TextCatBOW.v2" +@architectures = "spacy.TextCatCNN.v2" exclusive_classes = true -ngram_size = 1 -no_output_layer = false +nO = null + +[components.textcat.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.textcat.model.tok2vec.pooling] +@layers = "reduce_mean.v1" {%- endif %} {%- endif %} @@ -252,10 +261,16 @@ no_output_layer = false {% else -%} [components.textcat_multilabel.model] -@architectures = "spacy.TextCatBOW.v2" +@architectures = "spacy.TextCatCNN.v2" exclusive_classes = false -ngram_size = 1 -no_output_layer = false +nO = null + +[components.textcat_multilabel.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.textcat_multilabel.model.tok2vec.pooling] +@layers = "reduce_mean.v1" {%- endif %} {%- endif %} From 69ca6eb0414f40ac4000c9d9a269fcd1b26c2117 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marcus=20Bl=C3=A4ttermann?= Date: Wed, 8 Mar 2023 11:59:10 +0100 Subject: [PATCH 13/15] Make sure to run Python setup before NPM dev mode (#12384) --- website/package.json | 1 + 1 file changed, 1 insertion(+) diff --git a/website/package.json b/website/package.json index eeefe32df..5f8bae47e 100644 --- a/website/package.json +++ b/website/package.json @@ -6,6 +6,7 @@ "dev": "next dev", "build": "next build && npm run sitemap && next export", "prebuild": "pip install -r setup/requirements.txt && sh setup/setup.sh", + "predev": "npm run prebuild", "sitemap": "next-sitemap --config next-sitemap.config.mjs", "start": "next start", "lint": "next lint", From 5398e9f2767673621ab22197971b3e876933f4ba Mon Sep 17 00:00:00 2001 From: Victoria <80417010+victorialslocum@users.noreply.github.com> Date: Thu, 9 Mar 2023 10:01:18 +0100 Subject: [PATCH 14/15] Add links in website and readme for survey (#12385) --- README.md | 3 +++ website/src/templates/index.js | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/README.md b/README.md index 49aa6796e..36a015caf 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,9 @@ production-ready [**training system**](https://spacy.io/usage/training) and easy model packaging, deployment and workflow management. spaCy is commercial open-source software, released under the [MIT license](https://github.com/explosion/spaCy/blob/master/LICENSE). +💥 **We'd love to hear more about your experience with spaCy!** +[Fill out our survey here.](https://form.typeform.com/to/aMel9q9f) + 💫 **Version 3.5 out now!** [Check out the release notes here.](https://github.com/explosion/spaCy/releases) diff --git a/website/src/templates/index.js b/website/src/templates/index.js index 227b25be8..2ee29a9e9 100644 --- a/website/src/templates/index.js +++ b/website/src/templates/index.js @@ -25,6 +25,11 @@ const AlertSpace = ({ nightly, legacy }) => { const isOnline = useOnlineStatus() return ( <> + {isOnline && ( + + Take our survey here. + + )} {nightly && ( Date: Thu, 9 Mar 2023 17:30:59 +0800 Subject: [PATCH 15/15] Add spancat_singlelabel pipeline for multiclass and non-overlapping span labelling tasks (#11365) * [wip] Update * [wip] Update * Add initial port * [wip] Update * Fix all imports * Add spancat_exclusive to pipeline * [WIP] Update * [ci skip] Add breakpoint for debugging * Use spacy.SpanCategorizer.v1 as default archi * Update spacy/pipeline/spancat_exclusive.py Co-authored-by: kadarakos * [ci skip] Small updates * Use Softmax v2 directly from thinc * Cache the label map * Fix mypy errors However, I ignored line 370 because it opened up a bunch of type errors that might be trickier to solve and might lead to a more complicated codebase. * avoid multiplication with 1.0 Co-authored-by: kadarakos * Update spacy/pipeline/spancat_exclusive.py Co-authored-by: Sofie Van Landeghem * Update component versions to v2 * Add scorer to docstring * Add _n_labels property to SpanCategorizer Instead of using len(self.labels) in initialize() I am using a private property self._n_labels. This achieves implementation parity and allows me to delete the whole initialize() method for spancat_exclusive (since it's now the same with spancat). * Inherit from SpanCat instead of TrainablePipe This commit changes the inheritance structure of Exclusive_Spancat, now it's inheriting from SpanCategorizer than TrainablePipe. This allows me to remove duplicate methods that are already present in the parent function. * Revert documentation link to spancat * Fix init call for exclusive spancat * Update spacy/pipeline/spancat_exclusive.py Co-authored-by: Adriane Boyd * Import Suggester from spancat * Include zero_init.v1 for spancat * Implement _allow_extra_label to use _n_labels To ensure that spancat / spancat_exclusive cannot be resized after initialization, I inherited the _allow_extra_label() method from spacy/pipeline/trainable_pipe.pyx and used self._n_labels instead of len(self.labels) for checking. I think that changing it locally is a better solution rather than forcing each class that inherits TrainablePipe to use the self._n_labels attribute. Also note that I turned-off black formatting in this block of code because it reads better without the overhang. * Extend existing tests to spancat_exclusive In this commit, I extended the existing tests for spancat to include spancat_exclusive. I parametrized the test functions with 'name' (similar var name with textcat and textcat_multilabel) for each applicable test. TODO: Add overfitting tests for spancat_exclusive * Update documentation for spancat * Turn on formatting for allow_extra_label * Remove initializers in default config * Use DEFAULT_EXCL_SPANCAT_MODEL I also renamed spancat_exclusive_default_config into spancat_excl_default_config because black does some not pretty formatting changes. * Update documentation Update grammar and usage Co-authored-by: Adriane Boyd * Clarify docstring for Exclusive_SpanCategorizer * Remove mypy ignore and typecast labels to list * Fix documentation API * Use a single variable for tests * Update defaults for number of rows Co-authored-by: Adriane Boyd * Put back initializers in spancat config Whenever I remove model.scorer.init_w and model.scorer.init_b, I encounter an error in the test: SystemError: returned a result with an error set. My Thinc version is 8.1.5, but I can't seem to check what's causing the error. * Update spancat_exclusive docstring * Remove init_W and init_B parameters This commit is expected to fail until the new Thinc release. * Require thinc>=8.1.6 for serializable Softmax defaults * Handle zero suggestions to make tests pass I'm not sure if this is the most elegant solution. But what should happen is that the _make_span_group function MUST return an empty SpanGroup if there are no suggestions. The error happens when the 'scores' variable is empty. We cannot get the 'predicted' and other downstream vars. * Better approach for handling zero suggestions * Update website/docs/api/spancategorizer.md Co-authored-by: Adriane Boyd * Update spancategorizer headers * Apply suggestions from code review Co-authored-by: Sofie Van Landeghem * Add default value in negative_weight in docs * Add default value in allow_overlap in docs * Update how spancat_exclusive is constructed In this commit, I added the following: - Put the default values of negative_weight and allow_overlap in the default_config dictionary. - Rename make_spancat -> make_exclusive_spancat * Run prettier on spancategorizer.mdx * Change exactly one -> at most one * Add suggester documentation in Exclusive_SpanCategorizer * Add suggester to spancat docstrings * merge multilabel and singlelabel spancat * rename spancat_exclusive to singlelable * wire up different make_spangroups for single and multilabel * black * black * add docstrings * more docstring and fix negative_label * don't rely on default arguments * black * remove spancat exclusive * replace single_label with add_negative_label and adjust inference * mypy * logical bug in configuration check * add spans.attrs[scores] * single label make_spangroup test * bugfix * black * tests for make_span_group with negative labels * refactor make_span_group * black * Update spacy/tests/pipeline/test_spancat.py Co-authored-by: Adriane Boyd * remove duplicate declaration * Update spacy/pipeline/spancat.py Co-authored-by: Adriane Boyd * raise error instead of just print * make label mapper private * update docs * run prettier * Update website/docs/api/spancategorizer.mdx Co-authored-by: Adriane Boyd * Update website/docs/api/spancategorizer.mdx Co-authored-by: Adriane Boyd * Update spacy/pipeline/spancat.py Co-authored-by: Adriane Boyd * Update spacy/pipeline/spancat.py Co-authored-by: Adriane Boyd * Update spacy/pipeline/spancat.py Co-authored-by: Adriane Boyd * Update spacy/pipeline/spancat.py Co-authored-by: Adriane Boyd * don't keep recomputing self._label_map for each span * typo in docs * Intervals to private and document 'name' param * Update spacy/pipeline/spancat.py Co-authored-by: Adriane Boyd * Update spacy/pipeline/spancat.py Co-authored-by: Adriane Boyd * add Tag to new features * replace tags * revert * revert * revert * revert * Update website/docs/api/spancategorizer.mdx Co-authored-by: Adriane Boyd * Update website/docs/api/spancategorizer.mdx Co-authored-by: Adriane Boyd * prettier * Fix merge * Update website/docs/api/spancategorizer.mdx * remove references to 'single_label' * remove old paragraph * Add spancat_singlelabel to config template * Format * Extend init config tests --------- Co-authored-by: kadarakos Co-authored-by: Sofie Van Landeghem Co-authored-by: Adriane Boyd --- spacy/cli/templates/quickstart_training.jinja | 61 +++- spacy/errors.py | 1 + spacy/pipeline/spancat.py | 323 ++++++++++++++++-- spacy/tests/pipeline/test_spancat.py | 157 ++++++++- spacy/tests/test_cli.py | 9 +- website/docs/api/spancategorizer.mdx | 68 ++-- 6 files changed, 552 insertions(+), 67 deletions(-) diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja index 441189341..c5e8c6c43 100644 --- a/spacy/cli/templates/quickstart_training.jinja +++ b/spacy/cli/templates/quickstart_training.jinja @@ -3,7 +3,7 @@ the docs and the init config command. It encodes various best practices and can help generate the best possible configuration, given a user's requirements. #} {%- set use_transformer = hardware != "cpu" and transformer_data -%} {%- set transformer = transformer_data[optimize] if use_transformer else {} -%} -{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "trainable_lemmatizer"] -%} +{%- set listener_components = ["tagger", "morphologizer", "parser", "ner", "textcat", "textcat_multilabel", "entity_linker", "spancat", "spancat_singlelabel", "trainable_lemmatizer"] -%} [paths] train = null dev = null @@ -28,7 +28,7 @@ lang = "{{ lang }}" tok2vec/transformer. #} {%- set with_accuracy_or_transformer = (use_transformer or with_accuracy) -%} {%- set textcat_needs_features = has_textcat and with_accuracy_or_transformer -%} -{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%} +{%- if ("tagger" in components or "morphologizer" in components or "parser" in components or "ner" in components or "spancat" in components or "spancat_singlelabel" in components or "trainable_lemmatizer" in components or "entity_linker" in components or textcat_needs_features) -%} {%- set full_pipeline = ["transformer" if use_transformer else "tok2vec"] + components -%} {%- else -%} {%- set full_pipeline = components -%} @@ -159,6 +159,36 @@ grad_factor = 1.0 sizes = [1,2,3] {% endif -%} +{% if "spancat_singlelabel" in components %} +[components.spancat_singlelabel] +factory = "spancat_singlelabel" +negative_weight = 1.0 +allow_overlap = true +scorer = {"@scorers":"spacy.spancat_scorer.v1"} +spans_key = "sc" + +[components.spancat_singlelabel.model] +@architectures = "spacy.SpanCategorizer.v1" + +[components.spancat_singlelabel.model.reducer] +@layers = "spacy.mean_max_reducer.v1" +hidden_size = 128 + +[components.spancat_singlelabel.model.scorer] +@layers = "Softmax.v2" + +[components.spancat_singlelabel.model.tok2vec] +@architectures = "spacy-transformers.TransformerListener.v1" +grad_factor = 1.0 + +[components.spancat_singlelabel.model.tok2vec.pooling] +@layers = "reduce_mean.v1" + +[components.spancat_singlelabel.suggester] +@misc = "spacy.ngram_suggester.v1" +sizes = [1,2,3] +{% endif %} + {% if "trainable_lemmatizer" in components -%} [components.trainable_lemmatizer] factory = "trainable_lemmatizer" @@ -389,6 +419,33 @@ width = ${components.tok2vec.model.encode.width} sizes = [1,2,3] {% endif %} +{% if "spancat_singlelabel" in components %} +[components.spancat_singlelabel] +factory = "spancat_singlelabel" +negative_weight = 1.0 +allow_overlap = true +scorer = {"@scorers":"spacy.spancat_scorer.v1"} +spans_key = "sc" + +[components.spancat_singlelabel.model] +@architectures = "spacy.SpanCategorizer.v1" + +[components.spancat_singlelabel.model.reducer] +@layers = "spacy.mean_max_reducer.v1" +hidden_size = 128 + +[components.spancat_singlelabel.model.scorer] +@layers = "Softmax.v2" + +[components.spancat_singlelabel.model.tok2vec] +@architectures = "spacy.Tok2VecListener.v1" +width = ${components.tok2vec.model.encode.width} + +[components.spancat_singlelabel.suggester] +@misc = "spacy.ngram_suggester.v1" +sizes = [1,2,3] +{% endif %} + {% if "trainable_lemmatizer" in components -%} [components.trainable_lemmatizer] factory = "trainable_lemmatizer" diff --git a/spacy/errors.py b/spacy/errors.py index 1047ed21a..c897c29ff 100644 --- a/spacy/errors.py +++ b/spacy/errors.py @@ -969,6 +969,7 @@ class Errors(metaclass=ErrorsWithCodes): "with `displacy.serve(doc, port=port)`") E1050 = ("Port {port} is already in use. Please specify an available port with `displacy.serve(doc, port=port)` " "or use `auto_select_port=True` to pick an available port automatically.") + E1051 = ("'allow_overlap' can only be False when max_positive is 1, but found 'max_positive': {max_positive}.") # Deprecated model shortcuts, only used in errors and warnings diff --git a/spacy/pipeline/spancat.py b/spacy/pipeline/spancat.py index a3388e81a..983e1fba9 100644 --- a/spacy/pipeline/spancat.py +++ b/spacy/pipeline/spancat.py @@ -1,4 +1,5 @@ -from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any +from typing import List, Dict, Callable, Tuple, Optional, Iterable, Any, cast, Union +from dataclasses import dataclass from thinc.api import Config, Model, get_current_ops, set_dropout_rate, Ops from thinc.api import Optimizer from thinc.types import Ragged, Ints2d, Floats2d @@ -43,7 +44,36 @@ maxout_pieces = 3 depth = 4 """ +spancat_singlelabel_default_config = """ +[model] +@architectures = "spacy.SpanCategorizer.v1" +scorer = {"@layers": "Softmax.v2"} + +[model.reducer] +@layers = spacy.mean_max_reducer.v1 +hidden_size = 128 + +[model.tok2vec] +@architectures = "spacy.Tok2Vec.v2" +[model.tok2vec.embed] +@architectures = "spacy.MultiHashEmbed.v1" +width = 96 +rows = [5000, 1000, 2500, 1000] +attrs = ["NORM", "PREFIX", "SUFFIX", "SHAPE"] +include_static_vectors = false + +[model.tok2vec.encode] +@architectures = "spacy.MaxoutWindowEncoder.v2" +width = ${model.tok2vec.embed.width} +window_size = 1 +maxout_pieces = 3 +depth = 4 +""" + DEFAULT_SPANCAT_MODEL = Config().from_str(spancat_default_config)["model"] +DEFAULT_SPANCAT_SINGLELABEL_MODEL = Config().from_str( + spancat_singlelabel_default_config +)["model"] @runtime_checkable @@ -119,10 +149,14 @@ def make_spancat( threshold: float, max_positive: Optional[int], ) -> "SpanCategorizer": - """Create a SpanCategorizer component. The span categorizer consists of two + """Create a SpanCategorizer component and configure it for multi-label + classification to be able to assign multiple labels for each span. + The span categorizer consists of two parts: a suggester function that proposes candidate spans, and a labeller model that predicts one or more labels for each span. + name (str): The component instance name, used to add entries to the + losses during training. suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans. Spans are returned as a ragged array with two integer columns, for the start and end positions. @@ -144,12 +178,80 @@ def make_spancat( """ return SpanCategorizer( nlp.vocab, - suggester=suggester, model=model, - spans_key=spans_key, - threshold=threshold, - max_positive=max_positive, + suggester=suggester, name=name, + spans_key=spans_key, + negative_weight=None, + allow_overlap=True, + max_positive=max_positive, + threshold=threshold, + scorer=scorer, + add_negative_label=False, + ) + + +@Language.factory( + "spancat_singlelabel", + assigns=["doc.spans"], + default_config={ + "spans_key": "sc", + "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL, + "negative_weight": 1.0, + "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, + "scorer": {"@scorers": "spacy.spancat_scorer.v1"}, + "allow_overlap": True, + }, + default_score_weights={"spans_sc_f": 1.0, "spans_sc_p": 0.0, "spans_sc_r": 0.0}, +) +def make_spancat_singlelabel( + nlp: Language, + name: str, + suggester: Suggester, + model: Model[Tuple[List[Doc], Ragged], Floats2d], + spans_key: str, + negative_weight: float, + allow_overlap: bool, + scorer: Optional[Callable], +) -> "SpanCategorizer": + """Create a SpanCategorizer component and configure it for multi-class + classification. With this configuration each span can get at most one + label. The span categorizer consists of two + parts: a suggester function that proposes candidate spans, and a labeller + model that predicts one or more labels for each span. + + name (str): The component instance name, used to add entries to the + losses during training. + suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans. + Spans are returned as a ragged array with two integer columns, for the + start and end positions. + model (Model[Tuple[List[Doc], Ragged], Floats2d]): A model instance that + is given a list of documents and (start, end) indices representing + candidate span offsets. The model predicts a probability for each category + for each span. + spans_key (str): Key of the doc.spans dict to save the spans under. During + initialization and training, the component will look for spans on the + reference document under the same key. + scorer (Optional[Callable]): The scoring method. Defaults to + Scorer.score_spans for the Doc.spans[spans_key] with overlapping + spans allowed. + negative_weight (float): Multiplier for the loss terms. + Can be used to downweight the negative samples if there are too many. + allow_overlap (bool): If True the data is assumed to contain overlapping spans. + Otherwise it produces non-overlapping spans greedily prioritizing + higher assigned label scores. + """ + return SpanCategorizer( + nlp.vocab, + model=model, + suggester=suggester, + name=name, + spans_key=spans_key, + negative_weight=negative_weight, + allow_overlap=allow_overlap, + max_positive=1, + add_negative_label=True, + threshold=None, scorer=scorer, ) @@ -172,6 +274,27 @@ def make_spancat_scorer(): return spancat_score +@dataclass +class _Intervals: + """ + Helper class to avoid storing overlapping spans. + """ + + def __init__(self): + self.ranges = set() + + def add(self, i, j): + for e in range(i, j): + self.ranges.add(e) + + def __contains__(self, rang): + i, j = rang + for e in range(i, j): + if e in self.ranges: + return True + return False + + class SpanCategorizer(TrainablePipe): """Pipeline component to label spans of text. @@ -185,25 +308,43 @@ class SpanCategorizer(TrainablePipe): suggester: Suggester, name: str = "spancat", *, + add_negative_label: bool = False, spans_key: str = "spans", - threshold: float = 0.5, + negative_weight: Optional[float] = 1.0, + allow_overlap: Optional[bool] = True, max_positive: Optional[int] = None, + threshold: Optional[float] = 0.5, scorer: Optional[Callable] = spancat_score, ) -> None: - """Initialize the span categorizer. + """Initialize the multi-label or multi-class span categorizer. + vocab (Vocab): The shared vocabulary. model (thinc.api.Model): The Thinc Model powering the pipeline component. + For multi-class classification (single label per span) we recommend + using a Softmax classifier as a the final layer, while for multi-label + classification (multiple possible labels per span) we recommend Logistic. + suggester (Callable[[Iterable[Doc], Optional[Ops]], Ragged]): A function that suggests spans. + Spans are returned as a ragged array with two integer columns, for the + start and end positions. name (str): The component instance name, used to add entries to the losses during training. spans_key (str): Key of the Doc.spans dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"spans"`. - threshold (float): Minimum probability to consider a prediction - positive. Spans with a positive prediction will be saved on the Doc. - Defaults to 0.5. + add_negative_label (bool): Learn to predict a special 'negative_label' + when a Span is not annotated. + threshold (Optional[float]): Minimum probability to consider a prediction + positive. Defaults to 0.5. Spans with a positive prediction will be saved + on the Doc. max_positive (Optional[int]): Maximum number of labels to consider positive per span. Defaults to None, indicating no limit. + negative_weight (float): Multiplier for the loss terms. + Can be used to downweight the negative samples if there are too many + when add_negative_label is True. Otherwise its unused. + allow_overlap (bool): If True the data is assumed to contain overlapping spans. + Otherwise it produces non-overlapping spans greedily prioritizing + higher assigned label scores. Only used when max_positive is 1. scorer (Optional[Callable]): The scoring method. Defaults to Scorer.score_spans for the Doc.spans[spans_key] with overlapping spans allowed. @@ -215,12 +356,17 @@ class SpanCategorizer(TrainablePipe): "spans_key": spans_key, "threshold": threshold, "max_positive": max_positive, + "negative_weight": negative_weight, + "allow_overlap": allow_overlap, } self.vocab = vocab self.suggester = suggester self.model = model self.name = name self.scorer = scorer + self.add_negative_label = add_negative_label + if not allow_overlap and max_positive is not None and max_positive > 1: + raise ValueError(Errors.E1051.format(max_positive=max_positive)) @property def key(self) -> str: @@ -230,6 +376,21 @@ class SpanCategorizer(TrainablePipe): """ return str(self.cfg["spans_key"]) + def _allow_extra_label(self) -> None: + """Raise an error if the component can not add any more labels.""" + nO = None + if self.model.has_dim("nO"): + nO = self.model.get_dim("nO") + elif self.model.has_ref("output_layer") and self.model.get_ref( + "output_layer" + ).has_dim("nO"): + nO = self.model.get_ref("output_layer").get_dim("nO") + if nO is not None and nO == self._n_labels: + if not self.is_resizable: + raise ValueError( + Errors.E922.format(name=self.name, nO=self.model.get_dim("nO")) + ) + def add_label(self, label: str) -> int: """Add a new label to the pipe. @@ -263,6 +424,27 @@ class SpanCategorizer(TrainablePipe): """ return list(self.labels) + @property + def _label_map(self) -> Dict[str, int]: + """RETURNS (Dict[str, int]): The label map.""" + return {label: i for i, label in enumerate(self.labels)} + + @property + def _n_labels(self) -> int: + """RETURNS (int): Number of labels.""" + if self.add_negative_label: + return len(self.labels) + 1 + else: + return len(self.labels) + + @property + def _negative_label_i(self) -> Union[int, None]: + """RETURNS (Union[int, None]): Index of the negative label.""" + if self.add_negative_label: + return len(self.label_data) + else: + return None + def predict(self, docs: Iterable[Doc]): """Apply the pipeline's model to a batch of docs, without modifying them. @@ -304,14 +486,24 @@ class SpanCategorizer(TrainablePipe): DOCS: https://spacy.io/api/spancategorizer#set_annotations """ - labels = self.labels indices, scores = indices_scores offset = 0 for i, doc in enumerate(docs): indices_i = indices[i].dataXd - doc.spans[self.key] = self._make_span_group( - doc, indices_i, scores[offset : offset + indices.lengths[i]], labels # type: ignore[arg-type] - ) + allow_overlap = cast(bool, self.cfg["allow_overlap"]) + if self.cfg["max_positive"] == 1: + doc.spans[self.key] = self._make_span_group_singlelabel( + doc, + indices_i, + scores[offset : offset + indices.lengths[i]], + allow_overlap, + ) + else: + doc.spans[self.key] = self._make_span_group_multilabel( + doc, + indices_i, + scores[offset : offset + indices.lengths[i]], + ) offset += indices.lengths[i] def update( @@ -371,9 +563,11 @@ class SpanCategorizer(TrainablePipe): spans = Ragged( self.model.ops.to_numpy(spans.data), self.model.ops.to_numpy(spans.lengths) ) - label_map = {label: i for i, label in enumerate(self.labels)} target = numpy.zeros(scores.shape, dtype=scores.dtype) + if self.add_negative_label: + negative_spans = numpy.ones((scores.shape[0])) offset = 0 + label_map = self._label_map for i, eg in enumerate(examples): # Map (start, end) offset of spans to the row in the d_scores array, # so that we can adjust the gradient for predictions that were @@ -390,10 +584,16 @@ class SpanCategorizer(TrainablePipe): row = spans_index[key] k = label_map[gold_span.label_] target[row, k] = 1.0 + if self.add_negative_label: + # delete negative label target. + negative_spans[row] = 0.0 # The target is a flat array for all docs. Track the position # we're at within the flat array. offset += spans.lengths[i] target = self.model.ops.asarray(target, dtype="f") # type: ignore + if self.add_negative_label: + negative_samples = numpy.nonzero(negative_spans)[0] + target[negative_samples, self._negative_label_i] = 1.0 # type: ignore # The target will have the values 0 (for untrue predictions) or 1 # (for true predictions). # The scores should be in the range [0, 1]. @@ -402,6 +602,10 @@ class SpanCategorizer(TrainablePipe): # If the prediction is 0.9 and it's false, the gradient will be # 0.9 (0.9 - 0.0) d_scores = scores - target + if self.add_negative_label: + neg_weight = cast(float, self.cfg["negative_weight"]) + if neg_weight != 1.0: + d_scores[negative_samples] *= neg_weight loss = float((d_scores**2).sum()) return loss, d_scores @@ -438,7 +642,7 @@ class SpanCategorizer(TrainablePipe): if subbatch: docs = [eg.x for eg in subbatch] spans = build_ngram_suggester(sizes=[1])(docs) - Y = self.model.ops.alloc2f(spans.dataXd.shape[0], len(self.labels)) + Y = self.model.ops.alloc2f(spans.dataXd.shape[0], self._n_labels) self.model.initialize(X=(docs, spans), Y=Y) else: self.model.initialize() @@ -452,31 +656,96 @@ class SpanCategorizer(TrainablePipe): eg.reference.spans.get(self.key, []), allow_overlap=True ) - def _make_span_group( - self, doc: Doc, indices: Ints2d, scores: Floats2d, labels: List[str] + def _make_span_group_multilabel( + self, + doc: Doc, + indices: Ints2d, + scores: Floats2d, ) -> SpanGroup: + """Find the top-k labels for each span (k=max_positive).""" spans = SpanGroup(doc, name=self.key) - max_positive = self.cfg["max_positive"] + if scores.size == 0: + return spans + scores = self.model.ops.to_numpy(scores) + indices = self.model.ops.to_numpy(indices) threshold = self.cfg["threshold"] + max_positive = self.cfg["max_positive"] keeps = scores >= threshold - ranked = (scores * -1).argsort() # type: ignore if max_positive is not None: assert isinstance(max_positive, int) + if self.add_negative_label: + negative_scores = numpy.copy(scores[:, self._negative_label_i]) + scores[:, self._negative_label_i] = -numpy.inf + ranked = (scores * -1).argsort() # type: ignore + scores[:, self._negative_label_i] = negative_scores + else: + ranked = (scores * -1).argsort() # type: ignore span_filter = ranked[:, max_positive:] for i, row in enumerate(span_filter): keeps[i, row] = False - spans.attrs["scores"] = scores[keeps].flatten() - - indices = self.model.ops.to_numpy(indices) - keeps = self.model.ops.to_numpy(keeps) + attrs_scores = [] for i in range(indices.shape[0]): start = indices[i, 0] end = indices[i, 1] - for j, keep in enumerate(keeps[i]): if keep: - spans.append(Span(doc, start, end, label=labels[j])) + if j != self._negative_label_i: + spans.append(Span(doc, start, end, label=self.labels[j])) + attrs_scores.append(scores[i, j]) + spans.attrs["scores"] = numpy.array(attrs_scores) + return spans + + def _make_span_group_singlelabel( + self, + doc: Doc, + indices: Ints2d, + scores: Floats2d, + allow_overlap: bool = True, + ) -> SpanGroup: + """Find the argmax label for each span.""" + # Handle cases when there are zero suggestions + if scores.size == 0: + return SpanGroup(doc, name=self.key) + scores = self.model.ops.to_numpy(scores) + indices = self.model.ops.to_numpy(indices) + predicted = scores.argmax(axis=1) + argmax_scores = numpy.take_along_axis( + scores, numpy.expand_dims(predicted, 1), axis=1 + ) + keeps = numpy.ones(predicted.shape, dtype=bool) + # Remove samples where the negative label is the argmax. + if self.add_negative_label: + keeps = numpy.logical_and(keeps, predicted != self._negative_label_i) + # Filter samples according to threshold. + threshold = self.cfg["threshold"] + if threshold is not None: + keeps = numpy.logical_and(keeps, (argmax_scores >= threshold).squeeze()) + # Sort spans according to argmax probability + if not allow_overlap: + # Get the probabilities + sort_idx = (argmax_scores.squeeze() * -1).argsort() + predicted = predicted[sort_idx] + indices = indices[sort_idx] + keeps = keeps[sort_idx] + seen = _Intervals() + spans = SpanGroup(doc, name=self.key) + attrs_scores = [] + for i in range(indices.shape[0]): + if not keeps[i]: + continue + + label = predicted[i] + start = indices[i, 0] + end = indices[i, 1] + + if not allow_overlap: + if (start, end) in seen: + continue + else: + seen.add(start, end) + attrs_scores.append(argmax_scores[i]) + spans.append(Span(doc, start, end, label=self.labels[label])) return spans diff --git a/spacy/tests/pipeline/test_spancat.py b/spacy/tests/pipeline/test_spancat.py index e9db983d3..cf6304042 100644 --- a/spacy/tests/pipeline/test_spancat.py +++ b/spacy/tests/pipeline/test_spancat.py @@ -15,6 +15,8 @@ OPS = get_current_ops() SPAN_KEY = "labeled_spans" +SPANCAT_COMPONENTS = ["spancat", "spancat_singlelabel"] + TRAIN_DATA = [ ("Who is Shaka Khan?", {"spans": {SPAN_KEY: [(7, 17, "PERSON")]}}), ( @@ -41,38 +43,42 @@ def make_examples(nlp, data=TRAIN_DATA): return train_examples -def test_no_label(): +@pytest.mark.parametrize("name", SPANCAT_COMPONENTS) +def test_no_label(name): nlp = Language() - nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + nlp.add_pipe(name, config={"spans_key": SPAN_KEY}) with pytest.raises(ValueError): nlp.initialize() -def test_no_resize(): +@pytest.mark.parametrize("name", SPANCAT_COMPONENTS) +def test_no_resize(name): nlp = Language() - spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY}) spancat.add_label("Thing") spancat.add_label("Phrase") assert spancat.labels == ("Thing", "Phrase") nlp.initialize() - assert spancat.model.get_dim("nO") == 2 + assert spancat.model.get_dim("nO") == spancat._n_labels # this throws an error because the spancat can't be resized after initialization with pytest.raises(ValueError): spancat.add_label("Stuff") -def test_implicit_labels(): +@pytest.mark.parametrize("name", SPANCAT_COMPONENTS) +def test_implicit_labels(name): nlp = Language() - spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY}) assert len(spancat.labels) == 0 train_examples = make_examples(nlp) nlp.initialize(get_examples=lambda: train_examples) assert spancat.labels == ("PERSON", "LOC") -def test_explicit_labels(): +@pytest.mark.parametrize("name", SPANCAT_COMPONENTS) +def test_explicit_labels(name): nlp = Language() - spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY}) assert len(spancat.labels) == 0 spancat.add_label("PERSON") spancat.add_label("LOC") @@ -102,13 +108,13 @@ def test_doc_gc(): # XXX This fails with length 0 sometimes assert len(spangroup) > 0 with pytest.raises(RuntimeError): - span = spangroup[0] + spangroup[0] @pytest.mark.parametrize( "max_positive,nr_results", [(None, 4), (1, 2), (2, 3), (3, 4), (4, 4)] ) -def test_make_spangroup(max_positive, nr_results): +def test_make_spangroup_multilabel(max_positive, nr_results): fix_random_seed(0) nlp = Language() spancat = nlp.add_pipe( @@ -120,10 +126,12 @@ def test_make_spangroup(max_positive, nr_results): indices = ngram_suggester([doc])[0].dataXd assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]])) labels = ["Thing", "City", "Person", "GreatCity"] + for label in labels: + spancat.add_label(label) scores = numpy.asarray( [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f" ) - spangroup = spancat._make_span_group(doc, indices, scores, labels) + spangroup = spancat._make_span_group_multilabel(doc, indices, scores) assert len(spangroup) == nr_results # first span is always the second token "London" @@ -154,6 +162,118 @@ def test_make_spangroup(max_positive, nr_results): assert_almost_equal(0.9, spangroup.attrs["scores"][-1], 5) +@pytest.mark.parametrize( + "threshold,allow_overlap,nr_results", + [(0.05, True, 3), (0.05, False, 1), (0.5, True, 2), (0.5, False, 1)], +) +def test_make_spangroup_singlelabel(threshold, allow_overlap, nr_results): + fix_random_seed(0) + nlp = Language() + spancat = nlp.add_pipe( + "spancat", + config={ + "spans_key": SPAN_KEY, + "threshold": threshold, + "max_positive": 1, + }, + ) + doc = nlp.make_doc("Greater London") + ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2]) + indices = ngram_suggester([doc])[0].dataXd + assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]])) + labels = ["Thing", "City", "Person", "GreatCity"] + for label in labels: + spancat.add_label(label) + scores = numpy.asarray( + [[0.2, 0.4, 0.3, 0.1], [0.1, 0.6, 0.2, 0.4], [0.8, 0.7, 0.3, 0.9]], dtype="f" + ) + spangroup = spancat._make_span_group_singlelabel( + doc, indices, scores, allow_overlap + ) + assert len(spangroup) == nr_results + if threshold > 0.4: + if allow_overlap: + assert spangroup[0].text == "London" + assert spangroup[0].label_ == "City" + assert spangroup[1].text == "Greater London" + assert spangroup[1].label_ == "GreatCity" + + else: + assert spangroup[0].text == "Greater London" + assert spangroup[0].label_ == "GreatCity" + else: + if allow_overlap: + assert spangroup[0].text == "Greater" + assert spangroup[0].label_ == "City" + assert spangroup[1].text == "London" + assert spangroup[1].label_ == "City" + assert spangroup[2].text == "Greater London" + assert spangroup[2].label_ == "GreatCity" + else: + assert spangroup[0].text == "Greater London" + + +def test_make_spangroup_negative_label(): + fix_random_seed(0) + nlp_single = Language() + nlp_multi = Language() + spancat_single = nlp_single.add_pipe( + "spancat", + config={ + "spans_key": SPAN_KEY, + "threshold": 0.1, + "max_positive": 1, + }, + ) + spancat_multi = nlp_multi.add_pipe( + "spancat", + config={ + "spans_key": SPAN_KEY, + "threshold": 0.1, + "max_positive": 2, + }, + ) + spancat_single.add_negative_label = True + spancat_multi.add_negative_label = True + doc = nlp_single.make_doc("Greater London") + labels = ["Thing", "City", "Person", "GreatCity"] + for label in labels: + spancat_multi.add_label(label) + spancat_single.add_label(label) + ngram_suggester = registry.misc.get("spacy.ngram_suggester.v1")(sizes=[1, 2]) + indices = ngram_suggester([doc])[0].dataXd + assert_array_equal(OPS.to_numpy(indices), numpy.asarray([[0, 1], [1, 2], [0, 2]])) + scores = numpy.asarray( + [ + [0.2, 0.4, 0.3, 0.1, 0.1], + [0.1, 0.6, 0.2, 0.4, 0.9], + [0.8, 0.7, 0.3, 0.9, 0.1], + ], + dtype="f", + ) + spangroup_multi = spancat_multi._make_span_group_multilabel(doc, indices, scores) + spangroup_single = spancat_single._make_span_group_singlelabel(doc, indices, scores) + assert len(spangroup_single) == 2 + assert spangroup_single[0].text == "Greater" + assert spangroup_single[0].label_ == "City" + assert spangroup_single[1].text == "Greater London" + assert spangroup_single[1].label_ == "GreatCity" + + assert len(spangroup_multi) == 6 + assert spangroup_multi[0].text == "Greater" + assert spangroup_multi[0].label_ == "City" + assert spangroup_multi[1].text == "Greater" + assert spangroup_multi[1].label_ == "Person" + assert spangroup_multi[2].text == "London" + assert spangroup_multi[2].label_ == "City" + assert spangroup_multi[3].text == "London" + assert spangroup_multi[3].label_ == "GreatCity" + assert spangroup_multi[4].text == "Greater London" + assert spangroup_multi[4].label_ == "Thing" + assert spangroup_multi[5].text == "Greater London" + assert spangroup_multi[5].label_ == "GreatCity" + + def test_ngram_suggester(en_tokenizer): # test different n-gram lengths for size in [1, 2, 3]: @@ -371,9 +491,9 @@ def test_overfitting_IO_overlapping(): assert set([span.label_ for span in spans2]) == {"LOC", "DOUBLE_LOC"} -def test_zero_suggestions(): +@pytest.mark.parametrize("name", SPANCAT_COMPONENTS) +def test_zero_suggestions(name): # Test with a suggester that can return 0 suggestions - @registry.misc("test_mixed_zero_suggester") def make_mixed_zero_suggester(): def mixed_zero_suggester(docs, *, ops=None): @@ -400,7 +520,7 @@ def test_zero_suggestions(): fix_random_seed(0) nlp = English() spancat = nlp.add_pipe( - "spancat", + name, config={ "suggester": {"@misc": "test_mixed_zero_suggester"}, "spans_key": SPAN_KEY, @@ -408,7 +528,7 @@ def test_zero_suggestions(): ) train_examples = make_examples(nlp) optimizer = nlp.initialize(get_examples=lambda: train_examples) - assert spancat.model.get_dim("nO") == 2 + assert spancat.model.get_dim("nO") == spancat._n_labels assert set(spancat.labels) == {"LOC", "PERSON"} nlp.update(train_examples, sgd=optimizer) @@ -424,9 +544,10 @@ def test_zero_suggestions(): list(nlp.pipe(["", "one", "three three three"])) -def test_set_candidates(): +@pytest.mark.parametrize("name", SPANCAT_COMPONENTS) +def test_set_candidates(name): nlp = Language() - spancat = nlp.add_pipe("spancat", config={"spans_key": SPAN_KEY}) + spancat = nlp.add_pipe(name, config={"spans_key": SPAN_KEY}) train_examples = make_examples(nlp) nlp.initialize(get_examples=lambda: train_examples) texts = [ diff --git a/spacy/tests/test_cli.py b/spacy/tests/test_cli.py index f5bcdfd23..1fdf059b3 100644 --- a/spacy/tests/test_cli.py +++ b/spacy/tests/test_cli.py @@ -552,7 +552,14 @@ def test_parse_cli_overrides(): @pytest.mark.parametrize("lang", ["en", "nl"]) @pytest.mark.parametrize( - "pipeline", [["tagger", "parser", "ner"], [], ["ner", "textcat", "sentencizer"]] + "pipeline", + [ + ["tagger", "parser", "ner"], + [], + ["ner", "textcat", "sentencizer"], + ["morphologizer", "spancat", "entity_linker"], + ["spancat_singlelabel", "textcat_multilabel"], + ], ) @pytest.mark.parametrize("optimize", ["efficiency", "accuracy"]) @pytest.mark.parametrize("pretraining", [True, False]) diff --git a/website/docs/api/spancategorizer.mdx b/website/docs/api/spancategorizer.mdx index f39c0aff9..c7de2324b 100644 --- a/website/docs/api/spancategorizer.mdx +++ b/website/docs/api/spancategorizer.mdx @@ -13,6 +13,13 @@ A span categorizer consists of two parts: a [suggester function](#suggesters) that proposes candidate spans, which may or may not overlap, and a labeler model that predicts zero or more labels for each candidate. +This component comes in two forms: `spancat` and `spancat_singlelabel` (added in +spaCy v3.5.1). When you need to perform multi-label classification on your +spans, use `spancat`. The `spancat` component uses a `Logistic` layer where the +output class probabilities are independent for each class. However, if you need +to predict at most one true class for a span, then use `spancat_singlelabel`. It +uses a `Softmax` layer and treats the task as a multi-class problem. + Predicted spans will be saved in a [`SpanGroup`](/api/spangroup) on the doc. Individual span scores can be found in `spangroup.attrs["scores"]`. @@ -38,7 +45,7 @@ how the component should be configured. You can override its settings via the [model architectures](/api/architectures) documentation for details on the architectures and their arguments and hyperparameters. -> #### Example +> #### Example (spancat) > > ```python > from spacy.pipeline.spancat import DEFAULT_SPANCAT_MODEL @@ -52,14 +59,33 @@ architectures and their arguments and hyperparameters. > nlp.add_pipe("spancat", config=config) > ``` -| Setting | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | -| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | -| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | -| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | -| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | -| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | +> #### Example (spancat_singlelabel) +> +> ```python +> from spacy.pipeline.spancat import DEFAULT_SPANCAT_SINGLELABEL_MODEL +> config = { +> "threshold": 0.5, +> "spans_key": "labeled_spans", +> "model": DEFAULT_SPANCAT_SINGLELABEL_MODEL, +> "suggester": {"@misc": "spacy.ngram_suggester.v1", "sizes": [1, 2, 3]}, +> # Additional spancat_singlelabel parameters +> "negative_weight": 0.8, +> "allow_overlap": True, +> } +> nlp.add_pipe("spancat_singlelabel", config=config) +> ``` + +| Setting | Description | +| --------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. Defaults to [`ngram_suggester`](#ngram_suggester). ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | +| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. Defaults to [SpanCategorizer](/api/architectures#SpanCategorizer). ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | +| `spans_key` | Key of the [`Doc.spans`](/api/doc#spans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | +| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Meant to be used in combination with the multi-class `spancat` component with a `Logistic` scoring layer. Defaults to `0.5`. ~~float~~ | +| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. Meant to be used together with the `spancat` component and defaults to 0 with `spancat_singlelabel`. ~~Optional[int]~~ | +| `scorer` | The scoring method. Defaults to [`Scorer.score_spans`](/api/scorer#score_spans) for `Doc.spans[spans_key]` with overlapping spans allowed. ~~Optional[Callable]~~ | +| `add_negative_label` 3.5.1 | Whether to learn to predict a special negative label for each unannotated `Span` . This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel`. Spans with negative labels and their scores are not stored as annotations. ~~bool~~ | +| `negative_weight` 3.5.1 | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many. It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ | +| `allow_overlap` 3.5.1 | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ | ```python %%GITHUB_SPACY/spacy/pipeline/spancat.py @@ -71,6 +97,7 @@ architectures and their arguments and hyperparameters. > > ```python > # Construction via add_pipe with default model +> # Replace 'spancat' with 'spancat_singlelabel' for exclusive classes > spancat = nlp.add_pipe("spancat") > > # Construction via add_pipe with custom model @@ -86,16 +113,19 @@ Create a new pipeline instance. In your application, you would normally use a shortcut for this and instantiate the component using its string name and [`nlp.add_pipe`](/api/language#create_pipe). -| Name | Description | -| -------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| `vocab` | The shared vocabulary. ~~Vocab~~ | -| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | -| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | -| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | -| _keyword-only_ | | -| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | -| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | -| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | +| Name | Description | +| --------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `vocab` | The shared vocabulary. ~~Vocab~~ | +| `model` | A model instance that is given a a list of documents and `(start, end)` indices representing candidate span offsets. The model predicts a probability for each category for each span. ~~Model[Tuple[List[Doc], Ragged], Floats2d]~~ | +| `suggester` | A function that [suggests spans](#suggesters). Spans are returned as a ragged array with two integer columns, for the start and end positions. ~~Callable[[Iterable[Doc], Optional[Ops]], Ragged]~~ | +| `name` | String name of the component instance. Used to add entries to the `losses` during training. ~~str~~ | +| _keyword-only_ | | +| `spans_key` | Key of the [`Doc.spans`](/api/doc#sans) dict to save the spans under. During initialization and training, the component will look for spans on the reference document under the same key. Defaults to `"sc"`. ~~str~~ | +| `threshold` | Minimum probability to consider a prediction positive. Spans with a positive prediction will be saved on the Doc. Defaults to `0.5`. ~~float~~ | +| `max_positive` | Maximum number of labels to consider positive per span. Defaults to `None`, indicating no limit. ~~Optional[int]~~ | +| `allow_overlap` 3.5.1 | If `True`, the data is assumed to contain overlapping spans. It is only available when `max_positive` is exactly 1. Defaults to `True`. ~~bool~~ | +| `add_negative_label` 3.5.1 | Whether to learn to predict a special negative label for each unannotated `Span`. This should be `True` when using a `Softmax` classifier layer and so its `True` by default for `spancat_singlelabel` . Spans with negative labels and their scores are not stored as annotations. ~~bool~~ | +| `negative_weight` 3.5.1 | Multiplier for the loss terms. It can be used to downweight the negative samples if there are too many . It is only used when `add_negative_label` is `True`. Defaults to `1.0`. ~~float~~ | ## SpanCategorizer.\_\_call\_\_ {id="call",tag="method"}