From 7e684ad691992e759e71026a11c1ddd77c401f39 Mon Sep 17 00:00:00 2001
From: Denis Bezykornov
Date: Tue, 15 Nov 2022 13:37:25 +0300
Subject: [PATCH 4/8] Update russian tokenizer exceptions (#11753)
* Fix typos, add couple of new abbreviations, remove nonbreaking spaces
* Remove space from abbreviation
Co-authored-by: Adriane Boyd
---
spacy/lang/ru/tokenizer_exceptions.py | 18 ++++++++++++------
1 file changed, 12 insertions(+), 6 deletions(-)
diff --git a/spacy/lang/ru/tokenizer_exceptions.py b/spacy/lang/ru/tokenizer_exceptions.py
index f3756e26c..e1889f785 100644
--- a/spacy/lang/ru/tokenizer_exceptions.py
+++ b/spacy/lang/ru/tokenizer_exceptions.py
@@ -61,6 +61,11 @@ for abbr in [
{ORTH: "2к23", NORM: "2023"},
{ORTH: "2к24", NORM: "2024"},
{ORTH: "2к25", NORM: "2025"},
+ {ORTH: "2к26", NORM: "2026"},
+ {ORTH: "2к27", NORM: "2027"},
+ {ORTH: "2к28", NORM: "2028"},
+ {ORTH: "2к29", NORM: "2029"},
+ {ORTH: "2к30", NORM: "2030"},
]:
_exc[abbr[ORTH]] = [abbr]
@@ -268,8 +273,8 @@ for abbr in [
{ORTH: "з-ка", NORM: "заимка"},
{ORTH: "п-к", NORM: "починок"},
{ORTH: "киш.", NORM: "кишлак"},
- {ORTH: "п. ст. ", NORM: "поселок станция"},
- {ORTH: "п. ж/д ст. ", NORM: "поселок при железнодорожной станции"},
+ {ORTH: "п. ст.", NORM: "поселок станция"},
+ {ORTH: "п. ж/д ст.", NORM: "поселок при железнодорожной станции"},
{ORTH: "ж/д бл-ст", NORM: "железнодорожный блокпост"},
{ORTH: "ж/д б-ка", NORM: "железнодорожная будка"},
{ORTH: "ж/д в-ка", NORM: "железнодорожная ветка"},
@@ -280,12 +285,12 @@ for abbr in [
{ORTH: "ж/д п.п.", NORM: "железнодорожный путевой пост"},
{ORTH: "ж/д о.п.", NORM: "железнодорожный остановочный пункт"},
{ORTH: "ж/д рзд.", NORM: "железнодорожный разъезд"},
- {ORTH: "ж/д ст. ", NORM: "железнодорожная станция"},
+ {ORTH: "ж/д ст.", NORM: "железнодорожная станция"},
{ORTH: "м-ко", NORM: "местечко"},
{ORTH: "д.", NORM: "деревня"},
{ORTH: "с.", NORM: "село"},
{ORTH: "сл.", NORM: "слобода"},
- {ORTH: "ст. ", NORM: "станция"},
+ {ORTH: "ст.", NORM: "станция"},
{ORTH: "ст-ца", NORM: "станица"},
{ORTH: "у.", NORM: "улус"},
{ORTH: "х.", NORM: "хутор"},
@@ -388,8 +393,9 @@ for abbr in [
{ORTH: "прим.", NORM: "примечание"},
{ORTH: "прим.ред.", NORM: "примечание редакции"},
{ORTH: "см. также", NORM: "смотри также"},
- {ORTH: "кв.м.", NORM: "квадрантный метр"},
- {ORTH: "м2", NORM: "квадрантный метр"},
+ {ORTH: "см.", NORM: "смотри"},
+ {ORTH: "кв.м.", NORM: "квадратный метр"},
+ {ORTH: "м2", NORM: "квадратный метр"},
{ORTH: "б/у", NORM: "бывший в употреблении"},
{ORTH: "сокр.", NORM: "сокращение"},
{ORTH: "чел.", NORM: "человек"},
From caa9efad5991d574cf2bdc69fabfc6d952d5cba9 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem
Date: Tue, 15 Nov 2022 14:15:00 +0100
Subject: [PATCH 5/8] prevent rewriting an already raw URL (#11810)
---
spacy/cli/project/assets.py | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/spacy/cli/project/assets.py b/spacy/cli/project/assets.py
index 61438d1a8..8f35b2d23 100644
--- a/spacy/cli/project/assets.py
+++ b/spacy/cli/project/assets.py
@@ -189,7 +189,11 @@ def convert_asset_url(url: str) -> str:
RETURNS (str): The converted URL.
"""
# If the asset URL is a regular GitHub URL it's likely a mistake
- if re.match(r"(http(s?)):\/\/github.com", url) and "releases/download" not in url:
+ if (
+ re.match(r"(http(s?)):\/\/github.com", url)
+ and "releases/download" not in url
+ and "/raw/" not in url
+ ):
converted = url.replace("github.com", "raw.githubusercontent.com")
converted = re.sub(r"/(tree|blob)/", "/", converted)
msg.warn(
From c0c54e44bc70ca737b421def1f6ce3c30809a54b Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann
Date: Wed, 16 Nov 2022 17:44:42 +0900
Subject: [PATCH 6/8] Add equality definition for vectors (#11806)
* Add equality definition for vectors
This re-uses the check from sourcing components.
* Use the equality check
* Format
Co-authored-by: Adriane Boyd
---
spacy/language.py | 8 +-------
spacy/tests/vocab_vectors/test_vectors.py | 20 ++++++++++++++++++++
spacy/vectors.pyx | 9 +++++++++
3 files changed, 30 insertions(+), 7 deletions(-)
diff --git a/spacy/language.py b/spacy/language.py
index 967af1e62..836f3abf9 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -706,13 +706,7 @@ class Language:
# Check source type
if not isinstance(source, Language):
raise ValueError(Errors.E945.format(name=source_name, source=type(source)))
- # Check vectors, with faster checks first
- if (
- self.vocab.vectors.shape != source.vocab.vectors.shape
- or self.vocab.vectors.key2row != source.vocab.vectors.key2row
- or self.vocab.vectors.to_bytes(exclude=["strings"])
- != source.vocab.vectors.to_bytes(exclude=["strings"])
- ):
+ if self.vocab.vectors != source.vocab.vectors:
warnings.warn(Warnings.W113.format(name=source_name))
if source_name not in source.component_names:
raise KeyError(
diff --git a/spacy/tests/vocab_vectors/test_vectors.py b/spacy/tests/vocab_vectors/test_vectors.py
index dd2cfc596..70835816d 100644
--- a/spacy/tests/vocab_vectors/test_vectors.py
+++ b/spacy/tests/vocab_vectors/test_vectors.py
@@ -626,3 +626,23 @@ def test_floret_vectors(floret_vectors_vec_str, floret_vectors_hashvec_str):
OPS.to_numpy(vocab_r[word].vector),
decimal=6,
)
+
+
+def test_equality():
+ vectors1 = Vectors(shape=(10, 10))
+ vectors2 = Vectors(shape=(10, 8))
+
+ assert vectors1 != vectors2
+
+ vectors2 = Vectors(shape=(10, 10))
+ assert vectors1 == vectors2
+
+ vectors1.add("hello", row=2)
+ assert vectors1 != vectors2
+
+ vectors2.add("hello", row=2)
+ assert vectors1 == vectors2
+
+ vectors1.resize((5, 9))
+ vectors2.resize((5, 9))
+ assert vectors1 == vectors2
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 8300220c1..be0f6db09 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -243,6 +243,15 @@ cdef class Vectors:
else:
return key in self.key2row
+ def __eq__(self, other):
+ # Check for equality, with faster checks first
+ return (
+ self.shape == other.shape
+ and self.key2row == other.key2row
+ and self.to_bytes(exclude=["strings"])
+ == other.to_bytes(exclude=["strings"])
+ )
+
def resize(self, shape, inplace=False):
"""Resize the underlying vectors array. If inplace=True, the memory
is reallocated. This may cause other references to the data to become
From 317b6ef99c0e3512466d31a8274f9fe6a2894355 Mon Sep 17 00:00:00 2001
From: Adriane Boyd
Date: Wed, 16 Nov 2022 14:09:10 +0100
Subject: [PATCH 7/8] Update to mypy 0.990 (#11801)
---
requirements.txt | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/requirements.txt b/requirements.txt
index d91a3b3d4..23bfa6f14 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,7 +30,7 @@ pytest-timeout>=1.3.0,<2.0.0
mock>=2.0.0,<3.0.0
flake8>=3.8.0,<6.0.0
hypothesis>=3.27.0,<7.0.0
-mypy>=0.980,<0.990; platform_machine != "aarch64" and python_version >= "3.7"
+mypy>=0.990,<0.1000; platform_machine != "aarch64" and python_version >= "3.7"
types-dataclasses>=0.1.3; python_version < "3.7"
types-mock>=0.1.1
types-setuptools>=57.0.0
From 75bb7ad541a94c74127b57ffd6d674841767478c Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann
Date: Thu, 17 Nov 2022 18:25:01 +0900
Subject: [PATCH 8/8] Check textcat values for validity (#11763)
* Check textcat values for validity
* Fix error numbers
* Clean up vals reference
* Check category value validity through training
The _validate_categories is called in update, which for multilabel is
inherited from the single label component.
* Formatting
---
spacy/errors.py | 2 ++
spacy/pipeline/textcat.py | 10 +++++++---
spacy/pipeline/textcat_multilabel.py | 8 +++++++-
spacy/tests/pipeline/test_textcat.py | 24 ++++++++++++++++++++++++
4 files changed, 40 insertions(+), 4 deletions(-)
diff --git a/spacy/errors.py b/spacy/errors.py
index 278e5496a..1d29f0e17 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -544,6 +544,8 @@ class Errors(metaclass=ErrorsWithCodes):
"during training, make sure to include it in 'annotating components'")
# New errors added in v3.x
+ E851 = ("The 'textcat' component labels should only have values of 0 or 1, "
+ "but found value of '{val}'.")
E852 = ("The tar file pulled from the remote attempted an unsafe path "
"traversal.")
E853 = ("Unsupported component factory name '{name}'. The character '.' is "
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 4023c4456..a86eb99d2 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -293,7 +293,7 @@ class TextCategorizer(TrainablePipe):
bp_scores(gradient)
if sgd is not None:
self.finish_update(sgd)
- losses[self.name] += (gradient**2).sum()
+ losses[self.name] += (gradient ** 2).sum()
return losses
def _examples_to_truth(
@@ -327,7 +327,7 @@ class TextCategorizer(TrainablePipe):
not_missing = self.model.ops.asarray(not_missing) # type: ignore
d_scores = scores - truths
d_scores *= not_missing
- mean_square_error = (d_scores**2).mean()
+ mean_square_error = (d_scores ** 2).mean()
return float(mean_square_error), d_scores
def add_label(self, label: str) -> int:
@@ -401,5 +401,9 @@ class TextCategorizer(TrainablePipe):
def _validate_categories(self, examples: Iterable[Example]):
"""Check whether the provided examples all have single-label cats annotations."""
for ex in examples:
- if list(ex.reference.cats.values()).count(1.0) > 1:
+ vals = list(ex.reference.cats.values())
+ if vals.count(1.0) > 1:
raise ValueError(Errors.E895.format(value=ex.reference.cats))
+ for val in vals:
+ if not (val == 1.0 or val == 0.0):
+ raise ValueError(Errors.E851.format(val=val))
diff --git a/spacy/pipeline/textcat_multilabel.py b/spacy/pipeline/textcat_multilabel.py
index eb83d9cb7..ef9bd6557 100644
--- a/spacy/pipeline/textcat_multilabel.py
+++ b/spacy/pipeline/textcat_multilabel.py
@@ -192,6 +192,8 @@ class MultiLabel_TextCategorizer(TextCategorizer):
for label in labels:
self.add_label(label)
subbatch = list(islice(get_examples(), 10))
+ self._validate_categories(subbatch)
+
doc_sample = [eg.reference for eg in subbatch]
label_sample, _ = self._examples_to_truth(subbatch)
self._require_labels()
@@ -202,4 +204,8 @@ class MultiLabel_TextCategorizer(TextCategorizer):
def _validate_categories(self, examples: Iterable[Example]):
"""This component allows any type of single- or multi-label annotations.
This method overwrites the more strict one from 'textcat'."""
- pass
+ # check that annotation values are valid
+ for ex in examples:
+ for val in ex.reference.cats.values():
+ if not (val == 1.0 or val == 0.0):
+ raise ValueError(Errors.E851.format(val=val))
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index d359b77db..2eda9deaf 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -360,6 +360,30 @@ def test_label_types(name):
nlp.initialize()
+@pytest.mark.parametrize(
+ "name,get_examples",
+ [
+ ("textcat", make_get_examples_single_label),
+ ("textcat_multilabel", make_get_examples_multi_label),
+ ],
+)
+def test_invalid_label_value(name, get_examples):
+ nlp = Language()
+ textcat = nlp.add_pipe(name)
+ example_getter = get_examples(nlp)
+
+ def invalid_examples():
+ # make one example with an invalid score
+ examples = example_getter()
+ ref = examples[0].reference
+ key = list(ref.cats.keys())[0]
+ ref.cats[key] = 2.0
+ return examples
+
+ with pytest.raises(ValueError):
+ nlp.initialize(get_examples=invalid_examples)
+
+
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
def test_no_label(name):
nlp = Language()