From 02e18926c37d9c2e8965356c595fedd692507902 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 21 Jul 2021 15:32:37 +0200
Subject: [PATCH] Revert "Backport bugfixes from v3.1.0 to v3.0 (#8739)"
 (#8786)

This reverts commit f94168a41e41dc67a3724a33d0e10a45423085dc.
---
 MANIFEST.in                                   |  1 -
 spacy/about.py                                |  2 +-
 spacy/cli/convert.py                          |  3 +-
 spacy/cli/package.py                          |  2 +-
 spacy/cli/templates/quickstart_training.jinja |  2 +-
 spacy/errors.py                               |  5 --
 spacy/lang/az/__init__.py                     |  5 ++
 spacy/lang/el/lemmatizer.py                   |  2 +-
 spacy/lang/ru/lemmatizer.py                   |  5 +-
 spacy/lang/uk/lemmatizer.py                   |  6 ++-
 spacy/matcher/phrasematcher.pyx               |  2 -
 spacy/ml/models/multi_task.py                 |  3 +-
 spacy/pipeline/entity_linker.py               |  3 +-
 spacy/pipeline/entityruler.py                 | 46 ++++++++++---------
 spacy/pipeline/textcat.py                     |  2 -
 spacy/pipeline/trainable_pipe.pyx             |  3 +-
 spacy/tests/doc/test_doc_api.py               | 40 ++++------------
 spacy/tests/lang/test_initialize.py           | 13 +++---
 spacy/tests/matcher/test_matcher_api.py       |  1 -
 spacy/tests/matcher/test_matcher_logic.py     |  1 -
 spacy/tests/parser/test_ner.py                |  2 +-
 spacy/tests/pipeline/test_entity_linker.py    |  2 -
 spacy/tests/pipeline/test_entity_ruler.py     | 14 ------
 spacy/tests/pipeline/test_pipe_factories.py   | 31 +++++--------
 spacy/tests/pipeline/test_textcat.py          |  6 ---
 spacy/tests/regression/test_issue8216.py      | 34 --------------
 spacy/tests/tokenizer/test_tokenizer.py       |  3 +-
 spacy/tests/training/test_new_example.py      | 21 ---------
 spacy/tokens/doc.pyx                          |  6 +--
 spacy/training/example.pyx                    |  2 +-
 spacy/util.py                                 | 30 +++++++++---
 website/docs/api/textcategorizer.md           |  9 ++--
 website/meta/languages.json                   |  7 +--
 33 files changed, 105 insertions(+), 209 deletions(-)
 delete mode 100644 spacy/tests/regression/test_issue8216.py

diff --git a/MANIFEST.in b/MANIFEST.in
index 99fc174bd..8008b4507 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -8,4 +8,3 @@ recursive-exclude spacy/lang *.json
 recursive-include spacy/lang *.json.gz
 recursive-include spacy/cli *.json *.yml
 recursive-include licenses *
-recursive-exclude spacy *.cpp
diff --git a/spacy/about.py b/spacy/about.py
index 123e5ea7c..c351076c5 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.7"
+__version__ = "3.0.6"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index c84aa6431..d13a4fc80 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -115,8 +115,7 @@ def convert(
     ner_map = srsly.read_json(ner_map) if ner_map is not None else None
     doc_files = []
     for input_loc in walk_directory(Path(input_path), converter):
-        with input_loc.open("r", encoding="utf-8") as infile:
-            input_data = infile.read()
+        input_data = input_loc.open("r", encoding="utf-8").read()
         # Use converter function to convert data
         func = CONVERTERS[converter]
         docs = func(
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 275476307..eaffde1d7 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -18,7 +18,7 @@ def package_cli(
     output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
     code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
     meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
-    create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
+    create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
     name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
     version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
     build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index c139bcaae..e43c21bbd 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -418,7 +418,7 @@ compound = 1.001
 
 [initialize]
 {% if use_transformer or optimize == "efficiency" or not word_vectors -%}
-vectors = ${paths.vectors}
+vectors = null
 {% else -%}
 vectors = "{{ word_vectors }}"
 {% endif -%}
diff --git a/spacy/errors.py b/spacy/errors.py
index de04ca641..e8eccaece 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -518,11 +518,6 @@ class Errors:
     E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
 
     # New errors added in v3.x
-    E867 = ("The 'textcat' component requires at least two labels because it "
-            "uses mutually exclusive classes where exactly one label is True "
-            "for each doc. For binary classification tasks, you can use two "
-            "labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you "
-            "can use the 'textcat_multilabel' component with one label.")
     E870 = ("Could not serialize the DocBin because it is too large. Consider "
             "splitting up your documents into several doc bins and serializing "
             "each separately. spacy.Corpus.v1 will search recursively for all "
diff --git a/spacy/lang/az/__init__.py b/spacy/lang/az/__init__.py
index 2937e2ecf..6a4288d1e 100644
--- a/spacy/lang/az/__init__.py
+++ b/spacy/lang/az/__init__.py
@@ -1,11 +1,16 @@
+from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .stop_words import STOP_WORDS
+from .syntax_iterators import SYNTAX_ITERATORS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
 
 
 class AzerbaijaniDefaults(Language.Defaults):
+    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     lex_attr_getters = LEX_ATTRS
     stop_words = STOP_WORDS
+    token_match = TOKEN_MATCH
+    syntax_iterators = SYNTAX_ITERATORS
 
 
 class Azerbaijani(Language):
diff --git a/spacy/lang/el/lemmatizer.py b/spacy/lang/el/lemmatizer.py
index 631848af4..a049601dc 100644
--- a/spacy/lang/el/lemmatizer.py
+++ b/spacy/lang/el/lemmatizer.py
@@ -57,6 +57,6 @@ class GreekLemmatizer(Lemmatizer):
             forms.extend(oov_forms)
         if not forms:
             forms.append(string)
-        forms = list(dict.fromkeys(forms))
+        forms = list(set(forms))
         self.cache[cache_key] = forms
         return forms
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index 5a49a4e00..63aa94a36 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -12,6 +12,7 @@ PUNCT_RULES = {"«": '"', "»": '"'}
 
 
 class RussianLemmatizer(Lemmatizer):
+    _morph = None
 
     def __init__(
         self,
@@ -30,8 +31,8 @@ class RussianLemmatizer(Lemmatizer):
                     "The Russian lemmatizer mode 'pymorphy2' requires the "
                     "pymorphy2 library. Install it with: pip install pymorphy2"
                 ) from None
-            if getattr(self, "_morph", None) is None:
-                self._morph = MorphAnalyzer()
+            if RussianLemmatizer._morph is None:
+                RussianLemmatizer._morph = MorphAnalyzer()
         super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
 
     def pymorphy2_lemmatize(self, token: Token) -> List[str]:
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index 1fb030e06..e1fdf39fc 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -7,6 +7,8 @@ from ...vocab import Vocab
 
 
 class UkrainianLemmatizer(RussianLemmatizer):
+    _morph = None
+
     def __init__(
         self,
         vocab: Vocab,
@@ -25,6 +27,6 @@ class UkrainianLemmatizer(RussianLemmatizer):
                     "pymorphy2 library and dictionaries. Install them with: "
                     "pip install pymorphy2 pymorphy2-dicts-uk"
                 ) from None
-            if getattr(self, "_morph", None) is None:
-                self._morph = MorphAnalyzer(lang="uk")
+            if UkrainianLemmatizer._morph is None:
+                UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
         super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index d8486b84b..e5ff2202c 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -50,8 +50,6 @@ cdef class PhraseMatcher:
         if isinstance(attr, (int, long)):
             self.attr = attr
         else:
-            if attr is None:
-                attr = "ORTH"
             attr = attr.upper()
             if attr == "TEXT":
                 attr = "ORTH"
diff --git a/spacy/ml/models/multi_task.py b/spacy/ml/models/multi_task.py
index 97bef2d0e..d4d2d638b 100644
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@@ -3,7 +3,7 @@ from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Mode
 from thinc.api import MultiSoftmax, list2array
 from thinc.api import to_categorical, CosineDistance, L2Distance
 
-from ...util import registry, OOV_RANK
+from ...util import registry
 from ...errors import Errors
 from ...attrs import ID
 
@@ -70,7 +70,6 @@ def get_vectors_loss(ops, docs, prediction, distance):
     # and look them up all at once. This prevents data copying.
     ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
     target = docs[0].vocab.vectors.data[ids]
-    target[ids == OOV_RANK] = 0
     d_target, loss = distance(prediction, target)
     return loss, d_target
 
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 243ad9094..a03b6b384 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -481,8 +481,7 @@ class EntityLinker(TrainablePipe):
 
         def load_model(p):
             try:
-                with p.open("rb") as infile:
-                    self.model.from_bytes(infile.read())
+                self.model.from_bytes(p.open("rb").read())
             except AttributeError:
                 raise ValueError(Errors.E149) from None
 
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 761ff12bf..a74d2f303 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -3,7 +3,6 @@ from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable,
 from collections import defaultdict
 from pathlib import Path
 import srsly
-import warnings
 
 from .pipe import Pipe
 from ..training import Example
@@ -103,12 +102,17 @@ class EntityRuler(Pipe):
         self.overwrite = overwrite_ents
         self.token_patterns = defaultdict(list)
         self.phrase_patterns = defaultdict(list)
-        self._validate = validate
         self.matcher = Matcher(nlp.vocab, validate=validate)
-        self.phrase_matcher_attr = phrase_matcher_attr
-        self.phrase_matcher = PhraseMatcher(
-            nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
-        )
+        if phrase_matcher_attr is not None:
+            if phrase_matcher_attr.upper() == "TEXT":
+                phrase_matcher_attr = "ORTH"
+            self.phrase_matcher_attr = phrase_matcher_attr
+            self.phrase_matcher = PhraseMatcher(
+                nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
+            )
+        else:
+            self.phrase_matcher_attr = None
+            self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
         self.ent_id_sep = ent_id_sep
         self._ent_ids = defaultdict(dict)
         if patterns is not None:
@@ -142,9 +146,7 @@ class EntityRuler(Pipe):
 
     def match(self, doc: Doc):
         self._require_patterns()
-        with warnings.catch_warnings():
-            warnings.filterwarnings("ignore", message="\\[W036")
-            matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
+        matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
         matches = set(
             [(m_id, start, end) for m_id, start, end in matches if start != end]
         )
@@ -279,7 +281,7 @@ class EntityRuler(Pipe):
                     current_index = i
                     break
             subsequent_pipes = [
-                pipe for pipe in self.nlp.pipe_names[current_index :]
+                pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
             ]
         except ValueError:
             subsequent_pipes = []
@@ -315,22 +317,20 @@ class EntityRuler(Pipe):
                 pattern = entry["pattern"]
                 if isinstance(pattern, Doc):
                     self.phrase_patterns[label].append(pattern)
-                    self.phrase_matcher.add(label, [pattern])
                 elif isinstance(pattern, list):
                     self.token_patterns[label].append(pattern)
-                    self.matcher.add(label, [pattern])
                 else:
                     raise ValueError(Errors.E097.format(pattern=pattern))
+            for label, patterns in self.token_patterns.items():
+                self.matcher.add(label, patterns)
+            for label, patterns in self.phrase_patterns.items():
+                self.phrase_matcher.add(label, patterns)
 
     def clear(self) -> None:
         """Reset all patterns."""
         self.token_patterns = defaultdict(list)
         self.phrase_patterns = defaultdict(list)
         self._ent_ids = defaultdict(dict)
-        self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
-        self.phrase_matcher = PhraseMatcher(
-            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
-        )
 
     def _require_patterns(self) -> None:
         """Raise a warning if this component has no patterns defined."""
@@ -381,9 +381,10 @@ class EntityRuler(Pipe):
             self.add_patterns(cfg.get("patterns", cfg))
             self.overwrite = cfg.get("overwrite", False)
             self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
-            self.phrase_matcher = PhraseMatcher(
-                self.nlp.vocab, attr=self.phrase_matcher_attr
-            )
+            if self.phrase_matcher_attr is not None:
+                self.phrase_matcher = PhraseMatcher(
+                    self.nlp.vocab, attr=self.phrase_matcher_attr
+                )
             self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
         else:
             self.add_patterns(cfg)
@@ -434,9 +435,10 @@ class EntityRuler(Pipe):
             self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
             self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
 
-            self.phrase_matcher = PhraseMatcher(
-                self.nlp.vocab, attr=self.phrase_matcher_attr
-            )
+            if self.phrase_matcher_attr is not None:
+                self.phrase_matcher = PhraseMatcher(
+                    self.nlp.vocab, attr=self.phrase_matcher_attr
+                )
             from_disk(path, deserializers_patterns, {})
         return self
 
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index 8edd99361..1d652a483 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -332,8 +332,6 @@ class TextCategorizer(TrainablePipe):
         else:
             for label in labels:
                 self.add_label(label)
-        if len(self.labels) < 2:
-            raise ValueError(Errors.E867)
         if positive_label is not None:
             if positive_label not in self.labels:
                 err = Errors.E920.format(pos_label=positive_label, labels=self.labels)
diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index fe51f38e5..64e33f800 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -324,8 +324,7 @@ cdef class TrainablePipe(Pipe):
 
         def load_model(p):
             try:
-                with open(p, "rb") as mfile:
-                    self.model.from_bytes(mfile.read())
+                self.model.from_bytes(p.open("rb").read())
             except AttributeError:
                 raise ValueError(Errors.E149) from None
 
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index 3dafb6956..3aae063d3 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -351,25 +351,17 @@ def test_doc_from_array_morph(en_vocab):
 
 @pytest.mark.usefixtures("clean_underscore")
 def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
-    en_texts = [
-        "Merging the docs is fun.",
-        "",
-        "They don't think alike. ",
-        "Another doc.",
-    ]
+    en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
     en_texts_without_empty = [t for t in en_texts if len(t)]
     de_text = "Wie war die Frage?"
     en_docs = [en_tokenizer(text) for text in en_texts]
     en_docs[0].spans["group"] = [en_docs[0][1:4]]
     en_docs[2].spans["group"] = [en_docs[2][1:4]]
-    en_docs[3].spans["group"] = [en_docs[3][0:1]]
-    span_group_texts = sorted(
-        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
-    )
+    span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
     de_doc = de_tokenizer(de_text)
     Token.set_extension("is_ambiguous", default=False)
-    en_docs[0][2]._.is_ambiguous = True  # docs
-    en_docs[2][3]._.is_ambiguous = True  # think
+    en_docs[0][2]._.is_ambiguous = True # docs
+    en_docs[2][3]._.is_ambiguous = True # think
     assert Doc.from_docs([]) is None
     assert de_doc is not Doc.from_docs([de_doc])
     assert str(de_doc) == str(Doc.from_docs([de_doc]))
@@ -379,8 +371,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
 
     m_doc = Doc.from_docs(en_docs)
     assert len(en_texts_without_empty) == len(list(m_doc.sents))
-    assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
-    assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
+    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
+    assert str(m_doc) == " ".join(en_texts_without_empty)
     p_token = m_doc[len(en_docs[0]) - 1]
     assert p_token.text == "." and bool(p_token.whitespace_)
     en_docs_tokens = [t for doc in en_docs for t in doc]
@@ -392,12 +384,11 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert not any([t._.is_ambiguous for t in m_doc[3:8]])
     assert "group" in m_doc.spans
     assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
-    assert bool(m_doc[11].whitespace_)
 
     m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
     assert len(en_texts_without_empty) == len(list(m_doc.sents))
-    assert len(m_doc.text) == sum(len(t) for t in en_texts)
-    assert m_doc.text == "".join(en_texts_without_empty)
+    assert len(str(m_doc)) == sum(len(t) for t in en_texts)
+    assert str(m_doc) == "".join(en_texts)
     p_token = m_doc[len(en_docs[0]) - 1]
     assert p_token.text == "." and not bool(p_token.whitespace_)
     en_docs_tokens = [t for doc in en_docs for t in doc]
@@ -406,12 +397,11 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     assert m_doc[9].idx == think_idx
     assert "group" in m_doc.spans
     assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
-    assert bool(m_doc[11].whitespace_)
 
     m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
-    assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
+    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
     # space delimiter considered, although spacy attribute was missing
-    assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
+    assert str(m_doc) == " ".join(en_texts_without_empty)
     p_token = m_doc[len(en_docs[0]) - 1]
     assert p_token.text == "." and bool(p_token.whitespace_)
     en_docs_tokens = [t for doc in en_docs for t in doc]
@@ -424,16 +414,6 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
     # can merge empty docs
     doc = Doc.from_docs([en_tokenizer("")] * 10)
 
-    # empty but set spans keys are preserved
-    en_docs = [en_tokenizer(text) for text in en_texts]
-    m_doc = Doc.from_docs(en_docs)
-    assert "group" not in m_doc.spans
-    for doc in en_docs:
-        doc.spans["group"] = []
-    m_doc = Doc.from_docs(en_docs)
-    assert "group" in m_doc.spans
-    assert len(m_doc.spans["group"]) == 0
-
 
 def test_doc_api_from_docs_ents(en_tokenizer):
     texts = ["Merging the docs is fun.", "They don't think alike."]
diff --git a/spacy/tests/lang/test_initialize.py b/spacy/tests/lang/test_initialize.py
index 36f4a75e0..46f1f2bd1 100644
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@@ -4,13 +4,12 @@ from spacy.util import get_lang_class
 
 # fmt: off
 # Only include languages with no external dependencies
-# excluded: ja, ko, th, vi, zh
-LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
-             "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
-             "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
-             "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
-             "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
-             "tr", "tt", "uk", "ur", "xx", "yo"]
+# excluded: ja, ru, th, uk, vi, zh
+LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
+             "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
+             "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
+             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur",
+             "yo"]
 # fmt: on
 
 
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index e0f655bbe..d3772a931 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -481,7 +481,6 @@ def test_matcher_schema_token_attributes(en_vocab, pattern, text):
     assert len(matches) == 1
 
 
-@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_matcher_valid_callback(en_vocab):
     """Test that on_match can only be None or callable."""
     matcher = Matcher(en_vocab)
diff --git a/spacy/tests/matcher/test_matcher_logic.py b/spacy/tests/matcher/test_matcher_logic.py
index 36708edd0..9f575fe05 100644
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@@ -180,7 +180,6 @@ def test_matcher_sets_return_correct_tokens(en_vocab):
     assert texts == ["zero", "one", "two"]
 
 
-@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_matcher_remove():
     nlp = English()
     matcher = Matcher(nlp.vocab)
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index d7c37fbd1..bebadf7e9 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -252,12 +252,12 @@ def test_ruler_before_ner():
     # 1 : Entity Ruler - should set "this" to B and everything else to empty
     patterns = [{"label": "THING", "pattern": "This"}]
     ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
 
     # 2: untrained NER - should set everything else to O
     untrained_ner = nlp.add_pipe("ner")
     untrained_ner.add_label("MY_LABEL")
     nlp.initialize()
-    ruler.add_patterns(patterns)
     doc = nlp("This is Antti Korhonen speaking in Finland")
     expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
     expected_types = ["THING", "", "", "", "", "", ""]
diff --git a/spacy/tests/pipeline/test_entity_linker.py b/spacy/tests/pipeline/test_entity_linker.py
index 13c8cb72e..a7f9364e9 100644
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@@ -324,7 +324,6 @@ def test_append_alias(nlp):
     assert len(mykb.get_alias_candidates("douglas")) == 3
 
 
-@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_append_invalid_alias(nlp):
     """Test that append an alias will throw an error if prior probs are exceeding 1"""
     mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@@ -343,7 +342,6 @@ def test_append_invalid_alias(nlp):
         mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
 
 
-@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_preserving_links_asdoc(nlp):
     """Test that Span.as_doc preserves the existing entity links"""
     vector_length = 1
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index dc0ca0301..a382532d2 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -89,20 +89,6 @@ def test_entity_ruler_init_clear(nlp, patterns):
     assert len(ruler.labels) == 0
 
 
-def test_entity_ruler_clear(nlp, patterns):
-    """Test that initialization clears patterns."""
-    ruler = nlp.add_pipe("entity_ruler")
-    ruler.add_patterns(patterns)
-    assert len(ruler.labels) == 4
-    doc = nlp("hello world")
-    assert len(doc.ents) == 1
-    ruler.clear()
-    assert len(ruler.labels) == 0
-    with pytest.warns(UserWarning):
-        doc = nlp("hello world")
-    assert len(doc.ents) == 0
-
-
 def test_entity_ruler_existing(nlp, patterns):
     ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index c5cc62661..a7071abfd 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -334,31 +334,24 @@ def test_language_factories_invalid():
 
 
 @pytest.mark.parametrize(
-    "weights,override,expected",
+    "weights,expected",
     [
-        ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {}, {"a": 0.33, "b": 0.33, "c": 0.33}),
-        ([{"a": 1.0}, {"b": 50}, {"c": 100}], {}, {"a": 0.01, "b": 0.33, "c": 0.66}),
+        ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}),
+        ([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}),
         (
             [{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
-            {},
             {"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
         ),
         (
-            [{"a": 100, "b": 300}, {"c": 50, "d": 50}],
-            {},
-            {"a": 0.2, "b": 0.6, "c": 0.1, "d": 0.1},
+            [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
+            {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
         ),
-        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {}, {"a": 0.33, "b": 0.67}),
-        ([{"a": 0.5, "b": 0.0}], {}, {"a": 1.0, "b": 0.0}),
-        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}),
-        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}),
-        ([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}),
-        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}),
-        ([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
+        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
     ],
 )
-def test_language_factories_combine_score_weights(weights, override, expected):
-    result = combine_score_weights(weights, override)
+def test_language_factories_combine_score_weights(weights, expected):
+    result = combine_score_weights(weights)
     assert sum(result.values()) in (0.99, 1.0, 0.0)
     assert result == expected
 
@@ -384,17 +377,17 @@ def test_language_factories_scores():
     # Test with custom defaults
     config = nlp.config.copy()
     config["training"]["score_weights"]["a1"] = 0.0
-    config["training"]["score_weights"]["b3"] = 1.3
+    config["training"]["score_weights"]["b3"] = 1.0
     nlp = English.from_config(config)
     score_weights = nlp.config["training"]["score_weights"]
-    expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65}
+    expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
     assert score_weights == expected
     # Test with null values
     config = nlp.config.copy()
     config["training"]["score_weights"]["a1"] = None
     nlp = English.from_config(config)
     score_weights = nlp.config["training"]["score_weights"]
-    expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66}
+    expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
     assert score_weights == expected
 
 
diff --git a/spacy/tests/pipeline/test_textcat.py b/spacy/tests/pipeline/test_textcat.py
index c251f3dfd..43dfff147 100644
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@@ -108,12 +108,6 @@ def test_label_types(name):
     textcat.add_label("answer")
     with pytest.raises(ValueError):
         textcat.add_label(9)
-    # textcat requires at least two labels
-    if name == "textcat":
-        with pytest.raises(ValueError):
-            nlp.initialize()
-    else:
-        nlp.initialize()
 
 
 @pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
diff --git a/spacy/tests/regression/test_issue8216.py b/spacy/tests/regression/test_issue8216.py
deleted file mode 100644
index 528d4b6f9..000000000
--- a/spacy/tests/regression/test_issue8216.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import pytest
-
-from spacy import registry
-from spacy.language import Language
-from spacy.pipeline import EntityRuler
-
-
-@pytest.fixture
-def nlp():
-    return Language()
-
-
-@pytest.fixture
-@registry.misc("entity_ruler_patterns")
-def patterns():
-    return [
-        {"label": "HELLO", "pattern": "hello world"},
-        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
-        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
-        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
-        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
-        {"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"},
-    ]
-
-
-def test_entity_ruler_fix8216(nlp, patterns):
-    """Test that patterns don't get added excessively."""
-    ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
-    ruler.add_patterns(patterns)
-    pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
-    assert pattern_count > 0
-    ruler.add_patterns([])
-    after_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
-    assert after_count == pattern_count
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index c1ba1df36..6cfeaf014 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -84,8 +84,7 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
 @pytest.mark.parametrize("file_name", ["sun.txt"])
 def test_tokenizer_handle_text_from_file(tokenizer, file_name):
     loc = ensure_path(__file__).parent / file_name
-    with loc.open("r", encoding="utf8") as infile:
-        text = infile.read()
+    text = loc.open("r", encoding="utf8").read()
     assert len(text) != 0
     tokens = tokenizer(text)
     assert len(tokens) > 100
diff --git a/spacy/tests/training/test_new_example.py b/spacy/tests/training/test_new_example.py
index 4dd90f416..ba58ea96d 100644
--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@@ -182,27 +182,6 @@ def test_Example_from_dict_with_entities(annots):
     assert example.reference[5].ent_type_ == "LOC"
 
 
-def test_Example_from_dict_with_empty_entities():
-    annots = {
-        "words": ["I", "like", "New", "York", "and", "Berlin", "."],
-        "entities": [],
-    }
-    vocab = Vocab()
-    predicted = Doc(vocab, words=annots["words"])
-    example = Example.from_dict(predicted, annots)
-    # entities as empty list sets everything to O
-    assert example.reference.has_annotation("ENT_IOB")
-    assert len(list(example.reference.ents)) == 0
-    assert all(token.ent_iob_ == "O" for token in example.reference)
-    # various unset/missing entities leaves entities unset
-    annots["entities"] = None
-    example = Example.from_dict(predicted, annots)
-    assert not example.reference.has_annotation("ENT_IOB")
-    annots.pop("entities", None)
-    example = Example.from_dict(predicted, annots)
-    assert not example.reference.has_annotation("ENT_IOB")
-
-
 @pytest.mark.parametrize(
     "annots",
     [
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index a6732c7c9..7a50d3d53 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1141,10 +1141,6 @@ cdef class Doc:
                 else:
                     warnings.warn(Warnings.W102.format(key=key, value=value))
             for key in doc.spans:
-                # if a spans key is in any doc, include it in the merged doc
-                # even if it is empty
-                if key not in concat_spans:
-                    concat_spans[key] = []
                 for span in doc.spans[key]:
                     concat_spans[key].append((
                         span.start_char + char_offset,
@@ -1154,7 +1150,7 @@ cdef class Doc:
                         span.text, # included as a check
                     ))
             char_offset += len(doc.text)
-            if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
+            if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space:
                 char_offset += 1
 
         arrays = [doc.to_array(attrs) for doc in docs]
diff --git a/spacy/training/example.pyx b/spacy/training/example.pyx
index 1c80dc268..07a83bfec 100644
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@@ -416,7 +416,7 @@ def _fix_legacy_dict_data(example_dict):
     token_dict = example_dict.get("token_annotation", {})
     doc_dict = example_dict.get("doc_annotation", {})
     for key, value in example_dict.items():
-        if value is not None:
+        if value:
             if key in ("token_annotation", "doc_annotation"):
                 pass
             elif key == "ids":
diff --git a/spacy/util.py b/spacy/util.py
index 52b48dcdb..928106dbb 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1370,14 +1370,32 @@ def combine_score_weights(
         should be preserved.
     RETURNS (Dict[str, float]): The combined and normalized weights.
     """
-    # We divide each weight by the total weight sum.
     # We first need to extract all None/null values for score weights that
     # shouldn't be shown in the table *or* be weighted
-    result = {key: overrides.get(key, value) for w_dict in weights for (key, value) in w_dict.items()}
-    weight_sum = sum([v if v else 0.0 for v in result.values()])
-    for key, value in result.items():
-        if value and weight_sum > 0:
-            result[key] = round(value / weight_sum, 2)
+    result = {}
+    all_weights = []
+    for w_dict in weights:
+        filtered_weights = {}
+        for key, value in w_dict.items():
+            value = overrides.get(key, value)
+            if value is None:
+                result[key] = None
+            else:
+                filtered_weights[key] = value
+        all_weights.append(filtered_weights)
+    for w_dict in all_weights:
+        # We need to account for weights that don't sum to 1.0 and normalize
+        # the score weights accordingly, then divide score by the number of
+        # components.
+        total = sum(w_dict.values())
+        for key, value in w_dict.items():
+            if total == 0:
+                weight = 0.0
+            else:
+                weight = round(value / total / len(all_weights), 2)
+            prev_weight = result.get(key, 0.0)
+            prev_weight = 0.0 if prev_weight is None else prev_weight
+            result[key] = prev_weight + weight
     return result
 
 
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index baa30ae01..fdd235b85 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -10,12 +10,11 @@ api_trainable: true
 ---
 
 The text categorizer predicts **categories over a whole document**. and comes in
-two flavors: `textcat` and `textcat_multilabel`. When you need to predict
+two flavours: `textcat` and `textcat_multilabel`. When you need to predict
 exactly one true label per document, use the `textcat` which has mutually
 exclusive labels. If you want to perform multi-label classification and predict
-zero, one or more true labels per document, use the `textcat_multilabel`
-component instead. For a binary classification task, you can use `textcat` with
-**two** labels or `textcat_multilabel` with **one** label.
+zero, one or more labels per document, use the `textcat_multilabel` component
+instead.
 
 Both components are documented on this page.
 
@@ -190,7 +189,7 @@ This method was previously called `begin_training`.
 | _keyword-only_   |                                                                                                                                                                                                                                                                                                                                                                                                            |
 | `nlp`            | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
 | `labels`         | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
-| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is only used during scoring. It is not available when using the `textcat_multilabel` component. ~~Optional[str]~~                                                                                                                                                                             |
+| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is not available when using the `textcat_multilabel` component. ~~Optional[str]~~                                                                                                                                                                                                             |
 
 ## TextCategorizer.predict {#predict tag="method"}
 
diff --git a/website/meta/languages.json b/website/meta/languages.json
index f6ac45b15..e05718047 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -262,12 +262,7 @@
         },
         {
             "code": "mk",
-            "name": "Macedonian",
-            "models": [
-                "mk_core_news_sm",
-                "mk_core_news_md",
-                "mk_core_news_lg"
-            ]
+            "name": "Macedonian"
         },
         {
             "code": "ml",