Merge pull request #8787 from adrianeboyd/chore/backport-v3.0.7

Backport bug fixes to v3.0.x
2025-11-01 00:17:44 +03:00 · 2021-07-21 16:53:50 +02:00 · 2021-07-21 16:53:50 +02:00 · 034ac0acf4
commit 034ac0acf4
parent 02e18926c3 0080454140
33 changed files with 209 additions and 105 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -8,3 +8,4 @@ recursive-exclude spacy/lang *.json
 recursive-include spacy/lang *.json.gz
 recursive-include spacy/cli *.json *.yml
 recursive-include licenses *
 recursive-exclude spacy *.cpp
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.6"
+__version__ = "3.0.7"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -115,7 +115,8 @@ def convert(
    ner_map = srsly.read_json(ner_map) if ner_map is not None else None
    doc_files = []
    for input_loc in walk_directory(Path(input_path), converter):
-        input_data = input_loc.open("r", encoding="utf-8").read()
+        with input_loc.open("r", encoding="utf-8") as infile:
            input_data = infile.read()
        # Use converter function to convert data
        func = CONVERTERS[converter]
        docs = func(
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -18,7 +18,7 @@ def package_cli(
    output_dir: Path = Arg(..., help="Output parent directory", exists=True, file_okay=False),
    code_paths: str = Opt("", "--code", "-c", help="Comma-separated paths to Python file with additional code (registered functions) to be included in the package"),
    meta_path: Optional[Path] = Opt(None, "--meta-path", "--meta", "-m", help="Path to meta.json", exists=True, dir_okay=False),
-    create_meta: bool = Opt(False, "--create-meta", "-c", "-C", help="Create meta.json, even if one exists"),
+    create_meta: bool = Opt(False, "--create-meta", "-C", help="Create meta.json, even if one exists"),
    name: Optional[str] = Opt(None, "--name", "-n", help="Package name to override meta"),
    version: Optional[str] = Opt(None, "--version", "-v", help="Package version to override meta"),
    build: str = Opt("sdist", "--build", "-b", help="Comma-separated formats to build: sdist and/or wheel, or none."),
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@ -418,7 +418,7 @@ compound = 1.001
 [initialize]
 {% if use_transformer or optimize == "efficiency" or not word_vectors -%}
-vectors = null
+vectors = ${paths.vectors}
 {% else -%}
 vectors = "{{ word_vectors }}"
 {% endif -%}
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -518,6 +518,11 @@ class Errors:
    E202 = ("Unsupported alignment mode '{mode}'. Supported modes: {modes}.")
    # New errors added in v3.x
    E867 = ("The 'textcat' component requires at least two labels because it "
            "uses mutually exclusive classes where exactly one label is True "
            "for each doc. For binary classification tasks, you can use two "
            "labels with 'textcat' (LABEL / NOT_LABEL) or alternatively, you "
            "can use the 'textcat_multilabel' component with one label.")
    E870 = ("Could not serialize the DocBin because it is too large. Consider "
            "splitting up your documents into several doc bins and serializing "
            "each separately. spacy.Corpus.v1 will search recursively for all "
--- a/spacy/lang/az/init.py
+++ b/spacy/lang/az/init.py
@ -1,16 +1,11 @@
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS, TOKEN_MATCH
 from .stop_words import STOP_WORDS
 from .syntax_iterators import SYNTAX_ITERATORS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
 class AzerbaijaniDefaults(Language.Defaults):
    tokenizer_exceptions = TOKENIZER_EXCEPTIONS
    lex_attr_getters = LEX_ATTRS
    stop_words = STOP_WORDS
    token_match = TOKEN_MATCH
    syntax_iterators = SYNTAX_ITERATORS
 class Azerbaijani(Language):
--- a/spacy/lang/el/lemmatizer.py
+++ b/spacy/lang/el/lemmatizer.py
@ -57,6 +57,6 @@ class GreekLemmatizer(Lemmatizer):
            forms.extend(oov_forms)
        if not forms:
            forms.append(string)
-        forms = list(set(forms))
+        forms = list(dict.fromkeys(forms))
        self.cache[cache_key] = forms
        return forms
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@ -12,7 +12,6 @@ PUNCT_RULES = {"«": '"', "»": '"'}
 class RussianLemmatizer(Lemmatizer):
    _morph = None
    def __init__(
        self,
@ -31,8 +30,8 @@ class RussianLemmatizer(Lemmatizer):
                    "The Russian lemmatizer mode 'pymorphy2' requires the "
                    "pymorphy2 library. Install it with: pip install pymorphy2"
                ) from None
-            if RussianLemmatizer._morph is None:
+            if getattr(self, "_morph", None) is None:
-                RussianLemmatizer._morph = MorphAnalyzer()
+                self._morph = MorphAnalyzer()
        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
    def pymorphy2_lemmatize(self, token: Token) -> List[str]:
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@ -7,8 +7,6 @@ from ...vocab import Vocab
 class UkrainianLemmatizer(RussianLemmatizer):
    _morph = None
    def __init__(
        self,
        vocab: Vocab,
@ -27,6 +25,6 @@ class UkrainianLemmatizer(RussianLemmatizer):
                    "pymorphy2 library and dictionaries. Install them with: "
                    "pip install pymorphy2 pymorphy2-dicts-uk"
                ) from None
-            if UkrainianLemmatizer._morph is None:
+            if getattr(self, "_morph", None) is None:
-                UkrainianLemmatizer._morph = MorphAnalyzer(lang="uk")
+                self._morph = MorphAnalyzer(lang="uk")
        super().__init__(vocab, model, name, mode=mode, overwrite=overwrite)
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -50,6 +50,8 @@ cdef class PhraseMatcher:
        if isinstance(attr, (int, long)):
            self.attr = attr
        else:
            if attr is None:
                attr = "ORTH"
            attr = attr.upper()
            if attr == "TEXT":
                attr = "ORTH"
--- a/spacy/ml/models/multi_task.py
+++ b/spacy/ml/models/multi_task.py
@ -3,7 +3,7 @@ from thinc.api import chain, Maxout, LayerNorm, Softmax, Linear, zero_init, Mode
 from thinc.api import MultiSoftmax, list2array
 from thinc.api import to_categorical, CosineDistance, L2Distance
-from ...util import registry
+from ...util import registry, OOV_RANK
 from ...errors import Errors
 from ...attrs import ID
@ -70,6 +70,7 @@ def get_vectors_loss(ops, docs, prediction, distance):
    # and look them up all at once. This prevents data copying.
    ids = ops.flatten([doc.to_array(ID).ravel() for doc in docs])
    target = docs[0].vocab.vectors.data[ids]
    target[ids == OOV_RANK] = 0
    d_target, loss = distance(prediction, target)
    return loss, d_target
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@ -481,7 +481,8 @@ class EntityLinker(TrainablePipe):
        def load_model(p):
            try:
-                self.model.from_bytes(p.open("rb").read())
+                with p.open("rb") as infile:
                    self.model.from_bytes(infile.read())
            except AttributeError:
                raise ValueError(Errors.E149) from None
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -3,6 +3,7 @@ from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable,
 from collections import defaultdict
 from pathlib import Path
 import srsly
 import warnings
 from .pipe import Pipe
 from ..training import Example
@ -102,17 +103,12 @@ class EntityRuler(Pipe):
        self.overwrite = overwrite_ents
        self.token_patterns = defaultdict(list)
        self.phrase_patterns = defaultdict(list)
        self._validate = validate
        self.matcher = Matcher(nlp.vocab, validate=validate)
        if phrase_matcher_attr is not None:
            if phrase_matcher_attr.upper() == "TEXT":
                phrase_matcher_attr = "ORTH"
        self.phrase_matcher_attr = phrase_matcher_attr
        self.phrase_matcher = PhraseMatcher(
            nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
        )
        else:
            self.phrase_matcher_attr = None
            self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
        self.ent_id_sep = ent_id_sep
        self._ent_ids = defaultdict(dict)
        if patterns is not None:
@ -146,6 +142,8 @@ class EntityRuler(Pipe):
    def match(self, doc: Doc):
        self._require_patterns()
        with warnings.catch_warnings():
            warnings.filterwarnings("ignore", message="\\[W036")
            matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
        matches = set(
            [(m_id, start, end) for m_id, start, end in matches if start != end]
@ -281,7 +279,7 @@ class EntityRuler(Pipe):
                    current_index = i
                    break
            subsequent_pipes = [
-                pipe for pipe in self.nlp.pipe_names[current_index + 1 :]
+                pipe for pipe in self.nlp.pipe_names[current_index :]
            ]
        except ValueError:
            subsequent_pipes = []
@ -317,20 +315,22 @@ class EntityRuler(Pipe):
                pattern = entry["pattern"]
                if isinstance(pattern, Doc):
                    self.phrase_patterns[label].append(pattern)
                    self.phrase_matcher.add(label, [pattern])
                elif isinstance(pattern, list):
                    self.token_patterns[label].append(pattern)
                    self.matcher.add(label, [pattern])
                else:
                    raise ValueError(Errors.E097.format(pattern=pattern))
            for label, patterns in self.token_patterns.items():
                self.matcher.add(label, patterns)
            for label, patterns in self.phrase_patterns.items():
                self.phrase_matcher.add(label, patterns)
    def clear(self) -> None:
        """Reset all patterns."""
        self.token_patterns = defaultdict(list)
        self.phrase_patterns = defaultdict(list)
        self._ent_ids = defaultdict(dict)
        self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
        self.phrase_matcher = PhraseMatcher(
            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
        )
    def _require_patterns(self) -> None:
        """Raise a warning if this component has no patterns defined."""
@ -381,7 +381,6 @@ class EntityRuler(Pipe):
            self.add_patterns(cfg.get("patterns", cfg))
            self.overwrite = cfg.get("overwrite", False)
            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
            if self.phrase_matcher_attr is not None:
            self.phrase_matcher = PhraseMatcher(
                self.nlp.vocab, attr=self.phrase_matcher_attr
            )
@ -435,7 +434,6 @@ class EntityRuler(Pipe):
            self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
            self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
            if self.phrase_matcher_attr is not None:
            self.phrase_matcher = PhraseMatcher(
                self.nlp.vocab, attr=self.phrase_matcher_attr
            )
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@ -332,6 +332,8 @@ class TextCategorizer(TrainablePipe):
        else:
            for label in labels:
                self.add_label(label)
        if len(self.labels) < 2:
            raise ValueError(Errors.E867)
        if positive_label is not None:
            if positive_label not in self.labels:
                err = Errors.E920.format(pos_label=positive_label, labels=self.labels)
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@ -324,7 +324,8 @@ cdef class TrainablePipe(Pipe):
        def load_model(p):
            try:
-                self.model.from_bytes(p.open("rb").read())
+                with open(p, "rb") as mfile:
                    self.model.from_bytes(mfile.read())
            except AttributeError:
                raise ValueError(Errors.E149) from None
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -351,13 +351,21 @@ def test_doc_from_array_morph(en_vocab):
@pytest.mark.usefixtures("clean_underscore")
 def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
-    en_texts = ["Merging the docs is fun.", "", "They don't think alike."]
+    en_texts = [
        "Merging the docs is fun.",
        "",
        "They don't think alike. ",
        "Another doc.",
    ]
    en_texts_without_empty = [t for t in en_texts if len(t)]
    de_text = "Wie war die Frage?"
    en_docs = [en_tokenizer(text) for text in en_texts]
    en_docs[0].spans["group"] = [en_docs[0][1:4]]
    en_docs[2].spans["group"] = [en_docs[2][1:4]]
-    span_group_texts = sorted([en_docs[0][1:4].text, en_docs[2][1:4].text])
+    en_docs[3].spans["group"] = [en_docs[3][0:1]]
    span_group_texts = sorted(
        [en_docs[0][1:4].text, en_docs[2][1:4].text, en_docs[3][0:1].text]
    )
    de_doc = de_tokenizer(de_text)
    Token.set_extension("is_ambiguous", default=False)
    en_docs[0][2]._.is_ambiguous = True  # docs
@ -371,8 +379,8 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    m_doc = Doc.from_docs(en_docs)
    assert len(en_texts_without_empty) == len(list(m_doc.sents))
-    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
+    assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
-    assert str(m_doc) == " ".join(en_texts_without_empty)
+    assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
    p_token = m_doc[len(en_docs[0]) - 1]
    assert p_token.text == "." and bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
@ -384,11 +392,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    assert not any([t._.is_ambiguous for t in m_doc[3:8]])
    assert "group" in m_doc.spans
    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
    assert bool(m_doc[11].whitespace_)
    m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
    assert len(en_texts_without_empty) == len(list(m_doc.sents))
-    assert len(str(m_doc)) == sum(len(t) for t in en_texts)
+    assert len(m_doc.text) == sum(len(t) for t in en_texts)
-    assert str(m_doc) == "".join(en_texts)
+    assert m_doc.text == "".join(en_texts_without_empty)
    p_token = m_doc[len(en_docs[0]) - 1]
    assert p_token.text == "." and not bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
@ -397,11 +406,12 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    assert m_doc[9].idx == think_idx
    assert "group" in m_doc.spans
    assert span_group_texts == sorted([s.text for s in m_doc.spans["group"]])
    assert bool(m_doc[11].whitespace_)
    m_doc = Doc.from_docs(en_docs, attrs=["lemma", "length", "pos"])
-    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
+    assert len(m_doc.text) > len(en_texts[0]) + len(en_texts[1])
    # space delimiter considered, although spacy attribute was missing
-    assert str(m_doc) == " ".join(en_texts_without_empty)
+    assert m_doc.text == " ".join([t.strip() for t in en_texts_without_empty])
    p_token = m_doc[len(en_docs[0]) - 1]
    assert p_token.text == "." and bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
@ -414,6 +424,16 @@ def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    # can merge empty docs
    doc = Doc.from_docs([en_tokenizer("")] * 10)
    # empty but set spans keys are preserved
    en_docs = [en_tokenizer(text) for text in en_texts]
    m_doc = Doc.from_docs(en_docs)
    assert "group" not in m_doc.spans
    for doc in en_docs:
        doc.spans["group"] = []
    m_doc = Doc.from_docs(en_docs)
    assert "group" in m_doc.spans
    assert len(m_doc.spans["group"]) == 0
 def test_doc_api_from_docs_ents(en_tokenizer):
    texts = ["Merging the docs is fun.", "They don't think alike."]
--- a/spacy/tests/lang/test_initialize.py
+++ b/spacy/tests/lang/test_initialize.py
@ -4,12 +4,13 @@ from spacy.util import get_lang_class
 # fmt: off
 # Only include languages with no external dependencies
-# excluded: ja, ru, th, uk, vi, zh
+# excluded: ja, ko, th, vi, zh
-LANGUAGES = ["af", "ar", "bg", "bn", "ca", "cs", "da", "de", "el", "en", "es",
+LANGUAGES = ["af", "am", "ar", "az", "bg", "bn", "ca", "cs", "da", "de", "el",
-             "et", "fa", "fi", "fr", "ga", "he", "hi", "hr", "hu", "id", "is",
+             "en", "es", "et", "eu", "fa", "fi", "fr", "ga", "gu", "he", "hi",
-             "it", "kn", "lt", "lv", "nb", "nl", "pl", "pt", "ro", "si", "sk",
+             "hr", "hu", "hy", "id", "is", "it", "kn", "ky", "lb", "lt", "lv",
-             "sl", "sq", "sr", "sv", "ta", "te", "tl", "tn", "tr", "tt", "ur",
+             "mk", "ml", "mr", "nb", "ne", "nl", "pl", "pt", "ro", "ru", "sa",
-             "yo"]
+             "si", "sk", "sl", "sq", "sr", "sv", "ta", "te", "ti", "tl", "tn",
             "tr", "tt", "uk", "ur", "xx", "yo"]
 # fmt: on
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@ -481,6 +481,7 @@ def test_matcher_schema_token_attributes(en_vocab, pattern, text):
    assert len(matches) == 1
@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_matcher_valid_callback(en_vocab):
    """Test that on_match can only be None or callable."""
    matcher = Matcher(en_vocab)
--- a/spacy/tests/matcher/test_matcher_logic.py
+++ b/spacy/tests/matcher/test_matcher_logic.py
@ -180,6 +180,7 @@ def test_matcher_sets_return_correct_tokens(en_vocab):
    assert texts == ["zero", "one", "two"]
@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_matcher_remove():
    nlp = English()
    matcher = Matcher(nlp.vocab)
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@ -252,12 +252,12 @@ def test_ruler_before_ner():
    # 1 : Entity Ruler - should set "this" to B and everything else to empty
    patterns = [{"label": "THING", "pattern": "This"}]
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    # 2: untrained NER - should set everything else to O
    untrained_ner = nlp.add_pipe("ner")
    untrained_ner.add_label("MY_LABEL")
    nlp.initialize()
    ruler.add_patterns(patterns)
    doc = nlp("This is Antti Korhonen speaking in Finland")
    expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
    expected_types = ["THING", "", "", "", "", "", ""]
--- a/spacy/tests/pipeline/test_entity_linker.py
+++ b/spacy/tests/pipeline/test_entity_linker.py
@ -324,6 +324,7 @@ def test_append_alias(nlp):
    assert len(mykb.get_alias_candidates("douglas")) == 3
@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_append_invalid_alias(nlp):
    """Test that append an alias will throw an error if prior probs are exceeding 1"""
    mykb = KnowledgeBase(nlp.vocab, entity_vector_length=1)
@ -342,6 +343,7 @@ def test_append_invalid_alias(nlp):
        mykb.append_alias(alias="douglas", entity="Q1", prior_prob=0.2)
@pytest.mark.filterwarnings("ignore:\\[W036")
 def test_preserving_links_asdoc(nlp):
    """Test that Span.as_doc preserves the existing entity links"""
    vector_length = 1
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@ -89,6 +89,20 @@ def test_entity_ruler_init_clear(nlp, patterns):
    assert len(ruler.labels) == 0
 def test_entity_ruler_clear(nlp, patterns):
    """Test that initialization clears patterns."""
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    assert len(ruler.labels) == 4
    doc = nlp("hello world")
    assert len(doc.ents) == 1
    ruler.clear()
    assert len(ruler.labels) == 0
    with pytest.warns(UserWarning):
        doc = nlp("hello world")
    assert len(doc.ents) == 0
 def test_entity_ruler_existing(nlp, patterns):
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@ -334,24 +334,31 @@ def test_language_factories_invalid():
@pytest.mark.parametrize(
-    "weights,expected",
+    "weights,override,expected",
    [
-        ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}),
+        ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {}, {"a": 0.33, "b": 0.33, "c": 0.33}),
-        ([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}),
+        ([{"a": 1.0}, {"b": 50}, {"c": 100}], {}, {"a": 0.01, "b": 0.33, "c": 0.66}),
        (
            [{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
            {},
            {"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
        ),
        (
-            [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
+            [{"a": 100, "b": 300}, {"c": 50, "d": 50}],
-            {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
+            {},
            {"a": 0.2, "b": 0.6, "c": 0.1, "d": 0.1},
        ),
-        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {}, {"a": 0.33, "b": 0.67}),
-        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
+        ([{"a": 0.5, "b": 0.0}], {}, {"a": 1.0, "b": 0.0}),
        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}),
        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}),
        ([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}),
        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}),
        ([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}),
    ],
 )
-def test_language_factories_combine_score_weights(weights, expected):
+def test_language_factories_combine_score_weights(weights, override, expected):
-    result = combine_score_weights(weights)
+    result = combine_score_weights(weights, override)
    assert sum(result.values()) in (0.99, 1.0, 0.0)
    assert result == expected
@ -377,17 +384,17 @@ def test_language_factories_scores():
    # Test with custom defaults
    config = nlp.config.copy()
    config["training"]["score_weights"]["a1"] = 0.0
-    config["training"]["score_weights"]["b3"] = 1.0
+    config["training"]["score_weights"]["b3"] = 1.3
    nlp = English.from_config(config)
    score_weights = nlp.config["training"]["score_weights"]
-    expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
+    expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65}
    assert score_weights == expected
    # Test with null values
    config = nlp.config.copy()
    config["training"]["score_weights"]["a1"] = None
    nlp = English.from_config(config)
    score_weights = nlp.config["training"]["score_weights"]
-    expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
+    expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66}
    assert score_weights == expected
--- a/spacy/tests/pipeline/test_textcat.py
+++ b/spacy/tests/pipeline/test_textcat.py
@ -108,6 +108,12 @@ def test_label_types(name):
    textcat.add_label("answer")
    with pytest.raises(ValueError):
        textcat.add_label(9)
    # textcat requires at least two labels
    if name == "textcat":
        with pytest.raises(ValueError):
            nlp.initialize()
    else:
        nlp.initialize()
@pytest.mark.parametrize("name", ["textcat", "textcat_multilabel"])
--- a/spacy/tests/regression/test_issue8216.py
+++ b/spacy/tests/regression/test_issue8216.py
@ -0,0 +1,34 @@
 import pytest
 from spacy import registry
 from spacy.language import Language
 from spacy.pipeline import EntityRuler
@pytest.fixture
 def nlp():
    return Language()
@pytest.fixture
@registry.misc("entity_ruler_patterns")
 def patterns():
    return [
        {"label": "HELLO", "pattern": "hello world"},
        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
        {"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"},
    ]
 def test_entity_ruler_fix8216(nlp, patterns):
    """Test that patterns don't get added excessively."""
    ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
    ruler.add_patterns(patterns)
    pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
    assert pattern_count > 0
    ruler.add_patterns([])
    after_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
    assert after_count == pattern_count
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@ -84,7 +84,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
@pytest.mark.parametrize("file_name", ["sun.txt"])
 def test_tokenizer_handle_text_from_file(tokenizer, file_name):
    loc = ensure_path(__file__).parent / file_name
-    text = loc.open("r", encoding="utf8").read()
+    with loc.open("r", encoding="utf8") as infile:
        text = infile.read()
    assert len(text) != 0
    tokens = tokenizer(text)
    assert len(tokens) > 100
--- a/spacy/tests/training/test_new_example.py
+++ b/spacy/tests/training/test_new_example.py
@ -182,6 +182,27 @@ def test_Example_from_dict_with_entities(annots):
    assert example.reference[5].ent_type_ == "LOC"
 def test_Example_from_dict_with_empty_entities():
    annots = {
        "words": ["I", "like", "New", "York", "and", "Berlin", "."],
        "entities": [],
    }
    vocab = Vocab()
    predicted = Doc(vocab, words=annots["words"])
    example = Example.from_dict(predicted, annots)
    # entities as empty list sets everything to O
    assert example.reference.has_annotation("ENT_IOB")
    assert len(list(example.reference.ents)) == 0
    assert all(token.ent_iob_ == "O" for token in example.reference)
    # various unset/missing entities leaves entities unset
    annots["entities"] = None
    example = Example.from_dict(predicted, annots)
    assert not example.reference.has_annotation("ENT_IOB")
    annots.pop("entities", None)
    example = Example.from_dict(predicted, annots)
    assert not example.reference.has_annotation("ENT_IOB")
@pytest.mark.parametrize(
    "annots",
    [
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -1141,6 +1141,10 @@ cdef class Doc:
                else:
                    warnings.warn(Warnings.W102.format(key=key, value=value))
            for key in doc.spans:
                # if a spans key is in any doc, include it in the merged doc
                # even if it is empty
                if key not in concat_spans:
                    concat_spans[key] = []
                for span in doc.spans[key]:
                    concat_spans[key].append((
                        span.start_char + char_offset,
@ -1150,7 +1154,7 @@ cdef class Doc:
                        span.text, # included as a check
                    ))
            char_offset += len(doc.text)
-            if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space:
+            if len(doc) > 0 and ensure_whitespace and not doc[-1].is_space and not bool(doc[-1].whitespace_):
                char_offset += 1
        arrays = [doc.to_array(attrs) for doc in docs]
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -416,7 +416,7 @@ def _fix_legacy_dict_data(example_dict):
    token_dict = example_dict.get("token_annotation", {})
    doc_dict = example_dict.get("doc_annotation", {})
    for key, value in example_dict.items():
-        if value:
+        if value is not None:
            if key in ("token_annotation", "doc_annotation"):
                pass
            elif key == "ids":
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1370,32 +1370,14 @@ def combine_score_weights(
        should be preserved.
    RETURNS (Dict[str, float]): The combined and normalized weights.
    """
    # We divide each weight by the total weight sum.
    # We first need to extract all None/null values for score weights that
    # shouldn't be shown in the table *or* be weighted
-    result = {}
+    result = {key: overrides.get(key, value) for w_dict in weights for (key, value) in w_dict.items()}
-    all_weights = []
+    weight_sum = sum([v if v else 0.0 for v in result.values()])
-    for w_dict in weights:
+    for key, value in result.items():
-        filtered_weights = {}
+        if value and weight_sum > 0:
-        for key, value in w_dict.items():
+            result[key] = round(value / weight_sum, 2)
            value = overrides.get(key, value)
            if value is None:
                result[key] = None
            else:
                filtered_weights[key] = value
        all_weights.append(filtered_weights)
    for w_dict in all_weights:
        # We need to account for weights that don't sum to 1.0 and normalize
        # the score weights accordingly, then divide score by the number of
        # components.
        total = sum(w_dict.values())
        for key, value in w_dict.items():
            if total == 0:
                weight = 0.0
            else:
                weight = round(value / total / len(all_weights), 2)
            prev_weight = result.get(key, 0.0)
            prev_weight = 0.0 if prev_weight is None else prev_weight
            result[key] = prev_weight + weight
    return result
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -10,11 +10,12 @@ api_trainable: true
 ---
 The text categorizer predicts **categories over a whole document**. and comes in
-two flavours: `textcat` and `textcat_multilabel`. When you need to predict
+two flavors: `textcat` and `textcat_multilabel`. When you need to predict
 exactly one true label per document, use the `textcat` which has mutually
 exclusive labels. If you want to perform multi-label classification and predict
-zero, one or more labels per document, use the `textcat_multilabel` component
+zero, one or more true labels per document, use the `textcat_multilabel`
-instead.
+component instead. For a binary classification task, you can use `textcat` with
 **two** labels or `textcat_multilabel` with **one** label.
 Both components are documented on this page.
@ -189,7 +190,7 @@ This method was previously called `begin_training`.
 | _keyword-only_   |                                                                                                                                                                                                                                                                                                                                                                                                            |
 | `nlp`            | The current `nlp` object. Defaults to `None`. ~~Optional[Language]~~                                                                                                                                                                                                                                                                                                                                       |
 | `labels`         | The label information to add to the component, as provided by the [`label_data`](#label_data) property after initialization. To generate a reusable JSON file from your data, you should run the [`init labels`](/api/cli#init-labels) command. If no labels are provided, the `get_examples` callback is used to extract the labels from the data, which may be a lot slower. ~~Optional[Iterable[str]]~~ |
-| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is not available when using the `textcat_multilabel` component. ~~Optional[str]~~                                                                                                                                                                                                             |
+| `positive_label` | The positive label for a binary task with exclusive classes, `None` otherwise and by default. This parameter is only used during scoring. It is not available when using the `textcat_multilabel` component. ~~Optional[str]~~                                                                                                                                                                             |
 ## TextCategorizer.predict {#predict tag="method"}
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@ -262,7 +262,12 @@
        },
        {
            "code": "mk",
-            "name": "Macedonian"
+            "name": "Macedonian",
            "models": [
                "mk_core_news_sm",
                "mk_core_news_md",
                "mk_core_news_lg"
            ]
        },
        {
            "code": "ml",