Merge remote-tracking branch 'upstream/develop' into feature/coref

2025-08-09 14:44:52 +03:00 · 2021-05-18 17:00:17 +09:00 · 2021-05-18 17:00:17 +09:00 · a33d29441a
commit a33d29441a
parent e303628205 1d59fdbd39
24 changed files with 554 additions and 121 deletions
--- a/licenses/3rd_party_licenses.txt
+++ b/licenses/3rd_party_licenses.txt
@ -43,8 +43,8 @@ scikit-learn

 * Files: scorer.py

-The following implementation of roc_auc_score() is adapted from
-scikit-learn, which is distributed under the following license:
+The implementation of roc_auc_score() is adapted from scikit-learn, which is
+distributed under the following license:

 New BSD License

@ -77,3 +77,30 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 DAMAGE.
+
+
+pyvi
+----
+
+* Files: lang/vi/__init__.py
+
+The MIT License (MIT)
+Copyright (c) 2016 Viet-Trung Tran
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -112,7 +112,9 @@ def package(
        msg.fail("Invalid pipeline meta.json")
        print("\n".join(errors))
        sys.exit(1)
-    model_name = meta["lang"] + "_" + meta["name"]
+    model_name = meta["name"]
+    if not model_name.startswith(meta['lang'] + "_"):
+        model_name = f"{meta['lang']}_{model_name}"
    model_name_v = model_name + "-" + meta["version"]
    main_path = output_dir / model_name_v
    package_path = main_path / model_name
@ -294,7 +296,7 @@ def setup_package():

 if __name__ == '__main__':
    setup_package()
-""".strip()
+""".lstrip()


 TEMPLATE_MANIFEST = """
@ -314,4 +316,4 @@ __version__ = get_model_meta(Path(__file__).parent)['version']

 def load(**overrides):
    return load_model_from_init_py(__file__, **overrides)
-""".strip()
+""".lstrip()
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@ -80,6 +80,8 @@ eval_frequency = 200
 score_weights = {}
 # Names of pipeline components that shouldn't be updated during training
 frozen_components = []
+# Names of pipeline components that should set annotations during training
+annotating_components = []
 # Location in the config where the dev corpus is defined
 dev_corpus = "corpora.dev"
 # Location in the config where the train corpus is defined
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@ -28,7 +28,7 @@ cdef class Candidate:

 cdef class KnowledgeBase:
    cdef Pool mem
-    cpdef readonly Vocab vocab
+    cdef readonly Vocab vocab
    cdef int64_t entity_vector_length

    # This maps 64bit keys (hash of unique entity string)
--- a/spacy/lang/it/stop_words.py
+++ b/spacy/lang/it/stop_words.py
@ -72,7 +72,7 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
 subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
 sullo suo suoi

-tale tali talvolta tanto te tempo ti titolo torino tra tranne tre trenta
+tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
 troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto

 uguali ulteriore ultimo un una uno uomo
--- a/spacy/lang/vi/init.py
+++ b/spacy/lang/vi/init.py
@ -1,8 +1,15 @@
+from typing import Any, Dict, Union
+from pathlib import Path
+import re
+import srsly
+import string
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
 from ...tokens import Doc
 from ...util import DummyTokenizer, registry, load_config_from_str
+from ... import util


 DEFAULT_CONFIG = """
@ -40,17 +47,108 @@ class VietnameseTokenizer(DummyTokenizer):

    def __call__(self, text: str) -> Doc:
        if self.use_pyvi:
-            words, spaces = self.ViTokenizer.spacy_tokenize(text)
+            words = self.pyvi_tokenize(text)
+            words, spaces = util.get_words_and_spaces(words, text)
            return Doc(self.vocab, words=words, spaces=spaces)
        else:
-            words = []
-            spaces = []
-            for token in self.tokenizer(text):
-                words.extend(list(token.text))
-                spaces.extend([False] * len(token.text))
-                spaces[-1] = bool(token.whitespace_)
+            words, spaces = util.get_words_and_spaces(text.split(), text)
            return Doc(self.vocab, words=words, spaces=spaces)

+    # The methods pyvi_sylabelize_with_ws and pyvi_tokenize are adapted from
+    # pyvi v0.1, MIT License, Copyright (c) 2016 Viet-Trung Tran.
+    # See licenses/3rd_party_licenses.txt
+    def pyvi_sylabelize_with_ws(self, text):
+        """Modified from pyvi to preserve whitespace and skip unicode
+        normalization."""
+        specials = [r"==>", r"->", r"\.\.\.", r">>"]
+        digit = r"\d+([\.,_]\d+)+"
+        email = r"([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)"
+        web = r"\w+://[^\s]+"
+        word = r"\w+"
+        non_word = r"[^\w\s]"
+        abbreviations = [
+            r"[A-ZĐ]+\.",
+            r"Tp\.",
+            r"Mr\.",
+            r"Mrs\.",
+            r"Ms\.",
+            r"Dr\.",
+            r"ThS\.",
+        ]
+
+        patterns = []
+        patterns.extend(abbreviations)
+        patterns.extend(specials)
+        patterns.extend([web, email])
+        patterns.extend([digit, non_word, word])
+
+        patterns = r"(\s+|" + "|".join(patterns) + ")"
+        tokens = re.findall(patterns, text, re.UNICODE)
+
+        return [token[0] for token in tokens]
+
+    def pyvi_tokenize(self, text):
+        """Modified from pyvi to preserve text and whitespace."""
+        if len(text) == 0:
+            return []
+        elif text.isspace():
+            return [text]
+        segs = self.pyvi_sylabelize_with_ws(text)
+        words = []
+        preceding_ws = []
+        for i, token in enumerate(segs):
+            if not token.isspace():
+                words.append(token)
+                preceding_ws.append(
+                    "" if (i == 0 or not segs[i - 1].isspace()) else segs[i - 1]
+                )
+        labels = self.ViTokenizer.ViTokenizer.model.predict(
+            [self.ViTokenizer.ViTokenizer.sent2features(words, False)]
+        )
+        token = words[0]
+        tokens = []
+        for i in range(1, len(labels[0])):
+            if (
+                labels[0][i] == "I_W"
+                and words[i] not in string.punctuation
+                and words[i - 1] not in string.punctuation
+                and not words[i][0].isdigit()
+                and not words[i - 1][0].isdigit()
+                and not (words[i][0].istitle() and not words[i - 1][0].istitle())
+            ):
+                token = token + preceding_ws[i] + words[i]
+            else:
+                tokens.append(token)
+                token = words[i]
+        tokens.append(token)
+        return tokens
+
+    def _get_config(self) -> Dict[str, Any]:
+        return {"use_pyvi": self.use_pyvi}
+
+    def _set_config(self, config: Dict[str, Any] = {}) -> None:
+        self.use_pyvi = config.get("use_pyvi", False)
+
+    def to_bytes(self, **kwargs) -> bytes:
+        serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
+        return util.to_bytes(serializers, [])
+
+    def from_bytes(self, data: bytes, **kwargs) -> "VietnameseTokenizer":
+        deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
+        util.from_bytes(data, deserializers, [])
+        return self
+
+    def to_disk(self, path: Union[str, Path], **kwargs) -> None:
+        path = util.ensure_path(path)
+        serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
+        return util.to_disk(path, serializers, [])
+
+    def from_disk(self, path: Union[str, Path], **kwargs) -> "VietnameseTokenizer":
+        path = util.ensure_path(path)
+        serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
+        util.from_disk(path, serializers, [])
+        return self
+

 class VietnameseDefaults(Language.Defaults):
    config = load_config_from_str(DEFAULT_CONFIG)
--- a/spacy/language.py
+++ b/spacy/language.py
@ -1074,6 +1074,7 @@ class Language:
        losses: Optional[Dict[str, float]] = None,
        component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
        exclude: Iterable[str] = SimpleFrozenList(),
+        annotates: Iterable[str] = SimpleFrozenList(),
    ):
        """Update the models in the pipeline.

@ -1081,10 +1082,13 @@ class Language:
        _: Should not be set - serves to catch backwards-incompatible scripts.
        drop (float): The dropout rate.
        sgd (Optimizer): An optimizer.
-        losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
+        losses (Dict[str, float]): Dictionary to update with the loss, keyed by
+            component.
        component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
            components, keyed by component name.
        exclude (Iterable[str]): Names of components that shouldn't be updated.
+        annotates (Iterable[str]): Names of components that should set
+            annotations on the predicted examples after updating.
        RETURNS (Dict[str, float]): The updated losses dictionary

        DOCS: https://spacy.io/api/language#update
@ -1103,15 +1107,16 @@ class Language:
            sgd = self._optimizer
        if component_cfg is None:
            component_cfg = {}
+        pipe_kwargs = {}
        for i, (name, proc) in enumerate(self.pipeline):
            component_cfg.setdefault(name, {})
+            pipe_kwargs[name] = deepcopy(component_cfg[name])
            component_cfg[name].setdefault("drop", drop)
+            pipe_kwargs[name].setdefault("batch_size", self.batch_size)
        for name, proc in self.pipeline:
-            if name in exclude or not hasattr(proc, "update"):
-                continue
-            proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
-        if sgd not in (None, False):
-            for name, proc in self.pipeline:
+            if name not in exclude and hasattr(proc, "update"):
+                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
+            if sgd not in (None, False):
                if (
                    name not in exclude
                    and hasattr(proc, "is_trainable")
@ -1119,6 +1124,18 @@ class Language:
                    and proc.model not in (True, False, None)
                ):
                    proc.finish_update(sgd)
+            if name in annotates:
+                for doc, eg in zip(
+                    _pipe(
+                        (eg.predicted for eg in examples),
+                        proc=proc,
+                        name=name,
+                        default_error_handler=self.default_error_handler,
+                        kwargs=pipe_kwargs[name],
+                    ),
+                    examples,
+                ):
+                    eg.predicted = doc
        return losses

    def rehearse(
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@ -1,14 +1,11 @@
 from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMap, PreshMapArray
-from libc.stdint cimport uint64_t
-from murmurhash cimport mrmr
+from preshed.maps cimport PreshMap
 cimport numpy as np
+from libc.stdint cimport uint64_t

-from .structs cimport TokenC, MorphAnalysisC
+from .structs cimport MorphAnalysisC
 from .strings cimport StringStore
-from .typedefs cimport hash_t, attr_t, flags_t
-from .parts_of_speech cimport univ_pos_t
-from . cimport symbols
+from .typedefs cimport attr_t, hash_t


 cdef class Morphology:
@ -16,14 +13,6 @@ cdef class Morphology:
    cdef readonly StringStore strings
    cdef PreshMap tags # Keyed by hash, value is pointer to tag

-    cdef public object lemmatizer
-    cdef readonly object tag_map
-    cdef readonly object tag_names
-    cdef readonly object reverse_index
-    cdef readonly object _exc
-    cdef readonly PreshMapArray _cache
-    cdef readonly int n_tags
-
    cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
    cdef int insert(self, MorphAnalysisC tag) except -1

--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -1,20 +1,11 @@
 # cython: infer_types
-from libc.string cimport memset
-
-import srsly
-from collections import Counter
 import numpy
 import warnings

-from .attrs cimport POS, IS_SPACE
-from .parts_of_speech cimport SPACE
-from .lexeme cimport Lexeme
+from .attrs cimport POS

-from .strings import get_string_id
-from .attrs import LEMMA, intify_attrs
 from .parts_of_speech import IDS as POS_IDS
-from .errors import Errors, Warnings
-from .util import ensure_path
+from .errors import Warnings
 from . import symbols


--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@ -313,6 +313,7 @@ class ConfigSchemaTraining(BaseModel):
    optimizer: Optimizer = Field(..., title="The optimizer to use")
    logger: Logger = Field(..., title="The logger to track training progress")
    frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
+    annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training")
    before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
    # fmt: on

--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@ -286,6 +286,12 @@ def ur_tokenizer():
    return get_lang_class("ur")().tokenizer


+@pytest.fixture(scope="session")
+def vi_tokenizer():
+    pytest.importorskip("pyvi")
+    return get_lang_class("vi")().tokenizer
+
+
@pytest.fixture(scope="session")
 def yo_tokenizer():
    return get_lang_class("yo")().tokenizer
--- a/spacy/tests/lang/vi/init.py
+++ b/spacy/tests/lang/vi/init.py
--- a/spacy/tests/lang/vi/test_serialize.py
+++ b/spacy/tests/lang/vi/test_serialize.py
@ -0,0 +1,33 @@
+from spacy.lang.vi import Vietnamese
+from ...util import make_tempdir
+
+
+def test_vi_tokenizer_serialize(vi_tokenizer):
+    tokenizer_bytes = vi_tokenizer.to_bytes()
+    nlp = Vietnamese()
+    nlp.tokenizer.from_bytes(tokenizer_bytes)
+    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+    assert nlp.tokenizer.use_pyvi is True
+
+    with make_tempdir() as d:
+        file_path = d / "tokenizer"
+        vi_tokenizer.to_disk(file_path)
+        nlp = Vietnamese()
+        nlp.tokenizer.from_disk(file_path)
+        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+        assert nlp.tokenizer.use_pyvi is True
+
+    # mode is (de)serialized correctly
+    nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
+    nlp_bytes = nlp.to_bytes()
+    nlp_r = Vietnamese()
+    nlp_r.from_bytes(nlp_bytes)
+    assert nlp_bytes == nlp_r.to_bytes()
+    assert nlp_r.tokenizer.use_pyvi == False
+
+    with make_tempdir() as d:
+        nlp.to_disk(d)
+        nlp_r = Vietnamese()
+        nlp_r.from_disk(d)
+        assert nlp_bytes == nlp_r.to_bytes()
+        assert nlp_r.tokenizer.use_pyvi == False
--- a/spacy/tests/lang/vi/test_tokenizer.py
+++ b/spacy/tests/lang/vi/test_tokenizer.py
@ -0,0 +1,47 @@
+import pytest
+
+from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
+from spacy.lang.vi import Vietnamese
+
+
+# fmt: off
+TOKENIZER_TESTS = [
+    ("Đây là một văn  bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này", ['Đây', 'là', 'một', 'văn  bản', 'bằng', 'tiếng', 'Việt', 'Sau', 'đó', ',', 'đây', 'là', 'một', 'văn bản', 'khác', 'bằng', 'ngôn ngữ', 'này']),
+]
+# fmt: on
+
+
+@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
+def test_vi_tokenizer(vi_tokenizer, text, expected_tokens):
+    tokens = [token.text for token in vi_tokenizer(text)]
+    assert tokens == expected_tokens
+
+
+def test_vi_tokenizer_extra_spaces(vi_tokenizer):
+    # note: three spaces after "I"
+    tokens = vi_tokenizer("I   like cheese.")
+    assert tokens[1].orth_ == "  "
+
+
+@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
+def test_vi_tokenizer_naughty_strings(vi_tokenizer, text):
+    tokens = vi_tokenizer(text)
+    assert tokens.text_with_ws == text
+
+
+def test_vi_tokenizer_emptyish_texts(vi_tokenizer):
+    doc = vi_tokenizer("")
+    assert len(doc) == 0
+    doc = vi_tokenizer(" ")
+    assert len(doc) == 1
+    doc = vi_tokenizer("\n\n\n \t\t \n\n\n")
+    assert len(doc) == 1
+
+
+def test_vi_tokenizer_no_pyvi():
+    """Test for whitespace tokenization without pyvi"""
+    nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
+    text = "Đây là một văn  bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này"
+    doc = nlp(text)
+    assert [t.text for t in doc if not t.is_space] == text.split()
+    assert doc[4].text == " "
--- a/spacy/tests/pipeline/test_annotates_on_update.py
+++ b/spacy/tests/pipeline/test_annotates_on_update.py
@ -0,0 +1,113 @@
+from typing import Callable, Iterable, Iterator
+import pytest
+import io
+
+from thinc.api import Config
+from spacy.language import Language
+from spacy.training import Example
+from spacy.training.loop import train
+from spacy.lang.en import English
+from spacy.util import registry, load_model_from_config
+
+
+@pytest.fixture
+def config_str():
+    return """
+    [nlp]
+    lang = "en"
+    pipeline = ["sentencizer","assert_sents"]
+    disabled = []
+    before_creation = null
+    after_creation = null
+    after_pipeline_creation = null
+    batch_size = 1000
+    tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+    [components]
+
+    [components.assert_sents]
+    factory = "assert_sents"
+
+    [components.sentencizer]
+    factory = "sentencizer"
+    punct_chars = null
+
+    [training]
+    dev_corpus = "corpora.dev"
+    train_corpus = "corpora.train"
+    annotating_components = ["sentencizer"]
+    max_steps = 2
+
+    [corpora]
+
+    [corpora.dev]
+    @readers = "unannotated_corpus"
+
+    [corpora.train]
+    @readers = "unannotated_corpus"
+    """
+
+
+def test_annotates_on_update():
+    # The custom component checks for sentence annotation
+    @Language.factory("assert_sents", default_config={})
+    def assert_sents(nlp, name):
+        return AssertSents(name)
+
+    class AssertSents:
+        def __init__(self, name, **cfg):
+            self.name = name
+            pass
+
+        def __call__(self, doc):
+            if not doc.has_annotation("SENT_START"):
+                raise ValueError("No sents")
+            return doc
+
+        def update(self, examples, *, drop=0.0, sgd=None, losses=None):
+            for example in examples:
+                if not example.predicted.has_annotation("SENT_START"):
+                    raise ValueError("No sents")
+            return {}
+
+    nlp = English()
+    nlp.add_pipe("sentencizer")
+    nlp.add_pipe("assert_sents")
+
+    # When the pipeline runs, annotations are set
+    doc = nlp("This is a sentence.")
+
+    examples = []
+    for text in ["a a", "b b", "c c"]:
+        examples.append(Example(nlp.make_doc(text), nlp(text)))
+
+    for example in examples:
+        assert not example.predicted.has_annotation("SENT_START")
+
+    # If updating without setting annotations, assert_sents will raise an error
+    with pytest.raises(ValueError):
+        nlp.update(examples)
+
+    # Updating while setting annotations for the sentencizer succeeds
+    nlp.update(examples, annotates=["sentencizer"])
+
+
+def test_annotating_components_from_config(config_str):
+    @registry.readers("unannotated_corpus")
+    def create_unannotated_corpus() -> Callable[[Language], Iterable[Example]]:
+        return UnannotatedCorpus()
+
+    class UnannotatedCorpus:
+        def __call__(self, nlp: Language) -> Iterator[Example]:
+            for text in ["a a", "b b", "c c"]:
+                doc = nlp.make_doc(text)
+                yield Example(doc, doc)
+
+    orig_config = Config().from_str(config_str)
+    nlp = load_model_from_config(orig_config, auto_fill=True, validate=True)
+    assert nlp.config["training"]["annotating_components"] == ["sentencizer"]
+    train(nlp)
+
+    nlp.config["training"]["annotating_components"] = []
+    with pytest.raises(ValueError):
+        train(nlp)
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@ -334,24 +334,31 @@ def test_language_factories_invalid():


@pytest.mark.parametrize(
-    "weights,expected",
+    "weights,override,expected",
    [
-        ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}),
-        ([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}),
+        ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {}, {"a": 0.33, "b": 0.33, "c": 0.33}),
+        ([{"a": 1.0}, {"b": 50}, {"c": 100}], {}, {"a": 0.01, "b": 0.33, "c": 0.66}),
        (
            [{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
+            {},
            {"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
        ),
        (
-            [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
-            {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
+            [{"a": 100, "b": 300}, {"c": 50, "d": 50}],
+            {},
+            {"a": 0.2, "b": 0.6, "c": 0.1, "d": 0.1},
        ),
-        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
-        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {}, {"a": 0.33, "b": 0.67}),
+        ([{"a": 0.5, "b": 0.0}], {}, {"a": 1.0, "b": 0.0}),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}),
+        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}),
+        ([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}),
+        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}),
+        ([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}),
    ],
 )
-def test_language_factories_combine_score_weights(weights, expected):
-    result = combine_score_weights(weights)
+def test_language_factories_combine_score_weights(weights, override, expected):
+    result = combine_score_weights(weights, override)
    assert sum(result.values()) in (0.99, 1.0, 0.0)
    assert result == expected

@ -377,17 +384,17 @@ def test_language_factories_scores():
    # Test with custom defaults
    config = nlp.config.copy()
    config["training"]["score_weights"]["a1"] = 0.0
-    config["training"]["score_weights"]["b3"] = 1.0
+    config["training"]["score_weights"]["b3"] = 1.3
    nlp = English.from_config(config)
    score_weights = nlp.config["training"]["score_weights"]
-    expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
+    expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65}
    assert score_weights == expected
    # Test with null values
    config = nlp.config.copy()
    config["training"]["score_weights"]["a1"] = None
    nlp = English.from_config(config)
    score_weights = nlp.config["training"]["score_weights"]
-    expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
+    expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66}
    assert score_weights == expected


--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@ -1,7 +1,9 @@
 import pytest
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
+from spacy.training import Example
 from spacy.util import SimpleFrozenList, get_arg_names
+from spacy.lang.en import English


@pytest.fixture
@ -417,3 +419,41 @@ def test_pipe_methods_initialize():
    assert "test" in nlp.config["initialize"]["components"]
    nlp.remove_pipe("test")
    assert "test" not in nlp.config["initialize"]["components"]
+
+
+def test_update_with_annotates():
+    name = "test_with_annotates"
+    results = {}
+
+    def make_component(name):
+        results[name] = ""
+
+        def component(doc):
+            nonlocal results
+            results[name] += doc.text
+            return doc
+
+        return component
+
+    c1 = Language.component(f"{name}1", func=make_component(f"{name}1"))
+    c2 = Language.component(f"{name}2", func=make_component(f"{name}2"))
+
+    components = set([f"{name}1", f"{name}2"])
+
+    nlp = English()
+    texts = ["a", "bb", "ccc"]
+    examples = []
+    for text in texts:
+        examples.append(Example(nlp.make_doc(text), nlp.make_doc(text)))
+
+    for components_to_annotate in [[], [f"{name}1"], [f"{name}1", f"{name}2"], [f"{name}2", f"{name}1"]]:
+        for key in results:
+            results[key] = ""
+        nlp = English(vocab=nlp.vocab)
+        nlp.add_pipe(f"{name}1")
+        nlp.add_pipe(f"{name}2")
+        nlp.update(examples, annotates=components_to_annotate)
+        for component in components_to_annotate:
+            assert results[component] == "".join(eg.predicted.text for eg in examples)
+        for component in components - set(components_to_annotate):
+            assert results[component] == ""
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@ -14,7 +14,7 @@ cdef class Tokenizer:
    cdef Pool mem
    cdef PreshMap _cache
    cdef PreshMap _specials
-    cpdef readonly Vocab vocab
+    cdef readonly Vocab vocab

    cdef object _token_match
    cdef object _url_match
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@ -74,6 +74,8 @@ def train(

    # Components that shouldn't be updated during training
    frozen_components = T["frozen_components"]
+    # Components that should set annotations on update
+    annotating_components = T["annotating_components"]
    # Create iterator, which yields out info after each optimization step.
    training_step_iterator = train_while_improving(
        nlp,
@ -86,11 +88,17 @@ def train(
        max_steps=T["max_steps"],
        eval_frequency=T["eval_frequency"],
        exclude=frozen_components,
+        annotating_components=annotating_components,
    )
    clean_output_dir(output_path)
    stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
    if frozen_components:
        stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
+    if annotating_components:
+        stdout.write(
+            msg.info(f"Set annotations on update for: {annotating_components}")
+            + "\n"
+        )
    stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
    with nlp.select_pipes(disable=frozen_components):
        log_step, finalize_logger = train_logger(nlp, stdout, stderr)
@ -142,6 +150,7 @@ def train_while_improving(
    patience: int,
    max_steps: int,
    exclude: List[str],
+    annotating_components: List[str],
 ):
    """Train until an evaluation stops improving. Works as a generator,
    with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@ -193,7 +202,12 @@ def train_while_improving(
        dropout = next(dropouts)
        for subbatch in subdivide_batch(batch, accumulate_gradient):
            nlp.update(
-                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
+                subbatch,
+                drop=dropout,
+                losses=losses,
+                sgd=False,
+                exclude=exclude,
+                annotates=annotating_components,
            )
        # TODO: refactor this so we don't have to run it separately in here
        for name, proc in nlp.pipeline:
--- a/spacy/util.py
+++ b/spacy/util.py
@ -1369,32 +1369,14 @@ def combine_score_weights(
        should be preserved.
    RETURNS (Dict[str, float]): The combined and normalized weights.
    """
+    # We divide each weight by the total weight sum.
    # We first need to extract all None/null values for score weights that
    # shouldn't be shown in the table *or* be weighted
-    result = {}
-    all_weights = []
-    for w_dict in weights:
-        filtered_weights = {}
-        for key, value in w_dict.items():
-            value = overrides.get(key, value)
-            if value is None:
-                result[key] = None
-            else:
-                filtered_weights[key] = value
-        all_weights.append(filtered_weights)
-    for w_dict in all_weights:
-        # We need to account for weights that don't sum to 1.0 and normalize
-        # the score weights accordingly, then divide score by the number of
-        # components.
-        total = sum(w_dict.values())
-        for key, value in w_dict.items():
-            if total == 0:
-                weight = 0.0
-            else:
-                weight = round(value / total / len(all_weights), 2)
-            prev_weight = result.get(key, 0.0)
-            prev_weight = 0.0 if prev_weight is None else prev_weight
-            result[key] = prev_weight + weight
+    result = {key: overrides.get(key, value) for w_dict in weights for (key, value) in w_dict.items()}
+    weight_sum = sum([v if v else 0.0 for v in result.values()])
+    for key, value in result.items():
+        if value and weight_sum > 0:
+            result[key] = round(value / weight_sum, 2)
    return result


--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@ -25,12 +25,12 @@ cdef struct _Cached:

 cdef class Vocab:
    cdef Pool mem
-    cpdef readonly StringStore strings
-    cpdef public Morphology morphology
-    cpdef public object vectors
-    cpdef public object _lookups
-    cpdef public object writing_system
-    cpdef public object get_noun_chunks
+    cdef readonly StringStore strings
+    cdef public Morphology morphology
+    cdef public object vectors
+    cdef public object _lookups
+    cdef public object writing_system
+    cdef public object get_noun_chunks
    cdef readonly int length
    cdef public object data_dir
    cdef public object lex_attr_getters
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@ -182,24 +182,25 @@ single corpus once and then divide it up into `train` and `dev` partitions.
 This section defines settings and controls for the training and evaluation
 process that are used when you run [`spacy train`](/api/cli#train).

-| Name                  | Description                                                                                                                                                                                                                                                                                                                         |
-| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
-| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
-| `before_to_disk`      | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
-| `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
-| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
-| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |
-| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                      |
-| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                                                                                                                   |
-| `logger`              | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
-| `max_epochs`          | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~                                                                                             |
-| `max_steps`           | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                   |
-| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                                                             |
-| `patience`            | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~                                                                                                                                                                                                        |
-| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                                                                                                       |
-| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
-| `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |
+| Name                    | Description                                                                                                                                                                                                                                                                                                                         |
+| ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `accumulate_gradient`   | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
+| `batcher`               | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
+| `before_to_disk`        | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
+| `dev_corpus`            | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
+| `dropout`               | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
+| `eval_frequency`        | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |
+| `frozen_components`     | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                      |
+| `annotating_components` | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                          |
+| `gpu_allocator`         | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                                                                                                                   |
+| `logger`                | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
+| `max_epochs`            | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~                                                                                             |
+| `max_steps`             | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                   |
+| `optimizer`             | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                                                             |
+| `patience`              | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~                                                                                                                                                                                                        |
+| `score_weights`         | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                                                                                                       |
+| `seed`                  | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
+| `train_corpus`          | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |

 ### pretraining {#config-pretraining tag="section,optional"}

--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@ -245,14 +245,14 @@ and call the optimizer, while the others simply increment the gradients.
 > losses = trf.update(examples, sgd=optimizer)
 > ```

-| Name              | Description                                                                                                                                                                      |
-| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`        | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
-| _keyword-only_    |                                                                                                                                                                                  |
-| `drop`            | The dropout rate. ~~float~~                                                                                                                                                      |
-| `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                    |
-| `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                         |
-| **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                            |
+| Name           | Description                                                                                                                                                                      |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`     | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                  |
+| `drop`         | The dropout rate. ~~float~~                                                                                                                                                      |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                    |
+| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                         |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                            |

 ## Transformer.create_optimizer {#create_optimizer tag="method"}

@ -493,6 +493,11 @@ This requires sentence boundaries to be set (e.g. by the
 depending on the sentence lengths. However, it does provide the transformer with
 more meaningful windows to attend over.

+To set sentence boundaries with the `sentencizer` during training, add a
+`sentencizer` to the beginning of the pipeline and include it in
+[`[training.annotating_components]`](/usage/training#annotating-components) to
+have it set the sentence boundaries before the `transformer` component runs.
+
 ### strided_spans.v1 {#strided_spans tag="registered function"}

 > #### Example config
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@ -414,11 +414,11 @@ as-is. They are also excluded when calling
 > #### Note on frozen components
 >
 > Even though frozen components are not **updated** during training, they will
-> still **run** during training and evaluation. This is very important, because
-> they may still impact your model's performance – for instance, a sentence
-> boundary detector can impact what the parser or entity recognizer considers a
-> valid parse. So the evaluation results should always reflect what your
-> pipeline will produce at runtime.
+> still **run** during evaluation. This is very important, because they may
+> still impact your model's performance – for instance, a sentence boundary
+> detector can impact what the parser or entity recognizer considers a valid
+> parse. So the evaluation results should always reflect what your pipeline will
+> produce at runtime.

 ```ini
 [nlp]
@ -455,6 +455,64 @@ replace_listeners = ["model.tok2vec"]

 </Infobox>

+### Using predictions from preceding components {#annotating-components new="3.1"}
+
+By default, components are updated in isolation during training, which means
+that they don't see the predictions of any earlier components in the pipeline. A
+component receives [`Example.predicted`](/api/example) as input and compares its
+predictions to [`Example.reference`](/api/example) without saving its
+annotations in the `predicted` doc.
+
+Instead, if certain components should **set their annotations** during training,
+use the setting `annotating_components` in the `[training]` block to specify a
+list of components. For example, the feature `DEP` from the parser could be used
+as a tagger feature by including `DEP` in the tok2vec `attrs` and including
+`parser` in `annotating_components`:
+
+```ini
+### config.cfg (excerpt) {highlight="7,12"}
+[nlp]
+pipeline = ["parser", "tagger"]
+
+[components.tagger.model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = ${components.tagger.model.tok2vec.encode.width}
+attrs = ["NORM","DEP"]
+rows = [5000,2500]
+include_static_vectors = false
+
+[training]
+annotating_components = ["parser"]
+```
+
+Any component in the pipeline can be included as an annotating component,
+including frozen components. Frozen components can set annotations during
+training just as they would set annotations during evaluation or when the final
+pipeline is run. The config excerpt below shows how a frozen `ner` component and
+a `sentencizer` can provide the required `doc.sents` and `doc.ents` for the
+entity linker during training:
+
+```ini
+### config.cfg (excerpt)
+[nlp]
+pipeline = ["sentencizer", "ner", "entity_linker"]
+
+[components.ner]
+source = "en_core_web_sm"
+
+[training]
+frozen_components = ["ner"]
+annotating_components = ["sentencizer", "ner"]
+```
+
+<Infobox variant="warning" title="Training speed with annotating components" id="annotating-components-speed">
+
+Be aware that non-frozen annotating components with statistical models will
+**run twice** on each batch, once to update the model and once to apply the
+now-updated model to the predicted docs.
+
+</Infobox>
+
 ### Using registered functions {#config-functions}

 The training configuration defined in the config file doesn't have to only