From 95e3cf576bef4bf44a9736d3564fe87a1c742cc7 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 26 Apr 2021 16:53:21 +0200
Subject: [PATCH 01/21] Optionally append lang for packaged model name (#7417)

* Add empty lines at the end of Python files

* Only prepend the lang code if it's not there already

* Update spacy/cli/package.py

* fix whitespace stripping
---
 spacy/cli/package.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index eaffde1d7..5b8daf048 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -112,7 +112,9 @@ def package(
         msg.fail("Invalid pipeline meta.json")
         print("\n".join(errors))
         sys.exit(1)
-    model_name = meta["lang"] + "_" + meta["name"]
+    model_name = meta["name"]
+    if not model_name.startswith(meta['lang'] + "_"):
+        model_name = f"{meta['lang']}_{model_name}"
     model_name_v = model_name + "-" + meta["version"]
     main_path = output_dir / model_name_v
     package_path = main_path / model_name
@@ -294,7 +296,7 @@ def setup_package():
 
 if __name__ == '__main__':
     setup_package()
-""".strip()
+""".lstrip()
 
 
 TEMPLATE_MANIFEST = """
@@ -314,4 +316,4 @@ __version__ = get_model_meta(Path(__file__).parent)['version']
 
 def load(**overrides):
     return load_model_from_init_py(__file__, **overrides)
-""".strip()
+""".lstrip()

From e0b29f8ef7e4693355e481795af04413ccdf0d55 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 26 Apr 2021 16:53:38 +0200
Subject: [PATCH 02/21] Fix scoring normalization (#7629)

* fix scoring normalization

* score weights by total sum instead of per component

* cleanup

* more cleanup
---
 spacy/tests/pipeline/test_pipe_factories.py | 31 +++++++++++++--------
 spacy/util.py                               | 30 ++++----------------
 2 files changed, 25 insertions(+), 36 deletions(-)

diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index a7071abfd..c5cc62661 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -334,24 +334,31 @@ def test_language_factories_invalid():
 
 
 @pytest.mark.parametrize(
-    "weights,expected",
+    "weights,override,expected",
     [
-        ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {"a": 0.33, "b": 0.33, "c": 0.33}),
-        ([{"a": 1.0}, {"b": 50}, {"c": 123}], {"a": 0.33, "b": 0.33, "c": 0.33}),
+        ([{"a": 1.0}, {"b": 1.0}, {"c": 1.0}], {}, {"a": 0.33, "b": 0.33, "c": 0.33}),
+        ([{"a": 1.0}, {"b": 50}, {"c": 100}], {}, {"a": 0.01, "b": 0.33, "c": 0.66}),
         (
             [{"a": 0.7, "b": 0.3}, {"c": 1.0}, {"d": 0.5, "e": 0.5}],
+            {},
             {"a": 0.23, "b": 0.1, "c": 0.33, "d": 0.17, "e": 0.17},
         ),
         (
-            [{"a": 100, "b": 400}, {"c": 0.5, "d": 0.5}],
-            {"a": 0.1, "b": 0.4, "c": 0.25, "d": 0.25},
+            [{"a": 100, "b": 300}, {"c": 50, "d": 50}],
+            {},
+            {"a": 0.2, "b": 0.6, "c": 0.1, "d": 0.1},
         ),
-        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.25, "b": 0.75}),
-        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"a": 0.0, "b": 0.0, "c": 0.0}),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {}, {"a": 0.33, "b": 0.67}),
+        ([{"a": 0.5, "b": 0.0}], {}, {"a": 1.0, "b": 0.0}),
+        ([{"a": 0.5, "b": 0.5}, {"b": 1.0}], {"a": 0.0}, {"a": 0.0, "b": 1.0}),
+        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {}, {"a": 0.0, "b": 0.0, "c": 0.0}),
+        ([{"a": 0.0, "b": 0.0}, {"c": 1.0}], {}, {"a": 0.0, "b": 0.0, "c": 1.0}),
+        ([{"a": 0.0, "b": 0.0}, {"c": 0.0}], {"c": 0.2}, {"a": 0.0, "b": 0.0, "c": 1.0}),
+        ([{"a": 0.5, "b": 0.5, "c": 1.0, "d": 1.0}], {"a": 0.0, "b": 0.0}, {"a": 0.0, "b": 0.0, "c": 0.5, "d": 0.5}),
     ],
 )
-def test_language_factories_combine_score_weights(weights, expected):
-    result = combine_score_weights(weights)
+def test_language_factories_combine_score_weights(weights, override, expected):
+    result = combine_score_weights(weights, override)
     assert sum(result.values()) in (0.99, 1.0, 0.0)
     assert result == expected
 
@@ -377,17 +384,17 @@ def test_language_factories_scores():
     # Test with custom defaults
     config = nlp.config.copy()
     config["training"]["score_weights"]["a1"] = 0.0
-    config["training"]["score_weights"]["b3"] = 1.0
+    config["training"]["score_weights"]["b3"] = 1.3
     nlp = English.from_config(config)
     score_weights = nlp.config["training"]["score_weights"]
-    expected = {"a1": 0.0, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.34}
+    expected = {"a1": 0.0, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.65}
     assert score_weights == expected
     # Test with null values
     config = nlp.config.copy()
     config["training"]["score_weights"]["a1"] = None
     nlp = English.from_config(config)
     score_weights = nlp.config["training"]["score_weights"]
-    expected = {"a1": None, "a2": 0.5, "b1": 0.03, "b2": 0.12, "b3": 0.35}
+    expected = {"a1": None, "a2": 0.12, "b1": 0.05, "b2": 0.17, "b3": 0.66}
     assert score_weights == expected
 
 
diff --git a/spacy/util.py b/spacy/util.py
index 512c6b742..0166bd190 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -1369,32 +1369,14 @@ def combine_score_weights(
         should be preserved.
     RETURNS (Dict[str, float]): The combined and normalized weights.
     """
+    # We divide each weight by the total weight sum.
     # We first need to extract all None/null values for score weights that
     # shouldn't be shown in the table *or* be weighted
-    result = {}
-    all_weights = []
-    for w_dict in weights:
-        filtered_weights = {}
-        for key, value in w_dict.items():
-            value = overrides.get(key, value)
-            if value is None:
-                result[key] = None
-            else:
-                filtered_weights[key] = value
-        all_weights.append(filtered_weights)
-    for w_dict in all_weights:
-        # We need to account for weights that don't sum to 1.0 and normalize
-        # the score weights accordingly, then divide score by the number of
-        # components.
-        total = sum(w_dict.values())
-        for key, value in w_dict.items():
-            if total == 0:
-                weight = 0.0
-            else:
-                weight = round(value / total / len(all_weights), 2)
-            prev_weight = result.get(key, 0.0)
-            prev_weight = 0.0 if prev_weight is None else prev_weight
-            result[key] = prev_weight + weight
+    result = {key: overrides.get(key, value) for w_dict in weights for (key, value) in w_dict.items()}
+    weight_sum = sum([v if v else 0.0 for v in result.values()])
+    for key, value in result.items():
+        if value and weight_sum > 0:
+            result[key] = round(value / weight_sum, 2)
     return result
 
 

From c105ed10fd5d9eb924f767911dfc6400e0386505 Mon Sep 17 00:00:00 2001
From: Jacopo Farina <jacopofar@users.noreply.github.com>
Date: Mon, 26 Apr 2021 16:53:43 +0200
Subject: [PATCH 03/21] Remove torino from stop words (#7634)

Torino is the proper name of a city and the token has no other meaning
---
 spacy/lang/it/stop_words.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/it/stop_words.py b/spacy/lang/it/stop_words.py
index e97613912..4178ed452 100644
--- a/spacy/lang/it/stop_words.py
+++ b/spacy/lang/it/stop_words.py
@@ -72,7 +72,7 @@ steste stesti stette stettero stetti stia stiamo stiano stiate sto su sua
 subito successivamente successivo sue sugl sugli sui sul sull sulla sulle
 sullo suo suoi
 
-tale tali talvolta tanto te tempo ti titolo torino tra tranne tre trenta
+tale tali talvolta tanto te tempo ti titolo tra tranne tre trenta
 troppo trovato tu tua tue tuo tuoi tutta tuttavia tutte tutti tutto
 
 uguali ulteriore ultimo un una uno uomo

From 95c08336567788827deabfa3fcc500c03e382a20 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 26 Apr 2021 16:53:53 +0200
Subject: [PATCH 04/21] Add training option to set annotations on update
 (#7767)

* Add training option to set annotations on update

Add a `[training]` option called `set_annotations_on_update` to specify
a list of components for which the predicted annotations should be set
on `example.predicted` immediately after that component has been
updated. The predicted annotations can be accessed by later components
in the pipeline during the processing of the batch in the same `update`
call.

* Rename to annotates / annotating_components

* Add test for `annotating_components` when training from config

* Add documentation
---
 spacy/default_config.cfg                      |   2 +
 spacy/language.py                             |  29 ++++-
 spacy/schemas.py                              |   1 +
 .../pipeline/test_annotates_on_update.py      | 113 ++++++++++++++++++
 spacy/tests/pipeline/test_pipe_methods.py     |  40 +++++++
 spacy/training/loop.py                        |  16 ++-
 website/docs/api/data-formats.md              |  37 +++---
 website/docs/api/transformer.md               |  21 ++--
 website/docs/usage/training.md                |  68 ++++++++++-
 9 files changed, 289 insertions(+), 38 deletions(-)
 create mode 100644 spacy/tests/pipeline/test_annotates_on_update.py

diff --git a/spacy/default_config.cfg b/spacy/default_config.cfg
index 7f092d5f5..ceb7357fc 100644
--- a/spacy/default_config.cfg
+++ b/spacy/default_config.cfg
@@ -80,6 +80,8 @@ eval_frequency = 200
 score_weights = {}
 # Names of pipeline components that shouldn't be updated during training
 frozen_components = []
+# Names of pipeline components that should set annotations during training
+annotating_components = []
 # Location in the config where the dev corpus is defined
 dev_corpus = "corpora.dev"
 # Location in the config where the train corpus is defined
diff --git a/spacy/language.py b/spacy/language.py
index 6f6470533..1a447c11b 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -1074,6 +1074,7 @@ class Language:
         losses: Optional[Dict[str, float]] = None,
         component_cfg: Optional[Dict[str, Dict[str, Any]]] = None,
         exclude: Iterable[str] = SimpleFrozenList(),
+        annotates: Iterable[str] = SimpleFrozenList(),
     ):
         """Update the models in the pipeline.
 
@@ -1081,10 +1082,13 @@ class Language:
         _: Should not be set - serves to catch backwards-incompatible scripts.
         drop (float): The dropout rate.
         sgd (Optimizer): An optimizer.
-        losses (Dict[str, float]): Dictionary to update with the loss, keyed by component.
+        losses (Dict[str, float]): Dictionary to update with the loss, keyed by
+            component.
         component_cfg (Dict[str, Dict]): Config parameters for specific pipeline
             components, keyed by component name.
         exclude (Iterable[str]): Names of components that shouldn't be updated.
+        annotates (Iterable[str]): Names of components that should set
+            annotations on the predicted examples after updating.
         RETURNS (Dict[str, float]): The updated losses dictionary
 
         DOCS: https://spacy.io/api/language#update
@@ -1103,15 +1107,16 @@ class Language:
             sgd = self._optimizer
         if component_cfg is None:
             component_cfg = {}
+        pipe_kwargs = {}
         for i, (name, proc) in enumerate(self.pipeline):
             component_cfg.setdefault(name, {})
+            pipe_kwargs[name] = deepcopy(component_cfg[name])
             component_cfg[name].setdefault("drop", drop)
+            pipe_kwargs[name].setdefault("batch_size", self.batch_size)
         for name, proc in self.pipeline:
-            if name in exclude or not hasattr(proc, "update"):
-                continue
-            proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
-        if sgd not in (None, False):
-            for name, proc in self.pipeline:
+            if name not in exclude and hasattr(proc, "update"):
+                proc.update(examples, sgd=None, losses=losses, **component_cfg[name])
+            if sgd not in (None, False):
                 if (
                     name not in exclude
                     and hasattr(proc, "is_trainable")
@@ -1119,6 +1124,18 @@ class Language:
                     and proc.model not in (True, False, None)
                 ):
                     proc.finish_update(sgd)
+            if name in annotates:
+                for doc, eg in zip(
+                    _pipe(
+                        (eg.predicted for eg in examples),
+                        proc=proc,
+                        name=name,
+                        default_error_handler=self.default_error_handler,
+                        kwargs=pipe_kwargs[name],
+                    ),
+                    examples,
+                ):
+                    eg.predicted = doc
         return losses
 
     def rehearse(
diff --git a/spacy/schemas.py b/spacy/schemas.py
index 2f25c785f..92315399d 100644
--- a/spacy/schemas.py
+++ b/spacy/schemas.py
@@ -313,6 +313,7 @@ class ConfigSchemaTraining(BaseModel):
     optimizer: Optimizer = Field(..., title="The optimizer to use")
     logger: Logger = Field(..., title="The logger to track training progress")
     frozen_components: List[str] = Field(..., title="Pipeline components that shouldn't be updated during training")
+    annotating_components: List[str] = Field(..., title="Pipeline components that should set annotations during training")
     before_to_disk: Optional[Callable[["Language"], "Language"]] = Field(..., title="Optional callback to modify nlp object after training, before it's saved to disk")
     # fmt: on
 
diff --git a/spacy/tests/pipeline/test_annotates_on_update.py b/spacy/tests/pipeline/test_annotates_on_update.py
new file mode 100644
index 000000000..b17855d85
--- /dev/null
+++ b/spacy/tests/pipeline/test_annotates_on_update.py
@@ -0,0 +1,113 @@
+from typing import Callable, Iterable, Iterator
+import pytest
+import io
+
+from thinc.api import Config
+from spacy.language import Language
+from spacy.training import Example
+from spacy.training.loop import train
+from spacy.lang.en import English
+from spacy.util import registry, load_model_from_config
+
+
+@pytest.fixture
+def config_str():
+    return """
+    [nlp]
+    lang = "en"
+    pipeline = ["sentencizer","assert_sents"]
+    disabled = []
+    before_creation = null
+    after_creation = null
+    after_pipeline_creation = null
+    batch_size = 1000
+    tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
+
+    [components]
+
+    [components.assert_sents]
+    factory = "assert_sents"
+
+    [components.sentencizer]
+    factory = "sentencizer"
+    punct_chars = null
+
+    [training]
+    dev_corpus = "corpora.dev"
+    train_corpus = "corpora.train"
+    annotating_components = ["sentencizer"]
+    max_steps = 2
+
+    [corpora]
+
+    [corpora.dev]
+    @readers = "unannotated_corpus"
+
+    [corpora.train]
+    @readers = "unannotated_corpus"
+    """
+
+
+def test_annotates_on_update():
+    # The custom component checks for sentence annotation
+    @Language.factory("assert_sents", default_config={})
+    def assert_sents(nlp, name):
+        return AssertSents(name)
+
+    class AssertSents:
+        def __init__(self, name, **cfg):
+            self.name = name
+            pass
+
+        def __call__(self, doc):
+            if not doc.has_annotation("SENT_START"):
+                raise ValueError("No sents")
+            return doc
+
+        def update(self, examples, *, drop=0.0, sgd=None, losses=None):
+            for example in examples:
+                if not example.predicted.has_annotation("SENT_START"):
+                    raise ValueError("No sents")
+            return {}
+
+    nlp = English()
+    nlp.add_pipe("sentencizer")
+    nlp.add_pipe("assert_sents")
+
+    # When the pipeline runs, annotations are set
+    doc = nlp("This is a sentence.")
+
+    examples = []
+    for text in ["a a", "b b", "c c"]:
+        examples.append(Example(nlp.make_doc(text), nlp(text)))
+
+    for example in examples:
+        assert not example.predicted.has_annotation("SENT_START")
+
+    # If updating without setting annotations, assert_sents will raise an error
+    with pytest.raises(ValueError):
+        nlp.update(examples)
+
+    # Updating while setting annotations for the sentencizer succeeds
+    nlp.update(examples, annotates=["sentencizer"])
+
+
+def test_annotating_components_from_config(config_str):
+    @registry.readers("unannotated_corpus")
+    def create_unannotated_corpus() -> Callable[[Language], Iterable[Example]]:
+        return UnannotatedCorpus()
+
+    class UnannotatedCorpus:
+        def __call__(self, nlp: Language) -> Iterator[Example]:
+            for text in ["a a", "b b", "c c"]:
+                doc = nlp.make_doc(text)
+                yield Example(doc, doc)
+
+    orig_config = Config().from_str(config_str)
+    nlp = load_model_from_config(orig_config, auto_fill=True, validate=True)
+    assert nlp.config["training"]["annotating_components"] == ["sentencizer"]
+    train(nlp)
+
+    nlp.config["training"]["annotating_components"] = []
+    with pytest.raises(ValueError):
+        train(nlp)
diff --git a/spacy/tests/pipeline/test_pipe_methods.py b/spacy/tests/pipeline/test_pipe_methods.py
index 9af8395a6..0b84db4c0 100644
--- a/spacy/tests/pipeline/test_pipe_methods.py
+++ b/spacy/tests/pipeline/test_pipe_methods.py
@@ -1,7 +1,9 @@
 import pytest
 from spacy.language import Language
 from spacy.pipeline import TrainablePipe
+from spacy.training import Example
 from spacy.util import SimpleFrozenList, get_arg_names
+from spacy.lang.en import English
 
 
 @pytest.fixture
@@ -417,3 +419,41 @@ def test_pipe_methods_initialize():
     assert "test" in nlp.config["initialize"]["components"]
     nlp.remove_pipe("test")
     assert "test" not in nlp.config["initialize"]["components"]
+
+
+def test_update_with_annotates():
+    name = "test_with_annotates"
+    results = {}
+
+    def make_component(name):
+        results[name] = ""
+
+        def component(doc):
+            nonlocal results
+            results[name] += doc.text
+            return doc
+
+        return component
+
+    c1 = Language.component(f"{name}1", func=make_component(f"{name}1"))
+    c2 = Language.component(f"{name}2", func=make_component(f"{name}2"))
+
+    components = set([f"{name}1", f"{name}2"])
+
+    nlp = English()
+    texts = ["a", "bb", "ccc"]
+    examples = []
+    for text in texts:
+        examples.append(Example(nlp.make_doc(text), nlp.make_doc(text)))
+
+    for components_to_annotate in [[], [f"{name}1"], [f"{name}1", f"{name}2"], [f"{name}2", f"{name}1"]]:
+        for key in results:
+            results[key] = ""
+        nlp = English(vocab=nlp.vocab)
+        nlp.add_pipe(f"{name}1")
+        nlp.add_pipe(f"{name}2")
+        nlp.update(examples, annotates=components_to_annotate)
+        for component in components_to_annotate:
+            assert results[component] == "".join(eg.predicted.text for eg in examples)
+        for component in components - set(components_to_annotate):
+            assert results[component] == ""
diff --git a/spacy/training/loop.py b/spacy/training/loop.py
index ecfa12fdb..85aa458f0 100644
--- a/spacy/training/loop.py
+++ b/spacy/training/loop.py
@@ -74,6 +74,8 @@ def train(
 
     # Components that shouldn't be updated during training
     frozen_components = T["frozen_components"]
+    # Components that should set annotations on update
+    annotating_components = T["annotating_components"]
     # Create iterator, which yields out info after each optimization step.
     training_step_iterator = train_while_improving(
         nlp,
@@ -86,11 +88,17 @@ def train(
         max_steps=T["max_steps"],
         eval_frequency=T["eval_frequency"],
         exclude=frozen_components,
+        annotating_components=annotating_components,
     )
     clean_output_dir(output_path)
     stdout.write(msg.info(f"Pipeline: {nlp.pipe_names}") + "\n")
     if frozen_components:
         stdout.write(msg.info(f"Frozen components: {frozen_components}") + "\n")
+    if annotating_components:
+        stdout.write(
+            msg.info(f"Set annotations on update for: {annotating_components}")
+            + "\n"
+        )
     stdout.write(msg.info(f"Initial learn rate: {optimizer.learn_rate}") + "\n")
     with nlp.select_pipes(disable=frozen_components):
         log_step, finalize_logger = train_logger(nlp, stdout, stderr)
@@ -142,6 +150,7 @@ def train_while_improving(
     patience: int,
     max_steps: int,
     exclude: List[str],
+    annotating_components: List[str],
 ):
     """Train until an evaluation stops improving. Works as a generator,
     with each iteration yielding a tuple `(batch, info, is_best_checkpoint)`,
@@ -193,7 +202,12 @@ def train_while_improving(
         dropout = next(dropouts)
         for subbatch in subdivide_batch(batch, accumulate_gradient):
             nlp.update(
-                subbatch, drop=dropout, losses=losses, sgd=False, exclude=exclude
+                subbatch,
+                drop=dropout,
+                losses=losses,
+                sgd=False,
+                exclude=exclude,
+                annotates=annotating_components,
             )
         # TODO: refactor this so we don't have to run it separately in here
         for name, proc in nlp.pipeline:
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 0c2a4c9f3..576ab8394 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -182,24 +182,25 @@ single corpus once and then divide it up into `train` and `dev` partitions.
 This section defines settings and controls for the training and evaluation
 process that are used when you run [`spacy train`](/api/cli#train).
 
-| Name                  | Description                                                                                                                                                                                                                                                                                                                         |
-| --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `accumulate_gradient` | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
-| `batcher`             | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
-| `before_to_disk`      | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
-| `dev_corpus`          | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
-| `dropout`             | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
-| `eval_frequency`      | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |
-| `frozen_components`   | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                      |
-| `gpu_allocator`       | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                                                                                                                   |
-| `logger`              | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
-| `max_epochs`          | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~                                                                                             |
-| `max_steps`           | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                   |
-| `optimizer`           | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                                                             |
-| `patience`            | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~                                                                                                                                                                                                        |
-| `score_weights`       | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                                                                                                       |
-| `seed`                | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
-| `train_corpus`        | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |
+| Name                    | Description                                                                                                                                                                                                                                                                                                                         |
+| ----------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `accumulate_gradient`   | Whether to divide the batch up into substeps. Defaults to `1`. ~~int~~                                                                                                                                                                                                                                                              |
+| `batcher`               | Callable that takes an iterator of [`Doc`](/api/doc) objects and yields batches of `Doc`s. Defaults to [`batch_by_words`](/api/top-level#batch_by_words). ~~Callable[[Iterator[Doc], Iterator[List[Doc]]]]~~                                                                                                                        |
+| `before_to_disk`        | Optional callback to modify `nlp` object right before it is saved to disk during and after training. Can be used to remove or reset config values or disable components. Defaults to `null`. ~~Optional[Callable[[Language], Language]]~~                                                                                           |
+| `dev_corpus`            | Dot notation of the config location defining the dev corpus. Defaults to `corpora.dev`. ~~str~~                                                                                                                                                                                                                                     |
+| `dropout`               | The dropout rate. Defaults to `0.1`. ~~float~~                                                                                                                                                                                                                                                                                      |
+| `eval_frequency`        | How often to evaluate during training (steps). Defaults to `200`. ~~int~~                                                                                                                                                                                                                                                           |
+| `frozen_components`     | Pipeline component names that are "frozen" and shouldn't be initialized or updated during training. See [here](/usage/training#config-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                      |
+| `annotating_components` | Pipeline component names that should set annotations on the predicted docs during training. See [here](/usage/training#annotating-components) for details. Defaults to `[]`. ~~List[str]~~                                                                                                                                          |
+| `gpu_allocator`         | Library for cupy to route GPU memory allocation to. Can be `"pytorch"` or `"tensorflow"`. Defaults to variable `${system.gpu_allocator}`. ~~str~~                                                                                                                                                                                   |
+| `logger`                | Callable that takes the `nlp` and stdout and stderr `IO` objects, sets up the logger, and returns two new callables to log a training step and to finalize the logger. Defaults to [`ConsoleLogger`](/api/top-level#ConsoleLogger). ~~Callable[[Language, IO, IO], [Tuple[Callable[[Dict[str, Any]], None], Callable[[], None]]]]~~ |
+| `max_epochs`            | Maximum number of epochs to train for. `0` means an unlimited number of epochs. `-1` means that the train corpus should be streamed rather than loaded into memory with no shuffling within the training loop. Defaults to `0`. ~~int~~                                                                                             |
+| `max_steps`             | Maximum number of update steps to train for. `0` means an unlimited number of steps. Defaults to `20000`. ~~int~~                                                                                                                                                                                                                   |
+| `optimizer`             | The optimizer. The learning rate schedule and other settings can be configured as part of the optimizer. Defaults to [`Adam`](https://thinc.ai/docs/api-optimizers#adam). ~~Optimizer~~                                                                                                                                             |
+| `patience`              | How many steps to continue without improvement in evaluation score. `0` disables early stopping. Defaults to `1600`. ~~int~~                                                                                                                                                                                                        |
+| `score_weights`         | Score names shown in metrics mapped to their weight towards the final weighted score. See [here](/usage/training#metrics) for details. Defaults to `{}`. ~~Dict[str, float]~~                                                                                                                                                       |
+| `seed`                  | The random seed. Defaults to variable `${system.seed}`. ~~int~~                                                                                                                                                                                                                                                                     |
+| `train_corpus`          | Dot notation of the config location defining the train corpus. Defaults to `corpora.train`. ~~str~~                                                                                                                                                                                                                                 |
 
 ### pretraining {#config-pretraining tag="section,optional"}
 
diff --git a/website/docs/api/transformer.md b/website/docs/api/transformer.md
index 5aaa1d23e..4698529a1 100644
--- a/website/docs/api/transformer.md
+++ b/website/docs/api/transformer.md
@@ -245,14 +245,14 @@ and call the optimizer, while the others simply increment the gradients.
 > losses = trf.update(examples, sgd=optimizer)
 > ```
 
-| Name              | Description                                                                                                                                                                      |
-| ----------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `examples`        | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
-| _keyword-only_    |                                                                                                                                                                                  |
-| `drop`            | The dropout rate. ~~float~~                                                                                                                                                      |
-| `sgd`             | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                    |
-| `losses`          | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                         |
-| **RETURNS**       | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                            |
+| Name           | Description                                                                                                                                                                      |
+| -------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `examples`     | A batch of [`Example`](/api/example) objects. Only the [`Example.predicted`](/api/example#predicted) `Doc` object is used, the reference `Doc` is ignored. ~~Iterable[Example]~~ |
+| _keyword-only_ |                                                                                                                                                                                  |
+| `drop`         | The dropout rate. ~~float~~                                                                                                                                                      |
+| `sgd`          | An optimizer. Will be created via [`create_optimizer`](#create_optimizer) if not set. ~~Optional[Optimizer]~~                                                                    |
+| `losses`       | Optional record of the loss during training. Updated using the component name as the key. ~~Optional[Dict[str, float]]~~                                                         |
+| **RETURNS**    | The updated `losses` dictionary. ~~Dict[str, float]~~                                                                                                                            |
 
 ## Transformer.create_optimizer {#create_optimizer tag="method"}
 
@@ -493,6 +493,11 @@ This requires sentence boundaries to be set (e.g. by the
 depending on the sentence lengths. However, it does provide the transformer with
 more meaningful windows to attend over.
 
+To set sentence boundaries with the `sentencizer` during training, add a
+`sentencizer` to the beginning of the pipeline and include it in
+[`[training.annotating_components]`](/usage/training#annotating-components) to
+have it set the sentence boundaries before the `transformer` component runs.
+
 ### strided_spans.v1 {#strided_spans tag="registered function"}
 
 > #### Example config
diff --git a/website/docs/usage/training.md b/website/docs/usage/training.md
index 9f929fe19..1b345050c 100644
--- a/website/docs/usage/training.md
+++ b/website/docs/usage/training.md
@@ -414,11 +414,11 @@ as-is. They are also excluded when calling
 > #### Note on frozen components
 >
 > Even though frozen components are not **updated** during training, they will
-> still **run** during training and evaluation. This is very important, because
-> they may still impact your model's performance – for instance, a sentence
-> boundary detector can impact what the parser or entity recognizer considers a
-> valid parse. So the evaluation results should always reflect what your
-> pipeline will produce at runtime.
+> still **run** during evaluation. This is very important, because they may
+> still impact your model's performance – for instance, a sentence boundary
+> detector can impact what the parser or entity recognizer considers a valid
+> parse. So the evaluation results should always reflect what your pipeline will
+> produce at runtime.
 
 ```ini
 [nlp]
@@ -455,6 +455,64 @@ replace_listeners = ["model.tok2vec"]
 
 </Infobox>
 
+### Using predictions from preceding components {#annotating-components new="3.1"}
+
+By default, components are updated in isolation during training, which means
+that they don't see the predictions of any earlier components in the pipeline. A
+component receives [`Example.predicted`](/api/example) as input and compares its
+predictions to [`Example.reference`](/api/example) without saving its
+annotations in the `predicted` doc.
+
+Instead, if certain components should **set their annotations** during training,
+use the setting `annotating_components` in the `[training]` block to specify a
+list of components. For example, the feature `DEP` from the parser could be used
+as a tagger feature by including `DEP` in the tok2vec `attrs` and including
+`parser` in `annotating_components`:
+
+```ini
+### config.cfg (excerpt) {highlight="7,12"}
+[nlp]
+pipeline = ["parser", "tagger"]
+
+[components.tagger.model.tok2vec.embed]
+@architectures = "spacy.MultiHashEmbed.v1"
+width = ${components.tagger.model.tok2vec.encode.width}
+attrs = ["NORM","DEP"]
+rows = [5000,2500]
+include_static_vectors = false
+
+[training]
+annotating_components = ["parser"]
+```
+
+Any component in the pipeline can be included as an annotating component,
+including frozen components. Frozen components can set annotations during
+training just as they would set annotations during evaluation or when the final
+pipeline is run. The config excerpt below shows how a frozen `ner` component and
+a `sentencizer` can provide the required `doc.sents` and `doc.ents` for the
+entity linker during training:
+
+```ini
+### config.cfg (excerpt)
+[nlp]
+pipeline = ["sentencizer", "ner", "entity_linker"]
+
+[components.ner]
+source = "en_core_web_sm"
+
+[training]
+frozen_components = ["ner"]
+annotating_components = ["sentencizer", "ner"]
+```
+
+<Infobox variant="warning" title="Training speed with annotating components" id="annotating-components-speed">
+
+Be aware that non-frozen annotating components with statistical models will
+**run twice** on each batch, once to update the model and once to apply the
+now-updated model to the predicted docs.
+
+</Infobox>
+
 ### Using registered functions {#config-functions}
 
 The training configuration defined in the config file doesn't have to only

From ceee1ecf1735830abe5bfe0e22ac5ecd83e4eebc Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 26 Apr 2021 16:54:02 +0200
Subject: [PATCH 05/21] Replace cpdef variables with cdef (#7834)

---
 spacy/kb.pxd        |  2 +-
 spacy/tokenizer.pxd |  2 +-
 spacy/vocab.pxd     | 12 ++++++------
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/spacy/kb.pxd b/spacy/kb.pxd
index 4a71b26a2..a823dbe1e 100644
--- a/spacy/kb.pxd
+++ b/spacy/kb.pxd
@@ -28,7 +28,7 @@ cdef class Candidate:
 
 cdef class KnowledgeBase:
     cdef Pool mem
-    cpdef readonly Vocab vocab
+    cdef readonly Vocab vocab
     cdef int64_t entity_vector_length
 
     # This maps 64bit keys (hash of unique entity string)
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
index 2a44d7729..719e8e6f5 100644
--- a/spacy/tokenizer.pxd
+++ b/spacy/tokenizer.pxd
@@ -14,7 +14,7 @@ cdef class Tokenizer:
     cdef Pool mem
     cdef PreshMap _cache
     cdef PreshMap _specials
-    cpdef readonly Vocab vocab
+    cdef readonly Vocab vocab
 
     cdef object _token_match
     cdef object _url_match
diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
index b5bcf7658..9067476f7 100644
--- a/spacy/vocab.pxd
+++ b/spacy/vocab.pxd
@@ -25,12 +25,12 @@ cdef struct _Cached:
 
 cdef class Vocab:
     cdef Pool mem
-    cpdef readonly StringStore strings
-    cpdef public Morphology morphology
-    cpdef public object vectors
-    cpdef public object _lookups
-    cpdef public object writing_system
-    cpdef public object get_noun_chunks
+    cdef readonly StringStore strings
+    cdef public Morphology morphology
+    cdef public object vectors
+    cdef public object _lookups
+    cdef public object writing_system
+    cdef public object get_noun_chunks
     cdef readonly int length
     cdef public object data_dir
     cdef public object lex_attr_getters

From ae855a46259c6a76a0ab1dc317bb46c111fd1809 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 26 Apr 2021 16:54:23 +0200
Subject: [PATCH 06/21] Clean up Morphology imports and definitions (#7441)

* Clean up Morphology imports and definitions

* Whitespace formatting
---
 spacy/morphology.pxd | 19 ++++---------------
 spacy/morphology.pyx | 13 ++-----------
 2 files changed, 6 insertions(+), 26 deletions(-)

diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
index 4fe8f7428..8d449d065 100644
--- a/spacy/morphology.pxd
+++ b/spacy/morphology.pxd
@@ -1,14 +1,11 @@
 from cymem.cymem cimport Pool
-from preshed.maps cimport PreshMap, PreshMapArray
-from libc.stdint cimport uint64_t
-from murmurhash cimport mrmr
+from preshed.maps cimport PreshMap
 cimport numpy as np
+from libc.stdint cimport uint64_t
 
-from .structs cimport TokenC, MorphAnalysisC
+from .structs cimport MorphAnalysisC
 from .strings cimport StringStore
-from .typedefs cimport hash_t, attr_t, flags_t
-from .parts_of_speech cimport univ_pos_t
-from . cimport symbols
+from .typedefs cimport attr_t, hash_t
 
 
 cdef class Morphology:
@@ -16,14 +13,6 @@ cdef class Morphology:
     cdef readonly StringStore strings
     cdef PreshMap tags # Keyed by hash, value is pointer to tag
 
-    cdef public object lemmatizer
-    cdef readonly object tag_map
-    cdef readonly object tag_names
-    cdef readonly object reverse_index
-    cdef readonly object _exc
-    cdef readonly PreshMapArray _cache
-    cdef readonly int n_tags
-
     cdef MorphAnalysisC create_morph_tag(self, field_feature_pairs) except *
     cdef int insert(self, MorphAnalysisC tag) except -1
 
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index e8469223a..c3ffc46a1 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -1,20 +1,11 @@
 # cython: infer_types
-from libc.string cimport memset
-
-import srsly
-from collections import Counter
 import numpy
 import warnings
 
-from .attrs cimport POS, IS_SPACE
-from .parts_of_speech cimport SPACE
-from .lexeme cimport Lexeme
+from .attrs cimport POS
 
-from .strings import get_string_id
-from .attrs import LEMMA, intify_attrs
 from .parts_of_speech import IDS as POS_IDS
-from .errors import Errors, Warnings
-from .util import ensure_path
+from .errors import Warnings
 from . import symbols
 
 

From 946a4284bead9bc15aaafc50d69c82a8f253da33 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 26 Apr 2021 17:06:32 +0200
Subject: [PATCH 07/21] Set spacy-legacy to >=3.0.5 (#7897)

Set `spacy-legacy` to `>=3.0.5` due to `spacy.StaticVectors.v1` init bug.
---
 requirements.txt | 2 +-
 setup.cfg        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 1947dd2de..a8a15a01b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Our libraries
-spacy-legacy>=3.0.4,<3.1.0
+spacy-legacy>=3.0.5,<3.1.0
 cymem>=2.0.2,<2.1.0
 preshed>=3.0.2,<3.1.0
 thinc>=8.0.3,<8.1.0
diff --git a/setup.cfg b/setup.cfg
index 9e1293335..2fedd8f5c 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -37,7 +37,7 @@ setup_requires =
     thinc>=8.0.3,<8.1.0
 install_requires =
     # Our libraries
-    spacy-legacy>=3.0.4,<3.1.0
+    spacy-legacy>=3.0.5,<3.1.0
     murmurhash>=0.28.0,<1.1.0
     cymem>=2.0.2,<2.1.0
     preshed>=3.0.2,<3.1.0

From 1d59fdbd39876eb2aac03f66c03529fc6c40b5bc Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 17 May 2021 10:16:20 +0200
Subject: [PATCH 08/21] Update Vietnamese tokenizer (#8099)

* Adapt tokenization methods from `pyvi` to preserve text encoding and
whitespace
* Add serialization support similar to Chinese and Japanese

Note: as for Chinese and Japanese, some settings are duplicated in
`config.cfg` and `tokenizer/cfg`.
---
 licenses/3rd_party_licenses.txt       |  31 ++++++-
 spacy/lang/vi/__init__.py             | 112 ++++++++++++++++++++++++--
 spacy/tests/conftest.py               |   6 ++
 spacy/tests/lang/vi/__init__.py       |   0
 spacy/tests/lang/vi/test_serialize.py |  33 ++++++++
 spacy/tests/lang/vi/test_tokenizer.py |  47 +++++++++++
 6 files changed, 220 insertions(+), 9 deletions(-)
 create mode 100644 spacy/tests/lang/vi/__init__.py
 create mode 100644 spacy/tests/lang/vi/test_serialize.py
 create mode 100644 spacy/tests/lang/vi/test_tokenizer.py

diff --git a/licenses/3rd_party_licenses.txt b/licenses/3rd_party_licenses.txt
index 3702ad131..7bc3d4547 100644
--- a/licenses/3rd_party_licenses.txt
+++ b/licenses/3rd_party_licenses.txt
@@ -43,8 +43,8 @@ scikit-learn
 
 * Files: scorer.py
 
-The following implementation of roc_auc_score() is adapted from
-scikit-learn, which is distributed under the following license:
+The implementation of roc_auc_score() is adapted from scikit-learn, which is
+distributed under the following license:
 
 New BSD License
 
@@ -77,3 +77,30 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
 DAMAGE.
+
+
+pyvi
+----
+
+* Files: lang/vi/__init__.py
+
+The MIT License (MIT)
+Copyright (c) 2016 Viet-Trung Tran
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py
index 1328de495..b6d873a13 100644
--- a/spacy/lang/vi/__init__.py
+++ b/spacy/lang/vi/__init__.py
@@ -1,8 +1,15 @@
+from typing import Any, Dict, Union
+from pathlib import Path
+import re
+import srsly
+import string
+
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
 from ...language import Language
 from ...tokens import Doc
 from ...util import DummyTokenizer, registry, load_config_from_str
+from ... import util
 
 
 DEFAULT_CONFIG = """
@@ -40,17 +47,108 @@ class VietnameseTokenizer(DummyTokenizer):
 
     def __call__(self, text: str) -> Doc:
         if self.use_pyvi:
-            words, spaces = self.ViTokenizer.spacy_tokenize(text)
+            words = self.pyvi_tokenize(text)
+            words, spaces = util.get_words_and_spaces(words, text)
             return Doc(self.vocab, words=words, spaces=spaces)
         else:
-            words = []
-            spaces = []
-            for token in self.tokenizer(text):
-                words.extend(list(token.text))
-                spaces.extend([False] * len(token.text))
-                spaces[-1] = bool(token.whitespace_)
+            words, spaces = util.get_words_and_spaces(text.split(), text)
             return Doc(self.vocab, words=words, spaces=spaces)
 
+    # The methods pyvi_sylabelize_with_ws and pyvi_tokenize are adapted from
+    # pyvi v0.1, MIT License, Copyright (c) 2016 Viet-Trung Tran.
+    # See licenses/3rd_party_licenses.txt
+    def pyvi_sylabelize_with_ws(self, text):
+        """Modified from pyvi to preserve whitespace and skip unicode
+        normalization."""
+        specials = [r"==>", r"->", r"\.\.\.", r">>"]
+        digit = r"\d+([\.,_]\d+)+"
+        email = r"([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)"
+        web = r"\w+://[^\s]+"
+        word = r"\w+"
+        non_word = r"[^\w\s]"
+        abbreviations = [
+            r"[A-ZĐ]+\.",
+            r"Tp\.",
+            r"Mr\.",
+            r"Mrs\.",
+            r"Ms\.",
+            r"Dr\.",
+            r"ThS\.",
+        ]
+
+        patterns = []
+        patterns.extend(abbreviations)
+        patterns.extend(specials)
+        patterns.extend([web, email])
+        patterns.extend([digit, non_word, word])
+
+        patterns = r"(\s+|" + "|".join(patterns) + ")"
+        tokens = re.findall(patterns, text, re.UNICODE)
+
+        return [token[0] for token in tokens]
+
+    def pyvi_tokenize(self, text):
+        """Modified from pyvi to preserve text and whitespace."""
+        if len(text) == 0:
+            return []
+        elif text.isspace():
+            return [text]
+        segs = self.pyvi_sylabelize_with_ws(text)
+        words = []
+        preceding_ws = []
+        for i, token in enumerate(segs):
+            if not token.isspace():
+                words.append(token)
+                preceding_ws.append(
+                    "" if (i == 0 or not segs[i - 1].isspace()) else segs[i - 1]
+                )
+        labels = self.ViTokenizer.ViTokenizer.model.predict(
+            [self.ViTokenizer.ViTokenizer.sent2features(words, False)]
+        )
+        token = words[0]
+        tokens = []
+        for i in range(1, len(labels[0])):
+            if (
+                labels[0][i] == "I_W"
+                and words[i] not in string.punctuation
+                and words[i - 1] not in string.punctuation
+                and not words[i][0].isdigit()
+                and not words[i - 1][0].isdigit()
+                and not (words[i][0].istitle() and not words[i - 1][0].istitle())
+            ):
+                token = token + preceding_ws[i] + words[i]
+            else:
+                tokens.append(token)
+                token = words[i]
+        tokens.append(token)
+        return tokens
+
+    def _get_config(self) -> Dict[str, Any]:
+        return {"use_pyvi": self.use_pyvi}
+
+    def _set_config(self, config: Dict[str, Any] = {}) -> None:
+        self.use_pyvi = config.get("use_pyvi", False)
+
+    def to_bytes(self, **kwargs) -> bytes:
+        serializers = {"cfg": lambda: srsly.json_dumps(self._get_config())}
+        return util.to_bytes(serializers, [])
+
+    def from_bytes(self, data: bytes, **kwargs) -> "VietnameseTokenizer":
+        deserializers = {"cfg": lambda b: self._set_config(srsly.json_loads(b))}
+        util.from_bytes(data, deserializers, [])
+        return self
+
+    def to_disk(self, path: Union[str, Path], **kwargs) -> None:
+        path = util.ensure_path(path)
+        serializers = {"cfg": lambda p: srsly.write_json(p, self._get_config())}
+        return util.to_disk(path, serializers, [])
+
+    def from_disk(self, path: Union[str, Path], **kwargs) -> "VietnameseTokenizer":
+        path = util.ensure_path(path)
+        serializers = {"cfg": lambda p: self._set_config(srsly.read_json(p))}
+        util.from_disk(path, serializers, [])
+        return self
+
 
 class VietnameseDefaults(Language.Defaults):
     config = load_config_from_str(DEFAULT_CONFIG)
diff --git a/spacy/tests/conftest.py b/spacy/tests/conftest.py
index 04e254c50..404783197 100644
--- a/spacy/tests/conftest.py
+++ b/spacy/tests/conftest.py
@@ -286,6 +286,12 @@ def ur_tokenizer():
     return get_lang_class("ur")().tokenizer
 
 
+@pytest.fixture(scope="session")
+def vi_tokenizer():
+    pytest.importorskip("pyvi")
+    return get_lang_class("vi")().tokenizer
+
+
 @pytest.fixture(scope="session")
 def yo_tokenizer():
     return get_lang_class("yo")().tokenizer
diff --git a/spacy/tests/lang/vi/__init__.py b/spacy/tests/lang/vi/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/spacy/tests/lang/vi/test_serialize.py b/spacy/tests/lang/vi/test_serialize.py
new file mode 100644
index 000000000..3ee5333fb
--- /dev/null
+++ b/spacy/tests/lang/vi/test_serialize.py
@@ -0,0 +1,33 @@
+from spacy.lang.vi import Vietnamese
+from ...util import make_tempdir
+
+
+def test_vi_tokenizer_serialize(vi_tokenizer):
+    tokenizer_bytes = vi_tokenizer.to_bytes()
+    nlp = Vietnamese()
+    nlp.tokenizer.from_bytes(tokenizer_bytes)
+    assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+    assert nlp.tokenizer.use_pyvi is True
+
+    with make_tempdir() as d:
+        file_path = d / "tokenizer"
+        vi_tokenizer.to_disk(file_path)
+        nlp = Vietnamese()
+        nlp.tokenizer.from_disk(file_path)
+        assert tokenizer_bytes == nlp.tokenizer.to_bytes()
+        assert nlp.tokenizer.use_pyvi is True
+
+    # mode is (de)serialized correctly
+    nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
+    nlp_bytes = nlp.to_bytes()
+    nlp_r = Vietnamese()
+    nlp_r.from_bytes(nlp_bytes)
+    assert nlp_bytes == nlp_r.to_bytes()
+    assert nlp_r.tokenizer.use_pyvi == False
+
+    with make_tempdir() as d:
+        nlp.to_disk(d)
+        nlp_r = Vietnamese()
+        nlp_r.from_disk(d)
+        assert nlp_bytes == nlp_r.to_bytes()
+        assert nlp_r.tokenizer.use_pyvi == False
diff --git a/spacy/tests/lang/vi/test_tokenizer.py b/spacy/tests/lang/vi/test_tokenizer.py
new file mode 100644
index 000000000..3d0642d1e
--- /dev/null
+++ b/spacy/tests/lang/vi/test_tokenizer.py
@@ -0,0 +1,47 @@
+import pytest
+
+from ...tokenizer.test_naughty_strings import NAUGHTY_STRINGS
+from spacy.lang.vi import Vietnamese
+
+
+# fmt: off
+TOKENIZER_TESTS = [
+    ("Đây là một văn  bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này", ['Đây', 'là', 'một', 'văn  bản', 'bằng', 'tiếng', 'Việt', 'Sau', 'đó', ',', 'đây', 'là', 'một', 'văn bản', 'khác', 'bằng', 'ngôn ngữ', 'này']),
+]
+# fmt: on
+
+
+@pytest.mark.parametrize("text,expected_tokens", TOKENIZER_TESTS)
+def test_vi_tokenizer(vi_tokenizer, text, expected_tokens):
+    tokens = [token.text for token in vi_tokenizer(text)]
+    assert tokens == expected_tokens
+
+
+def test_vi_tokenizer_extra_spaces(vi_tokenizer):
+    # note: three spaces after "I"
+    tokens = vi_tokenizer("I   like cheese.")
+    assert tokens[1].orth_ == "  "
+
+
+@pytest.mark.parametrize("text", NAUGHTY_STRINGS)
+def test_vi_tokenizer_naughty_strings(vi_tokenizer, text):
+    tokens = vi_tokenizer(text)
+    assert tokens.text_with_ws == text
+
+
+def test_vi_tokenizer_emptyish_texts(vi_tokenizer):
+    doc = vi_tokenizer("")
+    assert len(doc) == 0
+    doc = vi_tokenizer(" ")
+    assert len(doc) == 1
+    doc = vi_tokenizer("\n\n\n \t\t \n\n\n")
+    assert len(doc) == 1
+
+
+def test_vi_tokenizer_no_pyvi():
+    """Test for whitespace tokenization without pyvi"""
+    nlp = Vietnamese.from_config({"nlp": {"tokenizer": {"use_pyvi": False}}})
+    text = "Đây là một văn  bản bằng tiếng Việt Sau đó, đây là một văn bản khác bằng ngôn ngữ này"
+    doc = nlp(text)
+    assert [t.text for t in doc if not t.is_space] == text.split()
+    assert doc[4].text == " "

From 4e69fcaa50c582e9b4808c453d5cab70f1224071 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 19 May 2021 12:00:07 +0200
Subject: [PATCH 09/21] Disable GPU CI tests (#8143)

---
 azure-pipelines.yml | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index bea65cae2..5840b916b 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -82,18 +82,18 @@ jobs:
           python_version: '$(python.version)'
           architecture: 'x64'
 
-  - job: "TestGPU"
-    dependsOn: "Validate"
-    strategy:
-      matrix:
-        Python38LinuxX64_GPU:
-          python.version: '3.8'
-    pool:
-      name: "LinuxX64_GPU"
-    steps:
-      - template: .github/azure-steps.yml
-        parameters:
-          python_version: '$(python.version)'
-          architecture: 'x64'
-          gpu: true
-          num_build_jobs: 24
+#  - job: "TestGPU"
+#    dependsOn: "Validate"
+#    strategy:
+#      matrix:
+#        Python38LinuxX64_GPU:
+#          python.version: '3.8'
+#    pool:
+#      name: "LinuxX64_GPU"
+#    steps:
+#      - template: .github/azure-steps.yml
+#        parameters:
+#          python_version: '$(python.version)'
+#          architecture: 'x64'
+#          gpu: true
+#          num_build_jobs: 24

From cd6bd91c3a17d99674b5ed8c3b1092696ee59373 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Thu, 20 May 2021 14:48:09 +0200
Subject: [PATCH 10/21] Switch default train corpus max_length to 0 in
 quickstart (#8142)

The behavior of `spacy.Corpus.v1` is unexpected enough for `max_length
!= 0` that `0` is a better default for users creating a new config with
the quickstart.

If not, documents are skipped, sometimes the entire corpus is skipped,
and sometimes documents are (quite unexpectedly for your average user)
split into sentences.
---
 spacy/cli/templates/quickstart_training.jinja | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/cli/templates/quickstart_training.jinja b/spacy/cli/templates/quickstart_training.jinja
index e43c21bbd..0d422318b 100644
--- a/spacy/cli/templates/quickstart_training.jinja
+++ b/spacy/cli/templates/quickstart_training.jinja
@@ -372,7 +372,7 @@ factory = "{{ pipe }}"
 [corpora.train]
 @readers = "spacy.Corpus.v1"
 path = ${paths.train}
-max_length = {{ 500 if hardware == "gpu" else 2000 }}
+max_length = 0
 
 [corpora.dev]
 @readers = "spacy.Corpus.v1"

From 04239e94c71bf8e4512676085a92fabcfcb42bb4 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 31 May 2021 16:36:17 +0900
Subject: [PATCH 11/21] Use a context manager when reading model (fix #7036)
 (#8244)

---
 spacy/pipeline/trainable_pipe.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/spacy/pipeline/trainable_pipe.pyx b/spacy/pipeline/trainable_pipe.pyx
index 64e33f800..fe51f38e5 100644
--- a/spacy/pipeline/trainable_pipe.pyx
+++ b/spacy/pipeline/trainable_pipe.pyx
@@ -324,7 +324,8 @@ cdef class TrainablePipe(Pipe):
 
         def load_model(p):
             try:
-                self.model.from_bytes(p.open("rb").read())
+                with open(p, "rb") as mfile:
+                    self.model.from_bytes(mfile.read())
             except AttributeError:
                 raise ValueError(Errors.E149) from None
 

From d54631f68b2dc739bb6dd215ddc3ac14ee2465c6 Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Mon, 31 May 2021 18:04:29 +0900
Subject: [PATCH 12/21] Fix other open calls without context managers (#8245)

---
 spacy/cli/convert.py                    | 3 ++-
 spacy/pipeline/entity_linker.py         | 3 ++-
 spacy/tests/tokenizer/test_tokenizer.py | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/convert.py b/spacy/cli/convert.py
index d13a4fc80..c84aa6431 100644
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@@ -115,7 +115,8 @@ def convert(
     ner_map = srsly.read_json(ner_map) if ner_map is not None else None
     doc_files = []
     for input_loc in walk_directory(Path(input_path), converter):
-        input_data = input_loc.open("r", encoding="utf-8").read()
+        with input_loc.open("r", encoding="utf-8") as infile:
+            input_data = infile.read()
         # Use converter function to convert data
         func = CONVERTERS[converter]
         docs = func(
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 66070916e..21d5e9db1 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -439,7 +439,8 @@ class EntityLinker(TrainablePipe):
 
         def load_model(p):
             try:
-                self.model.from_bytes(p.open("rb").read())
+                with p.open("rb") as infile:
+                    self.model.from_bytes(infile.read())
             except AttributeError:
                 raise ValueError(Errors.E149) from None
 
diff --git a/spacy/tests/tokenizer/test_tokenizer.py b/spacy/tests/tokenizer/test_tokenizer.py
index 6cfeaf014..c1ba1df36 100644
--- a/spacy/tests/tokenizer/test_tokenizer.py
+++ b/spacy/tests/tokenizer/test_tokenizer.py
@@ -84,7 +84,8 @@ Phasellus tincidunt, augue quis porta finibus, massa sapien consectetur augue, n
 @pytest.mark.parametrize("file_name", ["sun.txt"])
 def test_tokenizer_handle_text_from_file(tokenizer, file_name):
     loc = ensure_path(__file__).parent / file_name
-    text = loc.open("r", encoding="utf8").read()
+    with loc.open("r", encoding="utf8") as infile:
+        text = infile.read()
     assert len(text) != 0
     tokens = tokenizer(text)
     assert len(tokens) > 100

From d959603d517f49dd90cf36378b93797eef02f67f Mon Sep 17 00:00:00 2001
From: Paul O'Leary McCann <polm@dampfkraft.com>
Date: Thu, 3 Jun 2021 16:05:26 +0900
Subject: [PATCH 13/21] Don't add duplicate patterns all the time in
 EntityRuler (fix #8216) (#8246)

* Don't add duplicate patterns (fix #8216)

* Refactor EntityRuler init

This simplifies the EntityRuler init code. This is helpful as prep for
allowing the EntityRuler to reset itself.

* Make EntityRuler.clear reset matchers

Includes a new test for this.

* Tidy PhraseMatcher instantiation

Since the attr can be None safely now, the guard if is no longer
required here.

Also renamed the `_validate` attr. Maybe it's not needed?

* Fix NER test

* Add test to make sure patterns aren't increasing

* Move test to regression tests
---
 spacy/matcher/phrasematcher.pyx           |  2 ++
 spacy/pipeline/entityruler.py             | 39 ++++++++++-------------
 spacy/tests/parser/test_ner.py            |  2 +-
 spacy/tests/pipeline/test_entity_ruler.py | 13 ++++++++
 spacy/tests/regression/test_issue8216.py  | 34 ++++++++++++++++++++
 5 files changed, 67 insertions(+), 23 deletions(-)
 create mode 100644 spacy/tests/regression/test_issue8216.py

diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index e5ff2202c..d8486b84b 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -50,6 +50,8 @@ cdef class PhraseMatcher:
         if isinstance(attr, (int, long)):
             self.attr = attr
         else:
+            if attr is None:
+                attr = "ORTH"
             attr = attr.upper()
             if attr == "TEXT":
                 attr = "ORTH"
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 4e61dbca7..03730f772 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -101,17 +101,12 @@ class EntityRuler(Pipe):
         self.overwrite = overwrite_ents
         self.token_patterns = defaultdict(list)
         self.phrase_patterns = defaultdict(list)
+        self._validate = validate
         self.matcher = Matcher(nlp.vocab, validate=validate)
-        if phrase_matcher_attr is not None:
-            if phrase_matcher_attr.upper() == "TEXT":
-                phrase_matcher_attr = "ORTH"
-            self.phrase_matcher_attr = phrase_matcher_attr
-            self.phrase_matcher = PhraseMatcher(
-                nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
-            )
-        else:
-            self.phrase_matcher_attr = None
-            self.phrase_matcher = PhraseMatcher(nlp.vocab, validate=validate)
+        self.phrase_matcher_attr = phrase_matcher_attr
+        self.phrase_matcher = PhraseMatcher(
+            nlp.vocab, attr=self.phrase_matcher_attr, validate=validate
+        )
         self.ent_id_sep = ent_id_sep
         self._ent_ids = defaultdict(dict)
         if patterns is not None:
@@ -315,20 +310,22 @@ class EntityRuler(Pipe):
                 pattern = entry["pattern"]
                 if isinstance(pattern, Doc):
                     self.phrase_patterns[label].append(pattern)
+                    self.phrase_matcher.add(label, [pattern])
                 elif isinstance(pattern, list):
                     self.token_patterns[label].append(pattern)
+                    self.matcher.add(label, [pattern])
                 else:
                     raise ValueError(Errors.E097.format(pattern=pattern))
-            for label, patterns in self.token_patterns.items():
-                self.matcher.add(label, patterns)
-            for label, patterns in self.phrase_patterns.items():
-                self.phrase_matcher.add(label, patterns)
 
     def clear(self) -> None:
         """Reset all patterns."""
         self.token_patterns = defaultdict(list)
         self.phrase_patterns = defaultdict(list)
         self._ent_ids = defaultdict(dict)
+        self.matcher = Matcher(self.nlp.vocab, validate=self._validate)
+        self.phrase_matcher = PhraseMatcher(
+            self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
+        )
 
     def _split_label(self, label: str) -> Tuple[str, str]:
         """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
@@ -374,10 +371,9 @@ class EntityRuler(Pipe):
             self.add_patterns(cfg.get("patterns", cfg))
             self.overwrite = cfg.get("overwrite", False)
             self.phrase_matcher_attr = cfg.get("phrase_matcher_attr", None)
-            if self.phrase_matcher_attr is not None:
-                self.phrase_matcher = PhraseMatcher(
-                    self.nlp.vocab, attr=self.phrase_matcher_attr
-                )
+            self.phrase_matcher = PhraseMatcher(
+                self.nlp.vocab, attr=self.phrase_matcher_attr
+            )
             self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
         else:
             self.add_patterns(cfg)
@@ -428,10 +424,9 @@ class EntityRuler(Pipe):
             self.phrase_matcher_attr = cfg.get("phrase_matcher_attr")
             self.ent_id_sep = cfg.get("ent_id_sep", DEFAULT_ENT_ID_SEP)
 
-            if self.phrase_matcher_attr is not None:
-                self.phrase_matcher = PhraseMatcher(
-                    self.nlp.vocab, attr=self.phrase_matcher_attr
-                )
+            self.phrase_matcher = PhraseMatcher(
+                self.nlp.vocab, attr=self.phrase_matcher_attr
+            )
             from_disk(path, deserializers_patterns, {})
         return self
 
diff --git a/spacy/tests/parser/test_ner.py b/spacy/tests/parser/test_ner.py
index dffdff1ec..1b9d0b255 100644
--- a/spacy/tests/parser/test_ner.py
+++ b/spacy/tests/parser/test_ner.py
@@ -252,12 +252,12 @@ def test_ruler_before_ner():
     # 1 : Entity Ruler - should set "this" to B and everything else to empty
     patterns = [{"label": "THING", "pattern": "This"}]
     ruler = nlp.add_pipe("entity_ruler")
-    ruler.add_patterns(patterns)
 
     # 2: untrained NER - should set everything else to O
     untrained_ner = nlp.add_pipe("ner")
     untrained_ner.add_label("MY_LABEL")
     nlp.initialize()
+    ruler.add_patterns(patterns)
     doc = nlp("This is Antti Korhonen speaking in Finland")
     expected_iobs = ["B", "O", "O", "O", "O", "O", "O"]
     expected_types = ["THING", "", "", "", "", "", ""]
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 2f6da79d6..79ad44abd 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -78,6 +78,19 @@ def test_entity_ruler_init_clear(nlp, patterns):
     assert len(ruler.labels) == 0
 
 
+def test_entity_ruler_clear(nlp, patterns):
+    """Test that initialization clears patterns."""
+    ruler = nlp.add_pipe("entity_ruler")
+    ruler.add_patterns(patterns)
+    assert len(ruler.labels) == 4
+    doc = nlp("hello world")
+    assert len(doc.ents) == 1
+    ruler.clear()
+    assert len(ruler.labels) == 0
+    doc = nlp("hello world")
+    assert len(doc.ents) == 0
+
+
 def test_entity_ruler_existing(nlp, patterns):
     ruler = nlp.add_pipe("entity_ruler")
     ruler.add_patterns(patterns)
diff --git a/spacy/tests/regression/test_issue8216.py b/spacy/tests/regression/test_issue8216.py
new file mode 100644
index 000000000..528d4b6f9
--- /dev/null
+++ b/spacy/tests/regression/test_issue8216.py
@@ -0,0 +1,34 @@
+import pytest
+
+from spacy import registry
+from spacy.language import Language
+from spacy.pipeline import EntityRuler
+
+
+@pytest.fixture
+def nlp():
+    return Language()
+
+
+@pytest.fixture
+@registry.misc("entity_ruler_patterns")
+def patterns():
+    return [
+        {"label": "HELLO", "pattern": "hello world"},
+        {"label": "BYE", "pattern": [{"LOWER": "bye"}, {"LOWER": "bye"}]},
+        {"label": "HELLO", "pattern": [{"ORTH": "HELLO"}]},
+        {"label": "COMPLEX", "pattern": [{"ORTH": "foo", "OP": "*"}]},
+        {"label": "TECH_ORG", "pattern": "Apple", "id": "a1"},
+        {"label": "TECH_ORG", "pattern": "Microsoft", "id": "a2"},
+    ]
+
+
+def test_entity_ruler_fix8216(nlp, patterns):
+    """Test that patterns don't get added excessively."""
+    ruler = nlp.add_pipe("entity_ruler", config={"validate": True})
+    ruler.add_patterns(patterns)
+    pattern_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
+    assert pattern_count > 0
+    ruler.add_patterns([])
+    after_count = sum(len(mm) for mm in ruler.matcher._patterns.values())
+    assert after_count == pattern_count

From 07082c96921f9df9f10f0429379518ae83ff6829 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 4 Jun 2021 14:56:07 +0200
Subject: [PATCH 14/21] Exclude generated .cpp files from package (#8271)

---
 MANIFEST.in | 1 +
 1 file changed, 1 insertion(+)

diff --git a/MANIFEST.in b/MANIFEST.in
index 8008b4507..99fc174bd 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -8,3 +8,4 @@ recursive-exclude spacy/lang *.json
 recursive-include spacy/lang *.json.gz
 recursive-include spacy/cli *.json *.yml
 recursive-include licenses *
+recursive-exclude spacy *.cpp

From f0277bdeabbcfd59da0242172bc334f821420f87 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 31 May 2021 10:20:27 +0200
Subject: [PATCH 15/21] Show warning if entity_ruler runs without patterns
 (#7807)

* Show warning if entity_ruler runs without patterns

* Show warning if matcher runs without patterns

* fix wording

* unit test for warning once (WIP)

* warn W036 only once

* cleanup

* create filter_warning helper
---
 spacy/__init__.py                         |  6 ++---
 spacy/errors.py                           | 30 ++++++++++++++++++++++-
 spacy/matcher/matcher.pyx                 |  6 +++++
 spacy/pipeline/entityruler.py             |  9 ++++++-
 spacy/tests/matcher/test_matcher_api.py   |  9 +++++++
 spacy/tests/pipeline/test_entity_ruler.py | 11 +++++++++
 6 files changed, 66 insertions(+), 5 deletions(-)

diff --git a/spacy/__init__.py b/spacy/__init__.py
index 1eef7e621..d07931cfd 100644
--- a/spacy/__init__.py
+++ b/spacy/__init__.py
@@ -1,10 +1,10 @@
 from typing import Union, Iterable, Dict, Any
 from pathlib import Path
-import warnings
 import sys
 
-warnings.filterwarnings("ignore", message="numpy.dtype size changed")  # noqa
-warnings.filterwarnings("ignore", message="numpy.ufunc size changed")  # noqa
+# set library-specific custom warning handling before doing anything else
+from .errors import setup_default_warnings
+setup_default_warnings()
 
 # These are imported as part of the API
 from thinc.api import prefer_gpu, require_gpu, require_cpu  # noqa: F401
diff --git a/spacy/errors.py b/spacy/errors.py
index 7cf9e54e4..ce0d735af 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -1,3 +1,6 @@
+import warnings
+
+
 def add_codes(err_cls):
     """Add error codes to string messages via class attribute names."""
 
@@ -12,6 +15,30 @@ def add_codes(err_cls):
     return ErrorsWithCodes()
 
 
+def setup_default_warnings():
+    # ignore certain numpy warnings
+    filter_warning("ignore", error_msg="numpy.dtype size changed")  # noqa
+    filter_warning("ignore", error_msg="numpy.ufunc size changed")  # noqa
+
+    # warn about entity_ruler & matcher having no patterns only once
+    for pipe in ["matcher", "entity_ruler"]:
+        filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
+
+
+def filter_warning(action: str, error_msg: str):
+    """Customize how spaCy should handle a certain warning.
+
+    error_msg (str): e.g. "W006", or a full error message
+    action (str): "default", "error", "ignore", "always", "module" or "once"
+    """
+    warnings.filterwarnings(action, message=_escape_warning_msg(error_msg))
+
+
+def _escape_warning_msg(msg):
+    """To filter with warnings.filterwarnings, the [] brackets need to be escaped"""
+    return msg.replace("[", "\\[").replace("]", "\\]")
+
+
 # fmt: off
 
 @add_codes
@@ -80,8 +107,9 @@ class Warnings:
             "@misc = \"spacy.LookupsDataLoader.v1\"\n"
             "lang = ${{nlp.lang}}\n"
             "tables = [\"lexeme_norm\"]\n")
-    W035 = ('Discarding subpattern "{pattern}" due to an unrecognized '
+    W035 = ("Discarding subpattern '{pattern}' due to an unrecognized "
             "attribute or operator.")
+    W036 = ("The component '{name}' does not have any patterns defined.")
 
     # New warnings added in v3.x
     W086 = ("Component '{listener}' will be (re)trained, but it needs the component "
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index dae12c3f6..6fd8bdb03 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -138,6 +138,11 @@ cdef class Matcher:
         self._filter[key] = greedy
         self._patterns[key].extend(patterns)
 
+    def _require_patterns(self) -> None:
+        """Raise a warning if this component has no patterns defined."""
+        if len(self) == 0:
+            warnings.warn(Warnings.W036.format(name="matcher"))
+
     def remove(self, key):
         """Remove a rule from the matcher. A KeyError is raised if the key does
         not exist.
@@ -215,6 +220,7 @@ cdef class Matcher:
             If with_alignments is set to True and as_spans is set to False,
             A list of `(match_id, start, end, alignments)` tuples is returned.
         """
+        self._require_patterns()
         if isinstance(doclike, Doc):
             doc = doclike
             length = len(doc)
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 03730f772..78269f180 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -1,3 +1,4 @@
+import warnings
 from typing import Optional, Union, List, Dict, Tuple, Iterable, Any, Callable, Sequence
 from collections import defaultdict
 from pathlib import Path
@@ -6,7 +7,7 @@ import srsly
 from .pipe import Pipe
 from ..training import Example
 from ..language import Language
-from ..errors import Errors
+from ..errors import Errors, Warnings
 from ..util import ensure_path, to_disk, from_disk, SimpleFrozenList
 from ..tokens import Doc, Span
 from ..matcher import Matcher, PhraseMatcher
@@ -139,6 +140,7 @@ class EntityRuler(Pipe):
             error_handler(self.name, self, [doc], e)
 
     def match(self, doc: Doc):
+        self._require_patterns()
         matches = list(self.matcher(doc)) + list(self.phrase_matcher(doc))
         matches = set(
             [(m_id, start, end) for m_id, start, end in matches if start != end]
@@ -327,6 +329,11 @@ class EntityRuler(Pipe):
             self.nlp.vocab, attr=self.phrase_matcher_attr, validate=self._validate
         )
 
+    def _require_patterns(self) -> None:
+        """Raise a warning if this component has no patterns defined."""
+        if len(self) == 0:
+            warnings.warn(Warnings.W036.format(name=self.name))
+
     def _split_label(self, label: str) -> Tuple[str, str]:
         """Split Entity label into ent_label and ent_id if it contains self.ent_id_sep
 
diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
index 094bf22a6..4e6b4bfae 100644
--- a/spacy/tests/matcher/test_matcher_api.py
+++ b/spacy/tests/matcher/test_matcher_api.py
@@ -33,6 +33,15 @@ def test_matcher_from_api_docs(en_vocab):
     assert len(patterns[0])
 
 
+def test_matcher_empty_patterns_warns(en_vocab):
+    matcher = Matcher(en_vocab)
+    assert len(matcher) == 0
+    doc = Doc(en_vocab, words=["This", "is", "quite", "something"])
+    with pytest.warns(UserWarning):
+        matcher(doc)
+    assert len(doc.ents) == 0
+
+
 def test_matcher_from_usage_docs(en_vocab):
     text = "Wow 😀 This is really cool! 😂 😂"
     doc = Doc(en_vocab, words=text.split(" "))
diff --git a/spacy/tests/pipeline/test_entity_ruler.py b/spacy/tests/pipeline/test_entity_ruler.py
index 79ad44abd..4a01ce183 100644
--- a/spacy/tests/pipeline/test_entity_ruler.py
+++ b/spacy/tests/pipeline/test_entity_ruler.py
@@ -46,6 +46,17 @@ def test_entity_ruler_init(nlp, patterns):
     assert doc.ents[1].label_ == "BYE"
 
 
+def test_entity_ruler_no_patterns_warns(nlp):
+    ruler = EntityRuler(nlp)
+    assert len(ruler) == 0
+    assert len(ruler.labels) == 0
+    nlp.add_pipe("entity_ruler")
+    assert nlp.pipe_names == ["entity_ruler"]
+    with pytest.warns(UserWarning):
+        doc = nlp("hello world bye bye")
+    assert len(doc.ents) == 0
+
+
 def test_entity_ruler_init_patterns(nlp, patterns):
     # initialize with patterns
     ruler = nlp.add_pipe("entity_ruler")

From 9dfd3c9484a2cf332bed9f84473c2d419f621fb6 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 4 Jun 2021 17:44:04 +0200
Subject: [PATCH 16/21] Use warnings.warn instead of logger.warning

---
 spacy/errors.py                             | 3 +++
 spacy/language.py                           | 2 +-
 spacy/matcher/dependencymatcher.pyx         | 4 ++--
 spacy/pipeline/lemmatizer.py                | 4 +++-
 spacy/tests/doc/test_doc_api.py             | 9 ++-------
 spacy/tests/pipeline/test_lemmatizer.py     | 8 +++-----
 spacy/tests/pipeline/test_pipe_factories.py | 6 +-----
 spacy/tokens/doc.pyx                        | 2 +-
 8 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/spacy/errors.py b/spacy/errors.py
index ce0d735af..2e8cc4494 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -24,6 +24,9 @@ def setup_default_warnings():
     for pipe in ["matcher", "entity_ruler"]:
         filter_warning("once", error_msg=Warnings.W036.format(name=pipe))
 
+    # warn once about lemmatizer without required POS
+    filter_warning("once", error_msg="[W108]")
+
 
 def filter_warning(action: str, error_msg: str):
     """Customize how spaCy should handle a certain warning.
diff --git a/spacy/language.py b/spacy/language.py
index 1a447c11b..7786089a5 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -689,7 +689,7 @@ class Language:
         if self.vocab.vectors.shape != source.vocab.vectors.shape or \
                 self.vocab.vectors.key2row != source.vocab.vectors.key2row or \
                 self.vocab.vectors.to_bytes() != source.vocab.vectors.to_bytes():
-            util.logger.warning(Warnings.W113.format(name=source_name))
+            warnings.warn(Warnings.W113.format(name=source_name))
         if not source_name in source.component_names:
             raise KeyError(
                 Errors.E944.format(
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index 0e601281a..b6e84a5da 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -4,6 +4,7 @@ from collections import defaultdict
 from itertools import product
 
 import numpy
+import warnings
 
 from .matcher cimport Matcher
 from ..vocab cimport Vocab
@@ -11,7 +12,6 @@ from ..tokens.doc cimport Doc
 
 from ..errors import Errors, Warnings
 from ..tokens import Span
-from ..util import logger
 
 
 DELIMITER = "||"
@@ -282,7 +282,7 @@ cdef class DependencyMatcher:
         keys_to_position_maps = defaultdict(lambda: defaultdict(list))
         for match_id, start, end in self._matcher(doc):
             if start + 1 != end:
-                logger.warning(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
+                warnings.warn(Warnings.W110.format(tokens=[t.text for t in doc[start:end]], pattern=self._matcher.get(match_id)[1][0][0]))
             token = doc[start]
             root = ([token] + list(token.ancestors))[-1]
             keys_to_position_maps[root.i][match_id].append(start)
diff --git a/spacy/pipeline/lemmatizer.py b/spacy/pipeline/lemmatizer.py
index cfe405efa..87504fade 100644
--- a/spacy/pipeline/lemmatizer.py
+++ b/spacy/pipeline/lemmatizer.py
@@ -2,6 +2,8 @@ from typing import Optional, List, Dict, Any, Callable, Iterable, Union, Tuple
 from thinc.api import Model
 from pathlib import Path
 
+import warnings
+
 from .pipe import Pipe
 from ..errors import Errors, Warnings
 from ..language import Language
@@ -182,7 +184,7 @@ class Lemmatizer(Pipe):
         univ_pos = token.pos_.lower()
         if univ_pos in ("", "eol", "space"):
             if univ_pos == "":
-                logger.warning(Warnings.W108.format(text=string))
+                warnings.warn(Warnings.W108.format(text=string))
             return [string.lower()]
         # See Issue #435 for example of where this logic is requied.
         if self.is_base_form(token):
diff --git a/spacy/tests/doc/test_doc_api.py b/spacy/tests/doc/test_doc_api.py
index d7452a802..358724517 100644
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@@ -2,8 +2,6 @@ import weakref
 
 import pytest
 import numpy
-import logging
-import mock
 
 from spacy.lang.xx import MultiLanguage
 from spacy.tokens import Doc, Span, Token
@@ -158,13 +156,10 @@ def test_doc_api_serialize(en_tokenizer, text):
     def inner_func(d1, d2):
         return "hello!"
 
-    logger = logging.getLogger("spacy")
-    with mock.patch.object(logger, "warning") as mock_warning:
-        _ = tokens.to_bytes()  # noqa: F841
-        mock_warning.assert_not_called()
+    _ = tokens.to_bytes()  # noqa: F841
+    with pytest.warns(UserWarning):
         tokens.user_hooks["similarity"] = inner_func
         _ = tokens.to_bytes()  # noqa: F841
-        mock_warning.assert_called_once()
 
 
 def test_doc_api_set_ents(en_tokenizer):
diff --git a/spacy/tests/pipeline/test_lemmatizer.py b/spacy/tests/pipeline/test_lemmatizer.py
index 3c16d3bcb..1bec8696c 100644
--- a/spacy/tests/pipeline/test_lemmatizer.py
+++ b/spacy/tests/pipeline/test_lemmatizer.py
@@ -1,6 +1,4 @@
 import pytest
-import logging
-import mock
 import pickle
 from spacy import util, registry
 from spacy.lang.en import English
@@ -59,10 +57,10 @@ def test_lemmatizer_config(nlp):
 
     # warning if no POS assigned
     doc = nlp.make_doc("coping")
-    logger = logging.getLogger("spacy")
-    with mock.patch.object(logger, "warning") as mock_warning:
+    with pytest.warns(UserWarning):
         doc = lemmatizer(doc)
-        mock_warning.assert_called_once()
+    # warns once by default
+    doc = lemmatizer(doc)
 
     # works with POS
     doc = nlp.make_doc("coping")
diff --git a/spacy/tests/pipeline/test_pipe_factories.py b/spacy/tests/pipeline/test_pipe_factories.py
index c5cc62661..b99e9a863 100644
--- a/spacy/tests/pipeline/test_pipe_factories.py
+++ b/spacy/tests/pipeline/test_pipe_factories.py
@@ -1,6 +1,4 @@
 import pytest
-import mock
-import logging
 from spacy.language import Language
 from spacy.lang.en import English
 from spacy.lang.de import German
@@ -437,10 +435,8 @@ def test_pipe_factories_from_source_language_subclass():
     nlp = English()
     nlp.vocab.vectors.resize((1, 4))
     nlp.vocab.vectors.add("cat", vector=[1, 2, 3, 4])
-    logger = logging.getLogger("spacy")
-    with mock.patch.object(logger, "warning") as mock_warning:
+    with pytest.warns(UserWarning):
         nlp.add_pipe("tagger", source=source_nlp)
-        mock_warning.assert_called()
 
 
 def test_pipe_factories_from_source_custom():
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index aae0ff374..28f8debf3 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -1318,7 +1318,7 @@ cdef class Doc:
             if "user_data_values" not in exclude:
                 serializers["user_data_values"] = lambda: srsly.msgpack_dumps(user_data_values)
         if "user_hooks" not in exclude and any((self.user_hooks, self.user_token_hooks, self.user_span_hooks)):
-            util.logger.warning(Warnings.W109)
+            warnings.warn(Warnings.W109)
         return util.to_dict(serializers, exclude)
 
     def from_dict(self, msg, *, exclude=tuple()):

From d9be9e6cf9f892d97a26b11e022627abdb9dd07d Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 11 Jun 2021 10:20:24 +0200
Subject: [PATCH 17/21] Move README.md and LICENSES_SOURCES in package (#8297)

In addition to `LICENSE`, move the files `README.md` and
`LICENSES_SOURCES` to the top directory in `spacy package` if present in
the model directory.
---
 spacy/cli/package.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 5b8daf048..58e191f65 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -113,7 +113,7 @@ def package(
         print("\n".join(errors))
         sys.exit(1)
     model_name = meta["name"]
-    if not model_name.startswith(meta['lang'] + "_"):
+    if not model_name.startswith(meta["lang"] + "_"):
         model_name = f"{meta['lang']}_{model_name}"
     model_name_v = model_name + "-" + meta["version"]
     main_path = output_dir / model_name_v
@@ -130,9 +130,10 @@ def package(
             )
     Path.mkdir(package_path, parents=True)
     shutil.copytree(str(input_dir), str(package_path / model_name_v))
-    license_path = package_path / model_name_v / "LICENSE"
-    if license_path.exists():
-        shutil.move(str(license_path), str(main_path))
+    for file_name in FILENAMES_DOCS:
+        file_path = package_path / model_name_v / file_name
+        if file_path.exists():
+            shutil.move(str(file_path), str(main_path))
     imports = []
     for code_path in code_paths:
         imports.append(code_path.stem)
@@ -317,3 +318,6 @@ __version__ = get_model_meta(Path(__file__).parent)['version']
 def load(**overrides):
     return load_model_from_init_py(__file__, **overrides)
 """.lstrip()
+
+
+FILENAMES_DOCS = ["LICENSE", "LICENSES_SOURCES", "README.md"]

From b98d216205068024407ae02c27c06da04ea88ff8 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Fri, 11 Jun 2021 10:21:22 +0200
Subject: [PATCH 18/21] Update Catalan language data (#8308)

* Update Catalan language data

Update Catalan language data based on contributions from the Text Mining
Unit at the Barcelona Supercomputing Center:

https://github.com/TeMU-BSC/spacy4release/tree/main/lang_data

* Update tokenizer settings for UD Catalan AnCora

Update for UD Catalan AnCora v2.7 with merged multi-word tokens.

* Update test

* Move prefix patternt to more generic infix pattern

* Clean up
---
 setup.cfg                             |  2 +-
 spacy/lang/ca/__init__.py             | 22 +++++++-
 spacy/lang/ca/lemmatizer.py           | 81 +++++++++++++++++++++++++++
 spacy/lang/ca/punctuation.py          | 44 +++++++++++++--
 spacy/lang/ca/syntax_iterators.py     | 46 +++++++++++++++
 spacy/lang/ca/tokenizer_exceptions.py |  7 +++
 spacy/tests/lang/ca/test_text.py      |  4 +-
 spacy/tests/lang/test_lemmatizers.py  |  2 +-
 8 files changed, 198 insertions(+), 10 deletions(-)
 create mode 100644 spacy/lang/ca/lemmatizer.py
 create mode 100644 spacy/lang/ca/syntax_iterators.py

diff --git a/setup.cfg b/setup.cfg
index 2fedd8f5c..cd55911fe 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -65,7 +65,7 @@ console_scripts =
 
 [options.extras_require]
 lookups =
-    spacy_lookups_data>=1.0.0,<1.1.0
+    spacy_lookups_data>=1.0.1,<1.1.0
 transformers =
     spacy_transformers>=1.0.1,<1.1.0
 ray =
diff --git a/spacy/lang/ca/__init__.py b/spacy/lang/ca/__init__.py
index 970b23c1e..81f39b13c 100644
--- a/spacy/lang/ca/__init__.py
+++ b/spacy/lang/ca/__init__.py
@@ -1,15 +1,23 @@
+from typing import Optional
+
+from thinc.api import Model
+
 from .tokenizer_exceptions import TOKENIZER_EXCEPTIONS
-from .punctuation import TOKENIZER_INFIXES
+from .punctuation import TOKENIZER_INFIXES, TOKENIZER_SUFFIXES
 from .stop_words import STOP_WORDS
 from .lex_attrs import LEX_ATTRS
+from .syntax_iterators import SYNTAX_ITERATORS
 from ...language import Language
+from .lemmatizer import CatalanLemmatizer
 
 
 class CatalanDefaults(Language.Defaults):
     tokenizer_exceptions = TOKENIZER_EXCEPTIONS
     infixes = TOKENIZER_INFIXES
+    suffixes = TOKENIZER_SUFFIXES
     stop_words = STOP_WORDS
     lex_attr_getters = LEX_ATTRS
+    syntax_iterators = SYNTAX_ITERATORS
 
 
 class Catalan(Language):
@@ -17,4 +25,16 @@ class Catalan(Language):
     Defaults = CatalanDefaults
 
 
+@Catalan.factory(
+    "lemmatizer",
+    assigns=["token.lemma"],
+    default_config={"model": None, "mode": "rule", "overwrite": False},
+    default_score_weights={"lemma_acc": 1.0},
+)
+def make_lemmatizer(
+    nlp: Language, model: Optional[Model], name: str, mode: str, overwrite: bool
+):
+    return CatalanLemmatizer(nlp.vocab, model, name, mode=mode, overwrite=overwrite)
+
+
 __all__ = ["Catalan"]
diff --git a/spacy/lang/ca/lemmatizer.py b/spacy/lang/ca/lemmatizer.py
new file mode 100644
index 000000000..2518eb720
--- /dev/null
+++ b/spacy/lang/ca/lemmatizer.py
@@ -0,0 +1,81 @@
+from typing import List, Tuple
+
+from ...pipeline import Lemmatizer
+from ...tokens import Token
+
+
+class CatalanLemmatizer(Lemmatizer):
+    """
+    Copied from French Lemmatizer
+    Catalan language lemmatizer applies the default rule based lemmatization
+    procedure with some modifications for better Catalan language support.
+
+    The parts of speech 'ADV', 'PRON', 'DET', 'ADP' and 'AUX' are added to use
+    the rule-based lemmatization. As a last resort, the lemmatizer checks in
+    the lookup table.
+    """
+
+    @classmethod
+    def get_lookups_config(cls, mode: str) -> Tuple[List[str], List[str]]:
+        if mode == "rule":
+            required = ["lemma_lookup", "lemma_rules", "lemma_exc", "lemma_index"]
+            return (required, [])
+        else:
+            return super().get_lookups_config(mode)
+
+    def rule_lemmatize(self, token: Token) -> List[str]:
+        cache_key = (token.orth, token.pos)
+        if cache_key in self.cache:
+            return self.cache[cache_key]
+        string = token.text
+        univ_pos = token.pos_.lower()
+        if univ_pos in ("", "eol", "space"):
+            return [string.lower()]
+        elif "lemma_rules" not in self.lookups or univ_pos not in (
+            "noun",
+            "verb",
+            "adj",
+            "adp",
+            "adv",
+            "aux",
+            "cconj",
+            "det",
+            "pron",
+            "punct",
+            "sconj",
+        ):
+            return self.lookup_lemmatize(token)
+        index_table = self.lookups.get_table("lemma_index", {})
+        exc_table = self.lookups.get_table("lemma_exc", {})
+        rules_table = self.lookups.get_table("lemma_rules", {})
+        lookup_table = self.lookups.get_table("lemma_lookup", {})
+        index = index_table.get(univ_pos, {})
+        exceptions = exc_table.get(univ_pos, {})
+        rules = rules_table.get(univ_pos, [])
+        string = string.lower()
+        forms = []
+        if string in index:
+            forms.append(string)
+            self.cache[cache_key] = forms
+            return forms
+        forms.extend(exceptions.get(string, []))
+        oov_forms = []
+        if not forms:
+            for old, new in rules:
+                if string.endswith(old):
+                    form = string[: len(string) - len(old)] + new
+                    if not form:
+                        pass
+                    elif form in index or not form.isalpha():
+                        forms.append(form)
+                    else:
+                        oov_forms.append(form)
+        if not forms:
+            forms.extend(oov_forms)
+        if not forms and string in lookup_table.keys():
+            forms.append(self.lookup_lemmatize(token)[0])
+        if not forms:
+            forms.append(string)
+        forms = list(set(forms))
+        self.cache[cache_key] = forms
+        return forms
diff --git a/spacy/lang/ca/punctuation.py b/spacy/lang/ca/punctuation.py
index d50b75589..39db08f17 100644
--- a/spacy/lang/ca/punctuation.py
+++ b/spacy/lang/ca/punctuation.py
@@ -1,12 +1,46 @@
-from ..punctuation import TOKENIZER_INFIXES
-from ..char_classes import ALPHA
+from ..char_classes import LIST_PUNCT, LIST_ELLIPSES, LIST_QUOTES, LIST_ICONS
+from ..char_classes import CURRENCY
+from ..char_classes import CONCAT_QUOTES, ALPHA_LOWER, ALPHA_UPPER, ALPHA, PUNCT
+from ..char_classes import merge_chars, _units
 
 
 ELISION = " ' ’ ".strip().replace(" ", "").replace("\n", "")
 
 
-_infixes = TOKENIZER_INFIXES + [
-    r"(?<=[{a}][{el}])(?=[{a}])".format(a=ALPHA, el=ELISION)
-]
+_infixes = (
+    LIST_ELLIPSES
+    + LIST_ICONS
+    + [
+        r"(?<=[0-9])[+\-\*^](?=[0-9-])",
+        r"(?<=[{al}{q}])\.(?=[{au}{q}])".format(
+            al=ALPHA_LOWER, au=ALPHA_UPPER, q=CONCAT_QUOTES
+        ),
+        r"(?<=[{a}]),(?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}0-9])[:<>=/](?=[{a}])".format(a=ALPHA),
+        r"(?<=[{a}][{el}])(?=[{a}0-9])".format(a=ALPHA, el=ELISION),
+    ]
+)
+
+_units = _units.replace("% ", "")
+UNITS = merge_chars(_units)
+
+_suffixes = (
+    LIST_PUNCT
+    + LIST_ELLIPSES
+    + LIST_QUOTES
+    + LIST_ICONS
+    + [r"-", "—", "–"]
+    + [
+        r"(?<=[0-9])\+",
+        r"(?<=°[FfCcKk])\.",
+        r"(?<=[0-9])(?:{c})".format(c=CURRENCY),
+        r"(?<=[0-9])(?:{u})".format(u=UNITS),
+        r"(?<=[0-9{al}{e}{p}(?:{q})])\.".format(
+            al=ALPHA_LOWER, e=r"%²\-\+", q=CONCAT_QUOTES, p=PUNCT
+        ),
+        r"(?<=[{au}][{au}])\.".format(au=ALPHA_UPPER),
+    ]
+)
 
 TOKENIZER_INFIXES = _infixes
+TOKENIZER_SUFFIXES = _suffixes
diff --git a/spacy/lang/ca/syntax_iterators.py b/spacy/lang/ca/syntax_iterators.py
new file mode 100644
index 000000000..c70d53e80
--- /dev/null
+++ b/spacy/lang/ca/syntax_iterators.py
@@ -0,0 +1,46 @@
+from ...symbols import NOUN, PROPN
+from ...errors import Errors
+
+
+def noun_chunks(doclike):
+    """Detect base noun phrases from a dependency parse. Works on Doc and Span."""
+    # fmt: off
+    labels = ["nsubj", "nsubj:pass", "obj", "obl", "iobj", "ROOT", "appos", "nmod", "nmod:poss"]
+    # fmt: on
+    doc = doclike.doc  # Ensure works on both Doc and Span.
+    if not doc.has_annotation("DEP"):
+        raise ValueError(Errors.E029)
+    np_deps = [doc.vocab.strings[label] for label in labels]
+    np_label = doc.vocab.strings.add("NP")
+    prev_end = -1
+    for i, word in enumerate(doclike):
+        if word.pos not in (NOUN, PROPN):
+            continue
+        # Prevent nested chunks from being produced
+        if word.left_edge.i <= prev_end:
+            continue
+        if word.dep in np_deps:
+            left = word.left_edge.i
+            right = word.right_edge.i + 1
+            # leave prepositions and punctuation out of the left side of the chunk
+            if word.left_edge.pos_ == "ADP" or word.left_edge.pos_ == "PUNCT":
+                left = word.left_edge.i + 1
+            prev_end = word.right_edge.i
+            # leave subordinated clauses and appositions out of the chunk
+            a = word.i + 1
+            while a < word.right_edge.i:
+                paraula = doc[a]
+                if paraula.pos_ == "VERB":
+                    right = paraula.left_edge.i
+                    prev_end = paraula.left_edge.i - 1
+                elif paraula.dep_ == "appos":
+                    right = paraula.left_edge.i + 1
+                    prev_end = paraula.left_edge.i - 1
+                a += 1
+            # leave punctuation out of the right side of the chunk
+            if word.right_edge.pos_ == "PUNCT":
+                right = right - 1
+            yield left, right, np_label
+
+
+SYNTAX_ITERATORS = {"noun_chunks": noun_chunks}
diff --git a/spacy/lang/ca/tokenizer_exceptions.py b/spacy/lang/ca/tokenizer_exceptions.py
index b465e97ba..5f9a50f5e 100644
--- a/spacy/lang/ca/tokenizer_exceptions.py
+++ b/spacy/lang/ca/tokenizer_exceptions.py
@@ -24,6 +24,13 @@ for exc_data in [
     {ORTH: "núm", NORM: "número"},
     {ORTH: "St.", NORM: "sant"},
     {ORTH: "Sta.", NORM: "santa"},
+    {ORTH: "'l"},
+    {ORTH: "'ls"},
+    {ORTH: "'m"},
+    {ORTH: "'n"},
+    {ORTH: "'ns"},
+    {ORTH: "'s"},
+    {ORTH: "'t"},
 ]:
     _exc[exc_data[ORTH]] = [exc_data]
 
diff --git a/spacy/tests/lang/ca/test_text.py b/spacy/tests/lang/ca/test_text.py
index 38f5fc708..55bad0e94 100644
--- a/spacy/tests/lang/ca/test_text.py
+++ b/spacy/tests/lang/ca/test_text.py
@@ -12,13 +12,13 @@ def test_ca_tokenizer_handles_long_text(ca_tokenizer):
     una gerra de cervesa. Ens asseiem -fotògraf i periodista- en una terrassa buida."""
 
     tokens = ca_tokenizer(text)
-    assert len(tokens) == 138
+    assert len(tokens) == 140
 
 
 @pytest.mark.parametrize(
     "text,length",
     [
-        ("Perquè va anar-hi?", 6),
+        ("Perquè va anar-hi?", 4),
         ("“Ah no?”", 5),
         ("""Sí! "Anem", va contestar el Joan Carles""", 11),
         ("Van córrer aprox. 10km", 5),
diff --git a/spacy/tests/lang/test_lemmatizers.py b/spacy/tests/lang/test_lemmatizers.py
index e755da22d..e419f0a14 100644
--- a/spacy/tests/lang/test_lemmatizers.py
+++ b/spacy/tests/lang/test_lemmatizers.py
@@ -8,7 +8,7 @@ from spacy.util import get_lang_class
 # Only include languages with no external dependencies
 # excluded: ru, uk
 # excluded for custom tables: es, pl
-LANGUAGES = ["bn", "el", "en", "fa", "fr", "nb", "nl", "sv"]
+LANGUAGES = ["bn", "ca", "el", "en", "fa", "fr", "nb", "nl", "sv"]
 # fmt: on
 
 

From 63d748f80eea38cd5412aa7935b56b8ed03009f6 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 14 Jun 2021 09:50:13 +0200
Subject: [PATCH 19/21] Add Catalan and Danish trf to website models (#8378)

---
 website/meta/languages.json | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/website/meta/languages.json b/website/meta/languages.json
index e05718047..b605210c3 100644
--- a/website/meta/languages.json
+++ b/website/meta/languages.json
@@ -25,7 +25,13 @@
             "code": "ca",
             "name": "Catalan",
             "example": "Això és una frase.",
-            "has_examples": true
+            "has_examples": true,
+            "models": [
+                "ca_core_news_sm",
+                "ca_core_news_md",
+                "ca_core_news_lg",
+                "ca_core_news_trf"
+            ]
         },
         {
             "code": "cs",
@@ -40,7 +46,8 @@
             "models": [
                 "da_core_news_sm",
                 "da_core_news_md",
-                "da_core_news_lg"
+                "da_core_news_lg",
+                "da_core_news_trf"
             ]
         },
         {
@@ -545,4 +552,4 @@
             "url": "https://github.com/UniversalDependencies/UD_French-Sequoia/blob/master/LICENSE.txt"
         }
     ]
-}
\ No newline at end of file
+}

From 6b69b8934b483862a3e7d57ace26ae05ce16d053 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Mon, 14 Jun 2021 11:17:35 +0200
Subject: [PATCH 20/21] Set version to v3.1.0.dev0 (#8379)

---
 spacy/about.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/about.py b/spacy/about.py
index c351076c5..dc521045c 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.6"
+__version__ = "3.1.0.dev0"
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 __projects__ = "https://github.com/explosion/projects"

From 33240ed2c5c0a8d086df07e5e2d349f088a060d0 Mon Sep 17 00:00:00 2001
From: Adriane Boyd <adrianeboyd@gmail.com>
Date: Wed, 16 Jun 2021 10:14:42 +0200
Subject: [PATCH 21/21] Temporarily skip model download test

---
 .github/azure-steps.yml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/azure-steps.yml b/.github/azure-steps.yml
index 21d2654ad..df0691fd3 100644
--- a/.github/azure-steps.yml
+++ b/.github/azure-steps.yml
@@ -60,8 +60,8 @@ steps:
     displayName: "Run GPU tests"
     condition: eq(${{ parameters.gpu }}, true)
 
-  - script: |
-      python -m spacy download en_core_web_sm
-      python -c "import spacy; nlp=spacy.load('en_core_web_sm'); doc=nlp('test')"
-    displayName: 'Test download CLI'
-    condition: eq(variables['python_version'], '3.8')
+#  - script: |
+#      python -m spacy download en_core_web_sm
+#      python -c "import spacy; nlp=spacy.load('en_core_web_sm'); doc=nlp('test')"
+#    displayName: 'Test download CLI'
+#    condition: eq(variables['python_version'], '3.8')