From c998cde7e26edf71fd0c5bbdf694d52fef7ee192 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 10 Mar 2019 19:22:59 +0100
Subject: [PATCH 1/7] Auto-format [ci skip]

---
 spacy/_ml.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spacy/_ml.py b/spacy/_ml.py
index 168519bfe..c08dce100 100644
--- a/spacy/_ml.py
+++ b/spacy/_ml.py
@@ -97,6 +97,7 @@ def with_cpu(ops, model):
     """Wrap a model that should run on CPU, transferring inputs and outputs
     as necessary."""
     model.to_cpu()
+
     def with_cpu_forward(inputs, drop=0.):
         cpu_outputs, backprop = model.begin_update(_to_cpu(inputs), drop=drop)
         gpu_outputs = _to_device(ops, cpu_outputs)

From 8dbf1e9037aecbe31e8d5d3f95798a5bc2c0841c Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Sun, 10 Mar 2019 23:36:28 +0100
Subject: [PATCH 2/7] Also fix #3387 on develop

---
 website/docs/usage/visualizers.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md
index b1a6b8c7c..6d53fc150 100644
--- a/website/docs/usage/visualizers.md
+++ b/website/docs/usage/visualizers.md
@@ -283,7 +283,7 @@ from pathlib import Path
 nlp = spacy.load("en_core_web_sm")
 sentences = [u"This is an example.", u"This is another one."]
 for sent in sentences:
-    doc = nlp(sentence)
+    doc = nlp(sent)
     svg = displacy.render(doc, style="dep")
     file_name = '-'.join([w.text for w in doc if not w.is_punct]) + ".svg"
     output_path = Path("/images/" + file_name)

From 98acf5ffe408d3ec58fcfba0e0deb742891815d5 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 10 Mar 2019 23:36:47 +0100
Subject: [PATCH 3/7] =?UTF-8?q?=F0=9F=92=AB=20Allow=20passing=20of=20confi?=
 =?UTF-8?q?g=20parameters=20to=20specific=20pipeline=20components=20(#3386?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add component_cfg kwarg to begin_training

* Document component_cfg arg to begin_training

* Update docs and auto-format

* Support component_cfg across Language

* Format

* Update docs and docstrings [ci skip]

* Fix begin_training
---
 spacy/language.py            | 69 +++++++++++++++++++++++++++---------
 website/docs/api/language.md | 41 +++++++++++----------
 2 files changed, 75 insertions(+), 35 deletions(-)

diff --git a/spacy/language.py b/spacy/language.py
index 6fb30e46d..44a819132 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -106,6 +106,7 @@ class Language(object):
 
     DOCS: https://spacy.io/api/language
     """
+
     Defaults = BaseDefaults
     lang = None
 
@@ -344,13 +345,15 @@ class Language(object):
             raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
         return self.pipeline.pop(self.pipe_names.index(name))
 
-    def __call__(self, text, disable=[]):
+    def __call__(self, text, disable=[], component_cfg=None):
         """Apply the pipeline to some text. The text can span multiple sentences,
         and can contain arbtrary whitespace. Alignment into the original string
         is preserved.
 
         text (unicode): The text to be processed.
         disable (list): Names of the pipeline components to disable.
+        component_cfg (dict): An optional dictionary with extra keyword arguments
+            for specific components.
         RETURNS (Doc): A container for accessing the annotations.
 
         EXAMPLE:
@@ -363,12 +366,14 @@ class Language(object):
                 Errors.E088.format(length=len(text), max_length=self.max_length)
             )
         doc = self.make_doc(text)
+        if component_cfg is None:
+            component_cfg = {}
         for name, proc in self.pipeline:
             if name in disable:
                 continue
             if not hasattr(proc, "__call__"):
                 raise ValueError(Errors.E003.format(component=type(proc), name=name))
-            doc = proc(doc)
+            doc = proc(doc, **component_cfg.get(name, {}))
             if doc is None:
                 raise ValueError(Errors.E005.format(name=name))
         return doc
@@ -396,7 +401,7 @@ class Language(object):
     def make_doc(self, text):
         return self.tokenizer(text)
 
-    def update(self, docs, golds, drop=0.0, sgd=None, losses=None):
+    def update(self, docs, golds, drop=0.0, sgd=None, losses=None, component_cfg=None):
         """Update the models in the pipeline.
 
         docs (iterable): A batch of `Doc` objects.
@@ -443,11 +448,15 @@ class Language(object):
 
         pipes = list(self.pipeline)
         random.shuffle(pipes)
+        if component_cfg is None:
+            component_cfg = {}
         for name, proc in pipes:
             if not hasattr(proc, "update"):
                 continue
             grads = {}
-            proc.update(docs, golds, drop=drop, sgd=get_grads, losses=losses)
+            kwargs = component_cfg.get(name, {})
+            kwargs.setdefault("drop", drop)
+            proc.update(docs, golds, sgd=get_grads, losses=losses, **kwargs)
             for key, (W, dW) in grads.items():
                 sgd(W, dW, key=key)
 
@@ -517,11 +526,12 @@ class Language(object):
         for doc, gold in docs_golds:
             yield doc, gold
 
-    def begin_training(self, get_gold_tuples=None, sgd=None, **cfg):
+    def begin_training(self, get_gold_tuples=None, sgd=None, component_cfg=None, **cfg):
         """Allocate models, pre-process training data and acquire a trainer and
         optimizer. Used as a contextmanager.
 
         get_gold_tuples (function): Function returning gold data
+        component_cfg (dict): Config parameters for specific components.
         **cfg: Config parameters.
         RETURNS: An optimizer
         """
@@ -543,10 +553,17 @@ class Language(object):
         if sgd is None:
             sgd = create_default_optimizer(Model.ops)
         self._optimizer = sgd
+        if component_cfg is None:
+            component_cfg = {}
         for name, proc in self.pipeline:
             if hasattr(proc, "begin_training"):
+                kwargs = component_cfg.get(name, {})
+                kwargs.update(cfg)
                 proc.begin_training(
-                    get_gold_tuples, pipeline=self.pipeline, sgd=self._optimizer, **cfg
+                    get_gold_tuples,
+                    pipeline=self.pipeline,
+                    sgd=self._optimizer,
+                    **kwargs
                 )
         return self._optimizer
 
@@ -574,20 +591,27 @@ class Language(object):
                 proc._rehearsal_model = deepcopy(proc.model)
         return self._optimizer
 
-    def evaluate(self, docs_golds, verbose=False, batch_size=256):
-        scorer = Scorer()
+    def evaluate(
+        self, docs_golds, verbose=False, batch_size=256, scorer=None, component_cfg=None
+    ):
+        if scorer is None:
+            scorer = Scorer()
         docs, golds = zip(*docs_golds)
         docs = list(docs)
         golds = list(golds)
         for name, pipe in self.pipeline:
+            kwargs = component_cfg.get(name, {})
+            kwargs.setdefault("batch_size", batch_size)
             if not hasattr(pipe, "pipe"):
-                docs = (pipe(doc) for doc in docs)
+                docs = (pipe(doc, **kwargs) for doc in docs)
             else:
-                docs = pipe.pipe(docs, batch_size=batch_size)
+                docs = pipe.pipe(docs, **kwargs)
         for doc, gold in zip(docs, golds):
             if verbose:
                 print(doc)
-            scorer.score(doc, gold, verbose=verbose)
+            kwargs = component_cfg.get("scorer", {})
+            kwargs.setdefault("verbose", verbose)
+            scorer.score(doc, gold, **kwargs)
         return scorer
 
     @contextmanager
@@ -630,6 +654,7 @@ class Language(object):
         batch_size=1000,
         disable=[],
         cleanup=False,
+        component_cfg=None,
     ):
         """Process texts as a stream, and yield `Doc` objects in order.
 
@@ -643,6 +668,8 @@ class Language(object):
         disable (list): Names of the pipeline components to disable.
         cleanup (bool): If True, unneeded strings are freed,
             to control memory use. Experimental.
+        component_cfg (dict): An optional dictionary with extra keyword arguments
+            for specific components.
         YIELDS (Doc): Documents in the order of the original text.
 
         EXAMPLE:
@@ -655,20 +682,30 @@ class Language(object):
             texts = (tc[0] for tc in text_context1)
             contexts = (tc[1] for tc in text_context2)
             docs = self.pipe(
-                texts, n_threads=n_threads, batch_size=batch_size, disable=disable
+                texts,
+                n_threads=n_threads,
+                batch_size=batch_size,
+                disable=disable,
+                component_cfg=component_cfg,
             )
             for doc, context in izip(docs, contexts):
                 yield (doc, context)
             return
         docs = (self.make_doc(text) for text in texts)
+        if component_cfg is None:
+            component_cfg = {}
         for name, proc in self.pipeline:
             if name in disable:
                 continue
+            kwargs = component_cfg.get(name, {})
+            # Allow component_cfg to overwrite the top-level kwargs.
+            kwargs.setdefault("batch_size", batch_size)
+            kwargs.setdefault("n_threads", n_threads)
             if hasattr(proc, "pipe"):
-                docs = proc.pipe(docs, n_threads=n_threads, batch_size=batch_size)
+                docs = proc.pipe(docs, **kwargs)
             else:
                 # Apply the function, but yield the doc
-                docs = _pipe(proc, docs)
+                docs = _pipe(proc, docs, kwargs)
         # Track weakrefs of "recent" documents, so that we can see when they
         # expire from memory. When they do, we know we don't need old strings.
         # This way, we avoid maintaining an unbounded growth in string entries
@@ -861,7 +898,7 @@ class DisabledPipes(list):
         self[:] = []
 
 
-def _pipe(func, docs):
+def _pipe(func, docs, kwargs):
     for doc in docs:
-        doc = func(doc)
+        doc = func(doc, **kwargs)
         yield doc
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 34d14ec01..a8598815b 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -91,13 +91,14 @@ multiprocessing.
 >     assert doc.is_parsed
 > ```
 
-| Name         | Type  | Description                                                                                                                                                |
-| ------------ | ----- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `texts`      | -     | A sequence of unicode objects.                                                                                                                             |
-| `as_tuples`  | bool  | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
-| `batch_size` | int   | The number of texts to buffer.                                                                                                                             |
-| `disable`    | list  | Names of pipeline components to [disable](/usage/processing-pipelines#disabling).                                                                          |
-| **YIELDS**   | `Doc` | Documents in the order of the original text.                                                                                                               |
+| Name                                         | Type  | Description                                                                                                                                                |
+| -------------------------------------------- | ----- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `texts`                                      | -     | A sequence of unicode objects.                                                                                                                             |
+| `as_tuples`                                  | bool  | If set to `True`, inputs should be a sequence of `(text, context)` tuples. Output will then be a sequence of `(doc, context)` tuples. Defaults to `False`. |
+| `batch_size`                                 | int   | The number of texts to buffer.                                                                                                                             |
+| `disable`                                    | list  | Names of pipeline components to [disable](/usage/processing-pipelines#disabling).                                                                          |
+| `component_cfg` <Tag variant="new">2.1</Tag> | dict  | Config parameters for specific pipeline components, keyed by component name.                                                                               |
+| **YIELDS**                                   | `Doc` | Documents in the order of the original text.                                                                                                               |
 
 ## Language.update {#update tag="method"}
 
@@ -112,13 +113,14 @@ Update the models in the pipeline.
 >     nlp.update([doc], [gold], drop=0.5, sgd=optimizer)
 > ```
 
-| Name        | Type     | Description                                                                                                                                                                                                         |
-| ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `docs`      | iterable | A batch of `Doc` objects or unicode. If unicode, a `Doc` object will be created from the text.                                                                                                                      |
-| `golds`     | iterable | A batch of `GoldParse` objects or dictionaries. Dictionaries will be used to create [`GoldParse`](/api/goldparse) objects. For the available keys and their usage, see [`GoldParse.__init__`](/api/goldparse#init). |
-| `drop`      | float    | The dropout rate.                                                                                                                                                                                                   |
-| `sgd`       | callable | An optimizer.                                                                                                                                                                                                       |
-| **RETURNS** | dict     | Results from the update.                                                                                                                                                                                            |
+| Name                                         | Type     | Description                                                                                                                                                                                                         |
+| -------------------------------------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `docs`                                       | iterable | A batch of `Doc` objects or unicode. If unicode, a `Doc` object will be created from the text.                                                                                                                      |
+| `golds`                                      | iterable | A batch of `GoldParse` objects or dictionaries. Dictionaries will be used to create [`GoldParse`](/api/goldparse) objects. For the available keys and their usage, see [`GoldParse.__init__`](/api/goldparse#init). |
+| `drop`                                       | float    | The dropout rate.                                                                                                                                                                                                   |
+| `sgd`                                        | callable | An optimizer.                                                                                                                                                                                                       |
+| `component_cfg` <Tag variant="new">2.1</Tag> | dict     | Config parameters for specific pipeline components, keyed by component name.                                                                                                                                        |
+| **RETURNS**                                  | dict     | Results from the update.                                                                                                                                                                                            |
 
 ## Language.begin_training {#begin_training tag="method"}
 
@@ -130,11 +132,12 @@ Allocate models, pre-process training data and acquire an optimizer.
 > optimizer = nlp.begin_training(gold_tuples)
 > ```
 
-| Name          | Type     | Description                  |
-| ------------- | -------- | ---------------------------- |
-| `gold_tuples` | iterable | Gold-standard training data. |
-| `**cfg`       | -        | Config parameters.           |
-| **RETURNS**   | callable | An optimizer.                |
+| Name                                         | Type     | Description                                                                  |
+| -------------------------------------------- | -------- | ---------------------------------------------------------------------------- |
+| `gold_tuples`                                | iterable | Gold-standard training data.                                                 |
+| `component_cfg` <Tag variant="new">2.1</Tag> | dict     | Config parameters for specific pipeline components, keyed by component name. |
+| `**cfg`                                      | -        | Config parameters (sent to all components).                                  |
+| **RETURNS**                                  | callable | An optimizer.                                                                |
 
 ## Language.use_params {#use_params tag="contextmanager, method"}
 

From 7503e1e505dc70c93713d8848df3cbe1d5a6f44c Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Sun, 10 Mar 2019 23:50:00 +0100
Subject: [PATCH 4/7] Improve English tag map. Re #593, #3311

---
 spacy/lang/en/tag_map.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py
index 7747c928d..449c3cd7a 100644
--- a/spacy/lang/en/tag_map.py
+++ b/spacy/lang/en/tag_map.py
@@ -16,7 +16,7 @@ TAG_MAP = {
     ":": {POS: PUNCT},
     "$": {POS: SYM, "Other": {"SymType": "currency"}},
     "#": {POS: SYM, "Other": {"SymType": "numbersign"}},
-    "AFX": {POS: ADJ, "Hyph": "yes"},
+    "AFX": {POS: X, "Hyph": "yes"},
     "CC": {POS: CCONJ, "ConjType": "coor"},
     "CD": {POS: NUM, "NumType": "card"},
     "DT": {POS: DET},
@@ -34,10 +34,10 @@ TAG_MAP = {
     "NNP": {POS: PROPN, "NounType": "prop", "Number": "sing"},
     "NNPS": {POS: PROPN, "NounType": "prop", "Number": "plur"},
     "NNS": {POS: NOUN, "Number": "plur"},
-    "PDT": {POS: ADJ, "AdjType": "pdt", "PronType": "prn"},
+    "PDT": {POS: DET, "AdjType": "pdt", "PronType": "prn"},
     "POS": {POS: PART, "Poss": "yes"},
-    "PRP": {POS: PRON, "PronType": "prs"},
-    "PRP$": {POS: ADJ, "PronType": "prs", "Poss": "yes"},
+    "PRP": {POS: DET, "PronType": "prs"},
+    "PRP$": {POS: DET, "PronType": "prs", "Poss": "yes"},
     "RB": {POS: ADV, "Degree": "pos"},
     "RBR": {POS: ADV, "Degree": "comp"},
     "RBS": {POS: ADV, "Degree": "sup"},
@@ -58,9 +58,9 @@ TAG_MAP = {
         "Number": "sing",
         "Person": 3,
     },
-    "WDT": {POS: ADJ, "PronType": "int|rel"},
-    "WP": {POS: NOUN, "PronType": "int|rel"},
-    "WP$": {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
+    "WDT": {POS: DET, "PronType": "int|rel"},
+    "WP": {POS: PRON, "PronType": "int|rel"},
+    "WP$": {POS: DET, "Poss": "yes", "PronType": "int|rel"},
     "WRB": {POS: ADV, "PronType": "int|rel"},
     "ADD": {POS: X},
     "NFP": {POS: PUNCT},

From 8f45ff3dc2e3e4ac51abe72dfe67dc7b1aac9e7c Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Mon, 11 Mar 2019 00:47:41 +0100
Subject: [PATCH 5/7] Adjust formatting [ci skip]

---
 spacy/tests/regression/test_issue2001-2500.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index 81b7afa87..ed1c89671 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -12,12 +12,14 @@ from spacy.lang.en import English
 from ..util import add_vecs_to_vocab, get_doc
 
 
-@pytest.mark.xfail(
-    reason="The dot is now properly split off, but the prefix/suffix rules are not applied again afterwards."
-           "This means that the quote will still be attached to the remaining token."
-)
+@pytest.mark.xfail
 def test_issue2070():
-    """Test that checks that a dot followed by a quote is handled appropriately."""
+    """Test that checks that a dot followed by a quote is handled
+    appropriately.
+    """
+    # Problem: The dot is now properly split off, but the prefix/suffix rules
+    # are not applied again afterwards. This means that the quote will still be
+    # attached to the remaining token.
     nlp = English()
     doc = nlp('First sentence."A quoted sentence" he said ...')
     assert len(doc) == 11

From 5d25ee52fb514720746a8f285b0d48b016d3d8d3 Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 11 Mar 2019 01:06:02 +0100
Subject: [PATCH 6/7] Fix English tag map

---
 spacy/lang/en/tag_map.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py
index 449c3cd7a..67f43c53c 100644
--- a/spacy/lang/en/tag_map.py
+++ b/spacy/lang/en/tag_map.py
@@ -36,7 +36,7 @@ TAG_MAP = {
     "NNS": {POS: NOUN, "Number": "plur"},
     "PDT": {POS: DET, "AdjType": "pdt", "PronType": "prn"},
     "POS": {POS: PART, "Poss": "yes"},
-    "PRP": {POS: DET, "PronType": "prs"},
+    "PRP": {POS: PRON, "PronType": "prs"},
     "PRP$": {POS: DET, "PronType": "prs", "Poss": "yes"},
     "RB": {POS: ADV, "Degree": "pos"},
     "RBR": {POS: ADV, "Degree": "comp"},

From 80b94313b6bf71516fe68e4ffdd02c1015f4436b Mon Sep 17 00:00:00 2001
From: Matthew Honnibal <honnibal+gh@gmail.com>
Date: Mon, 11 Mar 2019 01:31:21 +0100
Subject: [PATCH 7/7] =?UTF-8?q?=F0=9F=92=AB=20Fix=20interaction=20of=20lem?=
 =?UTF-8?q?matizer=20and=20tokenizer=20exceptions=20(#3388)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes #2203. Closes #3268.

Lemmas set from outside the `Morphology` class were being overwritten. The result was especially confusing when deserialising, as it meant some lemmas could change when storing and retrieving a `Doc` object.

This PR applies two fixes:

1) When we go to set the lemma in the `Morphology` class, first check whether a lemma is already set. If so, don't overwrite.
2) When we load with `doc.from_array()`, take care to apply the `TAG` field first. This allows other fields to overwrite the `TAG` implied properties, if they're provided explicitly (e.g. the `LEMMA`).

## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
---
 spacy/morphology.pyx                          |  3 ++-
 spacy/tests/regression/test_issue2001-2500.py | 21 +++++++++++++++++++
 spacy/tokens/doc.pyx                          | 15 ++++++-------
 3 files changed, 31 insertions(+), 8 deletions(-)

diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index bd821d76f..ed1ee9a7e 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -110,7 +110,8 @@ cdef class Morphology:
             analysis.lemma = self.lemmatize(analysis.tag.pos, token.lex.orth,
                                             self.tag_map.get(tag_str, {}))
             self._cache.set(tag_id, token.lex.orth, analysis)
-        token.lemma = analysis.lemma
+        if token.lemma == 0:
+            token.lemma = analysis.lemma
         token.pos = analysis.tag.pos
         token.tag = analysis.tag.name
         token.morph = analysis.tag.morph
diff --git a/spacy/tests/regression/test_issue2001-2500.py b/spacy/tests/regression/test_issue2001-2500.py
index ed1c89671..df5d76641 100644
--- a/spacy/tests/regression/test_issue2001-2500.py
+++ b/spacy/tests/regression/test_issue2001-2500.py
@@ -2,6 +2,7 @@
 from __future__ import unicode_literals
 
 import pytest
+import numpy
 from spacy.tokens import Doc
 from spacy.displacy import render
 from spacy.gold import iob_to_biluo
@@ -39,6 +40,26 @@ def test_issue2179():
     assert nlp2.get_pipe("ner").labels == ("CITIZENSHIP",)
 
 
+def test_issue2203(en_vocab):
+    """Test that lemmas are set correctly in doc.from_array."""
+    words = ["I", "'ll", "survive"]
+    tags = ["PRP", "MD", "VB"]
+    lemmas = ["-PRON-", "will", "survive"]
+    tag_ids = [en_vocab.strings.add(tag) for tag in tags]
+    lemma_ids = [en_vocab.strings.add(lemma) for lemma in lemmas]
+    doc = Doc(en_vocab, words=words)
+    # Work around lemma corrpution problem and set lemmas after tags
+    doc.from_array("TAG", numpy.array(tag_ids, dtype="uint64"))
+    doc.from_array("LEMMA", numpy.array(lemma_ids, dtype="uint64"))
+    assert [t.tag_ for t in doc] == tags
+    assert [t.lemma_ for t in doc] == lemmas
+    # We need to serialize both tag and lemma, since this is what causes the bug
+    doc_array = doc.to_array(["TAG", "LEMMA"])
+    new_doc = Doc(doc.vocab, words=words).from_array(["TAG", "LEMMA"], doc_array)
+    assert [t.tag_ for t in new_doc] == tags
+    assert [t.lemma_ for t in new_doc] == lemmas
+
+
 def test_issue2219(en_vocab):
     vectors = [("a", [1, 2, 3]), ("letter", [4, 5, 6])]
     add_vecs_to_vocab(en_vocab, vectors)
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 483fa6a10..4d3ed084a 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -763,17 +763,18 @@ cdef class Doc:
             attr_ids[i] = attr_id
         if len(array.shape) == 1:
             array = array.reshape((array.size, 1))
+        # Do TAG first. This lets subsequent loop override stuff like POS, LEMMA
+        if TAG in attrs:
+            col = attrs.index(TAG)
+            for i in range(length):
+                if array[i, col] != 0:
+                    self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
         # Now load the data
         for i in range(self.length):
             token = &self.c[i]
             for j in range(n_attrs):
-                Token.set_struct_attr(token, attr_ids[j], array[i, j])
-        # Auxiliary loading logic
-        for col, attr_id in enumerate(attrs):
-            if attr_id == TAG:
-                for i in range(length):
-                    if array[i, col] != 0:
-                        self.vocab.morphology.assign_tag(&tokens[i], array[i, col])
+                if attr_ids[j] != TAG:
+                    Token.set_struct_attr(token, attr_ids[j], array[i, j])
         # Set flags
         self.is_parsed = bool(self.is_parsed or HEAD in attrs or DEP in attrs)
         self.is_tagged = bool(self.is_tagged or TAG in attrs or POS in attrs)