From 56c17973aa2526966eb93e2f26fc69c351dacf05 Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Wed, 5 Aug 2020 23:53:21 +0200
Subject: [PATCH 1/3] Use "raise ... from" in custom errors for better
 tracebacks

---
 spacy/cli/train.py                   |  6 +++---
 spacy/gold/example.pyx               |  6 +++---
 spacy/lang/ja/__init__.py            |  4 ++--
 spacy/lang/ko/__init__.py            |  2 +-
 spacy/lang/ru/lemmatizer.py          |  2 +-
 spacy/lang/th/__init__.py            |  2 +-
 spacy/lang/uk/lemmatizer.py          |  2 +-
 spacy/lang/vi/__init__.py            |  2 +-
 spacy/lang/zh/__init__.py            | 14 +++++++-------
 spacy/language.py                    |  2 +-
 spacy/matcher/matcher.pyx            |  2 +-
 spacy/pipeline/attributeruler.py     |  2 +-
 spacy/pipeline/entity_linker.py      |  6 +++---
 spacy/pipeline/morphologizer.pyx     |  4 ++--
 spacy/pipeline/multitask.pyx         |  2 +-
 spacy/pipeline/pipe.pyx              |  4 ++--
 spacy/pipeline/senter.pyx            |  4 ++--
 spacy/pipeline/tagger.pyx            | 10 +++++-----
 spacy/pipeline/textcat.py            |  6 +++---
 spacy/pipeline/transition_parser.pyx |  4 ++--
 spacy/tokens/doc.pyx                 |  2 +-
 spacy/util.py                        | 10 +++++-----
 22 files changed, 49 insertions(+), 49 deletions(-)

diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index c5c6e7252..32d22d1bc 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -211,7 +211,7 @@ def create_evaluation_callback(
         except KeyError as e:
             keys = list(scores.keys())
             err = Errors.E983.format(dict="score_weights", key=str(e), keys=keys)
-            raise KeyError(err)
+            raise KeyError(err) from None
         return weighted_score, scores
 
     return evaluate
@@ -369,7 +369,7 @@ def setup_printer(
                 Errors.E983.format(
                     dict="scores (losses)", key=str(e), keys=list(info["losses"].keys())
                 )
-            )
+            ) from None
 
         try:
             scores = [
@@ -382,7 +382,7 @@ def setup_printer(
                     key=str(e),
                     keys=list(info["other_scores"].keys()),
                 )
-            )
+            ) from None
         data = (
             [info["epoch"], info["step"]]
             + losses
diff --git a/spacy/gold/example.pyx b/spacy/gold/example.pyx
index f90d98603..6093d2346 100644
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@@ -139,7 +139,7 @@ cdef class Example:
 
     def get_aligned_spans_y2x(self, y_spans):
         return self._get_aligned_spans(self.x, y_spans, self.alignment.y2x)
-    
+
     def _get_aligned_spans(self, doc, spans, align):
         seen = set()
         output = []
@@ -207,7 +207,7 @@ cdef class Example:
         sent_starts and return a list of the new Examples"""
         if not self.reference.is_sentenced:
             return [self]
-        
+
         align = self.alignment.y2x
         seen_indices = set()
         output = []
@@ -267,7 +267,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
                 values.append([vocab.strings.add(v) for v in value])
             except TypeError:
                 types= set([type(v) for v in value])
-                raise TypeError(Errors.E969.format(field=key, types=types))
+                raise TypeError(Errors.E969.format(field=key, types=types)) from None
 
     array = numpy.asarray(values, dtype="uint64")
     return attrs, array.T
diff --git a/spacy/lang/ja/__init__.py b/spacy/lang/ja/__init__.py
index d435afe12..900db4e4c 100644
--- a/spacy/lang/ja/__init__.py
+++ b/spacy/lang/ja/__init__.py
@@ -200,7 +200,7 @@ def try_sudachi_import(split_mode="A"):
             "(https://github.com/WorksApplications/SudachiPy). "
             "Install with `pip install sudachipy sudachidict_core` or "
             "install spaCy with `pip install spacy[ja]`."
-        )
+        ) from None
 
 
 def resolve_pos(orth, tag, next_tag):
@@ -263,7 +263,7 @@ def get_dtokens_and_spaces(dtokens, text, gap_tag="空白"):
         try:
             word_start = text[text_pos:].index(word)
         except ValueError:
-            raise ValueError(Errors.E194.format(text=text, words=words))
+            raise ValueError(Errors.E194.format(text=text, words=words)) from None
 
         # space token
         if word_start > 0:
diff --git a/spacy/lang/ko/__init__.py b/spacy/lang/ko/__init__.py
index 6197ab927..f2954f461 100644
--- a/spacy/lang/ko/__init__.py
+++ b/spacy/lang/ko/__init__.py
@@ -85,7 +85,7 @@ def try_mecab_import() -> None:
             "Korean support requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), "
             "[mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), "
             "and [natto-py](https://github.com/buruzaemon/natto-py)"
-        )
+        ) from None
 
 
 def check_spaces(text, tokens):
diff --git a/spacy/lang/ru/lemmatizer.py b/spacy/lang/ru/lemmatizer.py
index a9a7ad80f..28767348d 100644
--- a/spacy/lang/ru/lemmatizer.py
+++ b/spacy/lang/ru/lemmatizer.py
@@ -21,7 +21,7 @@ class RussianLemmatizer(Lemmatizer):
                 'try to fix it with "pip install pymorphy2==0.8" '
                 'or "pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
                 "if you need Ukrainian too"
-            )
+            ) from None
         if RussianLemmatizer._morph is None:
             RussianLemmatizer._morph = MorphAnalyzer()
 
diff --git a/spacy/lang/th/__init__.py b/spacy/lang/th/__init__.py
index 989c22a42..a35ae987f 100644
--- a/spacy/lang/th/__init__.py
+++ b/spacy/lang/th/__init__.py
@@ -31,7 +31,7 @@ class ThaiTokenizer(DummyTokenizer):
             raise ImportError(
                 "The Thai tokenizer requires the PyThaiNLP library: "
                 "https://github.com/PyThaiNLP/pythainlp"
-            )
+            ) from None
         self.word_tokenize = word_tokenize
         self.vocab = nlp.vocab
 
diff --git a/spacy/lang/uk/lemmatizer.py b/spacy/lang/uk/lemmatizer.py
index de2d0c170..cf89d1a12 100644
--- a/spacy/lang/uk/lemmatizer.py
+++ b/spacy/lang/uk/lemmatizer.py
@@ -23,7 +23,7 @@ class UkrainianLemmatizer(Lemmatizer):
                 "The Ukrainian lemmatizer requires the pymorphy2 library and "
                 'dictionaries: try to fix it with "pip uninstall pymorphy2" and'
                 '"pip install git+https://github.com/kmike/pymorphy2.git pymorphy2-dicts-uk"'
-            )
+            ) from None
 
     def __call__(
         self, string: str, univ_pos: str, morphology: Optional[dict] = None
diff --git a/spacy/lang/vi/__init__.py b/spacy/lang/vi/__init__.py
index 2b06d33f7..1db762adb 100644
--- a/spacy/lang/vi/__init__.py
+++ b/spacy/lang/vi/__init__.py
@@ -38,7 +38,7 @@ class VietnameseTokenizer(DummyTokenizer):
                     "Pyvi not installed. Either set use_pyvi = False, "
                     "or install it https://pypi.python.org/pypi/pyvi"
                 )
-                raise ImportError(msg)
+                raise ImportError(msg) from None
 
     def __call__(self, text: str) -> Doc:
         if self.use_pyvi:
diff --git a/spacy/lang/zh/__init__.py b/spacy/lang/zh/__init__.py
index fe0613c80..5d3bd2a96 100644
--- a/spacy/lang/zh/__init__.py
+++ b/spacy/lang/zh/__init__.py
@@ -129,7 +129,7 @@ class ChineseTokenizer(DummyTokenizer):
                         "pkuseg not installed: unable to reset pkuseg "
                         "user dict. Please " + _PKUSEG_INSTALL_MSG
                     )
-                    raise ImportError(msg)
+                    raise ImportError(msg) from None
             for word in words:
                 self.pkuseg_seg.preprocesser.insert(word.strip(), "")
         else:
@@ -208,7 +208,7 @@ class ChineseTokenizer(DummyTokenizer):
                     raise ImportError(
                         "pkuseg not installed. To use this model, "
                         + _PKUSEG_INSTALL_MSG
-                    )
+                    ) from None
                 self.pkuseg_seg = pkuseg.pkuseg(str(tempdir))
             if pkuseg_data["processors_data"]:
                 processors_data = pkuseg_data["processors_data"]
@@ -258,7 +258,7 @@ class ChineseTokenizer(DummyTokenizer):
                     raise ImportError(
                         "pkuseg not installed. To use this model, "
                         + _PKUSEG_INSTALL_MSG
-                    )
+                    ) from None
             if path.exists():
                 self.pkuseg_seg = pkuseg.pkuseg(path)
 
@@ -267,7 +267,7 @@ class ChineseTokenizer(DummyTokenizer):
                 import pkuseg
             except ImportError:
                 if self.segmenter == Segmenter.pkuseg:
-                    raise ImportError(self._pkuseg_install_msg)
+                    raise ImportError(self._pkuseg_install_msg) from None
             if self.segmenter == Segmenter.pkuseg:
                 data = srsly.read_msgpack(path)
                 (user_dict, do_process, common_words, other_words) = data
@@ -311,7 +311,7 @@ def try_jieba_import(segmenter: str) -> None:
                 "Jieba not installed. To use jieba, install it with `pip "
                 " install jieba` or from https://github.com/fxsjy/jieba"
             )
-            raise ImportError(msg)
+            raise ImportError(msg) from None
 
 
 def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str) -> None:
@@ -332,11 +332,11 @@ def try_pkuseg_import(segmenter: str, pkuseg_model: str, pkuseg_user_dict: str)
     except ImportError:
         if segmenter == Segmenter.pkuseg:
             msg = "pkuseg not installed. To use pkuseg, " + _PKUSEG_INSTALL_MSG
-            raise ImportError(msg)
+            raise ImportError(msg) from None
     except FileNotFoundError:
         if segmenter == Segmenter.pkuseg:
             msg = "Unable to load pkuseg model from: " + pkuseg_model
-            raise FileNotFoundError(msg)
+            raise FileNotFoundError(msg) from None
 
 
 def _get_pkuseg_trie_data(node, path=""):
diff --git a/spacy/language.py b/spacy/language.py
index e9d7e9eb6..9018af73c 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -869,7 +869,7 @@ class Language:
             try:
                 doc = proc(doc, **component_cfg.get(name, {}))
             except KeyError:
-                raise ValueError(Errors.E109.format(name=name))
+                raise ValueError(Errors.E109.format(name=name)) from None
             if doc is None:
                 raise ValueError(Errors.E005.format(name=name))
         return doc
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 325c81369..a0f3f1655 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -131,7 +131,7 @@ cdef class Matcher:
                     for attr, _ in spec[1]:
                         self._seen_attrs.add(attr)
             except OverflowError, AttributeError:
-                raise ValueError(Errors.E154.format())
+                raise ValueError(Errors.E154.format()) from None
         self._patterns.setdefault(key, [])
         self._callbacks[key] = on_match
         self._filter[key] = greedy
diff --git a/spacy/pipeline/attributeruler.py b/spacy/pipeline/attributeruler.py
index 1f1e63959..d5abf7863 100644
--- a/spacy/pipeline/attributeruler.py
+++ b/spacy/pipeline/attributeruler.py
@@ -85,7 +85,7 @@ class AttributeRuler(Pipe):
                         span=[t.text for t in span],
                         index=index,
                     )
-                )
+                ) from None
             set_token_attrs(token, attrs)
         return doc
 
diff --git a/spacy/pipeline/entity_linker.py b/spacy/pipeline/entity_linker.py
index 923d925dc..d922db1ad 100644
--- a/spacy/pipeline/entity_linker.py
+++ b/spacy/pipeline/entity_linker.py
@@ -195,7 +195,7 @@ class EntityLinker(Pipe):
             types = set([type(eg) for eg in examples])
             raise TypeError(
                 Errors.E978.format(name="EntityLinker", method="update", types=types)
-            )
+            ) from None
         if set_annotations:
             # This seems simpler than other ways to get that exact output -- but
             # it does run the model twice :(
@@ -213,7 +213,7 @@ class EntityLinker(Pipe):
                         sent_index = sentences.index(ent.sent)
                     except AttributeError:
                         # Catch the exception when ent.sent is None and provide a user-friendly warning
-                        raise RuntimeError(Errors.E030)
+                        raise RuntimeError(Errors.E030) from None
                     # get n previous sentences, if there are any
                     start_sentence = max(0, sent_index - self.n_sents)
                     # get n posterior sentences, or as many < n as there are
@@ -439,7 +439,7 @@ class EntityLinker(Pipe):
             try:
                 self.model.from_bytes(p.open("rb").read())
             except AttributeError:
-                raise ValueError(Errors.E149)
+                raise ValueError(Errors.E149) from None
 
         def load_kb(p):
             self.kb = KnowledgeBase(entity_vector_length=self.cfg["entity_width"])
diff --git a/spacy/pipeline/morphologizer.pyx b/spacy/pipeline/morphologizer.pyx
index 18673f85b..06c9f9a25 100644
--- a/spacy/pipeline/morphologizer.pyx
+++ b/spacy/pipeline/morphologizer.pyx
@@ -262,7 +262,7 @@ class Morphologizer(Tagger):
             try:
                 self.model.from_bytes(b)
             except AttributeError:
-                raise ValueError(Errors.E149)
+                raise ValueError(Errors.E149) from None
 
         deserialize = {
             "vocab": lambda b: self.vocab.from_bytes(b),
@@ -301,7 +301,7 @@ class Morphologizer(Tagger):
                 try:
                     self.model.from_bytes(file_.read())
                 except AttributeError:
-                    raise ValueError(Errors.E149)
+                    raise ValueError(Errors.E149) from None
 
         deserialize = {
             "vocab": lambda p: self.vocab.from_disk(p),
diff --git a/spacy/pipeline/multitask.pyx b/spacy/pipeline/multitask.pyx
index d85030adb..4b582045d 100644
--- a/spacy/pipeline/multitask.pyx
+++ b/spacy/pipeline/multitask.pyx
@@ -211,7 +211,7 @@ class ClozeMultitask(Pipe):
             predictions, bp_predictions = self.model.begin_update([eg.predicted for eg in examples])
         except AttributeError:
             types = set([type(eg) for eg in examples])
-            raise TypeError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types))
+            raise TypeError(Errors.E978.format(name="ClozeMultitask", method="rehearse", types=types)) from None
         loss, d_predictions = self.get_loss(examples, self.vocab.vectors.data, predictions)
         bp_predictions(d_predictions)
         if sgd is not None:
diff --git a/spacy/pipeline/pipe.pyx b/spacy/pipeline/pipe.pyx
index 1a94905a2..bed4cdd16 100644
--- a/spacy/pipeline/pipe.pyx
+++ b/spacy/pipeline/pipe.pyx
@@ -204,7 +204,7 @@ cdef class Pipe:
             try:
                 self.model.from_bytes(b)
             except AttributeError:
-                raise ValueError(Errors.E149)
+                raise ValueError(Errors.E149) from None
 
         deserialize = {}
         if hasattr(self, "vocab"):
@@ -242,7 +242,7 @@ cdef class Pipe:
             try:
                 self.model.from_bytes(p.open("rb").read())
             except AttributeError:
-                raise ValueError(Errors.E149)
+                raise ValueError(Errors.E149) from None
 
         deserialize = {}
         deserialize["vocab"] = lambda p: self.vocab.from_disk(p)
diff --git a/spacy/pipeline/senter.pyx b/spacy/pipeline/senter.pyx
index 620a8557e..3147cc902 100644
--- a/spacy/pipeline/senter.pyx
+++ b/spacy/pipeline/senter.pyx
@@ -183,7 +183,7 @@ class SentenceRecognizer(Tagger):
             try:
                 self.model.from_bytes(b)
             except AttributeError:
-                raise ValueError(Errors.E149)
+                raise ValueError(Errors.E149) from None
 
         deserialize = {
             "vocab": lambda b: self.vocab.from_bytes(b),
@@ -222,7 +222,7 @@ class SentenceRecognizer(Tagger):
                 try:
                     self.model.from_bytes(file_.read())
                 except AttributeError:
-                    raise ValueError(Errors.E149)
+                    raise ValueError(Errors.E149) from None
 
         deserialize = {
             "vocab": lambda p: self.vocab.from_disk(p),
diff --git a/spacy/pipeline/tagger.pyx b/spacy/pipeline/tagger.pyx
index 43f5b02cb..da1b3d3aa 100644
--- a/spacy/pipeline/tagger.pyx
+++ b/spacy/pipeline/tagger.pyx
@@ -195,7 +195,7 @@ class Tagger(Pipe):
                 return
         except AttributeError:
             types = set([type(eg) for eg in examples])
-            raise TypeError(Errors.E978.format(name="Tagger", method="update", types=types))
+            raise TypeError(Errors.E978.format(name="Tagger", method="update", types=types)) from None
         set_dropout_rate(self.model, drop)
         tag_scores, bp_tag_scores = self.model.begin_update(
             [eg.predicted for eg in examples])
@@ -232,7 +232,7 @@ class Tagger(Pipe):
             docs = [eg.predicted for eg in examples]
         except AttributeError:
             types = set([type(eg) for eg in examples])
-            raise TypeError(Errors.E978.format(name="Tagger", method="rehearse", types=types))
+            raise TypeError(Errors.E978.format(name="Tagger", method="rehearse", types=types)) from None
         if self._rehearsal_model is None:
             return
         if not any(len(doc) for doc in docs):
@@ -292,7 +292,7 @@ class Tagger(Pipe):
             try:
                 y = example.y
             except AttributeError:
-                raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example)))
+                raise TypeError(Errors.E978.format(name="Tagger", method="begin_training", types=type(example))) from None
             for token in y:
                 tag = token.tag_
                 if tag in orig_tag_map:
@@ -400,7 +400,7 @@ class Tagger(Pipe):
             try:
                 self.model.from_bytes(b)
             except AttributeError:
-                raise ValueError(Errors.E149)
+                raise ValueError(Errors.E149) from None
 
         def load_tag_map(b):
             tag_map = srsly.msgpack_loads(b)
@@ -456,7 +456,7 @@ class Tagger(Pipe):
                 try:
                     self.model.from_bytes(file_.read())
                 except AttributeError:
-                    raise ValueError(Errors.E149)
+                    raise ValueError(Errors.E149) from None
 
         def load_tag_map(p):
             tag_map = srsly.read_msgpack(p)
diff --git a/spacy/pipeline/textcat.py b/spacy/pipeline/textcat.py
index bc16e790f..8b46082cb 100644
--- a/spacy/pipeline/textcat.py
+++ b/spacy/pipeline/textcat.py
@@ -203,7 +203,7 @@ class TextCategorizer(Pipe):
             types = set([type(eg) for eg in examples])
             raise TypeError(
                 Errors.E978.format(name="TextCategorizer", method="update", types=types)
-            )
+            ) from None
         set_dropout_rate(self.model, drop)
         scores, bp_scores = self.model.begin_update([eg.predicted for eg in examples])
         loss, d_scores = self.get_loss(examples, scores)
@@ -250,7 +250,7 @@ class TextCategorizer(Pipe):
             err = Errors.E978.format(
                 name="TextCategorizer", method="rehearse", types=types
             )
-            raise TypeError(err)
+            raise TypeError(err) from None
         if not any(len(doc) for doc in docs):
             # Handle cases where there are no tokens in any docs.
             return losses
@@ -351,7 +351,7 @@ class TextCategorizer(Pipe):
                 err = Errors.E978.format(
                     name="TextCategorizer", method="update", types=type(example)
                 )
-                raise TypeError(err)
+                raise TypeError(err) from None
             for cat in y.cats:
                 self.add_label(cat)
         self.require_labels()
diff --git a/spacy/pipeline/transition_parser.pyx b/spacy/pipeline/transition_parser.pyx
index b14a55cb4..9829e764d 100644
--- a/spacy/pipeline/transition_parser.pyx
+++ b/spacy/pipeline/transition_parser.pyx
@@ -473,7 +473,7 @@ cdef class Parser(Pipe):
                 self._resize()
                 self.model.from_bytes(bytes_data)
             except AttributeError:
-                raise ValueError(Errors.E149)
+                raise ValueError(Errors.E149) from None
         return self
 
     def to_bytes(self, exclude=tuple()):
@@ -498,7 +498,7 @@ cdef class Parser(Pipe):
                 try:
                     self.model.from_bytes(msg['model'])
                 except AttributeError:
-                    raise ValueError(Errors.E149)
+                    raise ValueError(Errors.E149) from None
         return self
 
     def _init_gold_batch(self, examples, min_length=5, max_length=500):
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index 2fcc0983b..935af88d1 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -699,7 +699,7 @@ cdef class Doc:
                        for id_ in py_attr_ids]
         except KeyError as msg:
             keys = [k for k in IDS.keys() if not k.startswith("FLAG")]
-            raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys))
+            raise KeyError(Errors.E983.format(dict="IDS", key=msg, keys=keys)) from None
         # Make an array from the attributes --- otherwise our inner loop is
         # Python dict iteration.
         cdef np.ndarray attr_ids = numpy.asarray(py_attr_ids, dtype="i")
diff --git a/spacy/util.py b/spacy/util.py
index 52073097e..05f8ef017 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -138,7 +138,7 @@ def get_lang_class(lang: str) -> "Language":
         try:
             module = importlib.import_module(f".lang.{lang}", "spacy")
         except ImportError as err:
-            raise ImportError(Errors.E048.format(lang=lang, err=err))
+            raise ImportError(Errors.E048.format(lang=lang, err=err)) from err
         set_lang_class(lang, getattr(module, module.__all__[0]))
     return registry.languages.get(lang)
 
@@ -502,7 +502,7 @@ def run_command(command: Union[str, List[str]]) -> None:
     except FileNotFoundError:
         raise FileNotFoundError(
             Errors.E970.format(str_command=" ".join(command), tool=command[0])
-        )
+        ) from None
     if status != 0:
         sys.exit(status)
 
@@ -891,7 +891,7 @@ def get_words_and_spaces(
         try:
             word_start = text[text_pos:].index(word)
         except ValueError:
-            raise ValueError(Errors.E194.format(text=text, words=words))
+            raise ValueError(Errors.E194.format(text=text, words=words)) from None
         if word_start > 0:
             text_words.append(text[text_pos : text_pos + word_start])
             text_spaces.append(False)
@@ -918,7 +918,7 @@ def copy_config(config: Union[Dict[str, Any], Config]) -> Config:
     try:
         return Config(config).copy()
     except ValueError:
-        raise ValueError(Errors.E961.format(config=config))
+        raise ValueError(Errors.E961.format(config=config)) from None
 
 
 def deep_merge_configs(
@@ -1002,7 +1002,7 @@ def dot_to_object(config: Config, section: str):
         try:
             component = component[item]
         except (KeyError, TypeError):
-            raise KeyError(Errors.E952.format(name=section))
+            raise KeyError(Errors.E952.format(name=section)) from None
     return component
 
 

From 06e80d95cdf6ae2fbfcf8f6ef20c882559190c7f Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 6 Aug 2020 00:28:14 +0200
Subject: [PATCH 2/3] Sync develop with nightly docs state (#5883)

Co-authored-by: svlandeg <sofie.vanlandeghem@gmail.com>
---
 spacy/cli/debug_model.py                  |   2 +-
 website/docs/api/cli.md                   | 170 ++++++++++++++++++++--
 website/docs/api/data-formats.md          | 119 ++++++++++++++-
 website/docs/api/entitylinker.md          |  45 ++----
 website/docs/api/example.md               |  58 +++++++-
 website/docs/api/top-level.md             |  53 -------
 website/docs/usage/linguistic-features.md |  41 +++---
 7 files changed, 359 insertions(+), 129 deletions(-)

diff --git a/spacy/cli/debug_model.py b/spacy/cli/debug_model.py
index 7c6c76a34..cc6cb98ea 100644
--- a/spacy/cli/debug_model.py
+++ b/spacy/cli/debug_model.py
@@ -54,7 +54,7 @@ def debug_model_cli(
             nlp, config = util.load_model_from_config(cfg)
         except ValueError as e:
             msg.fail(str(e), exits=1)
-    seed = config.get("training", {}).get("seed", None)
+    seed = config["pretraining"]["seed"]
     if seed is not None:
         msg.info(f"Fixing random seed: {seed}")
         fix_random_seed(seed)
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 0ea67747e..abe050661 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -221,17 +221,21 @@ config from being resolved. This means that you may not see all validation
 errors at once and some issues are only shown once previous errors have been
 fixed.
 
+Instead of specifying all required settings in the config file, you can rely on
+an auto-fill functionality that uses spaCy's built-in defaults. The resulting
+full config can be written to file and used in downstream training tasks.
+
 ```bash
-$ python -m spacy debug config [config_path] [--code] [overrides]
+$ python -m spacy debug config [config_path] [--code_path] [--output] [--auto_fill] [--diff] [overrides]
 ```
 
-> #### Example
+> #### Example 1
 >
 > ```bash
 > $ python -m spacy debug config ./config.cfg
 > ```
 
-<Accordion title="Example output" spaced>
+<Accordion title="Example 1 output" spaced>
 
 ```
 ✘ Config validation error
@@ -250,12 +254,30 @@ training -> width                extra fields not permitted
 
 </Accordion>
 
-| Argument       | Type       | Description                                                                                                                                                   |
-| -------------- | ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `config_path`  | positional | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters.                                                         |
-| `--code`, `-c` | option     | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures.          |
-| `--help`, `-h` | flag       | Show help message and available arguments.                                                                                                                    |
-| overrides      |            | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
+> #### Example 2
+>
+> ```bash
+> $ python -m spacy debug config ./minimal_config.cfg -F -o ./filled_config.cfg
+> ```
+
+<Accordion title="Example 2 output" spaced>
+
+```
+✔ Auto-filled config is valid
+✔ Saved updated config to ./filled_config.cfg
+```
+
+</Accordion>
+
+| Argument              | Type       | Default | Description                                                                                                                                                   |
+| --------------------- | ---------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `config_path`         | positional | -       | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters.                                                         |
+| `--code_path`, `-c`   | option     | `None`  | Path to Python file with additional code to be imported. Allows [registering custom functions](/usage/training#custom-models) for new architectures.          |
+| `--auto_fill`, `-F`   | option     | `False` | Whether or not to auto-fill the config with built-in defaults if possible. If `False`, the provided config needs to be complete.                              |
+| `--output_path`, `-o` | option     | `None`  | Output path where the filled config can be stored. Use '-' for standard output.                                                                               |
+| `--diff`, `-D`        | option     | `False` | Show a visual diff if config was auto-filled.                                                                                                                 |
+| `--help`, `-h`        | flag       | `False` | Show help message and available arguments.                                                                                                                    |
+| overrides             |            | `None`  | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
 
 ### debug data {#debug-data}
 
@@ -433,7 +455,135 @@ will not be available.
 | `--help`, `-h`             | flag       | Show help message and available arguments.                                                                                                                    |
 | overrides                  |            | Config parameters to override. Should be options starting with `--` that correspond to the config section and value to override, e.g. `--training.use_gpu 1`. |
 
-<!-- TODO: document debug profile and debug model? -->
+<!-- TODO: document debug profile?-->
+
+### debug model {#debug-model}
+
+Debug a Thinc [`Model`](https://thinc.ai/docs/api-model) by running it on a
+sample text and checking how it updates its internal weights and parameters.
+
+```bash
+$ python -m spacy debug model [config_path] [component] [--layers] [-DIM] [-PAR] [-GRAD] [-ATTR] [-P0] [-P1] [-P2] [P3] [--gpu_id]
+```
+
+> #### Example 1
+>
+> ```bash
+> $ python -m spacy debug model ./config.cfg tagger -P0
+> ```
+
+<Accordion title="Example 1 output" spaced>
+
+```
+ℹ Using CPU
+ℹ Fixing random seed: 0
+ℹ Analysing model with ID 62
+
+========================== STEP 0 - before training ==========================
+ℹ Layer 0: model ID 62:
+'extract_features>>list2ragged>>with_array-ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed>>with_array-maxout>>layernorm>>dropout>>ragged2list>>with_array-residual>>residual>>residual>>residual>>with_array-softmax'
+ℹ Layer 1: model ID 59:
+'extract_features>>list2ragged>>with_array-ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed>>with_array-maxout>>layernorm>>dropout>>ragged2list>>with_array-residual>>residual>>residual>>residual'
+ℹ Layer 2: model ID 61: 'with_array-softmax'
+ℹ Layer 3: model ID 24:
+'extract_features>>list2ragged>>with_array-ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed>>with_array-maxout>>layernorm>>dropout>>ragged2list'
+ℹ Layer 4: model ID 58: 'with_array-residual>>residual>>residual>>residual'
+ℹ Layer 5: model ID 60: 'softmax'
+ℹ Layer 6: model ID 13: 'extract_features'
+ℹ Layer 7: model ID 14: 'list2ragged'
+ℹ Layer 8: model ID 16:
+'with_array-ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed'
+ℹ Layer 9: model ID 22: 'with_array-maxout>>layernorm>>dropout'
+ℹ Layer 10: model ID 23: 'ragged2list'
+ℹ Layer 11: model ID 57: 'residual>>residual>>residual>>residual'
+ℹ Layer 12: model ID 15:
+'ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed|ints-getitem>>hashembed'
+ℹ Layer 13: model ID 21: 'maxout>>layernorm>>dropout'
+ℹ Layer 14: model ID 32: 'residual'
+ℹ Layer 15: model ID 40: 'residual'
+ℹ Layer 16: model ID 48: 'residual'
+ℹ Layer 17: model ID 56: 'residual'
+ℹ Layer 18: model ID 3: 'ints-getitem>>hashembed'
+ℹ Layer 19: model ID 6: 'ints-getitem>>hashembed'
+ℹ Layer 20: model ID 9: 'ints-getitem>>hashembed'
+...
+```
+
+</Accordion>
+
+In this example log, we just print the name of each layer after creation of the
+model ("Step 0"), which helps us to understand the internal structure of the
+Neural Network, and to focus on specific layers that we want to inspect further
+(see next example).
+
+> #### Example 2
+>
+> ```bash
+> $ python -m spacy debug model ./config.cfg tagger -l "5,15" -DIM -PAR -P0 -P1 -P2
+> ```
+
+<Accordion title="Example 2 output" spaced>
+
+```
+ℹ Using CPU
+ℹ Fixing random seed: 0
+ℹ Analysing model with ID 62
+
+========================= STEP 0 - before training =========================
+ℹ Layer 5: model ID 60: 'softmax'
+ℹ  - dim nO: None
+ℹ  - dim nI: 96
+ℹ  - param W: None
+ℹ  - param b: None
+ℹ Layer 15: model ID 40: 'residual'
+ℹ  - dim nO: None
+ℹ  - dim nI: None
+
+======================= STEP 1 - after initialization =======================
+ℹ Layer 5: model ID 60: 'softmax'
+ℹ  - dim nO: 4
+ℹ  - dim nI: 96
+ℹ  - param W: (4, 96) - sample: [0. 0. 0. 0. 0.]
+ℹ  - param b: (4,) - sample: [0. 0. 0. 0.]
+ℹ Layer 15: model ID 40: 'residual'
+ℹ  - dim nO: 96
+ℹ  - dim nI: None
+
+========================== STEP 2 - after training ==========================
+ℹ Layer 5: model ID 60: 'softmax'
+ℹ  - dim nO: 4
+ℹ  - dim nI: 96
+ℹ  - param W: (4, 96) - sample: [ 0.00283958 -0.00294119  0.00268396 -0.00296219
+-0.00297141]
+ℹ  - param b: (4,) - sample: [0.00300002 0.00300002 0.00300002 0.00300002]
+ℹ Layer 15: model ID 40: 'residual'
+ℹ  - dim nO: 96
+ℹ  - dim nI: None
+```
+
+</Accordion>
+
+In this example log, we see how initialization of the model (Step 1) propagates
+the correct values for the `nI` (input) and `nO` (output) dimensions of the
+various layers. In the `softmax` layer, this step also defines the `W` matrix as
+an all-zero matrix determined by the `nO` and `nI` dimensions. After a first
+training step (Step 2), this matrix has clearly updated its values through the
+training feedback loop.
+
+| Argument                | Type       | Default | Description                                                                                          |
+| ----------------------- | ---------- | ------- | ---------------------------------------------------------------------------------------------------- |
+| `config_path`           | positional |         | Path to [training config](/api/data-formats#config) file containing all settings and hyperparameters. |
+| `component`             | positional |         | Name of the pipeline component of which the model should be analysed.                                |
+| `--layers`, `-l`        | option     |         | Comma-separated names of layer IDs to print.                                                         |
+| `--dimensions`, `-DIM`  | option     | `False` | Show dimensions of each layer.                                                                       |
+| `--parameters`, `-PAR`  | option     | `False` | Show parameters of each layer.                                                                       |
+| `--gradients`, `-GRAD`  | option     | `False` | Show gradients of each layer.                                                                        |
+| `--attributes`, `-ATTR` | option     | `False` | Show attributes of each layer.                                                                       |
+| `--print-step0`, `-P0`  | option     | `False` | Print model before training.                                                                         |
+| `--print-step1`, `-P1`  | option     | `False` | Print model after initialization.                                                                    |
+| `--print-step2`, `-P2`  | option     | `False` | Print model after training.                                                                          |
+| `--print-step3`, `-P3`  | option     | `False` | Print final predictions.                                                                             |
+| `--help`, `-h`          | flag       |         | Show help message and available arguments.                                                           |
 
 ## Train {#train}
 
diff --git a/website/docs/api/data-formats.md b/website/docs/api/data-formats.md
index 10fef6ba6..210e5d47d 100644
--- a/website/docs/api/data-formats.md
+++ b/website/docs/api/data-formats.md
@@ -28,7 +28,7 @@ spaCy's training format. To convert one or more existing `Doc` objects to
 spaCy's JSON format, you can use the
 [`gold.docs_to_json`](/api/top-level#docs_to_json) helper.
 
-> #### Annotating entities
+> #### Annotating entities {#biluo}
 >
 > Named entities are provided in the
 > [BILUO](/usage/linguistic-features#accessing-ner) notation. Tokens outside an
@@ -75,6 +75,123 @@ from the English Wall Street Journal portion of the Penn Treebank:
 https://github.com/explosion/spaCy/tree/master/examples/training/training-data.json
 ```
 
+### Annotations in dictionary format {#dict-input}
+
+To create [`Example`](/api/example) objects, you can create a dictionary of the
+gold-standard annotations `gold_dict`, and then call
+
+```python
+example = Example.from_dict(doc, gold_dict)
+```
+
+There are currently two formats supported for this dictionary of annotations:
+one with a simple, flat structure of keywords, and one with a more hierarchical
+structure.
+
+#### Flat structure {#dict-flat}
+
+Here is the full overview of potential entries in a flat dictionary of
+annotations. You need to only specify those keys corresponding to the task you
+want to train.
+
+```python
+### Flat dictionary
+{
+    "text": string,                        # Raw text.
+    "words": List[string],                 # List of gold tokens.
+    "lemmas": List[string],                # List of lemmas.
+    "spaces": List[bool],                  # List of boolean values indicating whether the corresponding tokens is followed by a space or not.
+    "tags": List[string],                  # List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging).
+    "pos": List[string],                   # List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging).
+    "morphs": List[string],                # List of [morphological features](/usage/linguistic-features#rule-based-morphology).
+    "sent_starts": List[bool],             # List of boolean values indicating whether each token is the first of a sentence or not.
+    "deps": List[string],                  # List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head.
+    "heads": List[int],                    # List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text.
+    "entities": List[string],              # Option 1: List of [BILUO tags](#biluo) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens.
+    "entities": List[(int, int, string)],  # Option 2: List of `"(start, end, label)"` tuples defining all entities in.
+    "cats": Dict[str, float],              # Dictionary of `label:value` pairs indicating how relevant a certain [category](/api/textcategorizer) is for the text.
+    "links": Dict[(int, int), Dict],       # Dictionary of `offset:dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The charachter offsets are linked to a dictionary of relevant knowledge base IDs.
+}
+```
+
+There are a few caveats to take into account:
+
+- Multiple formats are possible for the "entities" entry, but you have to pick
+  one.
+- Any values for sentence starts will be ignored if there are annotations for
+  dependency relations.
+- If the dictionary contains values for "text" and "words", but not "spaces",
+  the latter are inferred automatically. If "words" is not provided either, the
+  values are inferred from the `doc` argument.
+
+##### Examples
+
+```python
+# Training data for a part-of-speech tagger
+doc = Doc(vocab, words=["I", "like", "stuff"])
+example = Example.from_dict(doc, {"tags": ["NOUN", "VERB", "NOUN"]})
+
+# Training data for an entity recognizer (option 1)
+doc = nlp("Laura flew to Silicon Valley.")
+biluo_tags = ["U-PERS", "O", "O", "B-LOC", "L-LOC"]
+example = Example.from_dict(doc, {"entities": biluo_tags})
+
+# Training data for an entity recognizer (option 2)
+doc = nlp("Laura flew to Silicon Valley.")
+entity_tuples = [
+        (0, 5, "PERSON"),
+        (14, 28, "LOC"),
+    ]
+example = Example.from_dict(doc, {"entities": entity_tuples})
+
+# Training data for text categorization
+doc = nlp("I'm pretty happy about that!")
+example = Example.from_dict(doc, {"cats": {"POSITIVE": 1.0, "NEGATIVE": 0.0}})
+
+# Training data for an Entity Linking component
+doc = nlp("Russ Cochran his reprints include EC Comics.")
+example = Example.from_dict(doc, {"links": {(0, 12): {"Q7381115": 1.0, "Q2146908": 0.0}}})
+```
+
+#### Hierachical structure {#dict-hierarch}
+
+Internally, a more hierarchical dictionary structure is used to store
+gold-standard annotations. Its format is similar to the structure described in
+the previous section, but there are two main sections `token_annotation` and
+`doc_annotation`, and the keys for token annotations should be uppercase
+[`Token` attributes](/api/token#attributes) such as "ORTH" and "TAG".
+
+```python
+### Hierarchical dictionary
+{
+    "text": string,                            # Raw text.
+    "token_annotation": {
+        "ORTH": List[string],                  # List of gold tokens.
+        "LEMMA": List[string],                 # List of lemmas.
+        "SPACY": List[bool],                   # List of boolean values indicating whether the corresponding tokens is followed by a space or not.
+        "TAG": List[string],                   # List of fine-grained [POS tags](/usage/linguistic-features#pos-tagging).
+        "POS": List[string],                   # List of coarse-grained [POS tags](/usage/linguistic-features#pos-tagging).
+        "MORPH": List[string],                 # List of [morphological features](/usage/linguistic-features#rule-based-morphology).
+        "SENT_START": List[bool],              # List of boolean values indicating whether each token is the first of a sentence or not.
+        "DEP": List[string],                   # List of string values indicating the [dependency relation](/usage/linguistic-features#dependency-parse) of a token to its head.
+        "HEAD": List[int],                     # List of integer values indicating the dependency head of each token, referring to the absolute index of each token in the text.
+    },
+    "doc_annotation": {
+        "entities": List[(int, int, string)],  # List of [BILUO tags](#biluo) per token of the format `"{action}-{label}"`, or `None` for unannotated tokens.
+        "cats": Dict[str, float],              # Dictionary of `label:value` pairs indicating how relevant a certain [category](/api/textcategorizer) is for the text.
+        "links": Dict[(int, int), Dict],       # Dictionary of `offset:dict` pairs defining [named entity links](/usage/linguistic-features#entity-linking). The charachter offsets are linked to a dictionary of relevant knowledge base IDs.
+    }
+}
+```
+
+There are a few caveats to take into account:
+
+- Any values for sentence starts will be ignored if there are annotations for
+  dependency relations.
+- If the dictionary contains values for "text" and "ORTH", but not "SPACY", the
+  latter are inferred automatically. If "ORTH" is not provided either, the
+  values are inferred from the `doc` argument.
+
 ## Training config {#config new="3"}
 
 Config files define the training process and model pipeline and can be passed to
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index 1e9beaf82..18d9c5edd 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -32,15 +32,13 @@ architectures and their arguments and hyperparameters.
 > nlp.add_pipe("entity_linker", config=config)
 > ```
 
-<!-- TODO: finish API docs -->
-
-| Setting          | Type                                       | Description       | Default                                         |
-| ---------------- | ------------------------------------------ | ----------------- | ----------------------------------------------- |
-| `kb`             | `KnowledgeBase`                            |                   | `None`                                          |
-| `labels_discard` | `Iterable[str]`                            |                   | `[]`                                            |
-| `incl_prior`     | bool                                       |                   |  `True`                                         |
-| `incl_context`   | bool                                       |                   | `True`                                          |
-| `model`          | [`Model`](https://thinc.ai/docs/api-model) | The model to use. | [EntityLinker](/api/architectures#EntityLinker) |
+| Setting          | Type                                       | Description                                                             | Default                                         |
+| ---------------- | ------------------------------------------ | ----------------------------------------------------------------------- | ----------------------------------------------- |
+| `kb`             | `KnowledgeBase`                            | The [`KnowledgeBase`](/api/kb) holding all entities and their aliases.  | `None`                                          |
+| `labels_discard` | `Iterable[str]`                            | NER labels that will automatically get a "NIL" prediction.              | `[]`                                            |
+| `incl_prior`     | bool                                       | Whether or not to include prior probabilities from the KB in the model. | `True`                                          |
+| `incl_context`   | bool                                       | Whether or not to include the local context in the model.               | `True`                                          |
+| `model`          | [`Model`](https://thinc.ai/docs/api-model) | The model to use.                                                       | [EntityLinker](/api/architectures#EntityLinker) |
 
 ```python
 https://github.com/explosion/spaCy/blob/develop/spacy/pipeline/entity_linker.py
@@ -75,10 +73,10 @@ shortcut for this and instantiate the component using its string name and
 | `model`          | `Model`         | The [`Model`](https://thinc.ai/docs/api-model) powering the pipeline component.             |
 | `name`           | str             | String name of the component instance. Used to add entries to the `losses` during training. |
 | _keyword-only_   |                 |                                                                                             |
-| `kb`             | `KnowlegeBase`  |                                                                                             |
-| `labels_discard` | `Iterable[str]` |                                                                                             |
-| `incl_prior`     | bool            |                                                                                             |
-| `incl_context`   | bool            |                                                                                             |
+| `kb`             | `KnowlegeBase`  | The [`KnowledgeBase`](/api/kb) holding all entities and their aliases.                      |
+| `labels_discard` | `Iterable[str]` | NER labels that will automatically get a "NIL" prediction.                                  |
+| `incl_prior`     | bool            | Whether or not to include prior probabilities from the KB in the model.                     |
+| `incl_context`   | bool            | Whether or not to include the local context in the model.                                   |
 
 ## EntityLinker.\_\_call\_\_ {#call tag="method"}
 
@@ -130,15 +128,12 @@ applied to the `Doc` in order. Both [`__call__`](/api/entitylinker#call) and
 ## EntityLinker.begin_training {#begin_training tag="method"}
 
 Initialize the pipe for training, using data examples if available. Returns an
-[`Optimizer`](https://thinc.ai/docs/api-optimizers) object. Before calling this
-method, a knowledge base should have been defined with
-[`set_kb`](/api/entitylinker#set_kb).
+[`Optimizer`](https://thinc.ai/docs/api-optimizers) object.
 
 > #### Example
 >
 > ```python
 > entity_linker = nlp.add_pipe("entity_linker", last=True)
-> entity_linker.set_kb(kb)
 > optimizer = entity_linker.begin_training(pipeline=nlp.pipeline)
 > ```
 
@@ -210,22 +205,6 @@ pipe's entity linking model and context encoder. Delegates to
 | `losses`          | `Dict[str, float]`                                  | Optional record of the loss during training. Updated using the component name as the key.                                                     |
 | **RETURNS**       | `Dict[str, float]`                                  | The updated `losses` dictionary.                                                                                                              |
 
-## EntityLinker.set_kb {#set_kb tag="method"}
-
-Define the knowledge base (KB) used for disambiguating named entities to KB
-identifiers.
-
-> #### Example
->
-> ```python
-> entity_linker = nlp.add_pipe("entity_linker")
-> entity_linker.set_kb(kb)
-> ```
-
-| Name | Type            | Description                     |
-| ---- | --------------- | ------------------------------- |
-| `kb` | `KnowledgeBase` | The [`KnowledgeBase`](/api/kb). |
-
 ## EntityLinker.create_optimizer {#create_optimizer tag="method"}
 
 Create an optimizer for the pipeline component.
diff --git a/website/docs/api/example.md b/website/docs/api/example.md
index 1257fdc1e..8c117aec7 100644
--- a/website/docs/api/example.md
+++ b/website/docs/api/example.md
@@ -8,8 +8,9 @@ new: 3.0
 
 An `Example` holds the information for one training instance. It stores two
 `Doc` objects: one for holding the gold-standard reference data, and one for
-holding the predictions of the pipeline. An `Alignment` object stores the
-alignment between these two documents, as they can differ in tokenization.
+holding the predictions of the pipeline. An [`Alignment`](#alignment-object)
+object stores the alignment between these two documents, as they can differ in
+tokenization.
 
 ## Example.\_\_init\_\_ {#init tag="method"}
 
@@ -40,9 +41,8 @@ both documents.
 ## Example.from_dict {#from_dict tag="classmethod"}
 
 Construct an `Example` object from the `predicted` document and the reference
-annotations provided as a dictionary.
-
-<!-- TODO: document formats? legacy & token_annotation stuff -->
+annotations provided as a dictionary. For more details on the required format,
+see the [training format documentation](/api/data-formats#dict-input).
 
 > #### Example
 >
@@ -244,8 +244,9 @@ accuracy of predicted entities against the original gold-standard annotation.
 
 ## Example.to_dict {#to_dict tag="method"}
 
-Return a dictionary representation of the reference annotation contained in this
-`Example`.
+Return a
+[hierarchical dictionary representation](/api/data-formats#dict-hierarch) of the
+reference annotation contained in this `Example`.
 
 > #### Example
 >
@@ -276,3 +277,46 @@ Split one `Example` into multiple `Example` objects, one for each sentence.
 | Name        | Type            | Description                                                |
 | ----------- | --------------- | ---------------------------------------------------------- |
 | **RETURNS** | `List[Example]` | List of `Example` objects, one for each original sentence. |
+
+## Alignment {#alignment-object new="3"}
+
+Calculate alignment tables between two tokenizations.
+
+### Alignment attributes {#alignment-attributes"}
+
+| Name  | Type                                               | Description                                                |
+| ----- | -------------------------------------------------- | ---------------------------------------------------------- |
+| `x2y` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | The `Ragged` object holding the alignment from `x` to `y`. |
+| `y2x` | [`Ragged`](https://thinc.ai/docs/api-types#ragged) | The `Ragged` object holding the alignment from `y` to `x`. |
+
+<Infobox title="Important note" variant="warning">
+
+The current implementation of the alignment algorithm assumes that both
+tokenizations add up to the same string. For example, you'll be able to align
+`["I", "'", "m"]` and `["I", "'m"]`, which both add up to `"I'm"`, but not
+`["I", "'m"]` and `["I", "am"]`.
+
+</Infobox>
+
+> #### Example
+>
+> ```python
+> from spacy.gold import Alignment
+>
+> bert_tokens = ["obama", "'", "s", "podcast"]
+> spacy_tokens = ["obama", "'s", "podcast"]
+> alignment = Alignment.from_strings(bert_tokens, spacy_tokens)
+> a2b = alignment.x2y
+> assert list(a2b.dataXd) == [0, 1, 1, 2]
+> ```
+>
+> If `a2b.dataXd[1] == a2b.dataXd[2] == 1`, that means that `A[1]` (`"'"`) and
+> `A[2]` (`"s"`) both align to `B[1]` (`"'s"`).
+
+### Alignment.from_strings {#classmethod tag="function"}
+
+| Name        | Type        | Description                                     |
+| ----------- | ----------- | ----------------------------------------------- |
+| `A`         | list        | String values of candidate tokens to align.     |
+| `B`         | list        | String values of reference tokens to align.     |
+| **RETURNS** | `Alignment` | An `Alignment` object describing the alignment. |
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 71b53f844..0954fb577 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -468,59 +468,6 @@ Convert a list of Doc objects into the
 | `id`        | int              | ID to assign to the JSON. Defaults to `0`. |
 | **RETURNS** | dict             | The data in spaCy's JSON format.           |
 
-### gold.align {#align tag="function"}
-
-Calculate alignment tables between two tokenizations, using the Levenshtein
-algorithm. The alignment is case-insensitive.
-
-<Infobox title="Important note" variant="warning">
-
-The current implementation of the alignment algorithm assumes that both
-tokenizations add up to the same string. For example, you'll be able to align
-`["I", "'", "m"]` and `["I", "'m"]`, which both add up to `"I'm"`, but not
-`["I", "'m"]` and `["I", "am"]`.
-
-</Infobox>
-
-> #### Example
->
-> ```python
-> from spacy.gold import align
->
-> bert_tokens = ["obama", "'", "s", "podcast"]
-> spacy_tokens = ["obama", "'s", "podcast"]
-> alignment = align(bert_tokens, spacy_tokens)
-> cost, a2b, b2a, a2b_multi, b2a_multi = alignment
-> ```
-
-| Name        | Type  | Description                                                                |
-| ----------- | ----- | -------------------------------------------------------------------------- |
-| `tokens_a`  | list  | String values of candidate tokens to align.                                |
-| `tokens_b`  | list  | String values of reference tokens to align.                                |
-| **RETURNS** | tuple | A `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the alignment. |
-
-The returned tuple contains the following alignment information:
-
-> #### Example
->
-> ```python
-> a2b = array([0, -1, -1, 2])
-> b2a = array([0, 2, 3])
-> a2b_multi = {1: 1, 2: 1}
-> b2a_multi = {}
-> ```
->
-> If `a2b[3] == 2`, that means that `tokens_a[3]` aligns to `tokens_b[2]`. If
-> there's no one-to-one alignment for a token, it has the value `-1`.
-
-| Name        | Type                                   | Description                                                                                                                                     |
-| ----------- | -------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
-| `cost`      | int                                    | The number of misaligned tokens.                                                                                                                |
-| `a2b`       | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_a` to indices in `tokens_b`.                                                                          |
-| `b2a`       | `numpy.ndarray[ndim=1, dtype='int32']` | One-to-one mappings of indices in `tokens_b` to indices in `tokens_a`.                                                                          |
-| `a2b_multi` | dict                                   | A dictionary mapping indices in `tokens_a` to indices in `tokens_b`, where multiple tokens of `tokens_a` align to the same token of `tokens_b`. |
-| `b2a_multi` | dict                                   | A dictionary mapping indices in `tokens_b` to indices in `tokens_a`, where multiple tokens of `tokens_b` align to the same token of `tokens_a`. |
-
 ### gold.biluo_tags_from_offsets {#biluo_tags_from_offsets tag="function"}
 
 Encode labelled spans into per-token tags, using the
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index 881a0e333..8d3c7e1b6 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -1089,51 +1089,44 @@ In situations like that, you often want to align the tokenization so that you
 can merge annotations from different sources together, or take vectors predicted
 by a
 [pretrained BERT model](https://github.com/huggingface/pytorch-transformers) and
-apply them to spaCy tokens. spaCy's [`gold.align`](/api/top-level#align) helper
-returns a `(cost, a2b, b2a, a2b_multi, b2a_multi)` tuple describing the number
-of misaligned tokens, the one-to-one mappings of token indices in both
-directions and the indices where multiple tokens align to one single token.
+apply them to spaCy tokens. spaCy's [`Alignment`](/api/example#alignment-object) object
+allows the one-to-one mappings of token indices in both directions as well as
+taking into account indices where multiple tokens align to one single token.
 
 > #### ✏️ Things to try
 >
 > 1. Change the capitalization in one of the token lists – for example,
 >    `"obama"` to `"Obama"`. You'll see that the alignment is case-insensitive.
 > 2. Change `"podcasts"` in `other_tokens` to `"pod", "casts"`. You should see
->    that there are now 4 misaligned tokens and that the new many-to-one mapping
->    is reflected in `a2b_multi`.
-> 3. Make `other_tokens` and `spacy_tokens` identical. You'll see that the
->    `cost` is `0` and all corresponding mappings are also identical.
+>    that there are now two tokens of length 2 in `y2x`, one corresponding to
+>    "'s", and one to "podcasts".
+> 3. Make `other_tokens` and `spacy_tokens` identical. You'll see that all
+>    tokens now correspond 1-to-1.
 
 ```python
 ### {executable="true"}
-from spacy.gold import align
+from spacy.gold import Alignment
 
 other_tokens = ["i", "listened", "to", "obama", "'", "s", "podcasts", "."]
 spacy_tokens = ["i", "listened", "to", "obama", "'s", "podcasts", "."]
-cost, a2b, b2a, a2b_multi, b2a_multi = align(other_tokens, spacy_tokens)
-print("Edit distance:", cost)  # 3
-print("One-to-one mappings a -> b", a2b)  # array([0, 1, 2, 3, -1, -1, 5, 6])
-print("One-to-one mappings b -> a", b2a)  # array([0, 1, 2, 3, -1, 6, 7])
-print("Many-to-one mappings a -> b", a2b_multi)  # {4: 4, 5: 4}
-print("Many-to-one mappings b-> a", b2a_multi)  # {}
+align = Alignment.from_strings(other_tokens, spacy_tokens)
+print(f"a -> b, lengths: {align.x2y.lengths}")  # array([1, 1, 1, 1, 1, 1, 1, 1])
+print(f"a -> b, mapping: {align.x2y.dataXd}")  # array([0, 1, 2, 3, 4, 4, 5, 6]) : two tokens both refer to "'s"
+print(f"b -> a, lengths: {align.y2x.lengths}")  # array([1, 1, 1, 1, 2, 1, 1])   : the token "'s" refers to two tokens
+print(f"b -> a, mappings: {align.y2x.dataXd}")  # array([0, 1, 2, 3, 4, 5, 6, 7])
 ```
 
 Here are some insights from the alignment information generated in the example
 above:
 
-- The edit distance (cost) is `3`: two deletions and one insertion.
 - The one-to-one mappings for the first four tokens are identical, which means
   they map to each other. This makes sense because they're also identical in the
   input: `"i"`, `"listened"`, `"to"` and `"obama"`.
-- The index mapped to `a2b[6]` is `5`, which means that `other_tokens[6]`
+- The value of `x2y.dataXd[6]` is `5`, which means that `other_tokens[6]`
   (`"podcasts"`) aligns to `spacy_tokens[5]` (also `"podcasts"`).
-- `a2b[4]` is `-1`, which means that there is no one-to-one alignment for the
-  token at `other_tokens[4]`. The token `"'"` doesn't exist on its own in
-  `spacy_tokens`. The same goes for `a2b[5]` and `other_tokens[5]`, i.e. `"s"`.
-- The dictionary `a2b_multi` shows that both tokens 4 and 5 of `other_tokens`
-  (`"'"` and `"s"`) align to token 4 of `spacy_tokens` (`"'s"`).
-- The dictionary `b2a_multi` shows that there are no tokens in `spacy_tokens`
-  that map to multiple tokens in `other_tokens`.
+- `x2y.dataXd[4]` and `x2y.dataXd[5]` are both `4`, which means that both tokens
+  4 and 5 of `other_tokens` (`"'"` and `"s"`) align to token 4 of `spacy_tokens`
+  (`"'s"`).
 
 <Infobox title="Important note" variant="warning">
 

From 30f316c688953959a42c79d8e0cec6b891348c0c Mon Sep 17 00:00:00 2001
From: Ines Montani <ines@ines.io>
Date: Thu, 6 Aug 2020 00:51:55 +0200
Subject: [PATCH 3/3] Fix server-side rendering [ci skip]

---
 website/src/components/quickstart.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/website/src/components/quickstart.js b/website/src/components/quickstart.js
index f1d3616a5..f886ee7b3 100644
--- a/website/src/components/quickstart.js
+++ b/website/src/components/quickstart.js
@@ -1,7 +1,7 @@
 import React, { Fragment, useState, useEffect, useRef } from 'react'
 import PropTypes from 'prop-types'
 import classNames from 'classnames'
-import { window } from 'browser-monads'
+import { window, document } from 'browser-monads'
 
 import Section from './section'
 import Icon from './icon'