Merge branch 'develop' into nightly.spacy.io

2025-07-14 18:22:27 +03:00 · 2020-07-03 15:15:58 +02:00 · 2020-07-03 15:15:58 +02:00 · 949d4a0a0b
commit 949d4a0a0b
parent aa62cdee50 e1b3e8ee11
25 changed files with 401 additions and 313 deletions
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy-nightly"
-__version__ = "3.0.0a0"
+__version__ = "3.0.0a1"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/convert.py
+++ b/spacy/cli/convert.py
@ -9,7 +9,7 @@ import sys
 from ._app import app, Arg, Opt
 from ..gold import docs_to_json
 from ..tokens import DocBin
-from ..gold.converters import iob2docs, conll_ner2docs, json2docs
+from ..gold.converters import iob2docs, conll_ner2docs, json2docs, conllu2docs
 # Converters are matched by file extension except for ner/iob, which are
@ -18,9 +18,9 @@ from ..gold.converters import iob2docs, conll_ner2docs, json2docs
 # imported from /converters.
 CONVERTERS = {
-    # "conllubio": conllu2docs, TODO
+    "conllubio": conllu2docs,
-    # "conllu": conllu2docs, TODO
+    "conllu": conllu2docs,
-    # "conll": conllu2docs, TODO
+    "conll": conllu2docs,
    "ner": conll_ner2docs,
    "iob": iob2docs,
    "json": json2docs,
@ -137,7 +137,7 @@ def _print_docs_to_stdout(docs, output_type):
    if output_type == "json":
        srsly.write_json("-", docs_to_json(docs))
    else:
-        sys.stdout.buffer.write(DocBin(docs=docs).to_bytes())
+        sys.stdout.buffer.write(DocBin(docs=docs, store_user_data=True).to_bytes())
 def _write_docs_to_file(docs, output_file, output_type):
@ -146,7 +146,7 @@ def _write_docs_to_file(docs, output_file, output_type):
    if output_type == "json":
        srsly.write_json(output_file, docs_to_json(docs))
    else:
-        data = DocBin(docs=docs).to_bytes()
+        data = DocBin(docs=docs, store_user_data=True).to_bytes()
        with output_file.open("wb") as file_:
            file_.write(data)
--- a/spacy/cli/init_model.py
+++ b/spacy/cli/init_model.py
@ -37,7 +37,7 @@ def init_model_cli(
    clusters_loc: Optional[Path] = Opt(None, "--clusters-loc", "-c", help="Optional location of brown clusters data", exists=True),
    jsonl_loc: Optional[Path] = Opt(None, "--jsonl-loc", "-j", help="Location of JSONL-formatted attributes file", exists=True),
    vectors_loc: Optional[Path] = Opt(None, "--vectors-loc", "-v", help="Optional vectors file in Word2Vec format", exists=True),
-    prune_vectors: int = Opt(-1 , "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
+    prune_vectors: int = Opt(-1, "--prune-vectors", "-V", help="Optional number of vectors to prune to"),
    truncate_vectors: int = Opt(0, "--truncate-vectors", "-t", help="Optional number of vectors to truncate to when reading in vectors file"),
    vectors_name: Optional[str] = Opt(None, "--vectors-name", "-vn", help="Optional name for the word vectors, e.g. en_core_web_lg.vectors"),
    model_name: Optional[str] = Opt(None, "--model-name", "-mn", help="Optional name for the model meta"),
@ -56,6 +56,7 @@ def init_model_cli(
        freqs_loc=freqs_loc,
        clusters_loc=clusters_loc,
        jsonl_loc=jsonl_loc,
        vectors_loc=vectors_loc,
        prune_vectors=prune_vectors,
        truncate_vectors=truncate_vectors,
        vectors_name=vectors_name,
@ -228,7 +229,7 @@ def add_vectors(
    else:
        if vectors_loc:
            with msg.loading(f"Reading vectors from {vectors_loc}"):
-                vectors_data, vector_keys = read_vectors(msg, vectors_loc)
+                vectors_data, vector_keys = read_vectors(msg, vectors_loc, truncate_vectors)
            msg.good(f"Loaded vectors from {vectors_loc}")
        else:
            vectors_data, vector_keys = (None, None)
@ -247,7 +248,7 @@ def add_vectors(
        nlp.vocab.prune_vectors(prune_vectors)
-def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int = 0):
+def read_vectors(msg: Printer, vectors_loc: Path, truncate_vectors: int):
    f = open_file(vectors_loc)
    shape = tuple(int(size) for size in next(f).split())
    if truncate_vectors >= 1:
--- a/spacy/cli/pretrain.py
+++ b/spacy/cli/pretrain.py
@ -15,7 +15,6 @@ from ..ml.models.multi_task import build_masked_language_model
 from ..tokens import Doc
 from ..attrs import ID, HEAD
 from .. import util
 from ..gold import Example
@app.command("pretrain")
@ -183,7 +182,7 @@ def pretrain(
        for batch_id, batch in enumerate(batches):
            docs, count = make_docs(
                nlp,
-                [ex.doc for ex in batch],
+                batch,
                max_length=pretrain_config["max_length"],
                min_length=pretrain_config["min_length"],
            )
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -159,6 +159,8 @@ class Warnings(object):
    W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
            "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
            "string \"Field1=Value1,Value2|Field2=Value3\".")
    W101 = ("Skipping `Doc` custom extension '{name}' while merging docs.")
    W102 = ("Skipping unsupported user data '{key}: {value}' while merging docs.")
@add_codes
@ -556,8 +558,8 @@ class Errors(object):
    E979 = ("Cannot convert {type} to an Example object.")
    E980 = ("Each link annotation should refer to a dictionary with at most one "
            "identifier mapping to 1.0, and all others to 0.0.")
-    E981 = ("The offsets of the annotations for 'links' need to refer exactly "
+    E981 = ("The offsets of the annotations for 'links' could not be aligned "
-            "to the offsets of the 'entities' annotations.")
+            "to token boundaries.")
    E982 = ("The 'ent_iob' attribute of a Token should be an integer indexing "
            "into {values}, but found {value}.")
    E983 = ("Invalid key for '{dict}': {key}. Available keys: "
@ -593,7 +595,9 @@ class Errors(object):
    E997 = ("Tokenizer special cases are not allowed to modify the text. "
            "This would map '{chunk}' to '{orth}' given token attributes "
            "'{token_attrs}'.")
- 
+    E999 = ("Unable to merge the `Doc` objects because they do not all share "
            "the same `Vocab`.")
@add_codes
 class TempErrors(object):
--- a/spacy/gold/converters/init.py
+++ b/spacy/gold/converters/init.py
@ -1,6 +1,4 @@
 from .iob2docs import iob2docs  # noqa: F401
 from .conll_ner2docs import conll_ner2docs  # noqa: F401
 from .json2docs import json2docs
-
+from .conllu2docs import conllu2docs  # noqa: F401
 # TODO: Update this one
 # from .conllu2docs import conllu2docs  # noqa: F401
--- a/spacy/gold/converters/conllu2docs.py
+++ b/spacy/gold/converters/conllu2docs.py
@ -4,11 +4,11 @@ from .conll_ner2docs import n_sents_info
 from ...gold import Example
 from ...gold import iob_to_biluo, spans_from_biluo_tags
 from ...language import Language
-from ...tokens import Doc, Token
+from ...tokens import Doc, Token, Span
 from wasabi import Printer
-def conllu2json(
+def conllu2docs(
    input_data,
    n_sents=10,
    append_morphology=False,
@ -28,34 +28,22 @@ def conllu2json(
    MISC_NER_PATTERN = "^((?:name|NE)=)?([BILU])-([A-Z_]+)|O$"
    msg = Printer(no_print=no_print)
    n_sents_info(msg, n_sents)
-    docs = []
+    sent_docs = read_conllx(
    raw = ""
    sentences = []
    conll_data = read_conllx(
        input_data,
        append_morphology=append_morphology,
        ner_tag_pattern=MISC_NER_PATTERN,
        ner_map=ner_map,
        merge_subtokens=merge_subtokens,
    )
-    has_ner_tags = has_ner(input_data, MISC_NER_PATTERN)
+    docs = []
-    for i, example in enumerate(conll_data):
+    sent_docs_to_merge = []
-        raw += example.text
+    for sent_doc in sent_docs:
-        sentences.append(
+        sent_docs_to_merge.append(sent_doc)
-            generate_sentence(
+        if len(sent_docs_to_merge) % n_sents == 0:
-                example.to_dict(), has_ner_tags, MISC_NER_PATTERN, ner_map=ner_map,
+            docs.append(Doc.from_docs(sent_docs_to_merge))
-            )
+            sent_docs_to_merge = []
-        )
+    if sent_docs_to_merge:
-        # Real-sized documents could be extracted using the comments on the
+        docs.append(Doc.from_docs(sent_docs_to_merge))
        # conllu document
        if len(sentences) % n_sents == 0:
            doc = create_json_doc(raw, sentences, i)
            docs.append(doc)
            raw = ""
            sentences = []
    if sentences:
        doc = create_json_doc(raw, sentences, i)
        docs.append(doc)
    return docs
@ -84,14 +72,14 @@ def read_conllx(
    ner_tag_pattern="",
    ner_map=None,
 ):
-    """ Yield examples, one for each sentence """
+    """ Yield docs, one for each sentence """
    vocab = Language.Defaults.create_vocab()  # need vocab to make a minimal Doc
    for sent in input_data.strip().split("\n\n"):
        lines = sent.strip().split("\n")
        if lines:
            while lines[0].startswith("#"):
                lines.pop(0)
-            example = example_from_conllu_sentence(
+            doc = doc_from_conllu_sentence(
                vocab,
                lines,
                ner_tag_pattern,
@ -99,7 +87,7 @@ def read_conllx(
                append_morphology=append_morphology,
                ner_map=ner_map,
            )
-            yield example
+            yield doc
 def get_entities(lines, tag_pattern, ner_map=None):
@ -141,39 +129,7 @@ def get_entities(lines, tag_pattern, ner_map=None):
    return iob_to_biluo(iob)
-def generate_sentence(example_dict, has_ner_tags, tag_pattern, ner_map=None):
+def doc_from_conllu_sentence(
    sentence = {}
    tokens = []
    token_annotation = example_dict["token_annotation"]
    for i, id_ in enumerate(token_annotation["ids"]):
        token = {}
        token["id"] = id_
        token["orth"] = token_annotation["words"][i]
        token["tag"] = token_annotation["tags"][i]
        token["pos"] = token_annotation["pos"][i]
        token["lemma"] = token_annotation["lemmas"][i]
        token["morph"] = token_annotation["morphs"][i]
        token["head"] = token_annotation["heads"][i] - i
        token["dep"] = token_annotation["deps"][i]
        if has_ner_tags:
            token["ner"] = example_dict["doc_annotation"]["entities"][i]
        tokens.append(token)
    sentence["tokens"] = tokens
    return sentence
 def create_json_doc(raw, sentences, id_):
    doc = {}
    paragraph = {}
    doc["id"] = id_
    doc["paragraphs"] = []
    paragraph["raw"] = raw.strip()
    paragraph["sentences"] = sentences
    doc["paragraphs"].append(paragraph)
    return doc
 def example_from_conllu_sentence(
    vocab,
    lines,
    ner_tag_pattern,
@ -263,8 +219,9 @@ def example_from_conllu_sentence(
    if merge_subtokens:
        doc = merge_conllu_subtokens(lines, doc)
-    # create Example from custom Doc annotation
+    # create final Doc from custom Doc annotation
-    words, spaces, tags, morphs, lemmas = [], [], [], [], []
+    words, spaces, tags, morphs, lemmas, poses = [], [], [], [], [], []
    heads, deps = [], []
    for i, t in enumerate(doc):
        words.append(t._.merged_orth)
        lemmas.append(t._.merged_lemma)
@ -274,16 +231,23 @@ def example_from_conllu_sentence(
            tags.append(t.tag_ + "__" + t._.merged_morph)
        else:
            tags.append(t.tag_)
        poses.append(t.pos_)
        heads.append(t.head.i)
        deps.append(t.dep_)
    doc_x = Doc(vocab, words=words, spaces=spaces)
-    ref_dict = Example(doc_x, reference=doc).to_dict()
+    for i in range(len(doc)):
-    ref_dict["words"] = words
+        doc_x[i].tag_ = tags[i]
-    ref_dict["lemmas"] = lemmas
+        doc_x[i].morph_ = morphs[i]
-    ref_dict["spaces"] = spaces
+        doc_x[i].lemma_ = lemmas[i]
-    ref_dict["tags"] = tags
+        doc_x[i].pos_ = poses[i]
-    ref_dict["morphs"] = morphs
+        doc_x[i].dep_ = deps[i]
-    example = Example.from_dict(doc_x, ref_dict)
+        doc_x[i].head = doc_x[heads[i]]
-    return example
+    doc_x.ents = [Span(doc_x, ent.start, ent.end, label=ent.label) for ent in doc.ents]
    doc_x.is_parsed = True
    doc_x.is_tagged = True
    return doc_x
 def merge_conllu_subtokens(lines, doc):
--- a/spacy/gold/converters/json2docs.py
+++ b/spacy/gold/converters/json2docs.py
@ -17,8 +17,6 @@ def json2docs(input_data, model=None, **kwargs):
        for json_para in json_to_annotations(json_doc):
            example_dict = _fix_legacy_dict_data(json_para)
            tok_dict, doc_dict = _parse_example_dict_data(example_dict)
            if json_para.get("raw"):
                assert tok_dict.get("SPACY")
            doc = annotations2doc(nlp.vocab, tok_dict, doc_dict)
            docs.append(doc)
    return docs
--- a/spacy/gold/corpus.py
+++ b/spacy/gold/corpus.py
@ -43,25 +43,36 @@ class Corpus:
                locs.append(path)
        return locs
    def _make_example(self, nlp, reference, gold_preproc):
        if gold_preproc or reference.has_unknown_spaces:
            return Example(
                Doc(
                    nlp.vocab,
                    words=[word.text for word in reference],
                    spaces=[bool(word.whitespace_) for word in reference]
                ),
                reference
            )
        else:
            return Example(
                nlp.make_doc(reference.text),
                reference
            )
    def make_examples(self, nlp, reference_docs, max_length=0):
        for reference in reference_docs:
            if len(reference) == 0:
                continue
            elif max_length == 0 or len(reference) < max_length:
-                yield Example(
+                yield self._make_example(nlp, reference, False)
                    nlp.make_doc(reference.text),
                    reference
                )
            elif reference.is_sentenced:
                for ref_sent in reference.sents:
                    if len(ref_sent) == 0:
                        continue
                    elif max_length == 0 or len(ref_sent) < max_length:
-                        yield Example(
+                        yield self._make_example(nlp, ref_sent.as_doc(), False)
                            nlp.make_doc(ref_sent.text),
                            ref_sent.as_doc()
                        )
    def make_examples_gold_preproc(self, nlp, reference_docs):
        for reference in reference_docs:
            if reference.is_sentenced:
@ -69,14 +80,7 @@ class Corpus:
            else:
                ref_sents = [reference]
            for ref_sent in ref_sents:
-                eg = Example(
+                eg = self._make_example(nlp, ref_sent, True)
                    Doc(
                        nlp.vocab, 
                        words=[w.text for w in ref_sent],
                        spaces=[bool(w.whitespace_) for w in ref_sent]
                    ),
                    ref_sent
                )
                if len(eg.x):
                    yield eg
--- a/spacy/gold/example.pyx
+++ b/spacy/gold/example.pyx
@ -15,7 +15,7 @@ from ..syntax import nonproj
 cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
-    """ Create a Doc from dictionaries with token and doc annotations. Assumes ORTH & SPACY are set. """
+    """ Create a Doc from dictionaries with token and doc annotations. """
    attrs, array = _annot2array(vocab, tok_annot, doc_annot)
    output = Doc(vocab, words=tok_annot["ORTH"], spaces=tok_annot["SPACY"])
    if "entities" in doc_annot:
@ -235,10 +235,7 @@ def _annot2array(vocab, tok_annot, doc_annot):
            if key == "entities":
                pass
            elif key == "links":
-                entities = doc_annot.get("entities", {})
+                ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], tok_annot["SPACY"], value)
                if not entities:
                    raise ValueError(Errors.E981)
                ent_kb_ids = _parse_links(vocab, tok_annot["ORTH"], value, entities)
                tok_annot["ENT_KB_ID"] = ent_kb_ids
            elif key == "cats":
                pass
@ -381,18 +378,11 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
                ent_types.append("")
    return ent_iobs, ent_types
-def _parse_links(vocab, words, links, entities):
+def _parse_links(vocab, words, spaces, links):
-    reference = Doc(vocab, words=words)
+    reference = Doc(vocab, words=words, spaces=spaces)
    starts = {token.idx: token.i for token in reference}
    ends = {token.idx + len(token): token.i for token in reference}
    ent_kb_ids = ["" for _ in reference]
    entity_map = [(ent[0], ent[1]) for ent in entities]
    # links annotations need to refer 1-1 to entity annotations - throw error otherwise
    for index, annot_dict in links.items():
        start_char, end_char = index
        if (start_char, end_char) not in entity_map:
            raise ValueError(Errors.E981)
    for index, annot_dict in links.items():
        true_kb_ids = []
@ -406,6 +396,8 @@ def _parse_links(vocab, words, links, entities):
            start_char, end_char = index
            start_token = starts.get(start_char)
            end_token = ends.get(end_char)
            if start_token is None or end_token is None:
                raise ValueError(Errors.E981)
            for i in range(start_token, end_token+1):
                ent_kb_ids[i] = true_kb_ids[0]
@ -414,7 +406,7 @@ def _parse_links(vocab, words, links, entities):
 def _guess_spaces(text, words):
    if text is None:
-        return [True] * len(words)
+        return None
    spaces = []
    text_pos = 0
    # align words with text
--- a/spacy/tests/doc/test_doc_api.py
+++ b/spacy/tests/doc/test_doc_api.py
@ -303,6 +303,60 @@ def test_doc_from_array_sent_starts(en_vocab):
    assert new_doc.is_parsed
 def test_doc_api_from_docs(en_tokenizer, de_tokenizer):
    en_texts = ["Merging the docs is fun.", "They don't think alike."]
    de_text = "Wie war die Frage?"
    en_docs = [en_tokenizer(text) for text in en_texts]
    docs_idx = en_texts[0].index('docs')
    de_doc = de_tokenizer(de_text)
    en_docs[0].user_data[("._.", "is_ambiguous", docs_idx, None)] = (True, None, None, None)
    assert Doc.from_docs([]) is None
    assert de_doc is not Doc.from_docs([de_doc])
    assert str(de_doc) == str(Doc.from_docs([de_doc]))
    with pytest.raises(ValueError):
        Doc.from_docs(en_docs + [de_doc])
    m_doc = Doc.from_docs(en_docs)
    assert len(en_docs) == len(list(m_doc.sents))
    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
    assert str(m_doc) == " ".join(en_texts)
    p_token = m_doc[len(en_docs[0])-1]
    assert p_token.text == "." and bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
    assert len(m_doc) == len(en_docs_tokens)
    think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
    assert m_doc[9].idx == think_idx
    with pytest.raises(AttributeError):
        not_available = m_doc[2]._.is_ambiguous             # not callable, because it was not set via set_extension
    assert len(m_doc.user_data) == len(en_docs[0].user_data)    # but it's there
    m_doc = Doc.from_docs(en_docs, ensure_whitespace=False)
    assert len(en_docs) == len(list(m_doc.sents))
    assert len(str(m_doc)) == len(en_texts[0]) + len(en_texts[1])
    assert str(m_doc) == "".join(en_texts)
    p_token = m_doc[len(en_docs[0]) - 1]
    assert p_token.text == "." and not bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
    assert len(m_doc) == len(en_docs_tokens)
    think_idx = len(en_texts[0]) + 0 + en_texts[1].index('think')
    assert m_doc[9].idx == think_idx
    m_doc = Doc.from_docs(en_docs, attrs=['lemma', 'length', 'pos'])
    with pytest.raises(ValueError):                 # important attributes from sentenziser or parser are missing
        assert list(m_doc.sents)
    assert len(str(m_doc)) > len(en_texts[0]) + len(en_texts[1])
    assert str(m_doc) == " ".join(en_texts)         # space delimiter considered, although spacy attribute was missing
    p_token = m_doc[len(en_docs[0]) - 1]
    assert p_token.text == "." and bool(p_token.whitespace_)
    en_docs_tokens = [t for doc in en_docs for t in doc]
    assert len(m_doc) == len(en_docs_tokens)
    think_idx = len(en_texts[0]) + 1 + en_texts[1].index('think')
    assert m_doc[9].idx == think_idx
 def test_doc_lang(en_vocab):
    doc = Doc(en_vocab, words=["Hello", "world"])
    assert doc.lang_ == "en"
--- a/spacy/tests/serialize/test_serialize_doc.py
+++ b/spacy/tests/serialize/test_serialize_doc.py
@ -75,3 +75,19 @@ def test_serialize_doc_bin():
    for i, doc in enumerate(reloaded_docs):
        assert doc.text == texts[i]
        assert doc.cats == cats
 def test_serialize_doc_bin_unknown_spaces(en_vocab):
    doc1 = Doc(en_vocab, words=["that", "'s"])
    assert doc1.has_unknown_spaces
    assert doc1.text == "that 's "
    doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
    assert not doc2.has_unknown_spaces
    assert doc2.text == "that's"
    doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
    re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
    assert re_doc1.has_unknown_spaces
    assert re_doc1.text == "that 's "
    assert not re_doc2.has_unknown_spaces
    assert re_doc2.text == "that's"
--- a/spacy/tests/test_cli.py
+++ b/spacy/tests/test_cli.py
@ -1,14 +1,10 @@
 import pytest
-from spacy.gold import docs_to_json
+from spacy.gold import docs_to_json, biluo_tags_from_offsets
-from spacy.gold.converters import iob2docs, conll_ner2docs
+from spacy.gold.converters import iob2docs, conll_ner2docs, conllu2docs
 from spacy.gold.converters.conllu2json import conllu2json
 from spacy.lang.en import English
 from spacy.cli.pretrain import make_docs
 # TODO
 # from spacy.gold.converters import conllu2docs
 def test_cli_converters_conllu2json():
    # from NorNE: https://github.com/ltgoslo/norne/blob/3d23274965f513f23aa48455b28b1878dad23c05/ud/nob/no_bokmaal-ud-dev.conllu
@ -19,8 +15,9 @@ def test_cli_converters_conllu2json():
        "4\tavstår\tavstå\tVERB\t_\tMood=Ind|Tense=Pres|VerbForm=Fin\t0\troot\t_\tO",
    ]
    input_data = "\n".join(lines)
-    converted = conllu2json(input_data, n_sents=1)
+    converted_docs = conllu2docs(input_data, n_sents=1)
-    assert len(converted) == 1
+    assert len(converted_docs) == 1
    converted = [docs_to_json(converted_docs)]
    assert converted[0]["id"] == 0
    assert len(converted[0]["paragraphs"]) == 1
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
@ -31,7 +28,9 @@ def test_cli_converters_conllu2json():
    assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB"]
    assert [t["head"] for t in tokens] == [1, 2, -1, 0]
    assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT"]
-    assert [t["ner"] for t in tokens] == ["O", "B-PER", "L-PER", "O"]
+    ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
    biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
    assert biluo_tags == ["O", "B-PER", "L-PER", "O"]
@pytest.mark.parametrize(
@ -55,11 +54,12 @@ def test_cli_converters_conllu2json():
 )
 def test_cli_converters_conllu2json_name_ner_map(lines):
    input_data = "\n".join(lines)
-    converted = conllu2json(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
+    converted_docs = conllu2docs(input_data, n_sents=1, ner_map={"PER": "PERSON", "BAD": ""})
-    assert len(converted) == 1
+    assert len(converted_docs) == 1
    converted = [docs_to_json(converted_docs)]
    assert converted[0]["id"] == 0
    assert len(converted[0]["paragraphs"]) == 1
-    assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår."
+    assert converted[0]["paragraphs"][0]["raw"] == "Dommer FinnEilertsen avstår. "
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
    sent = converted[0]["paragraphs"][0]["sentences"][0]
    assert len(sent["tokens"]) == 5
@ -68,7 +68,9 @@ def test_cli_converters_conllu2json_name_ner_map(lines):
    assert [t["tag"] for t in tokens] == ["NOUN", "PROPN", "PROPN", "VERB", "PUNCT"]
    assert [t["head"] for t in tokens] == [1, 2, -1, 0, -1]
    assert [t["dep"] for t in tokens] == ["appos", "nsubj", "name", "ROOT", "punct"]
-    assert [t["ner"] for t in tokens] == ["O", "B-PERSON", "L-PERSON", "O", "O"]
+    ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
    biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
    assert biluo_tags == ["O", "B-PERSON", "L-PERSON", "O", "O"]
 def test_cli_converters_conllu2json_subtokens():
@ -82,13 +84,15 @@ def test_cli_converters_conllu2json_subtokens():
        "5\t.\t$.\tPUNCT\t_\t_\t4\tpunct\t_\tname=O",
    ]
    input_data = "\n".join(lines)
-    converted = conllu2json(
+    converted_docs = conllu2docs(
        input_data, n_sents=1, merge_subtokens=True, append_morphology=True
    )
-    assert len(converted) == 1
+    assert len(converted_docs) == 1
    converted = [docs_to_json(converted_docs)]
    assert converted[0]["id"] == 0
    assert len(converted[0]["paragraphs"]) == 1
-    assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår."
+    assert converted[0]["paragraphs"][0]["raw"] == "Dommer FE avstår. "
    assert len(converted[0]["paragraphs"][0]["sentences"]) == 1
    sent = converted[0]["paragraphs"][0]["sentences"][0]
    assert len(sent["tokens"]) == 4
@ -111,7 +115,9 @@ def test_cli_converters_conllu2json_subtokens():
    assert [t["lemma"] for t in tokens] == ["dommer", "Finn Eilertsen", "avstå", "$."]
    assert [t["head"] for t in tokens] == [1, 1, 0, -1]
    assert [t["dep"] for t in tokens] == ["appos", "nsubj", "ROOT", "punct"]
-    assert [t["ner"] for t in tokens] == ["O", "U-PER", "O", "O"]
+    ent_offsets = [(e[0], e[1], e[2]) for e in converted[0]["paragraphs"][0]["entities"]]
    biluo_tags = biluo_tags_from_offsets(converted_docs[0], ent_offsets, missing="O")
    assert biluo_tags == ["O", "U-PER", "O", "O"]
 def test_cli_converters_iob2json(en_vocab):
--- a/spacy/tests/test_new_example.py
+++ b/spacy/tests/test_new_example.py
@ -230,8 +230,7 @@ def test_Example_from_dict_with_links(annots):
    [
        {
            "words": ["I", "like", "New", "York", "and", "Berlin", "."],
-            "entities": [(7, 15, "LOC"), (20, 26, "LOC")],
+            "links": {(7, 14): {"Q7381115": 1.0, "Q2146908": 0.0}},
            "links": {(0, 1): {"Q7381115": 1.0, "Q2146908": 0.0}},
        }
    ],
 )
--- a/spacy/tokens/_serialize.py
+++ b/spacy/tokens/_serialize.py
@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
 from ..errors import Errors
-ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH")
+ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")
 class DocBin(object):
@ -31,6 +31,7 @@ class DocBin(object):
        "spaces": bytes, # Serialized numpy boolean array with spaces data
        "lengths": bytes, # Serialized numpy int32 array with the doc lengths
        "strings": List[unicode] # List of unique strings in the token data
        "version": str, # DocBin version number
    }
    Strings for the words, tags, labels etc are represented by 64-bit hashes in
@ -53,12 +54,14 @@ class DocBin(object):
        DOCS: https://spacy.io/api/docbin#init
        """
        attrs = sorted([intify_attr(attr) for attr in attrs])
        self.version = "0.1"
        self.attrs = [attr for attr in attrs if attr != ORTH and attr != SPACY]
        self.attrs.insert(0, ORTH)  # Ensure ORTH is always attrs[0]
        self.tokens = []
        self.spaces = []
        self.cats = []
        self.user_data = []
        self.flags = []
        self.strings = set()
        self.store_user_data = store_user_data
        for doc in docs:
@ -83,12 +86,17 @@ class DocBin(object):
        assert array.shape[0] == spaces.shape[0]  # this should never happen
        spaces = spaces.reshape((spaces.shape[0], 1))
        self.spaces.append(numpy.asarray(spaces, dtype=bool))
        self.flags.append({
            "has_unknown_spaces": doc.has_unknown_spaces
        })
        for token in doc:
            self.strings.add(token.text)
            self.strings.add(token.tag_)
            self.strings.add(token.lemma_)
            self.strings.add(token.morph_)
            self.strings.add(token.dep_)
            self.strings.add(token.ent_type_)
            self.strings.add(token.ent_kb_id_)
        self.cats.append(doc.cats)
        if self.store_user_data:
            self.user_data.append(srsly.msgpack_dumps(doc.user_data))
@ -105,8 +113,11 @@ class DocBin(object):
            vocab[string]
        orth_col = self.attrs.index(ORTH)
        for i in range(len(self.tokens)):
            flags = self.flags[i]
            tokens = self.tokens[i]
            spaces = self.spaces[i]
            if flags.get("has_unknown_spaces"):
                spaces = None
            doc = Doc(vocab, words=tokens[:, orth_col], spaces=spaces)
            doc = doc.from_array(self.attrs, tokens)
            doc.cats = self.cats[i]
@ -130,6 +141,7 @@ class DocBin(object):
        self.spaces.extend(other.spaces)
        self.strings.update(other.strings)
        self.cats.extend(other.cats)
        self.flags.extend(other.flags)
        if self.store_user_data:
            self.user_data.extend(other.user_data)
@ -147,12 +159,14 @@ class DocBin(object):
        spaces = numpy.vstack(self.spaces) if self.spaces else numpy.asarray([])
        msg = {
            "version": self.version,
            "attrs": self.attrs,
            "tokens": tokens.tobytes("C"),
            "spaces": spaces.tobytes("C"),
            "lengths": numpy.asarray(lengths, dtype="int32").tobytes("C"),
            "strings": list(self.strings),
            "cats": self.cats,
            "flags": self.flags,
        }
        if self.store_user_data:
            msg["user_data"] = self.user_data
@ -178,6 +192,7 @@ class DocBin(object):
        self.tokens = NumpyOps().unflatten(flat_tokens, lengths)
        self.spaces = NumpyOps().unflatten(flat_spaces, lengths)
        self.cats = msg["cats"]
        self.flags = msg.get("flags", [{} for _ in lengths])
        if self.store_user_data and "user_data" in msg:
            self.user_data = list(msg["user_data"])
        for tokens in self.tokens:
--- a/spacy/tokens/doc.pxd
+++ b/spacy/tokens/doc.pxd
@ -59,11 +59,14 @@ cdef class Doc:
    cdef public dict user_token_hooks
    cdef public dict user_span_hooks
    cdef public bint has_unknown_spaces
    cdef public list _py_tokens
    cdef int length
    cdef int max_length
    cdef public object noun_chunks_iterator
    cdef object __weakref__
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -5,6 +5,7 @@ from libc.string cimport memcpy, memset
 from libc.math cimport sqrt
 from libc.stdint cimport int32_t, uint64_t
 import copy
 from collections import Counter
 import numpy
 import numpy.linalg
@ -24,7 +25,7 @@ from ..attrs cimport LENGTH, POS, LEMMA, TAG, MORPH, DEP, HEAD, SPACY, ENT_IOB
 from ..attrs cimport ENT_TYPE, ENT_ID, ENT_KB_ID, SENT_START, IDX, attr_id_t
 from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
-from ..attrs import intify_attrs, IDS
+from ..attrs import intify_attr, intify_attrs, IDS
 from ..util import normalize_slice
 from ..compat import copy_reg, pickle
 from ..errors import Errors, Warnings
@ -171,8 +172,7 @@ cdef class Doc:
            raise ValueError(Errors.E046.format(name=name))
        return Underscore.doc_extensions.pop(name)
-    def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None,
+    def __init__(self, Vocab vocab, words=None, spaces=None, user_data=None):
                 orths_and_spaces=None):
        """Create a Doc object.
        vocab (Vocab): A vocabulary object, which must match any models you
@ -214,28 +214,25 @@ cdef class Doc:
        self._vector = None
        self.noun_chunks_iterator = _get_chunker(self.vocab.lang)
        cdef bint has_space
-        if orths_and_spaces is None and words is not None:
+        if words is None and spaces is not None:
-            if spaces is None:
+            raise ValueError("words must be set if spaces is set")
-                spaces = [True] * len(words)
+        elif spaces is None and words is not None:
-            elif len(spaces) != len(words):
+            self.has_unknown_spaces = True
-                raise ValueError(Errors.E027)
+        else:
-            orths_and_spaces = zip(words, spaces)
+            self.has_unknown_spaces = False
        words = words if words is not None else []
        spaces = spaces if spaces is not None else ([True] * len(words))
        if len(spaces) != len(words):
            raise ValueError(Errors.E027)
        cdef const LexemeC* lexeme
-        if orths_and_spaces is not None:
+        for word, has_space in zip(words, spaces):
-            orths_and_spaces = list(orths_and_spaces)
+            if isinstance(word, unicode):
-            for orth_space in orths_and_spaces:
+                lexeme = self.vocab.get(self.mem, word)
-                if isinstance(orth_space, unicode):
+            elif isinstance(word, bytes):
-                    lexeme = self.vocab.get(self.mem, orth_space)
+                raise ValueError(Errors.E028.format(value=word))
-                    has_space = True
+            else:
-                elif isinstance(orth_space, bytes):
+                lexeme = self.vocab.get_by_orth(self.mem, word)
-                    raise ValueError(Errors.E028.format(value=orth_space))
+            self.push_back(lexeme, has_space)
                elif isinstance(orth_space[0], unicode):
                    lexeme = self.vocab.get(self.mem, orth_space[0])
                    has_space = orth_space[1]
                else:
                    lexeme = self.vocab.get_by_orth(self.mem, orth_space[0])
                    has_space = orth_space[1]
                self.push_back(lexeme, has_space)
        # Tough to decide on policy for this. Is an empty doc tagged and parsed?
        # There's no information we'd like to add to it, so I guess so?
        if self.length == 0:
@ -806,7 +803,7 @@ cdef class Doc:
        attrs = [(IDS[id_.upper()] if hasattr(id_, "upper") else id_)
                 for id_ in attrs]
        if array.dtype != numpy.uint64:
-            warnings.warn(Warnings.W028.format(type=array.dtype))
+            warnings.warn(Warnings.W101.format(type=array.dtype))
        if SENT_START in attrs and HEAD in attrs:
            raise ValueError(Errors.E032)
@ -882,6 +879,87 @@ cdef class Doc:
            set_children_from_heads(self.c, length)
        return self
    @staticmethod
    def from_docs(docs, ensure_whitespace=True, attrs=None):
        """Concatenate multiple Doc objects to form a new one. Raises an error if the `Doc` objects do not all share
        the same `Vocab`.
        docs (list): A list of Doc objects.
        ensure_whitespace (bool): Insert a space between two adjacent docs whenever the first doc does not end in whitespace.
        attrs (list): Optional list of attribute ID ints or attribute name strings.
        RETURNS (Doc): A doc that contains the concatenated docs, or None if no docs were given.
        DOCS: https://spacy.io/api/doc#from_docs
        """
        if not docs:
            return None
        vocab = {doc.vocab for doc in docs}
        if len(vocab) > 1:
            raise ValueError(Errors.E999)
        (vocab,) = vocab
        if attrs is None:
            attrs = [LEMMA, NORM]
            if all(doc.is_nered for doc in docs):
                attrs.extend([ENT_IOB, ENT_KB_ID, ENT_TYPE])
            # TODO: separate for is_morphed?
            if all(doc.is_tagged for doc in docs):
                attrs.extend([TAG, POS, MORPH])
            if all(doc.is_parsed for doc in docs):
                attrs.extend([HEAD, DEP])
            else:
                attrs.append(SENT_START)
        else:
            if any(isinstance(attr, str) for attr in attrs):     # resolve attribute names
                attrs = [intify_attr(attr) for attr in attrs]    # intify_attr returns None for invalid attrs
            attrs = list(attr for attr in set(attrs) if attr)    # filter duplicates, remove None if present
        if SPACY not in attrs:
            attrs.append(SPACY)
        concat_words = []
        concat_spaces = []
        concat_user_data = {}
        char_offset = 0
        for doc in docs:
            concat_words.extend(t.text for t in doc)
            concat_spaces.extend(bool(t.whitespace_) for t in doc)
            for key, value in doc.user_data.items():
                if isinstance(key, tuple) and len(key) == 4:
                    data_type, name, start, end = key
                    if start is not None or end is not None:
                        start += char_offset
                        if end is not None:
                            end += char_offset
                        concat_user_data[(data_type, name, start, end)] = copy.copy(value)
                    else:
                        warnings.warn(Warnings.W101.format(name=name))
                else:
                    warnings.warn(Warnings.W102.format(key=key, value=value))
            char_offset += len(doc.text) if not ensure_whitespace or doc[-1].is_space else len(doc.text) + 1
        arrays = [doc.to_array(attrs) for doc in docs]
        if ensure_whitespace:
            spacy_index = attrs.index(SPACY)
            for i, array in enumerate(arrays[:-1]):
                if len(array) > 0 and not docs[i][-1].is_space:
                    array[-1][spacy_index] = 1
            token_offset = -1
            for doc in docs[:-1]:
                token_offset += len(doc)
                if not doc[-1].is_space:
                    concat_spaces[token_offset] = True
        concat_array = numpy.concatenate(arrays)
        concat_doc = Doc(vocab, words=concat_words, spaces=concat_spaces, user_data=concat_user_data)
        concat_doc.from_array(attrs, concat_array)
        return concat_doc
    def get_lca_matrix(self):
        """Calculates a matrix of Lowest Common Ancestors (LCA) for a given
        `Doc`, where LCA[i, j] is the index of the lowest common ancestor among
@ -1000,6 +1078,7 @@ cdef class Doc:
            "sentiment": lambda: self.sentiment,
            "tensor": lambda: self.tensor,
            "cats": lambda: self.cats,
            "has_unknown_spaces": lambda: self.has_unknown_spaces
        }
        for key in kwargs:
            if key in serializers or key in ("user_data", "user_data_keys", "user_data_values"):
@ -1032,6 +1111,7 @@ cdef class Doc:
            "cats": lambda b: None,
            "user_data_keys": lambda b: None,
            "user_data_values": lambda b: None,
            "has_unknown_spaces": lambda b: None
        }
        for key in kwargs:
            if key in deserializers or key in ("user_data",):
@ -1052,6 +1132,8 @@ cdef class Doc:
            self.tensor = msg["tensor"]
        if "cats" not in exclude and "cats" in msg:
            self.cats = msg["cats"]
        if "has_unknown_spaces" not in exclude and "has_unknown_spaces" in msg:
            self.has_unknown_spaces = msg["has_unknown_spaces"]
        start = 0
        cdef const LexemeC* lex
        cdef unicode orth_
@ -1123,50 +1205,6 @@ cdef class Doc:
                remove_label_if_necessary(attributes[i])
                retokenizer.merge(span, attributes[i])
    def merge(self, int start_idx, int end_idx, *args, **attributes):
        """Retokenize the document, such that the span at
        `doc.text[start_idx : end_idx]` is merged into a single token. If
        `start_idx` and `end_idx `do not mark start and end token boundaries,
        the document remains unchanged.
        start_idx (int): Character index of the start of the slice to merge.
        end_idx (int): Character index after the end of the slice to merge.
        **attributes: Attributes to assign to the merged token. By default,
            attributes are inherited from the syntactic root of the span.
        RETURNS (Token): The newly merged token, or `None` if the start and end
            indices did not fall at token boundaries.
        """
        cdef unicode tag, lemma, ent_type
        warnings.warn(Warnings.W013.format(obj="Doc"), DeprecationWarning)
        # TODO: ENT_KB_ID ?
        if len(args) == 3:
            warnings.warn(Warnings.W003, DeprecationWarning)
            tag, lemma, ent_type = args
            attributes[TAG] = tag
            attributes[LEMMA] = lemma
            attributes[ENT_TYPE] = ent_type
        elif not args:
            fix_attributes(self, attributes)
        elif args:
            raise ValueError(Errors.E034.format(n_args=len(args), args=repr(args),
                                                kwargs=repr(attributes)))
        remove_label_if_necessary(attributes)
        attributes = intify_attrs(attributes, strings_map=self.vocab.strings)
        cdef int start = token_by_start(self.c, self.length, start_idx)
        if start == -1:
            return None
        cdef int end = token_by_end(self.c, self.length, end_idx)
        if end == -1:
            return None
        # Currently we have the token index, we want the range-end index
        end += 1
        with self.retokenize() as retokenizer:
            retokenizer.merge(self[start:end], attrs=attributes)
        return self[start]
    def print_tree(self, light=False, flat=False):
        raise ValueError(Errors.E105)
    def to_json(self, underscore=None):
        """Convert a Doc to JSON. The format it produces will be the new format
        for the `spacy train` command (not implemented yet).
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -280,18 +280,6 @@ cdef class Span:
        return array
    def merge(self, *args, **attributes):
        """Retokenize the document, such that the span is merged into a single
        token.
        **attributes: Attributes to assign to the merged token. By default,
            attributes are inherited from the syntactic root token of the span.
        RETURNS (Token): The newly merged token.
        """
        warnings.warn(Warnings.W013.format(obj="Span"), DeprecationWarning)
        return self.doc.merge(self.start_char, self.end_char, *args,
                              **attributes)
    def get_lca_matrix(self):
        """Calculates a matrix of Lowest Common Ancestors (LCA) for a given
        `Span`, where LCA[i, j] is the index of the lowest common ancestor among
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -349,6 +349,33 @@ array of attributes.
 | `exclude`   | list                                   | String names of [serialization fields](#serialization-fields) to exclude. |
 | **RETURNS** | `Doc`                                  | Itself.                                                                   |
 ## Doc.from_docs {#from_docs tag="staticmethod"}
 Concatenate multiple `Doc` objects to form a new one. Raises an error if the `Doc` objects do not all share the same `Vocab`.
 > #### Example
 >
 > ```python
 > from spacy.tokens import Doc
 > texts = ["London is the capital of the United Kingdom.",
 >          "The River Thames flows through London.",
 >          "The famous Tower Bridge crosses the River Thames."]
 > docs = list(nlp.pipe(texts))
 > c_doc = Doc.from_docs(docs)
 > assert str(c_doc) == " ".join(texts)
 > assert len(list(c_doc.sents)) == len(docs)
 > assert [str(ent) for ent in c_doc.ents] == \
 >        [str(ent) for doc in docs for ent in doc.ents]
 > ```
 | Name                | Type  | Description                                                                                     |
 | ------------------- | ----- | ----------------------------------------------------------------------------------------------- |
 | `docs`              | list  | A list of `Doc` objects.                                                                        |
 | `ensure_whitespace` | bool  | Insert a space between two adjacent docs whenever the first doc does not end in whitespace.     |
 | `attrs`             | list  | Optional list of attribute ID ints or attribute name strings.                                   |
 | **RETURNS**         | `Doc` | The new `Doc` object that is containing the other docs or `None`, if `docs` is empty or `None`. |
 ## Doc.to_disk {#to_disk tag="method" new="2"}
 Save the current state to a directory.
--- a/website/docs/api/docbin.md
+++ b/website/docs/api/docbin.md
@ -16,8 +16,9 @@ document from the `DocBin`. The serialization format is gzipped msgpack, where
 the msgpack object has the following structure:
 ```python
-### msgpack object strcutrue
+### msgpack object structrue
 {
    "version": str,           # DocBin version number
    "attrs": List[uint64],    # e.g. [TAG, HEAD, ENT_IOB, ENT_TYPE]
    "tokens": bytes,          # Serialized numpy uint64 array with the token data
    "spaces": bytes,          # Serialized numpy boolean array with spaces data
@ -45,7 +46,7 @@ Create a `DocBin` object to hold serialized annotations.
 | Argument          | Type     | Description                                                                                                                                                                                |
 | ----------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `attrs`           | list     | List of attributes to serialize. `orth` (hash of token text) and `spacy` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `None`. |
+| `attrs`           | list     | List of attributes to serialize. `ORTH` (hash of token text) and `SPACY` (whether the token is followed by whitespace) are always serialized, so they're not required. Defaults to `("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH", "POS")`. |
 | `store_user_data` | bool     | Whether to include the `Doc.user_data` and the values of custom extension attributes. Defaults to `False`.                                                                                 |
 | **RETURNS**       | `DocBin` | The newly constructed object.                                                                                                                                                              |
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -27,8 +27,7 @@ string where an integer is expected) or unexpected property names.
 ## Matcher.\_\_call\_\_ {#call tag="method"}
-Find all token sequences matching the supplied patterns on the `Doc`. As of
+Find all token sequences matching the supplied patterns on the `Doc` or `Span`.
 spaCy v2.3, the `Matcher` can also be called on `Span` objects.
 > #### Example
 >
@ -37,29 +36,16 @@ spaCy v2.3, the `Matcher` can also be called on `Span` objects.
 >
 > matcher = Matcher(nlp.vocab)
 > pattern = [{"LOWER": "hello"}, {"LOWER": "world"}]
-> matcher.add("HelloWorld", None, pattern)
+> matcher.add("HelloWorld", [pattern])
 > doc = nlp("hello world!")
 > matches = matcher(doc)
 > ```
 | Name        | Type         | Description                                                                                                                                                              |
 | ----------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `doclike`   | `Doc`/`Span` | The document to match over or a `Span` (as of v2.3).                                                                                                                     |
+| `doclike`   | `Doc`/`Span` | The `Doc` or `Span` to match over.                                                                                                                                       |
 | **RETURNS** | list         | A list of `(match_id, start, end)` tuples, describing the matches. A match tuple describes a span `doc[start:end`]. The `match_id` is the ID of the added match pattern. |
 <Infobox title="Important note" variant="warning">
 By default, the matcher **does not perform any action** on matches, like tagging
 matched phrases with entity types. Instead, actions need to be specified when
 **adding patterns or entities**, by passing in a callback function as the
 `on_match` argument on [`add`](/api/matcher#add). This allows you to define
 custom actions per pattern within the same matcher. For example, you might only
 want to merge some entity types, and set custom flags for other matched
 patterns. For more details and examples, see the usage guide on
 [rule-based matching](/usage/rule-based-matching).
 </Infobox>
 ## Matcher.pipe {#pipe tag="method"}
 Match a stream of documents, yielding them in turn.
@ -92,7 +78,7 @@ patterns.
 > ```python
 > matcher = Matcher(nlp.vocab)
 > assert len(matcher) == 0
-> matcher.add("Rule", None, [{"ORTH": "test"}])
+> matcher.add("Rule", [[{"ORTH": "test"}]])
 > assert len(matcher) == 1
 > ```
@ -108,9 +94,9 @@ Check whether the matcher contains rules for a match ID.
 >
 > ```python
 > matcher = Matcher(nlp.vocab)
-> assert 'Rule' not in matcher
+> assert "Rule" not in matcher
-> matcher.add('Rule', None, [{'ORTH': 'test'}])
+> matcher.add("Rule", [[{'ORTH': 'test'}]])
-> assert 'Rule' in matcher
+> assert "Rule" in matcher
 > ```
 | Name        | Type | Description                                           |
@ -129,39 +115,39 @@ overwritten.
 > #### Example
 >
 > ```python
->   def on_match(matcher, doc, id, matches):
+> def on_match(matcher, doc, id, matches):
->       print('Matched!', matches)
+>     print('Matched!', matches)
 >
->   matcher = Matcher(nlp.vocab)
+> matcher = Matcher(nlp.vocab)
->   matcher.add("HelloWorld", on_match, [{"LOWER": "hello"}, {"LOWER": "world"}])
+> patterns = [
->   matcher.add("GoogleMaps", on_match, [{"ORTH": "Google"}, {"ORTH": "Maps"}])
+>    [{"LOWER": "hello"}, {"LOWER": "world"}],
->   doc = nlp("HELLO WORLD on Google Maps.")
+>    [{"ORTH": "Google"}, {"ORTH": "Maps"}]
->   matches = matcher(doc)
+> ]
 > matcher.add("TEST_PATTERNS", patterns)
 > doc = nlp("HELLO WORLD on Google Maps.")
 > matches = matcher(doc)
 > ```
-| Name        | Type               | Description                                                                                   |
+<Infobox title="Changed in v3.0" variant="warning">
 | ----------- | ------------------ | --------------------------------------------------------------------------------------------- |
 | `match_id`  | str                | An ID for the thing you're matching.                                                          |
 | `on_match`  | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
 | `*patterns` | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
-<Infobox title="Changed in v2.2.2" variant="warning">
+As of spaCy v3.0, `Matcher.add` takes a list of patterns as the second argument
 As of spaCy 2.2.2, `Matcher.add` also supports the new API, which will become
 the default in the future. The patterns are now the second argument and a list
 (instead of a variable number of arguments). The `on_match` callback becomes an
 optional keyword argument.
 ```diff
 patterns = [[{"TEXT": "Google"}, {"TEXT": "Now"}], [{"TEXT": "GoogleNow"}]]
 - matcher.add("GoogleNow", None, *patterns)
 + matcher.add("GoogleNow", patterns)
 - matcher.add("GoogleNow", on_match, *patterns)
 + matcher.add("GoogleNow", patterns, on_match=on_match)
 ```
 </Infobox>
 | Name       | Type               | Description                                                                                   |
 | ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
 | `match_id` | str                | An ID for the thing you're matching.                                                          |
 | `patterns` | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
 | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
 ## Matcher.remove {#remove tag="method" new="2"}
 Remove a rule from the matcher. A `KeyError` is raised if the match ID does not
@ -170,7 +156,7 @@ exist.
 > #### Example
 >
 > ```python
-> matcher.add("Rule", None, [{"ORTH": "test"}])
+> matcher.add("Rule", [[{"ORTH": "test"}]])
 > assert "Rule" in matcher
 > matcher.remove("Rule")
 > assert "Rule" not in matcher
@ -188,7 +174,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
 > #### Example
 >
 > ```python
-> matcher.add("Rule", None, [{"ORTH": "test"}])
+> matcher.add("Rule", [[{"ORTH": "test"}]])
 > on_match, patterns = matcher.get("Rule")
 > ```
--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@ -52,7 +52,7 @@ Find all token sequences matching the supplied patterns on the `Doc`.
 > from spacy.matcher import PhraseMatcher
 >
 > matcher = PhraseMatcher(nlp.vocab)
-> matcher.add("OBAMA", None, nlp("Barack Obama"))
+> matcher.add("OBAMA", [nlp("Barack Obama")])
 > doc = nlp("Barack Obama lifts America one last time in emotional farewell")
 > matches = matcher(doc)
 > ```
@ -104,7 +104,7 @@ patterns.
 > ```python
 >   matcher = PhraseMatcher(nlp.vocab)
 >   assert len(matcher) == 0
->   matcher.add("OBAMA", None, nlp("Barack Obama"))
+>   matcher.add("OBAMA", [nlp("Barack Obama")])
 >   assert len(matcher) == 1
 > ```
@ -121,7 +121,7 @@ Check whether the matcher contains rules for a match ID.
 > ```python
 >   matcher = PhraseMatcher(nlp.vocab)
 >   assert "OBAMA" not in matcher
->   matcher.add("OBAMA", None, nlp("Barack Obama"))
+>   matcher.add("OBAMA", [nlp("Barack Obama")])
 >   assert "OBAMA" in matcher
 > ```
@ -145,36 +145,32 @@ overwritten.
 >       print('Matched!', matches)
 >
 >   matcher = PhraseMatcher(nlp.vocab)
->   matcher.add("OBAMA", on_match, nlp("Barack Obama"))
+>   matcher.add("OBAMA", [nlp("Barack Obama")], on_match=on_match)
->   matcher.add("HEALTH", on_match, nlp("health care reform"),
+>   matcher.add("HEALTH", [nlp("health care reform"), nlp("healthcare reform")], on_match=on_match)
 >                                   nlp("healthcare reform"))
 >   doc = nlp("Barack Obama urges Congress to find courage to defend his healthcare reforms")
 >   matches = matcher(doc)
 > ```
-| Name       | Type               | Description                                                                                   |
+<Infobox title="Changed in v3.0" variant="warning">
 | ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
 | `match_id` | str                | An ID for the thing you're matching.                                                          |
 | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
 | `*docs`    | `Doc`              | `Doc` objects of the phrases to match.                                                        |
-<Infobox title="Changed in v2.2.2" variant="warning">
+As of spaCy v3.0, `PhraseMatcher.add` takes a list of patterns as the second
-
+argument (instead of a variable number of arguments). The `on_match` callback
 As of spaCy 2.2.2, `PhraseMatcher.add` also supports the new API, which will
 become the default in the future. The `Doc` patterns are now the second argument
 and a list (instead of a variable number of arguments). The `on_match` callback
 becomes an optional keyword argument.
 ```diff
 patterns = [nlp("health care reform"), nlp("healthcare reform")]
 - matcher.add("HEALTH", None, *patterns)
 + matcher.add("HEALTH", patterns)
 - matcher.add("HEALTH", on_match, *patterns)
 + matcher.add("HEALTH", patterns, on_match=on_match)
 ```
 </Infobox>
 | Name       | Type               | Description                                                                                   |
 | ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
 | `match_id` | str                | An ID for the thing you're matching.                                                          |
 | `docs`     | list               | `Doc` objects of the phrases to match.                                                        |
 | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
 ## PhraseMatcher.remove {#remove tag="method" new="2.2"}
 Remove a rule from the matcher by match ID. A `KeyError` is raised if the key
@ -184,7 +180,7 @@ does not exist.
 >
 > ```python
 > matcher = PhraseMatcher(nlp.vocab)
-> matcher.add("OBAMA", None, nlp("Barack Obama"))
+> matcher.add("OBAMA", [nlp("Barack Obama")])
 > assert "OBAMA" in matcher
 > matcher.remove("OBAMA")
 > assert "OBAMA" not in matcher
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -407,7 +407,7 @@ class EntityMatcher(object):
    def __init__(self, nlp, terms, label):
        patterns = [nlp.make_doc(text) for text in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
-        self.matcher.add(label, None, *patterns)
+        self.matcher.add(label, patterns)
    def __call__(self, doc):
        matches = self.matcher(doc)
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -98,9 +98,7 @@ print([token.text for token in doc])
 First, we initialize the `Matcher` with a vocab. The matcher must always share
 the same vocab with the documents it will operate on. We can now call
-[`matcher.add()`](/api/matcher#add) with an ID and our custom pattern. The
+[`matcher.add()`](/api/matcher#add) with an ID and a list of patterns.
 second argument lets you pass in an optional callback function to invoke on a
 successful match. For now, we set it to `None`.
 ```python
 ### {executable="true"}
@ -111,7 +109,7 @@ nlp = spacy.load("en_core_web_sm")
 matcher = Matcher(nlp.vocab)
 # Add match ID "HelloWorld" with no callback and one pattern
 pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}]
-matcher.add("HelloWorld", None, pattern)
+matcher.add("HelloWorld", [pattern])
 doc = nlp("Hello, world! Hello world!")
 matches = matcher(doc)
@ -137,9 +135,11 @@ Optionally, we could also choose to add more than one pattern, for example to
 also match sequences without punctuation between "hello" and "world":
 ```python
-matcher.add("HelloWorld", None,
+patterns = [
-            [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
+    [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"LOWER": "world"}],
-            [{"LOWER": "hello"}, {"LOWER": "world"}])
+    [{"LOWER": "hello"}, {"LOWER": "world"}]
 ]
 matcher.add("HelloWorld", patterns)
 ```
 By default, the matcher will only return the matches and **not do anything
@ -413,7 +413,7 @@ nlp = spacy.load("en_core_web_sm")
 matcher = Matcher(nlp.vocab, validate=True)
 # Add match ID "HelloWorld" with unsupported attribute CASEINSENSITIVE
 pattern = [{"LOWER": "hello"}, {"IS_PUNCT": True}, {"CASEINSENSITIVE": "world"}]
-matcher.add("HelloWorld", None, pattern)
+matcher.add("HelloWorld", [pattern])
 # 🚨 Raises an error:
 # MatchPatternError: Invalid token patterns for matcher rule 'HelloWorld'
 # Pattern 0:
@ -446,7 +446,7 @@ def add_event_ent(matcher, doc, i, matches):
    print(entity.text)
 pattern = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
-matcher.add("GoogleIO", add_event_ent, pattern)
+matcher.add("GoogleIO", [pattern], on_match=add_event_ent)
 doc = nlp("This is a text about Google I/O")
 matches = matcher(doc)
 ```
@ -509,19 +509,18 @@ import spacy
 from spacy.matcher import Matcher
 from spacy.tokens import Token
-# We're using a class because the component needs to be initialised with
+# We're using a class because the component needs to be initialized with
 # the shared vocab via the nlp object
 class BadHTMLMerger(object):
    def __init__(self, nlp):
        patterns = [
            [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
        ]
        # Register a new token extension to flag bad HTML
        Token.set_extension("bad_html", default=False)
        self.matcher = Matcher(nlp.vocab)
-        self.matcher.add(
+        self.matcher.add("BAD_HTML", patterns)
            "BAD_HTML",
            None,
            [{"ORTH": "<"}, {"LOWER": "br"}, {"ORTH": ">"}],
            [{"ORTH": "<"}, {"LOWER": "br/"}, {"ORTH": ">"}],
        )
    def __call__(self, doc):
        # This method is invoked when the component is called on a Doc
@ -616,7 +615,7 @@ def collect_sents(matcher, doc, i, matches):
 pattern = [{"LOWER": "facebook"}, {"LEMMA": "be"}, {"POS": "ADV", "OP": "*"},
           {"POS": "ADJ"}]
-matcher.add("FacebookIs", collect_sents, pattern)  # add pattern
+matcher.add("FacebookIs", [pattern], on_match=collect_sents)  # add pattern
 doc = nlp("I'd say that Facebook is evil. – Facebook is pretty cool, right?")
 matches = matcher(doc)
@ -671,7 +670,7 @@ nlp = spacy.load("en_core_web_sm")
 matcher = Matcher(nlp.vocab)
 pattern = [{"ORTH": "("}, {"SHAPE": "ddd"}, {"ORTH": ")"}, {"SHAPE": "ddd"},
           {"ORTH": "-", "OP": "?"}, {"SHAPE": "ddd"}]
-matcher.add("PHONE_NUMBER", None, pattern)
+matcher.add("PHONE_NUMBER", [pattern])
 doc = nlp("Call me at (123) 456 789 or (123) 456 789!")
 print([t.text for t in doc])
@ -734,11 +733,11 @@ def label_sentiment(matcher, doc, i, matches):
    elif doc.vocab.strings[match_id] == "SAD":
        doc.sentiment -= 0.1  # Subtract 0.1 for negative sentiment
-matcher.add("HAPPY", label_sentiment, *pos_patterns)  # Add positive pattern
+matcher.add("HAPPY", pos_patterns, on_match=label_sentiment)  # Add positive pattern
-matcher.add("SAD", label_sentiment, *neg_patterns)  # Add negative pattern
+matcher.add("SAD", neg_patterns, on_match=label_sentiment)  # Add negative pattern
 # Add pattern for valid hashtag, i.e. '#' plus any ASCII token
-matcher.add("HASHTAG", None, [{"ORTH": "#"}, {"IS_ASCII": True}])
+matcher.add("HASHTAG", [[{"ORTH": "#"}, {"IS_ASCII": True}]])
 doc = nlp("Hello world 😀 #MondayMotivation")
 matches = matcher(doc)
@ -841,7 +840,7 @@ matcher = PhraseMatcher(nlp.vocab)
 terms = ["Barack Obama", "Angela Merkel", "Washington, D.C."]
 # Only run nlp.make_doc to speed things up
 patterns = [nlp.make_doc(text) for text in terms]
-matcher.add("TerminologyList", None, *patterns)
+matcher.add("TerminologyList", patterns)
 doc = nlp("German Chancellor Angela Merkel and US President Barack Obama "
          "converse in the Oval Office inside the White House in Washington, D.C.")
@ -890,7 +889,7 @@ from spacy.matcher import PhraseMatcher
 nlp = English()
 matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
 patterns = [nlp.make_doc(name) for name in ["Angela Merkel", "Barack Obama"]]
-matcher.add("Names", None, *patterns)
+matcher.add("Names", patterns)
 doc = nlp("angela merkel and us president barack Obama")
 for match_id, start, end in matcher(doc):
@ -924,7 +923,7 @@ from spacy.matcher import PhraseMatcher
 nlp = English()
 matcher = PhraseMatcher(nlp.vocab, attr="SHAPE")
-matcher.add("IP", None, nlp("127.0.0.1"), nlp("127.127.0.0"))
+matcher.add("IP", [nlp("127.0.0.1"), nlp("127.127.0.0")])
 doc = nlp("Often the router will have an IP address such as 192.168.1.1 or 192.168.2.1.")
 for match_id, start, end in matcher(doc):
--- a/website/docs/usage/spacy-101.md
+++ b/website/docs/usage/spacy-101.md
@ -751,10 +751,10 @@ matcher = Matcher(nlp.vocab)
 def set_sentiment(matcher, doc, i, matches):
    doc.sentiment += 0.1
-pattern1 = [{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]
+pattern1 = [[{"ORTH": "Google"}, {"ORTH": "I"}, {"ORTH": "/"}, {"ORTH": "O"}]]
-pattern2 = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
+patterns = [[{"ORTH": emoji, "OP": "+"}] for emoji in ["😀", "😂", "🤣", "😍"]]
-matcher.add("GoogleIO", None, pattern1)  # Match "Google I/O" or "Google i/o"
+matcher.add("GoogleIO", patterns1)  # Match "Google I/O" or "Google i/o"
-matcher.add("HAPPY", set_sentiment, *pattern2)  # Match one or more happy emoji
+matcher.add("HAPPY", patterns2, on_match=set_sentiment)  # Match one or more happy emoji
 doc = nlp("A text about Google I/O 😀😀")
 matches = matcher(doc)