Merge branch 'develop' into master-tmp

2025-11-04 01:48:04 +03:00 · 2020-06-03 14:36:59 +02:00 · 2020-06-03 14:36:59 +02:00 · 810fce3bb1
commit 810fce3bb1
parent b0ee76264b f74784575c
77 changed files with 1211 additions and 808 deletions
--- a/4
+++ b/4
@ -5,7 +5,7 @@ VENV := ./env$(PYVER)
 version := $(shell "bin/get-version.sh")

 dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
-	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data
+	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy_lookups_data
 	chmod a+rx $@

 dist/pytest.pex : wheelhouse/pytest-*.whl
@ -14,7 +14,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl

 wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
 	$(VENV)/bin/pip wheel . -w ./wheelhouse
-	$(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse
+	$(VENV)/bin/pip wheel spacy_lookups_data -w ./wheelhouse
 	touch $@

 wheelhouse/pytest-%.whl : $(VENV)/bin/pex
--- a/examples/experiments/onto-joint/defaults.cfg
+++ b/examples/experiments/onto-joint/defaults.cfg
@ -0,0 +1,115 @@
+# Training hyper-parameters and additional features.
+[training]
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length or number of examples.
+max_length = 0
+limit = 0
+# Data augmentation
+orth_variant_level = 0.0
+dropout = 0.1
+# Controls early-stopping. 0 or -1 mean unlimited.
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 400
+# Other settings
+seed = 0
+accumulate_gradient = 1
+use_pytorch_for_gpu_memory = false
+# Control how scores are printed and checkpoints are evaluated.
+scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
+score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
+# These settings are invalid for the transformer models.
+init_tok2vec = null
+vectors = null
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 1000
+stop = 1000
+compound = 1.001
+
+[optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 0.001
+
+#[optimizer.learn_rate]
+#@schedules = "warmup_linear.v1"
+#warmup_steps = 250
+#total_steps = 20000
+#initial_rate = 0.001
+
+[nlp]
+lang = "en"
+vectors = ${training:vectors}
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.senter]
+factory = "senter"
+
+[nlp.pipeline.ner]
+factory = "ner"
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+
+[nlp.pipeline.senter.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.senter.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 3
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 256
+depth = 6
+window_size = 1
+embed_size = 10000
+maxout_pieces = 3
+subword_features = true
--- a/requirements.txt
+++ b/requirements.txt
@ -13,9 +13,11 @@ numpy>=1.15.0
 requests>=2.13.0,<3.0.0
 plac>=0.9.6,<1.2.0
 tqdm>=4.38.0,<5.0.0
-# Optional dependencies
-jsonschema>=2.6.0,<3.1.0
 pydantic>=1.3.0,<2.0.0
+# Official Python utilities
+setuptools
+packaging
+importlib_metadata>=0.20; python_version < "3.8"
 # Development dependencies
 cython>=0.25
 pytest>=4.6.5
--- a/setup.cfg
+++ b/setup.cfg
@ -47,15 +47,17 @@ install_requires =
    wasabi>=0.4.0,<1.1.0
    srsly>=2.0.0,<3.0.0
    catalogue>=0.0.7,<1.1.0
-    ml_datasets
+    ml_datasets>=0.1.1
    # Third-party dependencies
    tqdm>=4.38.0,<5.0.0
-    setuptools
    numpy>=1.15.0
    plac>=0.9.6,<1.2.0
    requests>=2.13.0,<3.0.0
    pydantic>=1.3.0,<2.0.0
-    tqdm>=4.38.0,<5.0.0
+    # Official Python utilities
+    setuptools
+    packaging
+    importlib_metadata>=0.20; python_version < "3.8"

 [options.extras_require]
 lookups =
--- a/spacy/about.py
+++ b/spacy/about.py
@ -1,6 +1,6 @@
 # fmt: off
 __title__ = "spacy"
-__version__ = "3.0.0.dev8"
+__version__ = "3.0.0.dev9"
 __release__ = True
 __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@ -111,8 +111,8 @@ def get_entities(lines, tag_pattern, ner_map=None):
    final entity type with `ner_map` if mapping present. Entity tag is 'O' if
    the pattern is not matched.

-    lines (unicode): CONLL-U lines for one sentences
-    tag_pattern (unicode): Regex pattern for entity tag
+    lines (str): CONLL-U lines for one sentences
+    tag_pattern (str): Regex pattern for entity tag
    ner_map (dict): Map old NER tag names to new ones, '' maps to O.
    RETURNS (list): List of BILUO entity tags
    """
@ -187,8 +187,8 @@ def example_from_conllu_sentence(
    """Create an Example from the lines for one CoNLL-U sentence, merging
    subtokens and appending morphology to tags if required.

-    lines (unicode): The non-comment lines for a CoNLL-U sentence
-    ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col
+    lines (str): The non-comment lines for a CoNLL-U sentence
+    ner_tag_pattern (str): The regex pattern for matching NER in MISC col
    RETURNS (Example): An example containing the annotation
    """
    # create a Doc with each subtoken as its own token
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@ -5,6 +5,7 @@ import sys
 from wasabi import msg

 from .. import about
+from ..util import is_package, get_base_version


 def download(
@ -17,7 +18,7 @@ def download(
    flag is set, the command expects the full model name with version.
    For direct downloads, the compatibility check will be skipped.
    """
-    if not require_package("spacy") and "--no-deps" not in pip_args:
+    if not is_package("spacy") and "--no-deps" not in pip_args:
        msg.warn(
            "Skipping model package dependencies and setting `--no-deps`. "
            "You don't seem to have the spaCy package itself installed "
@ -45,21 +46,6 @@ def download(
            "Download and installation successful",
            f"You can now load the model via spacy.load('{model_name}')",
        )
-        # If a model is downloaded and then loaded within the same process, our
-        # is_package check currently fails, because pkg_resources.working_set
-        # is not refreshed automatically (see #3923). We're trying to work
-        # around this here be requiring the package explicitly.
-        require_package(model_name)
-
-
-def require_package(name):
-    try:
-        import pkg_resources
-
-        pkg_resources.working_set.require(name)
-        return True
-    except:  # noqa: E722
-        return False


 def get_json(url, desc):
@ -77,8 +63,7 @@ def get_json(url, desc):


 def get_compatibility():
-    version = about.__version__
-    version = version.rsplit(".dev", 1)[0]
+    version = get_base_version(about.__version__)
    comp_table = get_json(about.__compatibility__, "compatibility table")
    comp = comp_table["spacy"]
    if version not in comp:
@ -87,7 +72,7 @@ def get_compatibility():


 def get_version(model, comp):
-    model = model.rsplit(".dev", 1)[0]
+    model = get_base_version(model)
    if model not in comp:
        msg.fail(
            f"No compatible model found for '{model}' (spaCy v{about.__version__})",
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@ -48,7 +48,9 @@ def info(
        "Location": str(Path(__file__).parent.parent),
        "Platform": platform.platform(),
        "Python version": platform.python_version(),
-        "Models": ", ".join(model["name"] for model in all_models.values()),
+        "Models": ", ".join(
+            f"{m['name']} ({m['version']})" for m in all_models.values()
+        ),
    }
    if not silent:
        title = "Info about spaCy"
@ -63,7 +65,7 @@ def print_markdown(data, title=None):
    """Print data in GitHub-flavoured Markdown format for issues etc.

    data (dict or list of tuples): Label/value pairs.
-    title (unicode or None): Title, will be rendered as headline 2.
+    title (str / None): Title, will be rendered as headline 2.
    """
    markdown = []
    for key, value in data.items():
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@ -83,14 +83,14 @@ def generate_meta(model_path, existing_meta, msg):
        ("lang", "Model language", meta.get("lang", "en")),
        ("name", "Model name", meta.get("name", "model")),
        ("version", "Model version", meta.get("version", "0.0.0")),
-        ("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"),
        ("description", "Model description", meta.get("description", False)),
        ("author", "Author", meta.get("author", False)),
        ("email", "Author email", meta.get("email", False)),
        ("url", "Author website", meta.get("url", False)),
-        ("license", "License", meta.get("license", "CC BY-SA 3.0")),
+        ("license", "License", meta.get("license", "MIT")),
    ]
    nlp = util.load_model_from_path(Path(model_path))
+    meta["spacy_version"] = util.get_model_version_range(about.__version__)
    meta["pipeline"] = nlp.pipe_names
    meta["vectors"] = {
        "width": nlp.vocab.vectors_length,
@ -168,6 +168,7 @@ def setup_package():
        package_data={model_name: list_files(model_dir)},
        install_requires=list_requirements(meta),
        zip_safe=False,
+        entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]}
    )


--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@ -483,7 +483,6 @@ def train(
                    # Update model meta.json
                    meta["lang"] = nlp.lang
                    meta["pipeline"] = nlp.pipe_names
-                    meta["spacy_version"] = f">={about.__version__}"
                    if beam_width == 1:
                        meta["speed"] = {
                            "nwords": nwords,
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@ -7,7 +7,7 @@ from pathlib import Path
 from wasabi import msg
 import thinc
 import thinc.schedules
-from thinc.api import Model
+from thinc.api import Model, use_pytorch_for_gpu_memory
 import random

 from ..gold import GoldCorpus
@ -171,6 +171,8 @@ def train_from_config(
    msg.info(f"Loading config from: {config_path}")
    config = util.load_config(config_path, create_objects=False)
    util.fix_random_seed(config["training"]["seed"])
+    if config["training"]["use_pytorch_for_gpu_memory"]:
+        use_pytorch_for_gpu_memory()
    nlp_config = config["nlp"]
    config = util.load_config(config_path, create_objects=True)
    msg.info("Creating nlp from config")
@ -213,6 +215,12 @@ def train_from_config(
                if is_best_checkpoint and output_path is not None:
                    nlp.to_disk(output_path)
                progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
+            # Clean up the objects to faciliate garbage collection.
+            for eg in batch:
+                eg.doc = None
+                eg.goldparse = None
+                eg.doc_annotation = None
+                eg.token_annotation = None
    finally:
        if output_path is not None:
            final_model_path = output_path / "model-final"
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@ -4,6 +4,8 @@ import requests
 from wasabi import msg

 from .. import about
+from ..util import get_package_version, get_installed_models, get_base_version
+from ..util import get_package_path, get_model_meta, is_compatible_version


 def validate():
@ -12,7 +14,7 @@ def validate():
    with the installed models. Should be run after `pip install -U spacy`.
    """
    model_pkgs, compat = get_model_pkgs()
-    spacy_version = about.__version__.rsplit(".dev", 1)[0]
+    spacy_version = get_base_version(about.__version__)
    current_compat = compat.get(spacy_version, {})
    if not current_compat:
        msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
@ -25,7 +27,7 @@ def validate():
    msg.info(f"spaCy installation: {spacy_dir}")

    if model_pkgs:
-        header = ("NAME", "VERSION", "")
+        header = ("NAME", "SPACY", "VERSION", "")
        rows = []
        for name, data in model_pkgs.items():
            if data["compat"]:
@ -34,7 +36,7 @@ def validate():
            else:
                version = msg.text(data["version"], color="red", no_print=True)
                comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
-            rows.append((data["name"], version, comp))
+            rows.append((data["name"], data["spacy"], version, comp))
        msg.table(rows, header=header)
    else:
        msg.text("No models found in your current environment.", exits=0)
@ -44,8 +46,9 @@ def validate():
        cmd = "python -m spacy download {}"
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
    if na_models:
-        msg.warn(
-            f"The following models are not available for spaCy v{about.__version__}:",
+        msg.info(
+            f"The following models are custom spaCy models or not "
+            f"available for spaCy v{about.__version__}:",
            ", ".join(na_models),
        )
    if incompat_models:
@ -53,8 +56,6 @@ def validate():


 def get_model_pkgs():
-    import pkg_resources
-
    with msg.loading("Loading compatibility table..."):
        r = requests.get(about.__compatibility__)
        if r.status_code != 200:
@ -66,19 +67,28 @@ def get_model_pkgs():
    msg.good("Loaded compatibility table")
    compat = r.json()["spacy"]
    all_models = set()
+    installed_models = get_installed_models()
    for spacy_v, models in dict(compat).items():
        all_models.update(models.keys())
        for model, model_vs in models.items():
            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
    pkgs = {}
-    for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
+    for pkg_name in installed_models:
        package = pkg_name.replace("-", "_")
-        if package in all_models:
-            version = pkg_data.version
+        version = get_package_version(pkg_name)
+        if package in compat:
+            is_compat = version in compat[package]
+            spacy_version = about.__version__
+        else:
+            model_path = get_package_path(package)
+            model_meta = get_model_meta(model_path)
+            spacy_version = model_meta.get("spacy_version", "n/a")
+            is_compat = is_compatible_version(about.__version__, spacy_version)
        pkgs[pkg_name] = {
            "name": package,
            "version": version,
-                "compat": package in compat and version in compat[package],
+            "spacy": spacy_version,
+            "compat": is_compat,
        }
    return pkgs, compat

--- a/spacy/displacy/init.py
+++ b/spacy/displacy/init.py
@ -22,13 +22,13 @@ def render(
    """Render displaCy visualisation.

    docs (list or Doc): Document(s) to visualise.
-    style (unicode): Visualisation style, 'dep' or 'ent'.
+    style (str): Visualisation style, 'dep' or 'ent'.
    page (bool): Render markup as full HTML page.
    minify (bool): Minify HTML markup.
    jupyter (bool): Override Jupyter auto-detection.
    options (dict): Visualiser-specific options, e.g. colors.
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
-    RETURNS (unicode): Rendered HTML markup.
+    RETURNS (str): Rendered HTML markup.

    DOCS: https://spacy.io/api/top-level#displacy.render
    USAGE: https://spacy.io/usage/visualizers
@ -73,13 +73,13 @@ def serve(
    """Serve displaCy visualisation.

    docs (list or Doc): Document(s) to visualise.
-    style (unicode): Visualisation style, 'dep' or 'ent'.
+    style (str): Visualisation style, 'dep' or 'ent'.
    page (bool): Render markup as full HTML page.
    minify (bool): Minify HTML markup.
    options (dict): Visualiser-specific options, e.g. colors.
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
    port (int): Port to serve visualisation.
-    host (unicode): Host to serve visualisation.
+    host (str): Host to serve visualisation.

    DOCS: https://spacy.io/api/top-level#displacy.serve
    USAGE: https://spacy.io/usage/visualizers
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@ -47,7 +47,7 @@ class DependencyRenderer(object):
        parsed (list): Dependency parses to render.
        page (bool): Render parses wrapped as full HTML page.
        minify (bool): Minify HTML markup.
-        RETURNS (unicode): Rendered SVG or HTML markup.
+        RETURNS (str): Rendered SVG or HTML markup.
        """
        # Create a random ID prefix to make sure parses don't receive the
        # same ID, even if they're identical
@ -78,7 +78,7 @@ class DependencyRenderer(object):
        render_id (int): Unique ID, typically index of document.
        words (list): Individual words and their tags.
        arcs (list): Individual arcs and their start, end, direction and label.
-        RETURNS (unicode): Rendered SVG markup.
+        RETURNS (str): Rendered SVG markup.
        """
        self.levels = self.get_levels(arcs)
        self.highest_level = len(self.levels)
@ -112,10 +112,10 @@ class DependencyRenderer(object):
    ):
        """Render individual word.

-        text (unicode): Word text.
-        tag (unicode): Part-of-speech tag.
+        text (str): Word text.
+        tag (str): Part-of-speech tag.
        i (int): Unique ID, typically word index.
-        RETURNS (unicode): Rendered SVG markup.
+        RETURNS (str): Rendered SVG markup.
        """
        y = self.offset_y + self.word_spacing
        x = self.offset_x + i * self.distance
@ -131,12 +131,12 @@ class DependencyRenderer(object):
    def render_arrow(self, label, start, end, direction, i):
        """Render individual arrow.

-        label (unicode): Dependency label.
+        label (str): Dependency label.
        start (int): Index of start word.
        end (int): Index of end word.
-        direction (unicode): Arrow direction, 'left' or 'right'.
+        direction (str): Arrow direction, 'left' or 'right'.
        i (int): Unique ID, typically arrow index.
-        RETURNS (unicode): Rendered SVG markup.
+        RETURNS (str): Rendered SVG markup.
        """
        if start < 0 or end < 0:
            error_args = dict(start=start, end=end, label=label, dir=direction)
@ -179,7 +179,7 @@ class DependencyRenderer(object):
        y (int): Y-coordinate of arrow start and end point.
        y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
        x_end (int): X-coordinate of arrow end point.
-        RETURNS (unicode): Definition of the arc path ('d' attribute).
+        RETURNS (str): Definition of the arc path ('d' attribute).
        """
        template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
        if self.compact:
@ -189,11 +189,11 @@ class DependencyRenderer(object):
    def get_arrowhead(self, direction, x, y, end):
        """Render individual arrow head.

-        direction (unicode): Arrow direction, 'left' or 'right'.
+        direction (str): Arrow direction, 'left' or 'right'.
        x (int): X-coordinate of arrow start point.
        y (int): Y-coordinate of arrow start and end point.
        end (int): X-coordinate of arrow end point.
-        RETURNS (unicode): Definition of the arrow head path ('d' attribute).
+        RETURNS (str): Definition of the arrow head path ('d' attribute).
        """
        if direction == "left":
            pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
@ -279,7 +279,7 @@ class EntityRenderer(object):
        parsed (list): Dependency parses to render.
        page (bool): Render parses wrapped as full HTML page.
        minify (bool): Minify HTML markup.
-        RETURNS (unicode): Rendered HTML markup.
+        RETURNS (str): Rendered HTML markup.
        """
        rendered = []
        for i, p in enumerate(parsed):
@ -300,9 +300,9 @@ class EntityRenderer(object):
    def render_ents(self, text, spans, title):
        """Render entities in text.

-        text (unicode): Original text.
+        text (str): Original text.
        spans (list): Individual entity spans and their start, end and label.
-        title (unicode or None): Document title set in Doc.user_data['title'].
+        title (str / None): Document title set in Doc.user_data['title'].
        """
        markup = ""
        offset = 0
--- a/spacy/errors.py
+++ b/spacy/errors.py
@ -113,9 +113,12 @@ class Warnings(object):
            "ignored during training.")

    # TODO: fix numbering after merging develop into master
-    W095 = ("Skipping unsupported morphological feature(s): {feature}. "
-            "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
-            "string \"Field1=Value1,Value2|Field2=Value3\".")
+    W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is "
+            "incompatible with the current version ({current}). This may lead "
+            "to unexpected results or runtime errors. To resolve this, "
+            "download a newer compatible model or retrain your custom model "
+            "with the current spaCy version. For more details and available "
+            "updates, run: python -m spacy validate")
    W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' "
            "instead.")
    W097 = ("No Model config was provided to create the '{name}' component, "
@ -124,6 +127,9 @@ class Warnings(object):
            "so a default configuration was used.")
    W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
            "but got '{type}' instead, so ignoring it.")
+    W100 = ("Skipping unsupported morphological feature(s): {feature}. "
+            "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
+            "string \"Field1=Value1,Value2|Field2=Value3\".")


@add_codes
@ -621,7 +627,7 @@ class MatchPatternError(ValueError):
    def __init__(self, key, errors):
        """Custom error for validating match patterns.

-        key (unicode): The name of the matcher rule.
+        key (str): The name of the matcher rule.
        errors (dict): Validation errors (sequence of strings) mapped to pattern
            ID, i.e. the index of the added pattern.
        """
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@ -1,8 +1,8 @@
 def explain(term):
    """Get a description for a given POS tag, dependency label or entity type.

-    term (unicode): The term to explain.
-    RETURNS (unicode): The explanation, or `None` if not found in the glossary.
+    term (str): The term to explain.
+    RETURNS (str): The explanation, or `None` if not found in the glossary.

    EXAMPLE:
        >>> spacy.explain(u'NORP')
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@ -154,8 +154,8 @@ class GoldCorpus(object):
    def __init__(self, train, dev, gold_preproc=False, limit=None):
        """Create a GoldCorpus.

-        train (unicode or Path): File or directory of training data.
-        dev (unicode or Path): File or directory of development data.
+        train (str / Path): File or directory of training data.
+        dev (str / Path): File or directory of development data.
        RETURNS (GoldCorpus): The newly created object.
        """
        self.limit = limit
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@ -38,7 +38,7 @@ cdef class Candidate:

    @property
    def entity_(self):
-        """RETURNS (unicode): ID/name of this entity in the KB"""
+        """RETURNS (str): ID/name of this entity in the KB"""
        return self.kb.vocab.strings[self.entity_hash]

    @property
@ -48,7 +48,7 @@ cdef class Candidate:

    @property
    def alias_(self):
-        """RETURNS (unicode): ID of the original alias"""
+        """RETURNS (str): ID of the original alias"""
        return self.kb.vocab.strings[self.alias_hash]

    @property
--- a/spacy/language.py
+++ b/spacy/language.py
@ -17,7 +17,8 @@ from .tokens.underscore import Underscore
 from .vocab import Vocab
 from .lemmatizer import Lemmatizer
 from .lookups import Lookups
-from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
+from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
+from .pipe_analysis import count_pipeline_interdependencies
 from .gold import Example
 from .scorer import Scorer
 from .util import link_vectors_to_models, create_default_optimizer, registry
@ -127,7 +128,7 @@ class Language(object):

    Defaults (class): Settings, data and factory methods for creating the `nlp`
        object and processing pipeline.
-    lang (unicode): Two-letter language ID, i.e. ISO code.
+    lang (str): Two-letter language ID, i.e. ISO code.

    DOCS: https://spacy.io/api/language
    """
@ -196,13 +197,14 @@ class Language(object):

    @property
    def meta(self):
+        spacy_version = util.get_model_version_range(about.__version__)
        if self.vocab.lang:
            self._meta.setdefault("lang", self.vocab.lang)
        else:
            self._meta.setdefault("lang", self.lang)
        self._meta.setdefault("name", "model")
        self._meta.setdefault("version", "0.0.0")
-        self._meta.setdefault("spacy_version", f">={about.__version__}")
+        self._meta.setdefault("spacy_version", spacy_version)
        self._meta.setdefault("description", "")
        self._meta.setdefault("author", "")
        self._meta.setdefault("email", "")
@ -292,7 +294,7 @@ class Language(object):
    def get_pipe(self, name):
        """Get a pipeline component for a given component name.

-        name (unicode): Name of pipeline component to get.
+        name (str): Name of pipeline component to get.
        RETURNS (callable): The pipeline component.

        DOCS: https://spacy.io/api/language#get_pipe
@ -305,7 +307,7 @@ class Language(object):
    def create_pipe(self, name, config=dict()):
        """Create a pipeline component from a factory.

-        name (unicode): Factory name to look up in `Language.factories`.
+        name (str): Factory name to look up in `Language.factories`.
        config (dict): Configuration parameters to initialise component.
        RETURNS (callable): Pipeline component.

@ -348,12 +350,12 @@ class Language(object):
        of before/after/first/last can be set. Default behaviour is "last".

        component (callable): The pipeline component.
-        name (unicode): Name of pipeline component. Overwrites existing
+        name (str): Name of pipeline component. Overwrites existing
            component.name attribute if available. If no name is set and
            the component exposes no name attribute, component.__name__ is
            used. An error is raised if a name already exists in the pipeline.
-        before (unicode): Component name to insert component directly before.
-        after (unicode): Component name to insert component directly after.
+        before (str): Component name to insert component directly before.
+        after (str): Component name to insert component directly after.
        first (bool): Insert component first / not first in the pipeline.
        last (bool): Insert component last / not last in the pipeline.

@ -394,7 +396,7 @@ class Language(object):
        """Check if a component name is present in the pipeline. Equivalent to
        `name in nlp.pipe_names`.

-        name (unicode): Name of the component.
+        name (str): Name of the component.
        RETURNS (bool): Whether a component of the name exists in the pipeline.

        DOCS: https://spacy.io/api/language#has_pipe
@ -404,7 +406,7 @@ class Language(object):
    def replace_pipe(self, name, component):
        """Replace a component in the pipeline.

-        name (unicode): Name of the component to replace.
+        name (str): Name of the component to replace.
        component (callable): Pipeline component.

        DOCS: https://spacy.io/api/language#replace_pipe
@ -423,8 +425,8 @@ class Language(object):
    def rename_pipe(self, old_name, new_name):
        """Rename a pipeline component.

-        old_name (unicode): Name of the component to rename.
-        new_name (unicode): New name of the component.
+        old_name (str): Name of the component to rename.
+        new_name (str): New name of the component.

        DOCS: https://spacy.io/api/language#rename_pipe
        """
@ -438,7 +440,7 @@ class Language(object):
    def remove_pipe(self, name):
        """Remove a component from the pipeline.

-        name (unicode): Name of the component to remove.
+        name (str): Name of the component to remove.
        RETURNS (tuple): A `(name, component)` tuple of the removed component.

        DOCS: https://spacy.io/api/language#remove_pipe
@ -455,7 +457,7 @@ class Language(object):
        and can contain arbitrary whitespace. Alignment into the original string
        is preserved.

-        text (unicode): The text to be processed.
+        text (str): The text to be processed.
        disable (list): Names of the pipeline components to disable.
        component_cfg (dict): An optional dictionary with extra keyword arguments
            for specific components.
@ -564,13 +566,14 @@ class Language(object):

        if component_cfg is None:
            component_cfg = {}
+        component_deps = count_pipeline_interdependencies(self.pipeline)
        # Determine whether component should set annotations. In theory I guess
        # we should do this by inspecting the meta? Or we could just always
        # say "yes"
-        for name, proc in self.pipeline:
+        for i, (name, proc) in enumerate(self.pipeline):
            component_cfg.setdefault(name, {})
            component_cfg[name].setdefault("drop", drop)
-            component_cfg[name].setdefault("set_annotations", False)
+            component_cfg[name]["set_annotations"] = bool(component_deps[i])
        for name, proc in self.pipeline:
            if not hasattr(proc, "update"):
                continue
@ -938,7 +941,7 @@ class Language(object):
        """Save the current state to a directory.  If a model is loaded, this
        will include the model.

-        path (unicode or Path): Path to a directory, which will be created if
+        path (str / Path): Path to a directory, which will be created if
            it doesn't exist.
        exclude (list): Names of components or serialization fields to exclude.

@ -972,7 +975,7 @@ class Language(object):
        returns it. If the saved `Language` object contains a model, the
        model will be loaded.

-        path (unicode or Path): A path to a directory.
+        path (str / Path): A path to a directory.
        exclude (list): Names of components or serialization fields to exclude.
        RETURNS (Language): The modified `Language` object.

@ -1090,7 +1093,7 @@ class component(object):
    ):
        """Decorate a pipeline component.

-        name (unicode): Default component and factory name.
+        name (str): Default component and factory name.
        assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
        requires (list): Attributes required by component, e.g. `["token.dep"]`.
        retokenizes (bool): Whether the component changes the tokenization.
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@ -30,8 +30,8 @@ class Lemmatizer(object):
    def __call__(self, string, univ_pos, morphology=None):
        """Lemmatize a string.

-        string (unicode): The string to lemmatize, e.g. the token text.
-        univ_pos (unicode / int): The token's universal part-of-speech tag.
+        string (str): The string to lemmatize, e.g. the token text.
+        univ_pos (str / int): The token's universal part-of-speech tag.
        morphology (dict): The token's morphological features following the
            Universal Dependencies scheme.
        RETURNS (list): The available lemmas for the string.
@ -69,7 +69,7 @@ class Lemmatizer(object):
        Check whether we're dealing with an uninflected paradigm, so we can
        avoid lemmatization entirely.

-        univ_pos (unicode / int): The token's universal part-of-speech tag.
+        univ_pos (str / int): The token's universal part-of-speech tag.
        morphology (dict): The token's morphological features following the
            Universal Dependencies scheme.
        """
@ -126,10 +126,10 @@ class Lemmatizer(object):
        """Look up a lemma in the table, if available. If no lemma is found,
        the original string is returned.

-        string (unicode): The original string.
+        string (str): The original string.
        orth (int): Optional hash of the string to look up. If not set, the
            string will be used and hashed.
-        RETURNS (unicode): The lemma if the string was found, otherwise the
+        RETURNS (str): The lemma if the string was found, otherwise the
            original string.
        """
        lookup_table = self.lookups.get_table("lemma_lookup", {})
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@ -164,7 +164,7 @@ cdef class Lexeme:
            self.vocab.set_vector(self.c.orth, vector)

    property rank:
-        """RETURNS (unicode): Sequential ID of the lexemes's lexical type, used
+        """RETURNS (str): Sequential ID of the lexemes's lexical type, used
            to index into tables, e.g. for word vectors."""
        def __get__(self):
            return self.c.id
@ -187,18 +187,18 @@ cdef class Lexeme:

    @property
    def orth_(self):
-        """RETURNS (unicode): The original verbatim text of the lexeme
+        """RETURNS (str): The original verbatim text of the lexeme
            (identical to `Lexeme.text`). Exists mostly for consistency with
            the other attributes."""
        return self.vocab.strings[self.c.orth]

    @property
    def text(self):
-        """RETURNS (unicode): The original verbatim text of the lexeme."""
+        """RETURNS (str): The original verbatim text of the lexeme."""
        return self.orth_

    property lower:
-        """RETURNS (unicode): Lowercase form of the lexeme."""
+        """RETURNS (str): Lowercase form of the lexeme."""
        def __get__(self):
            return self.c.lower

@ -281,7 +281,7 @@ cdef class Lexeme:
            prob_table[self.c.orth] = x

    property lower_:
-        """RETURNS (unicode): Lowercase form of the word."""
+        """RETURNS (str): Lowercase form of the word."""
        def __get__(self):
            return self.vocab.strings[self.c.lower]

@ -289,7 +289,7 @@ cdef class Lexeme:
            self.c.lower = self.vocab.strings.add(x)

    property norm_:
-        """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the
+        """RETURNS (str): The lexemes's norm, i.e. a normalised form of the
            lexeme text.
        """
        def __get__(self):
@ -299,7 +299,7 @@ cdef class Lexeme:
            self.norm = self.vocab.strings.add(x)

    property shape_:
-        """RETURNS (unicode): Transform of the word's string, to show
+        """RETURNS (str): Transform of the word's string, to show
            orthographic features.
        """
        def __get__(self):
@ -309,7 +309,7 @@ cdef class Lexeme:
            self.c.shape = self.vocab.strings.add(x)

    property prefix_:
-        """RETURNS (unicode): Length-N substring from the start of the word.
+        """RETURNS (str): Length-N substring from the start of the word.
            Defaults to `N=1`.
        """
        def __get__(self):
@ -319,7 +319,7 @@ cdef class Lexeme:
            self.c.prefix = self.vocab.strings.add(x)

    property suffix_:
-        """RETURNS (unicode): Length-N substring from the end of the word.
+        """RETURNS (str): Length-N substring from the end of the word.
            Defaults to `N=3`.
        """
        def __get__(self):
@ -329,7 +329,7 @@ cdef class Lexeme:
            self.c.suffix = self.vocab.strings.add(x)

    property lang_:
-        """RETURNS (unicode): Language of the parent vocabulary."""
+        """RETURNS (str): Language of the parent vocabulary."""
        def __get__(self):
            return self.vocab.strings[self.c.lang]

--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@ -31,7 +31,7 @@ class Lookups(object):
        """Check if the lookups contain a table of a given name. Delegates to
        Lookups.has_table.

-        name (unicode): Name of the table.
+        name (str): Name of the table.
        RETURNS (bool): Whether a table of that name is in the lookups.
        """
        return self.has_table(name)
@ -48,7 +48,7 @@ class Lookups(object):
    def add_table(self, name, data=SimpleFrozenDict()):
        """Add a new table to the lookups. Raises an error if the table exists.

-        name (unicode): Unique name of table.
+        name (str): Unique name of table.
        data (dict): Optional data to add to the table.
        RETURNS (Table): The newly added table.

@ -64,7 +64,7 @@ class Lookups(object):
        """Get a table. Raises an error if the table doesn't exist and no
        default value is provided.

-        name (unicode): Name of the table.
+        name (str): Name of the table.
        default: Optional default value to return if table doesn't exist.
        RETURNS (Table): The table.

@ -79,7 +79,7 @@ class Lookups(object):
    def remove_table(self, name):
        """Remove a table. Raises an error if the table doesn't exist.

-        name (unicode): Name of the table to remove.
+        name (str): Name of the table to remove.
        RETURNS (Table): The removed table.

        DOCS: https://spacy.io/api/lookups#remove_table
@ -91,7 +91,7 @@ class Lookups(object):
    def has_table(self, name):
        """Check if the lookups contain a table of a given name.

-        name (unicode): Name of the table.
+        name (str): Name of the table.
        RETURNS (bool): Whether a table of that name exists.

        DOCS: https://spacy.io/api/lookups#has_table
@ -125,7 +125,7 @@ class Lookups(object):
        """Save the lookups to a directory as lookups.bin. Expects a path to a
        directory, which will be created if it doesn't exist.

-        path (unicode / Path): The file path.
+        path (str / Path): The file path.

        DOCS: https://spacy.io/api/lookups#to_disk
        """
@ -141,7 +141,7 @@ class Lookups(object):
        """Load lookups from a directory containing a lookups.bin. Will skip
        loading if the file doesn't exist.

-        path (unicode / Path): The directory path.
+        path (str / Path): The directory path.
        RETURNS (Lookups): The loaded lookups.

        DOCS: https://spacy.io/api/lookups#from_disk
@ -167,7 +167,7 @@ class Table(OrderedDict):
        """Initialize a new table from a dict.

        data (dict): The dictionary.
-        name (unicode): Optional table name for reference.
+        name (str): Optional table name for reference.
        RETURNS (Table): The newly created object.

        DOCS: https://spacy.io/api/lookups#table.from_dict
@ -179,7 +179,7 @@ class Table(OrderedDict):
    def __init__(self, name=None, data=None):
        """Initialize a new table.

-        name (unicode): Optional table name for reference.
+        name (str): Optional table name for reference.
        data (dict): Initial data, used to hint Bloom Filter.
        RETURNS (Table): The newly created object.

@ -197,7 +197,7 @@ class Table(OrderedDict):
    def __setitem__(self, key, value):
        """Set new key/value pair. String keys will be hashed.

-        key (unicode / int): The key to set.
+        key (str / int): The key to set.
        value: The value to set.
        """
        key = get_string_id(key)
@ -208,7 +208,7 @@ class Table(OrderedDict):
        """Set new key/value pair. String keys will be hashed.
        Same as table[key] = value.

-        key (unicode / int): The key to set.
+        key (str / int): The key to set.
        value: The value to set.
        """
        self[key] = value
@ -216,7 +216,7 @@ class Table(OrderedDict):
    def __getitem__(self, key):
        """Get the value for a given key. String keys will be hashed.

-        key (unicode / int): The key to get.
+        key (str / int): The key to get.
        RETURNS: The value.
        """
        key = get_string_id(key)
@ -225,7 +225,7 @@ class Table(OrderedDict):
    def get(self, key, default=None):
        """Get the value for a given key. String keys will be hashed.

-        key (unicode / int): The key to get.
+        key (str / int): The key to get.
        default: The default value to return.
        RETURNS: The value.
        """
@ -235,7 +235,7 @@ class Table(OrderedDict):
    def __contains__(self, key):
        """Check whether a key is in the table. String keys will be hashed.

-        key (unicode / int): The key to check.
+        key (str / int): The key to check.
        RETURNS (bool): Whether the key is in the table.
        """
        key = get_string_id(key)
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@ -66,7 +66,7 @@ cdef class DependencyMatcher:
    def __contains__(self, key):
        """Check whether the matcher contains rules for a match ID.

-        key (unicode): The match ID.
+        key (str): The match ID.
        RETURNS (bool): Whether the matcher contains rules for this match ID.
        """
        return self._normalize_key(key) in self._patterns
@ -194,7 +194,7 @@ cdef class DependencyMatcher:
    def get(self, key, default=None):
        """Retrieve the pattern stored for a key.

-        key (unicode or int): The key to retrieve.
+        key (str / int): The key to retrieve.
        RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
        """
        key = self._normalize_key(key)
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@ -64,7 +64,7 @@ cdef class Matcher:
    def __contains__(self, key):
        """Check whether the matcher contains rules for a match ID.

-        key (unicode): The match ID.
+        key (str): The match ID.
        RETURNS (bool): Whether the matcher contains rules for this match ID.
        """
        return self._normalize_key(key) in self._patterns
@ -98,7 +98,7 @@ cdef class Matcher:
        number of arguments). The on_match callback becomes an optional keyword
        argument.

-        key (unicode): The match ID.
+        key (str): The match ID.
        patterns (list): The patterns to add for the given key.
        on_match (callable): Optional callback executed on match.
        *_patterns (list): For backwards compatibility: list of patterns to add
@ -139,7 +139,7 @@ cdef class Matcher:
        """Remove a rule from the matcher. A KeyError is raised if the key does
        not exist.

-        key (unicode): The ID of the match rule.
+        key (str): The ID of the match rule.
        """
        norm_key = self._normalize_key(key)
        if not norm_key in self._patterns:
@ -166,7 +166,7 @@ cdef class Matcher:
    def get(self, key, default=None):
        """Retrieve the pattern stored for a key.

-        key (unicode or int): The key to retrieve.
+        key (str / int): The key to retrieve.
        RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
        """
        key = self._normalize_key(key)
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@ -30,7 +30,7 @@ cdef class PhraseMatcher:
        """Initialize the PhraseMatcher.

        vocab (Vocab): The shared vocabulary.
-        attr (int / unicode): Token attribute to match on.
+        attr (int / str): Token attribute to match on.
        validate (bool): Perform additional validation when patterns are added.
        RETURNS (PhraseMatcher): The newly constructed object.

@ -70,7 +70,7 @@ cdef class PhraseMatcher:
    def __contains__(self, key):
        """Check whether the matcher contains rules for a match ID.

-        key (unicode): The match ID.
+        key (str): The match ID.
        RETURNS (bool): Whether the matcher contains rules for this match ID.

        DOCS: https://spacy.io/api/phrasematcher#contains
@ -85,7 +85,7 @@ cdef class PhraseMatcher:
        """Remove a rule from the matcher by match ID. A KeyError is raised if
        the key does not exist.

-        key (unicode): The match ID.
+        key (str): The match ID.

        DOCS: https://spacy.io/api/phrasematcher#remove
        """
@ -159,7 +159,7 @@ cdef class PhraseMatcher:
        number of arguments). The on_match callback becomes an optional keyword
        argument.

-        key (unicode): The match ID.
+        key (str): The match ID.
        docs (list): List of `Doc` objects representing match patterns.
        on_match (callable): Callback executed on match.
        *_docs (Doc): For backwards compatibility: list of patterns to add
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@ -15,10 +15,10 @@ def build_tb_parser_model(
    use_upper=True,
    nO=None,
 ):
-    token_vector_width = tok2vec.get_dim("nO")
+    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
    tok2vec = chain(
        tok2vec,
-        with_array(Linear(hidden_width, token_vector_width)),
+        with_array(Linear(hidden_width, t2v_width)),
        list2array(),
    )
    tok2vec.set_dim("nO", hidden_width)
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@ -6,9 +6,9 @@ from ...util import registry

@registry.architectures.register("spacy.Tagger.v1")
 def build_tagger_model(tok2vec, nO=None) -> Model:
-    token_vector_width = tok2vec.get_dim("nO")
    # TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
-    output_layer = Softmax(nO, nI=token_vector_width, init_W=zero_init)
+    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+    output_layer = Softmax(nO, t2v_width, init_W=zero_init)
    softmax = with_array(output_layer)
    model = chain(tok2vec, softmax)
    model.set_ref("tok2vec", tok2vec)
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@ -38,8 +38,8 @@ def forward(model, X, is_train):


 def init(model, X=None, Y=None):
-    tok2vec = model.get_ref("tok2vec").initialize()
-    lower = model.get_ref("lower").initialize(X=X)
+    tok2vec = model.get_ref("tok2vec").initialize(X=X)
+    lower = model.get_ref("lower").initialize()
    if model.attrs["has_upper"]:
        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
        model.get_ref("upper").initialize(X=statevecs)
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@ -198,8 +198,8 @@ cdef class Morphology:
        """Add a special-case rule to the morphological analyser. Tokens whose
        tag and orth match the rule will receive the specified properties.

-        tag (unicode): The part-of-speech tag to key the exception.
-        orth (unicode): The word-form to key the exception.
+        tag (str): The part-of-speech tag to key the exception.
+        orth (str): The word-form to key the exception.
        """
        attrs = dict(attrs)
        attrs = _normalize_props(attrs)
--- a/spacy/pipe_analysis.py
+++ b/spacy/pipe_analysis.py
@ -11,7 +11,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
    fulfilled (e.g. if previous components assign the attributes).

    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
-    name (unicode): The name of the pipeline component to analyze.
+    name (str): The name of the pipeline component to analyze.
    pipe (callable): The pipeline component function to analyze.
    index (int): The index of the component in the pipeline.
    warn (bool): Show user warning if problem is found.
@ -125,7 +125,7 @@ def get_assigns_for_attr(pipeline, attr):
    """Get all pipeline components that assign an attr, e.g. "doc.tensor".

    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
-    attr (unicode): The attribute to check.
+    attr (str): The attribute to check.
    RETURNS (list): (name, pipeline) tuples of components that assign the attr.
    """
    return _get_feature_for_attr(pipeline, attr, "assigns")
@ -135,7 +135,7 @@ def get_requires_for_attr(pipeline, attr):
    """Get all pipeline components that require an attr, e.g. "doc.tensor".

    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
-    attr (unicode): The attribute to check.
+    attr (str): The attribute to check.
    RETURNS (list): (name, pipeline) tuples of components that require the attr.
    """
    return _get_feature_for_attr(pipeline, attr, "requires")
@ -173,3 +173,22 @@ def print_summary(nlp, pretty=True, no_print=False):
        msg.good("No problems found.")
    if no_print:
        return {"overview": overview, "problems": problems}
+
+
+def count_pipeline_interdependencies(pipeline):
+    """Count how many subsequent components require an annotation set by each
+    component in the pipeline.
+    """
+    pipe_assigns = []
+    pipe_requires = []
+    for name, pipe in pipeline:
+        pipe_assigns.append(set(getattr(pipe, "assigns", [])))
+        pipe_requires.append(set(getattr(pipe, "requires", [])))
+    counts = []
+    for i, assigns in enumerate(pipe_assigns):
+        count = 0
+        for requires in pipe_requires[i + 1 :]:
+            if assigns.intersection(requires):
+                count += 1
+        counts.append(count)
+    return counts
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@ -30,7 +30,7 @@ class EntityRuler(object):

        nlp (Language): The shared nlp object to pass the vocab to the matchers
            and process phrase patterns.
-        phrase_matcher_attr (int / unicode): Token attribute to match on, passed
+        phrase_matcher_attr (int / str): Token attribute to match on, passed
            to the internal PhraseMatcher as `attr`
        validate (bool): Whether patterns should be validated, passed to
            Matcher and PhraseMatcher as `validate`
@ -315,7 +315,7 @@ class EntityRuler(object):
        """Load the entity ruler from a file. Expects a file containing
        newline-delimited JSON (JSONL) with one entry per line.

-        path (unicode / Path): The JSONL file to load.
+        path (str / Path): The JSONL file to load.
        **kwargs: Other config paramters, mostly for consistency.

        RETURNS (EntityRuler): The loaded entity ruler.
@ -351,7 +351,7 @@ class EntityRuler(object):
        """Save the entity ruler patterns to a directory. The patterns will be
        saved as newline-delimited JSON (JSONL).

-        path (unicode / Path): The JSONL file to save.
+        path (str / Path): The JSONL file to save.
        **kwargs: Other config paramters, mostly for consistency.

        DOCS: https://spacy.io/api/entityruler#to_disk
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@ -50,7 +50,7 @@ def merge_subtokens(doc, label="subtok"):
    """Merge subtokens into a single token.

    doc (Doc): The Doc object.
-    label (unicode): The subtoken dependency label.
+    label (str): The subtoken dependency label.
    RETURNS (Doc): The Doc object with merged subtokens.

    DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@ -531,7 +531,16 @@ class Tagger(Pipe):
                                          vocab.morphology.lemmatizer,
                                          exc=vocab.morphology.exc)
        self.set_output(len(self.labels))
-        self.model.initialize()
+        doc_sample = [Doc(self.vocab, words=["hello", "world"])]
+        if pipeline is not None:
+            for name, component in pipeline:
+                if component is self:
+                    break
+                if hasattr(component, "pipe"):
+                    doc_sample = list(component.pipe(doc_sample))
+                else:
+                    doc_sample = [component(doc) for doc in doc_sample]
+        self.model.initialize(X=doc_sample)
        # Get batch of example docs, example outputs to call begin_training().
        # This lets the model infer shapes.
        link_vectors_to_models(self.vocab)
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@ -109,7 +109,7 @@ cdef class StringStore:
        """Retrieve a string from a given hash, or vice versa.

        string_or_id (bytes, unicode or uint64): The value to encode.
-        Returns (unicode or uint64): The value to be retrieved.
+        Returns (str / uint64): The value to be retrieved.
        """
        if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
            return 0
@ -152,7 +152,7 @@ cdef class StringStore:
    def add(self, string):
        """Add a string to the StringStore.

-        string (unicode): The string to add.
+        string (str): The string to add.
        RETURNS (uint64): The string's hash value.
        """
        if isinstance(string, unicode):
@ -179,7 +179,7 @@ cdef class StringStore:
    def __contains__(self, string not None):
        """Check whether a string is in the store.

-        string (unicode): The string to check.
+        string (str): The string to check.
        RETURNS (bool): Whether the store contains the string.
        """
        cdef hash_t key
@ -205,7 +205,7 @@ cdef class StringStore:
    def __iter__(self):
        """Iterate over the strings in the store, in order.

-        YIELDS (unicode): A string in the store.
+        YIELDS (str): A string in the store.
        """
        cdef int i
        cdef hash_t key
@ -223,7 +223,7 @@ cdef class StringStore:
    def to_disk(self, path):
        """Save the current state to a directory.

-        path (unicode or Path): A path to a directory, which will be created if
+        path (str / Path): A path to a directory, which will be created if
            it doesn't exist. Paths may be either strings or Path-like objects.
        """
        path = util.ensure_path(path)
@ -234,7 +234,7 @@ cdef class StringStore:
        """Loads state from a directory. Modifies the object in place and
        returns it.

-        path (unicode or Path): A path to a directory. Paths may be either
+        path (str / Path): A path to a directory. Paths may be either
            strings or `Path`-like objects.
        RETURNS (StringStore): The modified `StringStore` object.
        """
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@ -624,12 +624,25 @@ cdef class Parser:
            sgd = self.create_optimizer()
        doc_sample = []
        gold_sample = []
-        for example in islice(get_examples(), 1000):
+        for example in islice(get_examples(), 10):
            parses = example.get_gold_parses(merge=False, vocab=self.vocab)
            for doc, gold in parses:
+                if len(doc):
                    doc_sample.append(doc)
                    gold_sample.append(gold)
-        self.model.initialize(doc_sample, gold_sample)
+
+        if pipeline is not None:
+            for name, component in pipeline:
+                if component is self:
+                    break
+                if hasattr(component, "pipe"):
+                    doc_sample = list(component.pipe(doc_sample))
+                else:
+                    doc_sample = [component(doc) for doc in doc_sample]
+        if doc_sample:
+            self.model.initialize(doc_sample)
+        else:
+            self.model.initialize()
        if pipeline is not None:
            self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
        link_vectors_to_models(self.vocab)
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@ -9,7 +9,6 @@ def test_build_dependencies():
        "pytest-timeout",
        "mock",
        "flake8",
-        "jsonschema",
    ]
    libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"]

--- a/spacy/tests/pipeline/test_analysis.py
+++ b/spacy/tests/pipeline/test_analysis.py
@ -1,7 +1,8 @@
 import spacy.language
 from spacy.language import Language, component
-from spacy.analysis import print_summary, validate_attrs
-from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
+from spacy.pipe_analysis import print_summary, validate_attrs
+from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr
+from spacy.pipe_analysis import count_pipeline_interdependencies
 from mock import Mock, ANY
 import pytest

@ -161,3 +162,19 @@ def test_analysis_validate_attrs_remove_pipe():
    with pytest.warns(None) as record:
        nlp.remove_pipe("c2")
    assert not record.list
+
+
+def test_pipe_interdependencies():
+    class Fancifier:
+        name = "fancifier"
+        assigns = ("doc._.fancy",)
+        requires = tuple()
+
+    class FancyNeeder:
+        name = "needer"
+        assigns = tuple()
+        requires = ("doc._.fancy",)
+
+    pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())]
+    counts = count_pipeline_interdependencies(pipeline)
+    assert counts == [1, 0]
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@ -2,9 +2,11 @@ import pytest
 import os
 import ctypes
 from pathlib import Path
+from spacy.about import __version__ as spacy_version
 from spacy import util
 from spacy import prefer_gpu, require_gpu
-from spacy.ml._precomputable_affine import PrecomputableAffine, _backprop_precomputable_affine_padding
+from spacy.ml._precomputable_affine import PrecomputableAffine
+from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding


@pytest.fixture
@ -24,10 +26,12 @@ def test_util_ensure_path_succeeds(text):
    assert isinstance(path, Path)


-@pytest.mark.parametrize("package", ["numpy"])
-def test_util_is_package(package):
+@pytest.mark.parametrize(
+    "package,result", [("numpy", True), ("sfkodskfosdkfpsdpofkspdof", False)]
+)
+def test_util_is_package(package, result):
    """Test that an installed package via pip is recognised by util.is_package."""
-    assert util.is_package(package)
+    assert util.is_package(package) is result


@pytest.mark.parametrize("package", ["thinc"])
@ -87,3 +91,21 @@ def test_ascii_filenames():
    root = Path(__file__).parent.parent
    for path in root.glob("**/*"):
        assert all(ord(c) < 128 for c in path.name), path.name
+
+
+@pytest.mark.parametrize(
+    "version,constraint,compatible",
+    [
+        (spacy_version, spacy_version, True),
+        (spacy_version, f">={spacy_version}", True),
+        ("3.0.0", "2.0.0", False),
+        ("3.2.1", ">=2.0.0", True),
+        ("2.2.10a1", ">=1.0.0,<2.1.1", False),
+        ("3.0.0.dev3", ">=1.2.3,<4.5.6", True),
+        ("n/a", ">=1.2.3,<4.5.6", None),
+        ("1.2.3", "n/a", None),
+        ("n/a", "n/a", None),
+    ],
+)
+def test_is_compatible_version(version, constraint, compatible):
+    assert util.is_compatible_version(version, constraint) is compatible
--- a/spacy/tests/test_util.py
+++ b/spacy/tests/test_util.py
@ -0,0 +1,59 @@
+import pytest
+from spacy.gold import Example
+
+from .util import get_random_doc
+
+from spacy.util import minibatch_by_words
+
+
+@pytest.mark.parametrize(
+    "doc_sizes, expected_batches",
+    [
+        ([400, 400, 199], [3]),
+        ([400, 400, 199, 3], [4]),
+        ([400, 400, 199, 3, 200], [3, 2]),
+        ([400, 400, 199, 3, 1], [5]),
+        ([400, 400, 199, 3, 1, 1500], [5]),    # 1500 will be discarded
+        ([400, 400, 199, 3, 1, 200], [3, 3]),
+        ([400, 400, 199, 3, 1, 999], [3, 3]),
+        ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
+        ([1, 2, 999], [3]),
+        ([1, 2, 999, 1], [4]),
+        ([1, 200, 999, 1], [2, 2]),
+        ([1, 999, 200, 1], [2, 2]),
+    ],
+)
+def test_util_minibatch(doc_sizes, expected_batches):
+    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
+    examples = [Example(doc=doc) for doc in docs]
+    tol = 0.2
+    batch_size = 1000
+    batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True))
+    assert [len(batch) for batch in batches] == expected_batches
+
+    max_size = batch_size + batch_size * tol
+    for batch in batches:
+        assert sum([len(example.doc) for example in batch]) < max_size
+
+
+@pytest.mark.parametrize(
+    "doc_sizes, expected_batches",
+    [
+        ([400, 4000, 199], [1, 2]),
+        ([400, 400, 199, 3000, 200], [1, 4]),
+        ([400, 400, 199, 3, 1, 1500], [1, 5]),
+        ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
+        ([1, 2, 9999], [1, 2]),
+        ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
+    ],
+)
+def test_util_minibatch_oversize(doc_sizes, expected_batches):
+    """ Test that oversized documents are returned in their own batch"""
+    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
+    examples = [Example(doc=doc) for doc in docs]
+    tol = 0.2
+    batch_size = 1000
+    batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False))
+    assert [len(batch) for batch in batches] == expected_batches
+
+
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@ -92,6 +92,13 @@ def get_batch(batch_size):
    return docs


+def get_random_doc(n_words):
+    vocab = Vocab()
+    # Make the words numbers, so that they're easy to track.
+    numbers = [str(i) for i in range(0, n_words)]
+    return Doc(vocab, words=numbers)
+
+
 def apply_transition_sequence(parser, doc, sequence):
    """Perform a series of pre-specified transitions, to put the parser in a
    desired state."""
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@ -134,7 +134,7 @@ cdef class Tokenizer:
    def __call__(self, unicode string):
        """Tokenize a string.

-        string (unicode): The string to tokenize.
+        string (str): The string to tokenize.
        RETURNS (Doc): A container for linguistic annotations.

        DOCS: https://spacy.io/api/tokenizer#call
@ -147,7 +147,7 @@ cdef class Tokenizer:
    cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
        """Tokenize according to affix and token_match settings.

-        string (unicode): The string to tokenize.
+        string (str): The string to tokenize.
        RETURNS (Doc): A container for linguistic annotations.
        """
        if len(string) >= (2 ** 30):
@ -527,7 +527,7 @@ cdef class Tokenizer:
    def find_infix(self, unicode string):
        """Find internal split points of the string, such as hyphens.

-        string (unicode): The string to segment.
+        string (str): The string to segment.
        RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
            and `.end()` methods, denoting the placement of internal segment
            separators, e.g. hyphens.
@ -542,7 +542,7 @@ cdef class Tokenizer:
        """Find the length of a prefix that should be segmented from the
        string, or None if no prefix rules match.

-        string (unicode): The string to segment.
+        string (str): The string to segment.
        RETURNS (int): The length of the prefix if present, otherwise `None`.

        DOCS: https://spacy.io/api/tokenizer#find_prefix
@ -556,7 +556,7 @@ cdef class Tokenizer:
        """Find the length of a suffix that should be segmented from the
        string, or None if no suffix rules match.

-        string (unicode): The string to segment.
+        string (str): The string to segment.
        Returns (int): The length of the suffix if present, otherwise `None`.

        DOCS: https://spacy.io/api/tokenizer#find_suffix
@ -576,7 +576,7 @@ cdef class Tokenizer:
    def _validate_special_case(self, chunk, substrings):
        """Check whether the `ORTH` fields match the string.

-        string (unicode): The string to specially tokenize.
+        string (str): The string to specially tokenize.
        substrings (iterable): A sequence of dicts, where each dict describes
            a token and its attributes.
        """
@ -588,7 +588,7 @@ cdef class Tokenizer:
    def add_special_case(self, unicode string, substrings):
        """Add a special-case tokenization rule.

-        string (unicode): The string to specially tokenize.
+        string (str): The string to specially tokenize.
        substrings (iterable): A sequence of dicts, where each dict describes
            a token and its attributes. The `ORTH` fields of the attributes
            must exactly match the string when they are concatenated.
@ -629,7 +629,7 @@ cdef class Tokenizer:
        produced are identical to `nlp.tokenizer()` except for whitespace
        tokens.

-        string (unicode): The string to tokenize.
+        string (str): The string to tokenize.
        RETURNS (list): A list of (pattern_string, token_string) tuples

        DOCS: https://spacy.io/api/tokenizer#explain
@ -693,7 +693,7 @@ cdef class Tokenizer:
    def to_disk(self, path, **kwargs):
        """Save the current state to a directory.

-        path (unicode or Path): A path to a directory, which will be created if
+        path (str / Path): A path to a directory, which will be created if
            it doesn't exist.
        exclude (list): String names of serialization fields to exclude.

@ -707,7 +707,7 @@ cdef class Tokenizer:
        """Loads state from a directory. Modifies the object in place and
        returns it.

-        path (unicode or Path): A path to a directory.
+        path (str / Path): A path to a directory.
        exclude (list): String names of serialization fields to exclude.
        RETURNS (Tokenizer): The modified `Tokenizer` object.

--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@ -117,7 +117,7 @@ cdef class Doc:
    def set_extension(cls, name, **kwargs):
        """Define a custom attribute which becomes available as `Doc._`.

-        name (unicode): Name of the attribute to set.
+        name (str): Name of the attribute to set.
        default: Optional default value of the attribute.
        getter (callable): Optional getter function.
        setter (callable): Optional setter function.
@ -135,7 +135,7 @@ cdef class Doc:
    def get_extension(cls, name):
        """Look up a previously registered extension by name.

-        name (unicode): Name of the extension.
+        name (str): Name of the extension.
        RETURNS (tuple): A `(default, method, getter, setter)` tuple.

        DOCS: https://spacy.io/api/doc#get_extension
@ -146,7 +146,7 @@ cdef class Doc:
    def has_extension(cls, name):
        """Check whether an extension has been registered.

-        name (unicode): Name of the extension.
+        name (str): Name of the extension.
        RETURNS (bool): Whether the extension has been registered.

        DOCS: https://spacy.io/api/doc#has_extension
@ -157,7 +157,7 @@ cdef class Doc:
    def remove_extension(cls, name):
        """Remove a previously registered extension.

-        name (unicode): Name of the extension.
+        name (str): Name of the extension.
        RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
            removed extension.

@ -483,7 +483,7 @@ cdef class Doc:
    def text(self):
        """A unicode representation of the document text.

-        RETURNS (unicode): The original verbatim text of the document.
+        RETURNS (str): The original verbatim text of the document.
        """
        return "".join(t.text_with_ws for t in self)

@ -492,7 +492,7 @@ cdef class Doc:
        """An alias of `Doc.text`, provided for duck-type compatibility with
        `Span` and `Token`.

-        RETURNS (unicode): The original verbatim text of the document.
+        RETURNS (str): The original verbatim text of the document.
        """
        return self.text

@ -637,7 +637,7 @@ cdef class Doc:

    @property
    def lang_(self):
-        """RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'."""
+        """RETURNS (str): Language of the doc's vocabulary, e.g. 'en'."""
        return self.vocab.lang

    cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
@ -852,7 +852,7 @@ cdef class Doc:
    def to_disk(self, path, **kwargs):
        """Save the current state to a directory.

-        path (unicode or Path): A path to a directory, which will be created if
+        path (str / Path): A path to a directory, which will be created if
            it doesn't exist. Paths may be either strings or Path-like objects.
        exclude (list): String names of serialization fields to exclude.

@ -866,7 +866,7 @@ cdef class Doc:
        """Loads state from a directory. Modifies the object in place and
        returns it.

-        path (unicode or Path): A path to a directory. Paths may be either
+        path (str / Path): A path to a directory. Paths may be either
            strings or `Path`-like objects.
        exclude (list): String names of serialization fields to exclude.
        RETURNS (Doc): The modified `Doc` object.
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@ -33,7 +33,7 @@ cdef class Span:
    def set_extension(cls, name, **kwargs):
        """Define a custom attribute which becomes available as `Span._`.

-        name (unicode): Name of the attribute to set.
+        name (str): Name of the attribute to set.
        default: Optional default value of the attribute.
        getter (callable): Optional getter function.
        setter (callable): Optional setter function.
@ -51,7 +51,7 @@ cdef class Span:
    def get_extension(cls, name):
        """Look up a previously registered extension by name.

-        name (unicode): Name of the extension.
+        name (str): Name of the extension.
        RETURNS (tuple): A `(default, method, getter, setter)` tuple.

        DOCS: https://spacy.io/api/span#get_extension
@ -62,7 +62,7 @@ cdef class Span:
    def has_extension(cls, name):
        """Check whether an extension has been registered.

-        name (unicode): Name of the extension.
+        name (str): Name of the extension.
        RETURNS (bool): Whether the extension has been registered.

        DOCS: https://spacy.io/api/span#has_extension
@ -73,7 +73,7 @@ cdef class Span:
    def remove_extension(cls, name):
        """Remove a previously registered extension.

-        name (unicode): Name of the extension.
+        name (str): Name of the extension.
        RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
            removed extension.

@ -491,7 +491,7 @@ cdef class Span:

    @property
    def text(self):
-        """RETURNS (unicode): The original verbatim text of the span."""
+        """RETURNS (str): The original verbatim text of the span."""
        text = self.text_with_ws
        if self[-1].whitespace_:
            text = text[:-1]
@ -502,7 +502,7 @@ cdef class Span:
        """The text content of the span with a trailing whitespace character if
        the last token has one.

-        RETURNS (unicode): The text content of the span (with trailing
+        RETURNS (str): The text content of the span (with trailing
            whitespace).
        """
        return "".join([t.text_with_ws for t in self])
@ -678,7 +678,7 @@ cdef class Span:
            raise NotImplementedError(TempErrors.T007.format(attr="ent_id"))

    property ent_id_:
-        """RETURNS (unicode): The (string) entity ID."""
+        """RETURNS (str): The (string) entity ID."""
        def __get__(self):
            return self.root.ent_id_

@ -690,12 +690,12 @@ cdef class Span:
        """Verbatim text content (identical to `Span.text`). Exists mostly for
        consistency with other attributes.

-        RETURNS (unicode): The span's text."""
+        RETURNS (str): The span's text."""
        return self.text

    @property
    def lemma_(self):
-        """RETURNS (unicode): The span's lemma."""
+        """RETURNS (str): The span's lemma."""
        return " ".join([t.lemma_ for t in self]).strip()

    @property
@ -714,7 +714,7 @@ cdef class Span:
        return "".join([t.text_with_ws for t in self])

    property label_:
-        """RETURNS (unicode): The span's label."""
+        """RETURNS (str): The span's label."""
        def __get__(self):
            return self.doc.vocab.strings[self.label]

@ -724,7 +724,7 @@ cdef class Span:
            raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_))

    property kb_id_:
-        """RETURNS (unicode): The named entity's KB ID."""
+        """RETURNS (str): The named entity's KB ID."""
        def __get__(self):
            return self.doc.vocab.strings[self.kb_id]

--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@ -36,7 +36,7 @@ cdef class Token:
    def set_extension(cls, name, **kwargs):
        """Define a custom attribute which becomes available as `Token._`.

-        name (unicode): Name of the attribute to set.
+        name (str): Name of the attribute to set.
        default: Optional default value of the attribute.
        getter (callable): Optional getter function.
        setter (callable): Optional setter function.
@ -54,7 +54,7 @@ cdef class Token:
    def get_extension(cls, name):
        """Look up a previously registered extension by name.

-        name (unicode): Name of the extension.
+        name (str): Name of the extension.
        RETURNS (tuple): A `(default, method, getter, setter)` tuple.

        DOCS: https://spacy.io/api/token#get_extension
@ -65,7 +65,7 @@ cdef class Token:
    def has_extension(cls, name):
        """Check whether an extension has been registered.

-        name (unicode): Name of the extension.
+        name (str): Name of the extension.
        RETURNS (bool): Whether the extension has been registered.

        DOCS: https://spacy.io/api/token#has_extension
@ -76,7 +76,7 @@ cdef class Token:
    def remove_extension(cls, name):
        """Remove a previously registered extension.

-        name (unicode): Name of the extension.
+        name (str): Name of the extension.
        RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
            removed extension.

@ -244,12 +244,12 @@ cdef class Token:

    @property
    def text(self):
-        """RETURNS (unicode): The original verbatim text of the token."""
+        """RETURNS (str): The original verbatim text of the token."""
        return self.orth_

    @property
    def text_with_ws(self):
-        """RETURNS (unicode): The text content of the span (with trailing
+        """RETURNS (str): The text content of the span (with trailing
            whitespace).
        """
        cdef unicode orth = self.vocab.strings[self.c.lex.orth]
@ -762,7 +762,7 @@ cdef class Token:
            self.c.ent_type = ent_type

    property ent_type_:
-        """RETURNS (unicode): Named entity type."""
+        """RETURNS (str): Named entity type."""
        def __get__(self):
            return self.vocab.strings[self.c.ent_type]

@ -785,7 +785,7 @@ cdef class Token:
        and "" means no entity tag is set. "B" with an empty ent_type
        means that the token is blocked from further processing by NER.

-        RETURNS (unicode): IOB code of named entity tag.
+        RETURNS (str): IOB code of named entity tag.
        """
        iob_strings = ("", "I", "O", "B")
        return iob_strings[self.c.ent_iob]
@ -801,7 +801,7 @@ cdef class Token:
            self.c.ent_id = key

    property ent_id_:
-        """RETURNS (unicode): ID of the entity the token is an instance of,
+        """RETURNS (str): ID of the entity the token is an instance of,
            if any.
        """
        def __get__(self):
@ -819,7 +819,7 @@ cdef class Token:
            self.c.ent_kb_id = ent_kb_id

    property ent_kb_id_:
-        """RETURNS (unicode): Named entity KB ID."""
+        """RETURNS (str): Named entity KB ID."""
        def __get__(self):
            return self.vocab.strings[self.c.ent_kb_id]

@ -828,12 +828,12 @@ cdef class Token:

    @property
    def whitespace_(self):
-        """RETURNS (unicode): The trailing whitespace character, if present."""
+        """RETURNS (str): The trailing whitespace character, if present."""
        return " " if self.c.spacy else ""

    @property
    def orth_(self):
-        """RETURNS (unicode): Verbatim text content (identical to
+        """RETURNS (str): Verbatim text content (identical to
            `Token.text`). Exists mostly for consistency with the other
            attributes.
        """
@ -841,13 +841,13 @@ cdef class Token:

    @property
    def lower_(self):
-        """RETURNS (unicode): The lowercase token text. Equivalent to
+        """RETURNS (str): The lowercase token text. Equivalent to
            `Token.text.lower()`.
        """
        return self.vocab.strings[self.c.lex.lower]

    property norm_:
-        """RETURNS (unicode): The token's norm, i.e. a normalised form of the
+        """RETURNS (str): The token's norm, i.e. a normalised form of the
            token text. Usually set in the language's tokenizer exceptions or
            norm exceptions.
        """
@ -859,34 +859,34 @@ cdef class Token:

    @property
    def shape_(self):
-        """RETURNS (unicode): Transform of the tokens's string, to show
+        """RETURNS (str): Transform of the tokens's string, to show
            orthographic features. For example, "Xxxx" or "dd".
        """
        return self.vocab.strings[self.c.lex.shape]

    @property
    def prefix_(self):
-        """RETURNS (unicode): A length-N substring from the start of the token.
+        """RETURNS (str): A length-N substring from the start of the token.
            Defaults to `N=1`.
        """
        return self.vocab.strings[self.c.lex.prefix]

    @property
    def suffix_(self):
-        """RETURNS (unicode): A length-N substring from the end of the token.
+        """RETURNS (str): A length-N substring from the end of the token.
            Defaults to `N=3`.
        """
        return self.vocab.strings[self.c.lex.suffix]

    @property
    def lang_(self):
-        """RETURNS (unicode): Language of the parent document's vocabulary,
+        """RETURNS (str): Language of the parent document's vocabulary,
            e.g. 'en'.
        """
        return self.vocab.strings[self.c.lex.lang]

    property lemma_:
-        """RETURNS (unicode): The token lemma, i.e. the base form of the word,
+        """RETURNS (str): The token lemma, i.e. the base form of the word,
            with no inflectional suffixes.
        """
        def __get__(self):
@ -899,7 +899,7 @@ cdef class Token:
            self.c.lemma = self.vocab.strings.add(lemma_)

    property pos_:
-        """RETURNS (unicode): Coarse-grained part-of-speech tag."""
+        """RETURNS (str): Coarse-grained part-of-speech tag."""
        def __get__(self):
            return parts_of_speech.NAMES[self.c.pos]

@ -907,7 +907,7 @@ cdef class Token:
            self.c.pos = parts_of_speech.IDS[pos_name]

    property tag_:
-        """RETURNS (unicode): Fine-grained part-of-speech tag."""
+        """RETURNS (str): Fine-grained part-of-speech tag."""
        def __get__(self):
            return self.vocab.strings[self.c.tag]

@ -915,7 +915,7 @@ cdef class Token:
            self.tag = self.vocab.strings.add(tag)

    property dep_:
-        """RETURNS (unicode): The syntactic dependency label."""
+        """RETURNS (str): The syntactic dependency label."""
        def __get__(self):
            return self.vocab.strings[self.c.dep]

--- a/spacy/util.py
+++ b/spacy/util.py
@ -15,6 +15,8 @@ import srsly
 import catalogue
 import sys
 import warnings
+from packaging.specifiers import SpecifierSet, InvalidSpecifier
+from packaging.version import Version, InvalidVersion


 try:
@ -22,9 +24,16 @@ try:
 except ImportError:
    cupy = None

+try:  # Python 3.8
+    import importlib.metadata as importlib_metadata
+except ImportError:
+    import importlib_metadata
+
 from .symbols import ORTH
 from .compat import cupy, CudaStream
 from .errors import Errors, Warnings
+from . import about
+

 _PRINT_ENV = False
 OOV_RANK = numpy.iinfo(numpy.uint64).max
@ -37,6 +46,10 @@ class registry(thinc.registry):
    factories = catalogue.create("spacy", "factories", entry_points=True)
    displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
    assets = catalogue.create("spacy", "assets", entry_points=True)
+    # This is mostly used to get a list of all installed models in the current
+    # environment. spaCy models packaged with `spacy package` will "advertise"
+    # themselves via entry points.
+    models = catalogue.create("spacy", "models", entry_points=True)


 def set_env_log(value):
@ -49,7 +62,7 @@ def lang_class_is_loaded(lang):
    loaded lazily, to avoid expensive setup code associated with the language
    data.

-    lang (unicode): Two-letter language code, e.g. 'en'.
+    lang (str): Two-letter language code, e.g. 'en'.
    RETURNS (bool): Whether a Language class has been loaded.
    """
    return lang in registry.languages
@ -58,7 +71,7 @@ def lang_class_is_loaded(lang):
 def get_lang_class(lang):
    """Import and load a Language class.

-    lang (unicode): Two-letter language code, e.g. 'en'.
+    lang (str): Two-letter language code, e.g. 'en'.
    RETURNS (Language): Language class.
    """
    # Check if language is registered / entry point is available
@ -76,7 +89,7 @@ def get_lang_class(lang):
 def set_lang_class(name, cls):
    """Set a custom Language class name that can be loaded via get_lang_class.

-    name (unicode): Name of Language class.
+    name (str): Name of Language class.
    cls (Language): Language class.
    """
    registry.languages.register(name, func=cls)
@ -98,7 +111,7 @@ def load_language_data(path):
    """Load JSON language data using the given path as a base. If the provided
    path isn't present, will attempt to load a gzipped version before giving up.

-    path (unicode / Path): The data to load.
+    path (str / Path): The data to load.
    RETURNS: The loaded data.
    """
    path = ensure_path(path)
@ -119,7 +132,7 @@ def get_module_path(module):
 def load_model(name, **overrides):
    """Load a model from a package or data path.

-    name (unicode): Package name or model path.
+    name (str): Package name or model path.
    **overrides: Specific overrides, like pipeline components to disable.
    RETURNS (Language): `Language` class with the loaded model.
    """
@ -193,7 +206,7 @@ def load_model_from_init_py(init_file, **overrides):
    """Helper function to use in the `load()` method of a model package's
    __init__.py.

-    init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
+    init_file (str): Path to model's __init__.py, i.e. `__file__`.
    **overrides: Specific overrides, like pipeline components to disable.
    RETURNS (Language): `Language` class with loaded model.
    """
@ -206,11 +219,74 @@ def load_model_from_init_py(init_file, **overrides):
    return load_model_from_path(data_path, meta, **overrides)


+def get_installed_models():
+    """List all model packages currently installed in the environment.
+
+    RETURNS (list): The string names of the models.
+    """
+    return list(registry.models.get_all().keys())
+
+
+def get_package_version(name):
+    """Get the version of an installed package. Typically used to get model
+    package versions.
+
+    name (str): The name of the installed Python package.
+    RETURNS (str / None): The version or None if package not installed.
+    """
+    try:
+        return importlib_metadata.version(name)
+    except importlib_metadata.PackageNotFoundError:
+        return None
+
+
+def is_compatible_version(version, constraint, prereleases=True):
+    """Check if a version (e.g. "2.0.0") is compatible given a version
+    constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version,
+    it's interpreted as =={version}.
+
+    version (str): The version to check.
+    constraint (str): The constraint string.
+    prereleases (bool): Whether to allow prereleases. If set to False,
+        prerelease versions will be considered incompatible.
+    RETURNS (bool / None): Whether the version is compatible, or None if the
+        version or constraint are invalid.
+    """
+    # Handle cases where exact version is provided as constraint
+    if constraint[0].isdigit():
+        constraint = f"=={constraint}"
+    try:
+        spec = SpecifierSet(constraint)
+        version = Version(version)
+    except (InvalidSpecifier, InvalidVersion):
+        return None
+    spec.prereleases = prereleases
+    return version in spec
+
+
+def get_model_version_range(spacy_version):
+    """Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy
+    version. Models are always compatible across patch versions but not
+    across minor or major versions.
+    """
+    release = Version(spacy_version).release
+    return f">={spacy_version},<{release[0]}.{release[1] + 1}.0"
+
+
+def get_base_version(version):
+    """Generate the base version without any prerelease identifiers.
+
+    version (str): The version, e.g. "3.0.0.dev1".
+    RETURNS (str): The base version, e.g. "3.0.0".
+    """
+    return Version(version).base_version
+
+
 def load_config(path, create_objects=False):
    """Load a Thinc-formatted config file, optionally filling in objects where
    the config references registry entries. See "Thinc config files" for details.

-    path (unicode or Path): Path to the config file
+    path (str / Path): Path to the config file
    create_objects (bool): Whether to automatically create objects when the config
        references registry entries. Defaults to False.

@ -227,7 +303,7 @@ def load_config_from_str(string, create_objects=False):
    """Load a Thinc-formatted config, optionally filling in objects where
    the config references registry entries. See "Thinc config files" for details.

-    string (unicode or Path): Text contents of the config file.
+    string (str / Path): Text contents of the config file.
    create_objects (bool): Whether to automatically create objects when the config
        references registry entries. Defaults to False.

@ -243,7 +319,7 @@ def load_config_from_str(string, create_objects=False):
 def get_model_meta(path):
    """Get model meta.json from a directory path and validate its contents.

-    path (unicode or Path): Path to model directory.
+    path (str / Path): Path to model directory.
    RETURNS (dict): The model's meta data.
    """
    model_path = ensure_path(path)
@ -256,13 +332,23 @@ def get_model_meta(path):
    for setting in ["lang", "name", "version"]:
        if setting not in meta or not meta[setting]:
            raise ValueError(Errors.E054.format(setting=setting))
+    if "spacy_version" in meta:
+        if not is_compatible_version(about.__version__, meta["spacy_version"]):
+            warnings.warn(
+                Warnings.W095.format(
+                    model=f"{meta['lang']}_{meta['name']}",
+                    model_version=meta["version"],
+                    version=meta["spacy_version"],
+                    current=about.__version__,
+                )
+            )
    return meta


 def get_model_config(path):
    """Get the model's config from a directory path.

-    path (unicode or Path): Path to model directory.
+    path (str / Path): Path to model directory.
    RETURNS (Config): The model's config data.
    """
    model_path = ensure_path(path)
@ -279,23 +365,20 @@ def get_model_config(path):
 def is_package(name):
    """Check if string maps to a package installed via pip.

-    name (unicode): Name of package.
+    name (str): Name of package.
    RETURNS (bool): True if installed package, False if not.
    """
-    import pkg_resources
-
-    name = name.lower()  # compare package name against lowercase name
-    packages = pkg_resources.working_set.by_key.keys()
-    for package in packages:
-        if package.lower().replace("-", "_") == name:
+    try:
+        importlib_metadata.distribution(name)
        return True
+    except:  # noqa: E722
        return False


 def get_package_path(name):
    """Get the path to an installed package.

-    name (unicode): Package name.
+    name (str): Package name.
    RETURNS (Path): Path to installed package.
    """
    name = name.lower()  # use lowercase version to be safe
@ -470,8 +553,8 @@ def expand_exc(excs, search, replace):
    For example, to add additional versions with typographic apostrophes.

    excs (dict): Tokenizer exceptions.
-    search (unicode): String to find and replace.
-    replace (unicode): Replacement.
+    search (str): String to find and replace.
+    replace (str): Replacement.
    RETURNS (dict): Combined tokenizer exceptions.
    """

@ -575,41 +658,73 @@ def decaying(start, stop, decay):
        curr -= decay


-def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0.2):
+def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False):
    """Create minibatches of roughly a given number of words. If any examples
    are longer than the specified batch length, they will appear in a batch by
-    themselves."""
+    themselves, or be discarded if discard_oversize=True."""
    if isinstance(size, int):
        size_ = itertools.repeat(size)
    elif isinstance(size, List):
        size_ = iter(size)
    else:
        size_ = size
-    examples = iter(examples)
-    oversize = []
-    while True:
-        batch_size = next(size_)
-        tol_size = batch_size * 0.2
+
+    target_size = next(size_)
+    tol_size = target_size * tolerance
    batch = []
-        if oversize:
-            example = oversize.pop(0)
+    overflow = []
+    batch_size = 0
+    overflow_size = 0
+
+    for example in examples:
        n_words = count_words(example.doc)
+        # if the current example exceeds the maximum batch size, it is returned separately
+        # but only if discard_oversize=False.
+        if n_words > target_size + tol_size:
+            if not discard_oversize:
+                yield [example]
+
+        # add the example to the current batch if there's no overflow yet and it still fits
+        elif overflow_size == 0 and (batch_size + n_words) <= target_size:
            batch.append(example)
-            batch_size -= n_words
-        while batch_size >= 1:
-            try:
-                example = next(examples)
-            except StopIteration:
-                if batch:
-                    yield batch
-                return
-            n_words = count_words(example.doc)
-            if n_words < (batch_size + tol_size):
-                batch_size -= n_words
-                batch.append(example)
+            batch_size += n_words
+
+        # add the example to the overflow buffer if it fits in the tolerance margin
+        elif (batch_size + overflow_size + n_words) <= (target_size + tol_size):
+            overflow.append(example)
+            overflow_size += n_words
+
+        # yield the previous batch and start a new one. The new one gets the overflow examples.
        else:
-                oversize.append(example)
+            yield batch
+            target_size = next(size_)
+            tol_size = target_size * tolerance
+            batch = overflow
+            batch_size = overflow_size
+            overflow = []
+            overflow_size = 0
+
+            # this example still fits
+            if (batch_size + n_words) <= target_size:
+                batch.append(example)
+                batch_size += n_words
+
+            # this example fits in overflow
+            elif (batch_size + n_words) <= (target_size + tol_size):
+                overflow.append(example)
+                overflow_size += n_words
+
+            # this example does not fit with the previous overflow: start another new batch
+            else:
+                yield batch
+                target_size = next(size_)
+                tol_size = target_size * tolerance
+                batch = [example]
+                batch_size = n_words
+
+    # yield the final batch
    if batch:
+        batch.extend(overflow)
        yield batch


@ -705,8 +820,8 @@ def from_disk(path, readers, exclude):
 def import_file(name, loc):
    """Import module from a file. Used to load models from a directory.

-    name (unicode): Name of module to load.
-    loc (unicode / Path): Path to the file.
+    name (str): Name of module to load.
+    loc (str / Path): Path to the file.
    RETURNS: The loaded module.
    """
    loc = str(loc)
@ -721,8 +836,8 @@ def minify_html(html):
    Disclaimer: NOT a general-purpose solution, only removes indentation and
    newlines.

-    html (unicode): Markup to minify.
-    RETURNS (unicode): "Minified" HTML.
+    html (str): Markup to minify.
+    RETURNS (str): "Minified" HTML.
    """
    return html.strip().replace("    ", "").replace("\n", "")

@ -731,8 +846,8 @@ def escape_html(text):
    """Replace <, >, &, " with their HTML encoded representation. Intended to
    prevent HTML errors in rendered displaCy markup.

-    text (unicode): The original text.
-    RETURNS (unicode): Equivalent text to be safely used within HTML.
+    text (str): The original text.
+    RETURNS (str): Equivalent text to be safely used within HTML.
    """
    text = text.replace("&", "&amp;")
    text = text.replace("<", "&lt;")
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@ -57,7 +57,7 @@ cdef class Vectors:
        shape (tuple): Size of the table, as (# entries, # columns)
        data (numpy.ndarray): The vector data.
        keys (iterable): A sequence of keys, aligned with the data.
-        name (unicode): A name to identify the vectors table.
+        name (str): A name to identify the vectors table.
        RETURNS (Vectors): The newly created object.

        DOCS: https://spacy.io/api/vectors#init
@ -244,7 +244,7 @@ cdef class Vectors:
    def find(self, *, key=None, keys=None, row=None, rows=None):
        """Look up one or more keys by row, or vice versa.

-        key (unicode / int): Find the row that the given key points to.
+        key (str / int): Find the row that the given key points to.
            Returns int, -1 if missing.
        keys (iterable): Find rows that the keys point to.
            Returns ndarray.
@ -366,7 +366,7 @@ cdef class Vectors:
    def to_disk(self, path, **kwargs):
        """Save the current state to a directory.

-        path (unicode / Path): A path to a directory, which will be created if
+        path (str / Path): A path to a directory, which will be created if
            it doesn't exists.

        DOCS: https://spacy.io/api/vectors#to_disk
@ -386,7 +386,7 @@ cdef class Vectors:
        """Loads state from a directory. Modifies the object in place and
        returns it.

-        path (unicode / Path): Directory path, string or Path-like object.
+        path (str / Path): Directory path, string or Path-like object.
        RETURNS (Vectors): The modified object.

        DOCS: https://spacy.io/api/vectors#from_disk
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@ -505,8 +505,8 @@ tokenization can be provided.
 > ```

 | Key      | Type | Description                                                |
-| -------- | ------- | ---------------------------------------------------------- |
-| `text`   | unicode | The raw input text. Is not required if `tokens` available. |
+| -------- | ---- | ---------------------------------------------------------- |
+| `text`   | str  | The raw input text. Is not required if `tokens` available. |
 | `tokens` | list | Optional tokenization, one string per token.               |

 ```json
--- a/website/docs/api/cython-classes.md
+++ b/website/docs/api/cython-classes.md
@ -170,7 +170,7 @@ vocabulary.
 | Name        | Type             | Description                                                                                 |
 | ----------- | ---------------- | ------------------------------------------------------------------------------------------- |
 | `mem`       | `cymem.Pool`     | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. |
-| `string`    | unicode          | The string of the word to look up.                                                          |
+| `string`    | str              | The string of the word to look up.                                                          |
 | **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary.                                                               |

 ### Vocab.get_by_orth {#vocab_get_by_orth tag="method"}
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@ -230,8 +230,8 @@ Add a new label to the pipe.
 > ```

 | Name    | Type | Description       |
-| ------- | ------- | ----------------- |
-| `label` | unicode | The label to add. |
+| ------- | ---- | ----------------- |
+| `label` | str  | The label to add. |

 ## DependencyParser.to_disk {#to_disk tag="method"}

@ -245,8 +245,8 @@ Serialize the pipe to disk.
 > ```

 | Name      | Type         | Description                                                                                                           |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |

 ## DependencyParser.from_disk {#from_disk tag="method"}
@ -262,7 +262,7 @@ Load the pipe from disk. Modifies the object in place and returns it.

 | Name        | Type               | Description                                                                |
 | ----------- | ------------------ | -------------------------------------------------------------------------- |
-| `path`      | unicode / `Path`   | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `path`      | str / `Path`       | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | `exclude`   | list               | String names of [serialization fields](#serialization-fields) to exclude.  |
 | **RETURNS** | `DependencyParser` | The modified `DependencyParser` object.                                    |

--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@ -123,7 +123,7 @@ details, see the documentation on

 | Name      | Type     | Description                                                                                                                         |
 | --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `name`    | unicode  | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`.                       |
+| `name`    | str      | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`.                       |
 | `default` | -        | Optional default value of the attribute if no getter or method is defined.                                                          |
 | `method`  | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`.                                                          |
 | `getter`  | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute.          |
@ -146,8 +146,8 @@ Look up a previously registered extension by name. Returns a 4-tuple
 > ```

 | Name        | Type  | Description                                                   |
-| ----------- | ------- | ------------------------------------------------------------- |
-| `name`      | unicode | Name of the extension.                                        |
+| ----------- | ----- | ------------------------------------------------------------- |
+| `name`      | str   | Name of the extension.                                        |
 | **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |

 ## Doc.has_extension {#has_extension tag="classmethod" new="2"}
@ -163,8 +163,8 @@ Check whether an extension has been registered on the `Doc` class.
 > ```

 | Name        | Type | Description                                |
-| ----------- | ------- | ------------------------------------------ |
-| `name`      | unicode | Name of the extension to check.            |
+| ----------- | ---- | ------------------------------------------ |
+| `name`      | str  | Name of the extension to check.            |
 | **RETURNS** | bool | Whether the extension has been registered. |

 ## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
@ -181,8 +181,8 @@ Remove a previously registered extension.
 > ```

 | Name        | Type  | Description                                                           |
-| ----------- | ------- | --------------------------------------------------------------------- |
-| `name`      | unicode | Name of the extension.                                                |
+| ----------- | ----- | --------------------------------------------------------------------- |
+| `name`      | str   | Name of the extension.                                                |
 | **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |

 ## Doc.char_span {#char_span tag="method" new="2"}
@ -369,8 +369,8 @@ Save the current state to a directory.
 > ```

 | Name      | Type         | Description                                                                                                           |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |

 ## Doc.from_disk {#from_disk tag="method" new="2"}
@ -386,8 +386,8 @@ Loads state from a directory. Modifies the object in place and returns it.
 > ```

 | Name        | Type         | Description                                                                |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path`      | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | `exclude`   | list         | String names of [serialization fields](#serialization-fields) to exclude.  |
 | **RETURNS** | `Doc`        | The modified `Doc` object.                                                 |

@ -648,15 +648,15 @@ The L2 norm of the document's vector representation.

 | Name                                    | Type         | Description                                                                                                                                                                                                                                                                                |
 | --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `text`                                  | unicode      | A unicode representation of the document text.                                                                                                                                                                                                                                             |
-| `text_with_ws`                          | unicode      | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`.                                                                                                                                                                                                      |
+| `text`                                  | str          | A unicode representation of the document text.                                                                                                                                                                                                                                             |
+| `text_with_ws`                          | str          | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`.                                                                                                                                                                                                      |
 | `mem`                                   | `Pool`       | The document's local memory heap, for all C data it owns.                                                                                                                                                                                                                                  |
 | `vocab`                                 | `Vocab`      | The store of lexical types.                                                                                                                                                                                                                                                                |
 | `tensor` <Tag variant="new">2</Tag>     | `ndarray`    | Container for dense vector representations.                                                                                                                                                                                                                                                |
 | `cats` <Tag variant="new">2</Tag>       | dict         | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float.                                                                                     |
 | `user_data`                             | -            | A generic storage area, for user custom data.                                                                                                                                                                                                                                              |
 | `lang` <Tag variant="new">2.1</Tag>     | int          | Language of the document's vocabulary.                                                                                                                                                                                                                                                     |
-| `lang_` <Tag variant="new">2.1</Tag>    | unicode      | Language of the document's vocabulary.                                                                                                                                                                                                                                                     |
+| `lang_` <Tag variant="new">2.1</Tag>    | str          | Language of the document's vocabulary.                                                                                                                                                                                                                                                     |
 | `is_tagged`                             | bool         | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty.                                                                                                                                                                                  |
 | `is_parsed`                             | bool         | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty.                                                                                                                                                                                   |
 | `is_sentenced`                          | bool         | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty.                                                                                                                                                                        |
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@ -259,8 +259,8 @@ Serialize the pipe to disk.
 > ```

 | Name      | Type         | Description                                                                                                           |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |

 ## EntityLinker.from_disk {#from_disk tag="method"}
@ -275,8 +275,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
 > ```

 | Name        | Type           | Description                                                                |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| ----------- | -------------- | -------------------------------------------------------------------------- |
+| `path`      | str / `Path`   | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | `exclude`   | list           | String names of [serialization fields](#serialization-fields) to exclude.  |
 | **RETURNS** | `EntityLinker` | The modified `EntityLinker` object.                                        |

--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@ -231,8 +231,8 @@ Add a new label to the pipe.
 > ```

 | Name    | Type | Description       |
-| ------- | ------- | ----------------- |
-| `label` | unicode | The label to add. |
+| ------- | ---- | ----------------- |
+| `label` | str  | The label to add. |

 ## EntityRecognizer.to_disk {#to_disk tag="method"}

@ -246,8 +246,8 @@ Serialize the pipe to disk.
 > ```

 | Name      | Type         | Description                                                                                                           |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |

 ## EntityRecognizer.from_disk {#from_disk tag="method"}
@ -263,7 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.

 | Name        | Type               | Description                                                                |
 | ----------- | ------------------ | -------------------------------------------------------------------------- |
-| `path`      | unicode / `Path`   | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `path`      | str / `Path`       | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | `exclude`   | list               | String names of [serialization fields](#serialization-fields) to exclude.  |
 | **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object.                                    |

--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@ -73,8 +73,8 @@ Whether a label is present in the patterns.
 > ```

 | Name        | Type | Description                                  |
-| ----------- | ------- | -------------------------------------------- |
-| `label`     | unicode | The label to check.                          |
+| ----------- | ---- | -------------------------------------------- |
+| `label`     | str  | The label to check.                          |
 | **RETURNS** | bool | Whether the entity ruler contains the label. |

 ## EntityRuler.\_\_call\_\_ {#call tag="method"}
@ -83,8 +83,9 @@ Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
 happens automatically after the component has been added to the pipeline using
 [`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
 with `overwrite_ents=True`, existing entities will be replaced if they overlap
-with the matches. When matches overlap in a Doc, the entity ruler prioritizes longer
-patterns over shorter, and if equal the match occuring first in the Doc is chosen.
+with the matches. When matches overlap in a Doc, the entity ruler prioritizes
+longer patterns over shorter, and if equal the match occuring first in the Doc
+is chosen.

 > #### Example
 >
@ -140,8 +141,8 @@ only the patterns are saved as JSONL. If a directory name is provided, a
 > ```

 | Name   | Type         | Description                                                                                                                         |
-| ------ | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| ------ | ------------ | ----------------------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |

 ## EntityRuler.from_disk {#from_disk tag="method"}

@ -159,8 +160,8 @@ configuration.
 > ```

 | Name        | Type          | Description                                                                              |
-| ----------- | ---------------- | ---------------------------------------------------------------------------------------- |
-| `path`      | unicode / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. |
+| ----------- | ------------- | ---------------------------------------------------------------------------------------- |
+| `path`      | str / `Path`  | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. |
 | **RETURNS** | `EntityRuler` | The modified `EntityRuler` object.                                                       |

 ## EntityRuler.to_bytes {#to_bytes tag="method"}
--- a/website/docs/api/goldcorpus.md
+++ b/website/docs/api/goldcorpus.md
@ -18,7 +18,7 @@ Create a `GoldCorpus`. IF the input data is an iterable, each item should be a
 for further details.

 | Name        | Type                    | Description                                                  |
-| ----------- | --------------------------- | ------------------------------------------------------------ |
-| `train`     | unicode / `Path` / iterable | Training data, as a path (file or directory) or iterable.    |
-| `dev`       | unicode / `Path` / iterable | Development data, as a path (file or directory) or iterable. |
+| ----------- | ----------------------- | ------------------------------------------------------------ |
+| `train`     | str / `Path` / iterable | Training data, as a path (file or directory) or iterable.    |
+| `dev`       | str / `Path` / iterable | Development data, as a path (file or directory) or iterable. |
 | **RETURNS** | `GoldCorpus`            | The newly constructed object.                                |
--- a/website/docs/api/goldparse.md
+++ b/website/docs/api/goldparse.md
@ -60,7 +60,8 @@ Whether the provided syntactic annotations form a projective dependency tree.

 Convert a list of Doc objects into the
 [JSON-serializable format](/api/annotation#json-input) used by the
-[`spacy train`](/api/cli#train) command. Each input doc will be treated as a 'paragraph' in the output doc.
+[`spacy train`](/api/cli#train) command. Each input doc will be treated as a
+'paragraph' in the output doc.

 > #### Example
 >
@ -158,7 +159,7 @@ single-token entity.
 | ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
 | `doc`       | `Doc`    | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document.                          |
 | `entities`  | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. |
-| **RETURNS** | list     | Unicode strings, describing the [BILUO](/api/annotation#biluo) tags.                                                                            |
+| **RETURNS** | list     | str strings, describing the [BILUO](/api/annotation#biluo) tags.                                                                                |

 ### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}

--- a/website/docs/api/kb.md
+++ b/website/docs/api/kb.md
@ -1,16 +1,19 @@
 ---
 title: KnowledgeBase
-teaser: A storage class for entities and aliases of a specific knowledge base (ontology)
+teaser:
+  A storage class for entities and aliases of a specific knowledge base
+  (ontology)
 tag: class
 source: spacy/kb.pyx
 new: 2.2
 ---

-The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init)
-objects, which are plausible external identifiers given a certain textual mention.
-Each such `Candidate` holds information from the relevant KB entities,
-such as its frequency in text and possible aliases.
-Each entity in the knowledge base also has a pretrained entity vector of a fixed size.
+The `KnowledgeBase` object provides a method to generate
+[`Candidate`](/api/kb/#candidate_init) objects, which are plausible external
+identifiers given a certain textual mention. Each such `Candidate` holds
+information from the relevant KB entities, such as its frequency in text and
+possible aliases. Each entity in the knowledge base also has a pretrained entity
+vector of a fixed size.

 ## KnowledgeBase.\_\_init\_\_ {#init tag="method"}

@ -25,24 +28,24 @@ Create the knowledge base.
 > ```

 | Name                   | Type            | Description                              |
-| ----------------------- | ---------------- | ----------------------------------------- |
+| ---------------------- | --------------- | ---------------------------------------- |
 | `vocab`                | `Vocab`         | A `Vocab` object.                        |
 | `entity_vector_length` | int             | Length of the fixed-size entity vectors. |
 | **RETURNS**            | `KnowledgeBase` | The newly constructed object.            |

-
 ## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}

 The length of the fixed-size entity vectors in the knowledge base.

 | Name        | Type | Description                              |
-| ----------- | ---- | ----------------------------------------- |
+| ----------- | ---- | ---------------------------------------- |
 | **RETURNS** | int  | Length of the fixed-size entity vectors. |

 ## KnowledgeBase.add_entity {#add_entity tag="method"}

-Add an entity to the knowledge base, specifying its corpus frequency
-and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length).
+Add an entity to the knowledge base, specifying its corpus frequency and entity
+vector, which should be of length
+[`entity_vector_length`](/api/kb#entity_vector_length).

 > #### Example
 >
@ -52,15 +55,15 @@ and entity vector, which should be of length [`entity_vector_length`](/api/kb#en
 > ```

 | Name            | Type   | Description                                     |
-| --------------- | ------------- | ------------------------------------------------- |
-| `entity`        | unicode       | The unique entity identifier                      |
+| --------------- | ------ | ----------------------------------------------- |
+| `entity`        | str    | The unique entity identifier                    |
 | `freq`          | float  | The frequency of the entity in a typical corpus |
 | `entity_vector` | vector | The pretrained vector of the entity             |

 ## KnowledgeBase.set_entities {#set_entities tag="method"}

-Define the full list of entities in the knowledge base, specifying the corpus frequency
-and entity vector for each entity.
+Define the full list of entities in the knowledge base, specifying the corpus
+frequency and entity vector for each entity.

 > #### Example
 >
@ -69,17 +72,18 @@ and entity vector for each entity.
 > ```

 | Name          | Type     | Description                       |
-| ------------- | ------------- | ------------------------------------------------- |
+| ------------- | -------- | --------------------------------- |
 | `entity_list` | iterable | List of unique entity identifiers |
 | `freq_list`   | iterable | List of entity frequencies        |
 | `vector_list` | iterable | List of entity vectors            |

 ## KnowledgeBase.add_alias {#add_alias tag="method"}

-Add an alias or mention to the knowledge base, specifying its potential KB identifiers
-and their prior probabilities. The entity identifiers should refer to entities previously
-added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities).
-The sum of the prior probabilities should not exceed 1.
+Add an alias or mention to the knowledge base, specifying its potential KB
+identifiers and their prior probabilities. The entity identifiers should refer
+to entities previously added with [`add_entity`](/api/kb#add_entity) or
+[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
+should not exceed 1.

 > #### Example
 >
@ -88,10 +92,10 @@ The sum of the prior probabilities should not exceed 1.
 > ```

 | Name            | Type     | Description                                        |
-| -------------- | ------------- | -------------------------------------------------- |
-| `alias`        | unicode       | The textual mention or alias                       |
+| --------------- | -------- | -------------------------------------------------- |
+| `alias`         | str      | The textual mention or alias                       |
 | `entities`      | iterable | The potential entities that the alias may refer to |
-| `probabilities`| iterable      | The prior probabilities of each entity             |
+| `probabilities` | iterable | The prior probabilities of each entity             |

 ## KnowledgeBase.\_\_len\_\_ {#len tag="method"}

@ -118,7 +122,7 @@ Get a list of all entity IDs in the knowledge base.
 > ```

 | Name        | Type | Description                                 |
-| ----------- | ---- | --------------------------------------------- |
+| ----------- | ---- | ------------------------------------------- |
 | **RETURNS** | list | The list of entities in the knowledge base. |

 ## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
@ -132,7 +136,7 @@ Get the total number of aliases in the knowledge base.
 > ```

 | Name        | Type | Description                                  |
-| ----------- | ---- | --------------------------------------------- |
+| ----------- | ---- | -------------------------------------------- |
 | **RETURNS** | int  | The number of aliases in the knowledge base. |

 ## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
@ -146,7 +150,7 @@ Get a list of all aliases in the knowledge base.
 > ```

 | Name        | Type | Description                                |
-| ----------- | ---- | --------------------------------------------- |
+| ----------- | ---- | ------------------------------------------ |
 | **RETURNS** | list | The list of aliases in the knowledge base. |

 ## KnowledgeBase.get_candidates {#get_candidates tag="method"}
@ -161,8 +165,8 @@ of type [`Candidate`](/api/kb/#candidate_init).
 > ```

 | Name        | Type     | Description                              |
-| ------------- | ------------- | -------------------------------------------------- |
-| `alias`       | unicode       | The textual mention or alias                       |
+| ----------- | -------- | ---------------------------------------- |
+| `alias`     | str      | The textual mention or alias             |
 | **RETURNS** | iterable | The list of relevant `Candidate` objects |

 ## KnowledgeBase.get_vector {#get_vector tag="method"}
@ -176,14 +180,14 @@ Given a certain entity ID, retrieve its pretrained entity vector.
 > ```

 | Name        | Type   | Description       |
-| ------------- | ------------- | -------------------------------------------------- |
-| `entity`      | unicode       | The entity ID                                      |
+| ----------- | ------ | ----------------- |
+| `entity`    | str    | The entity ID     |
 | **RETURNS** | vector | The entity vector |

 ## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}

-Given a certain entity ID and a certain textual mention, retrieve
-the prior probability of the fact that the mention links to the entity ID.
+Given a certain entity ID and a certain textual mention, retrieve the prior
+probability of the fact that the mention links to the entity ID.

 > #### Example
 >
@ -192,9 +196,9 @@ the prior probability of the fact that the mention links to the entity ID.
 > ```

 | Name        | Type  | Description                                                    |
-| ------------- | ------------- | --------------------------------------------------------------- |
-| `entity`      | unicode       | The entity ID                                                   |
-| `alias`       | unicode       | The textual mention or alias                                    |
+| ----------- | ----- | -------------------------------------------------------------- |
+| `entity`    | str   | The entity ID                                                  |
+| `alias`     | str   | The textual mention or alias                                   |
 | **RETURNS** | float | The prior probability of the `alias` referring to the `entity` |

 ## KnowledgeBase.dump {#dump tag="method"}
@ -208,13 +212,13 @@ Save the current state of the knowledge base to a directory.
 > ```

 | Name  | Type         | Description                                                                                                           |
-| ------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ |
-| `loc`         | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects.    |
+| ----- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `loc` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |

 ## KnowledgeBase.load_bulk {#load_bulk tag="method"}

-Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab)
-should also be the same as the one used to create the KB.
+Restore the state of the knowledge base from a given directory. Note that the
+[`Vocab`](/api/vocab) should also be the same as the one used to create the KB.

 > #### Example
 >
@ -226,18 +230,16 @@ should also be the same as the one used to create the KB.
 > kb.load_bulk("/path/to/kb")
 > ```

-
 | Name        | Type            | Description                                                                |
-| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
-| `loc`       | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects.                |
+| ----------- | --------------- | -------------------------------------------------------------------------- |
+| `loc`       | str / `Path`    | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object.                                       |

-
 ## Candidate.\_\_init\_\_ {#candidate_init tag="method"}

 Construct a `Candidate` object. Usually this constructor is not called directly,
-but instead these objects are returned by the [`get_candidates`](/api/kb#get_candidates) method
-of a `KnowledgeBase`.
+but instead these objects are returned by the
+[`get_candidates`](/api/kb#get_candidates) method of a `KnowledgeBase`.

 > #### Example
 >
@ -258,11 +260,11 @@ of a `KnowledgeBase`.
 ## Candidate attributes {#candidate_attributes}

 | Name            | Type   | Description                                                    |
-| ---------------------- | ------------ | ------------------------------------------------------------------ |
+| --------------- | ------ | -------------------------------------------------------------- |
 | `entity`        | int    | The entity's unique KB identifier                              |
-| `entity_`              | unicode      | The entity's unique KB identifier                                  |
+| `entity_`       | str    | The entity's unique KB identifier                              |
 | `alias`         | int    | The alias or textual mention                                   |
-| `alias_`               | unicode      | The alias or textual mention                                       |
+| `alias_`        | str    | The alias or textual mention                                   |
 | `prior_prob`    | long   | The prior probability of the `alias` referring to the `entity` |
 | `entity_freq`   | long   | The frequency of the entity in a typical corpus                |
 | `entity_vector` | vector | The pretrained vector of the entity                            |
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@ -50,8 +50,8 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
 > ```

 | Name        | Type  | Description                                                                       |
-| ----------- | ------- | --------------------------------------------------------------------------------- |
-| `text`      | unicode | The text to be processed.                                                         |
+| ----------- | ----- | --------------------------------------------------------------------------------- |
+| `text`      | str   | The text to be processed.                                                         |
 | `disable`   | list  | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
 | **RETURNS** | `Doc` | A container for accessing the annotations.                                        |

@ -201,7 +201,7 @@ Create a pipeline component from a factory.

 | Name        | Type     | Description                                                                        |
 | ----------- | -------- | ---------------------------------------------------------------------------------- |
-| `name`      | unicode  | Factory name to look up in [`Language.factories`](/api/language#class-attributes). |
+| `name`      | str      | Factory name to look up in [`Language.factories`](/api/language#class-attributes). |
 | `config`    | dict     | Configuration parameters to initialize component.                                  |
 | **RETURNS** | callable | The pipeline component.                                                            |

@ -224,9 +224,9 @@ take a `Doc` object, modify it and return it. Only one of `before`, `after`,
 | Name        | Type     | Description                                                                                                                                                                                                                                            |
 | ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `component` | callable | The pipeline component.                                                                                                                                                                                                                                |
-| `name`      | unicode  | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. |
-| `before`    | unicode  | Component name to insert component directly before.                                                                                                                                                                                                    |
-| `after`     | unicode  | Component name to insert component directly after:                                                                                                                                                                                                     |
+| `name`      | str      | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. |
+| `before`    | str      | Component name to insert component directly before.                                                                                                                                                                                                    |
+| `after`     | str      | Component name to insert component directly after:                                                                                                                                                                                                     |
 | `first`     | bool     | Insert component first / not first in the pipeline.                                                                                                                                                                                                    |
 | `last`      | bool     | Insert component last / not last in the pipeline.                                                                                                                                                                                                      |

@ -244,8 +244,8 @@ Check whether a component is present in the pipeline. Equivalent to
 > ```

 | Name        | Type | Description                                              |
-| ----------- | ------- | -------------------------------------------------------- |
-| `name`      | unicode | Name of the pipeline component to check.                 |
+| ----------- | ---- | -------------------------------------------------------- |
+| `name`      | str  | Name of the pipeline component to check.                 |
 | **RETURNS** | bool | Whether a component of that name exists in the pipeline. |

 ## Language.get_pipe {#get_pipe tag="method" new="2"}
@ -261,7 +261,7 @@ Get a pipeline component for a given component name.

 | Name        | Type     | Description                            |
 | ----------- | -------- | -------------------------------------- |
-| `name`      | unicode  | Name of the pipeline component to get. |
+| `name`      | str      | Name of the pipeline component to get. |
 | **RETURNS** | callable | The pipeline component.                |

 ## Language.replace_pipe {#replace_pipe tag="method" new="2"}
@ -276,7 +276,7 @@ Replace a component in the pipeline.

 | Name        | Type     | Description                       |
 | ----------- | -------- | --------------------------------- |
-| `name`      | unicode  | Name of the component to replace. |
+| `name`      | str      | Name of the component to replace. |
 | `component` | callable | The pipeline component to insert. |

 ## Language.rename_pipe {#rename_pipe tag="method" new="2"}
@ -293,9 +293,9 @@ added to the pipeline, you can also use the `name` argument on
 > ```

 | Name       | Type | Description                      |
-| ---------- | ------- | -------------------------------- |
-| `old_name` | unicode | Name of the component to rename. |
-| `new_name` | unicode | New name of the component.       |
+| ---------- | ---- | -------------------------------- |
+| `old_name` | str  | Name of the component to rename. |
+| `new_name` | str  | New name of the component.       |

 ## Language.remove_pipe {#remove_pipe tag="method" new="2"}

@ -310,8 +310,8 @@ component function.
 > ```

 | Name        | Type  | Description                                           |
-| ----------- | ------- | ----------------------------------------------------- |
-| `name`      | unicode | Name of the component to remove.                      |
+| ----------- | ----- | ----------------------------------------------------- |
+| `name`      | str   | Name of the component to remove.                      |
 | **RETURNS** | tuple | A `(name, component)` tuple of the removed component. |

 ## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"}
@ -342,12 +342,11 @@ latter case, all components not in the `enable` list, will be disabled.
 | Name        | Type            | Description                                                                          |
 | ----------- | --------------- | ------------------------------------------------------------------------------------ |
 | `disable`   | list            | Names of pipeline components to disable.                                             |
-| `disable`   | unicode         | Name of pipeline component to disable.                                               |
+| `disable`   | str             | Name of pipeline component to disable.                                               |
 | `enable`    | list            | Names of pipeline components that will not be disabled.                              |
-| `enable`    | unicode         | Name of pipeline component that will not be disabled.                                |
+| `enable`    | str             | Name of pipeline component that will not be disabled.                                |
 | **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |

-
 <Infobox title="Changed in v3.0" variant="warning">

 As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
@ -371,8 +370,8 @@ the model**.
 > ```

 | Name      | Type         | Description                                                                                                           |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | `exclude` | list         | Names of pipeline components or [serialization fields](#serialization-fields) to exclude.                             |

 ## Language.from_disk {#from_disk tag="method" new="2"}
@ -396,8 +395,8 @@ loaded object.
 > ```

 | Name        | Type         | Description                                                                               |
-| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
-| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects.                |
+| ----------- | ------------ | ----------------------------------------------------------------------------------------- |
+| `path`      | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects.                |
 | `exclude`   | list         | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
 | **RETURNS** | `Language`   | The modified `Language` object.                                                           |

@ -481,9 +480,9 @@ per component.
 ## Class attributes {#class-attributes}

 | Name                                   | Type  | Description                                                                                                                         |
-| -------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------------- |
+| -------------------------------------- | ----- | ----------------------------------------------------------------------------------------------------------------------------------- |
 | `Defaults`                             | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline.                                           |
-| `lang`                                 | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes).                                     |
+| `lang`                                 | str   | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes).                                     |
 | `factories` <Tag variant="new">2</Tag> | dict  | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. |

 ## Serialization fields {#serialization-fields}
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@ -63,8 +63,8 @@ Lemmatize a string.

 | Name         | Type          | Description                                                                                              |
 | ------------ | ------------- | -------------------------------------------------------------------------------------------------------- |
-| `string`     | unicode       | The string to lemmatize, e.g. the token text.                                                            |
-| `univ_pos`   | unicode / int | The token's universal part-of-speech tag.                                                                |
+| `string`     | str           | The string to lemmatize, e.g. the token text.                                                            |
+| `univ_pos`   | str / int     | The token's universal part-of-speech tag.                                                                |
 | `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. |
 | **RETURNS**  | list          | The available lemmas for the string.                                                                     |

@ -83,10 +83,10 @@ original string is returned. Languages can provide a
 > ```

 | Name        | Type | Description                                                                                                 |
-| ----------- | ------- | ----------------------------------------------------------------------------------------------------------- |
-| `string`    | unicode | The string to look up.                                                                                      |
+| ----------- | ---- | ----------------------------------------------------------------------------------------------------------- |
+| `string`    | str  | The string to look up.                                                                                      |
 | `orth`      | int  | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
-| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string.                                           |
+| **RETURNS** | str  | The lemma if the string was found, otherwise the original string.                                           |

 ## Lemmatizer.is_base_form {#is_base_form tag="method"}

@ -103,8 +103,8 @@ lemmatization entirely.
 > ```

 | Name         | Type      | Description                                                                             |
-| ------------ | ------------- | --------------------------------------------------------------------------------------- |
-| `univ_pos`   | unicode / int | The token's universal part-of-speech tag.                                               |
+| ------------ | --------- | --------------------------------------------------------------------------------------- |
+| `univ_pos`   | str / int | The token's universal part-of-speech tag.                                               |
 | `morphology` | dict      | The token's morphological features.                                                     |
 | **RETURNS**  | bool      | Whether the token's part-of-speech tag and morphological features describe a base form. |

--- a/website/docs/api/lexeme.md
+++ b/website/docs/api/lexeme.md
@ -125,21 +125,21 @@ The L2 norm of the lexeme's vector representation.
 | Name                                         | Type    | Description                                                                                                                                                                                                                                                  |
 | -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | `vocab`                                      | `Vocab` | The lexeme's vocabulary.                                                                                                                                                                                                                                     |
-| `text`                                       | unicode | Verbatim text content.                                                                                                                                                                                                                                       |
+| `text`                                       | str     | Verbatim text content.                                                                                                                                                                                                                                       |
 | `orth`                                       | int     | ID of the verbatim text content.                                                                                                                                                                                                                             |
-| `orth_`                                      | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes.                                                                                                                                                 |
+| `orth_`                                      | str     | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes.                                                                                                                                                 |
 | `rank`                                       | int     | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors.                                                                                                                                                               |
 | `flags`                                      | int     | Container of the lexeme's binary flags.                                                                                                                                                                                                                      |
 | `norm`                                       | int     | The lexemes's norm, i.e. a normalized form of the lexeme text.                                                                                                                                                                                               |
-| `norm_`                                      | unicode | The lexemes's norm, i.e. a normalized form of the lexeme text.                                                                                                                                                                                               |
+| `norm_`                                      | str     | The lexemes's norm, i.e. a normalized form of the lexeme text.                                                                                                                                                                                               |
 | `lower`                                      | int     | Lowercase form of the word.                                                                                                                                                                                                                                  |
-| `lower_`                                     | unicode | Lowercase form of the word.                                                                                                                                                                                                                                  |
+| `lower_`                                     | str     | Lowercase form of the word.                                                                                                                                                                                                                                  |
 | `shape`                                      | int     | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
-| `shape_`                                     | unicode | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`.  |
+| `shape_`                                     | str     | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`.  |
 | `prefix`                                     | int     | Length-N substring from the start of the word. Defaults to `N=1`.                                                                                                                                                                                            |
-| `prefix_`                                    | unicode | Length-N substring from the start of the word. Defaults to `N=1`.                                                                                                                                                                                            |
+| `prefix_`                                    | str     | Length-N substring from the start of the word. Defaults to `N=1`.                                                                                                                                                                                            |
 | `suffix`                                     | int     | Length-N substring from the end of the word. Defaults to `N=3`.                                                                                                                                                                                              |
-| `suffix_`                                    | unicode | Length-N substring from the start of the word. Defaults to `N=3`.                                                                                                                                                                                            |
+| `suffix_`                                    | str     | Length-N substring from the start of the word. Defaults to `N=3`.                                                                                                                                                                                            |
 | `is_alpha`                                   | bool    | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`.                                                                                                                                                                     |
 | `is_ascii`                                   | bool    | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.                                                                                                                                                      |
 | `is_digit`                                   | bool    | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`.                                                                                                                                                                                    |
@ -159,7 +159,7 @@ The L2 norm of the lexeme's vector representation.
 | `is_oov`                                     | bool    | Is the lexeme out-of-vocabulary?                                                                                                                                                                                                                             |
 | `is_stop`                                    | bool    | Is the lexeme part of a "stop list"?                                                                                                                                                                                                                         |
 | `lang`                                       | int     | Language of the parent vocabulary.                                                                                                                                                                                                                           |
-| `lang_`                                      | unicode | Language of the parent vocabulary.                                                                                                                                                                                                                           |
+| `lang_`                                      | str     | Language of the parent vocabulary.                                                                                                                                                                                                                           |
 | `prob`                                       | float   | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary).                                                                                                                                                   |
 | `cluster`                                    | int     | Brown cluster ID.                                                                                                                                                                                                                                            |
 | `sentiment`                                  | float   | A scalar value indicating the positivity or negativity of the lexeme.                                                                                                                                                                                        |
--- a/website/docs/api/lookups.md
+++ b/website/docs/api/lookups.md
@ -57,8 +57,8 @@ Check if the lookups contain a table of a given name. Delegates to
 > ```

 | Name        | Type | Description                                     |
-| ----------- | ------- | ----------------------------------------------- |
-| `name`      | unicode | Name of the table.                              |
+| ----------- | ---- | ----------------------------------------------- |
+| `name`      | str  | Name of the table.                              |
 | **RETURNS** | bool | Whether a table of that name is in the lookups. |

 ## Lookups.tables {#tables tag="property"}
@ -91,7 +91,7 @@ exists.

 | Name        | Type                          | Description                        |
 | ----------- | ----------------------------- | ---------------------------------- |
-| `name`      | unicode                       | Unique name of the table.          |
+| `name`      | str                           | Unique name of the table.          |
 | `data`      | dict                          | Optional data to add to the table. |
 | **RETURNS** | [`Table`](/api/lookups#table) | The newly added table.             |

@ -110,7 +110,7 @@ Get a table from the lookups. Raises an error if the table doesn't exist.

 | Name        | Type                          | Description        |
 | ----------- | ----------------------------- | ------------------ |
-| `name`      | unicode                       | Name of the table. |
+| `name`      | str                           | Name of the table. |
 | **RETURNS** | [`Table`](/api/lookups#table) | The table.         |

 ## Lookups.remove_table {#remove_table tag="method"}
@ -128,7 +128,7 @@ Remove a table from the lookups. Raises an error if the table doesn't exist.

 | Name        | Type                          | Description                  |
 | ----------- | ----------------------------- | ---------------------------- |
-| `name`      | unicode                       | Name of the table to remove. |
+| `name`      | str                           | Name of the table to remove. |
 | **RETURNS** | [`Table`](/api/lookups#table) | The removed table.           |

 ## Lookups.has_table {#has_table tag="method"}
@ -145,8 +145,8 @@ Check if the lookups contain a table of a given name. Equivalent to
 > ```

 | Name        | Type | Description                                     |
-| ----------- | ------- | ----------------------------------------------- |
-| `name`      | unicode | Name of the table.                              |
+| ----------- | ---- | ----------------------------------------------- |
+| `name`      | str  | Name of the table.                              |
 | **RETURNS** | bool | Whether a table of that name is in the lookups. |

 ## Lookups.to_bytes {#to_bytes tag="method"}
@ -192,8 +192,8 @@ which will be created if it doesn't exist.
 > ```

 | Name   | Type         | Description                                                                                                           |
-| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |

 ## Lookups.from_disk {#from_disk tag="method"}

@ -209,8 +209,8 @@ the file doesn't exist.
 > ```

 | Name        | Type         | Description                                                                |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path`      | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | **RETURNS** | `Lookups`    | The loaded lookups.                                                        |

 ## Table {#table tag="class, ordererddict"}
@ -238,7 +238,7 @@ Initialize a new table.

 | Name        | Type    | Description                        |
 | ----------- | ------- | ---------------------------------- |
-| `name`      | unicode | Optional table name for reference. |
+| `name`      | str     | Optional table name for reference. |
 | **RETURNS** | `Table` | The newly constructed object.      |

 ### Table.from_dict {#table.from_dict tag="classmethod"}
@ -256,7 +256,7 @@ Initialize a new table from a dict.
 | Name        | Type    | Description                        |
 | ----------- | ------- | ---------------------------------- |
 | `data`      | dict    | The dictionary.                    |
-| `name`      | unicode | Optional table name for reference. |
+| `name`      | str     | Optional table name for reference. |
 | **RETURNS** | `Table` | The newly constructed object.      |

 ### Table.set {#table.set tag="method"}
@ -274,8 +274,8 @@ Set a new key / value pair. String keys will be hashed. Same as
 > ```

 | Name    | Type      | Description |
-| ------- | ------------- | ----------- |
-| `key`   | unicode / int | The key.    |
+| ------- | --------- | ----------- |
+| `key`   | str / int | The key.    |
 | `value` | -         | The value.  |

 ### Table.to_bytes {#table.to_bytes tag="method"}
@ -313,6 +313,6 @@ Load a table from a bytestring.

 | Name           | Type                        | Description                                           |
 | -------------- | --------------------------- | ----------------------------------------------------- |
-| `name`         | unicode                     | Table name.                                           |
+| `name`         | str                         | Table name.                                           |
 | `default_size` | int                         | Default size of bloom filters if no data is provided. |
 | `bloom`        | `preshed.bloom.BloomFilter` | The bloom filters.                                    |
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@ -126,8 +126,8 @@ Check whether the matcher contains rules for a match ID.
 > ```

 | Name        | Type | Description                                           |
-| ----------- | ------- | ----------------------------------------------------- |
-| `key`       | unicode | The match ID.                                         |
+| ----------- | ---- | ----------------------------------------------------- |
+| `key`       | str  | The match ID.                                         |
 | **RETURNS** | bool | Whether the matcher contains rules for this match ID. |

 ## Matcher.add {#add tag="method" new="2"}
@ -153,7 +153,7 @@ overwritten.

 | Name        | Type               | Description                                                                                   |
 | ----------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id`  | unicode            | An ID for the thing you're matching.                                                          |
+| `match_id`  | str                | An ID for the thing you're matching.                                                          |
 | `on_match`  | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
 | `*patterns` | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |

@ -189,8 +189,8 @@ exist.
 > ```

 | Name  | Type | Description               |
-| ----- | ------- | ------------------------- |
-| `key` | unicode | The ID of the match rule. |
+| ----- | ---- | ------------------------- |
+| `key` | str  | The ID of the match rule. |

 ## Matcher.get {#get tag="method" new="2"}

@ -205,6 +205,6 @@ Retrieve the pattern stored for a key. Returns the rule as an
 > ```

 | Name        | Type  | Description                                   |
-| ----------- | ------- | --------------------------------------------- |
-| `key`       | unicode | The ID of the match rule.                     |
+| ----------- | ----- | --------------------------------------------- |
+| `key`       | str   | The ID of the match rule.                     |
 | **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@ -134,8 +134,8 @@ Check whether the matcher contains rules for a match ID.
 > ```

 | Name        | Type | Description                                           |
-| ----------- | ------- | ----------------------------------------------------- |
-| `key`       | unicode | The match ID.                                         |
+| ----------- | ---- | ----------------------------------------------------- |
+| `key`       | str  | The match ID.                                         |
 | **RETURNS** | bool | Whether the matcher contains rules for this match ID. |

 ## PhraseMatcher.add {#add tag="method"}
@ -162,7 +162,7 @@ overwritten.

 | Name       | Type               | Description                                                                                   |
 | ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id` | unicode            | An ID for the thing you're matching.                                                          |
+| `match_id` | str                | An ID for the thing you're matching.                                                          |
 | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
 | `*docs`    | `Doc`              | `Doc` objects of the phrases to match.                                                        |

@ -199,5 +199,5 @@ does not exist.
 > ```

 | Name  | Type | Description               |
-| ----- | ------- | ------------------------- |
-| `key` | unicode | The ID of the match rule. |
+| ----- | ---- | ------------------------- |
+| `key` | str  | The ID of the match rule. |
--- a/website/docs/api/pipeline-functions.md
+++ b/website/docs/api/pipeline-functions.md
@ -113,7 +113,7 @@ end of the pipeline and after all other components.
 </Infobox>

 | Name        | Type  | Description                                                  |
-| ----------- | ------- | ------------------------------------------------------------ |
+| ----------- | ----- | ------------------------------------------------------------ |
 | `doc`       | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
-| `label`     | unicode | The subtoken dependency label. Defaults to `"subtok"`.       |
+| `label`     | str   | The subtoken dependency label. Defaults to `"subtok"`.       |
 | **RETURNS** | `Doc` | The modified `Doc` with merged subtokens.                    |
--- a/website/docs/api/sentencizer.md
+++ b/website/docs/api/sentencizer.md
@ -82,8 +82,8 @@ a file `sentencizer.json`. This also happens automatically when you save an
 > ```

 | Name   | Type         | Description                                                                                                      |
-| ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| ------ | ------------ | ---------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |

 ## Sentencizer.from_disk {#from_disk tag="method"}

@ -99,8 +99,8 @@ added to its pipeline.
 > ```

 | Name        | Type          | Description                                                                |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path`      | unicode / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. |
+| ----------- | ------------- | -------------------------------------------------------------------------- |
+| `path`      | str / `Path`  | A path to a JSON file. Paths may be either strings or `Path`-like objects. |
 | **RETURNS** | `Sentencizer` | The modified `Sentencizer` object.                                         |

 ## Sentencizer.to_bytes {#to_bytes tag="method"}
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@ -110,7 +110,7 @@ For details, see the documentation on

 | Name      | Type     | Description                                                                                                                           |
 | --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- |
-| `name`    | unicode  | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`.                        |
+| `name`    | str      | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`.                        |
 | `default` | -        | Optional default value of the attribute if no getter or method is defined.                                                            |
 | `method`  | callable | Set a custom method on the object, for example `span._.compare(other_span)`.                                                          |
 | `getter`  | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute.            |
@ -133,8 +133,8 @@ Look up a previously registered extension by name. Returns a 4-tuple
 > ```

 | Name        | Type  | Description                                                   |
-| ----------- | ------- | ------------------------------------------------------------- |
-| `name`      | unicode | Name of the extension.                                        |
+| ----------- | ----- | ------------------------------------------------------------- |
+| `name`      | str   | Name of the extension.                                        |
 | **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |

 ## Span.has_extension {#has_extension tag="classmethod" new="2"}
@ -150,8 +150,8 @@ Check whether an extension has been registered on the `Span` class.
 > ```

 | Name        | Type | Description                                |
-| ----------- | ------- | ------------------------------------------ |
-| `name`      | unicode | Name of the extension to check.            |
+| ----------- | ---- | ------------------------------------------ |
+| `name`      | str  | Name of the extension to check.            |
 | **RETURNS** | bool | Whether the extension has been registered. |

 ## Span.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
@ -168,8 +168,8 @@ Remove a previously registered extension.
 > ```

 | Name        | Type  | Description                                                           |
-| ----------- | ------- | --------------------------------------------------------------------- |
-| `name`      | unicode | Name of the extension.                                                |
+| ----------- | ----- | --------------------------------------------------------------------- |
+| `name`      | str   | Name of the extension.                                                |
 | **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |

 ## Span.char_span {#char_span tag="method" new="2.2.4"}
@ -497,16 +497,16 @@ The L2 norm of the span's vector representation.
 | `end`                                   | int          | The token offset for the end of the span.                                                                      |
 | `start_char`                            | int          | The character offset for the start of the span.                                                                |
 | `end_char`                              | int          | The character offset for the end of the span.                                                                  |
-| `text`                                  | unicode      | A unicode representation of the span text.                                                                     |
-| `text_with_ws`                          | unicode      | The text content of the span with a trailing whitespace character if the last token has one.                   |
+| `text`                                  | str          | A unicode representation of the span text.                                                                     |
+| `text_with_ws`                          | str          | The text content of the span with a trailing whitespace character if the last token has one.                   |
 | `orth`                                  | int          | ID of the verbatim text content.                                                                               |
-| `orth_`                                 | unicode      | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes.     |
+| `orth_`                                 | str          | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes.     |
 | `label`                                 | int          | The hash value of the span's label.                                                                            |
-| `label_`                                | unicode      | The span's label.                                                                                              |
-| `lemma_`                                | unicode      | The span's lemma.                                                                                              |
+| `label_`                                | str          | The span's label.                                                                                              |
+| `lemma_`                                | str          | The span's lemma.                                                                                              |
 | `kb_id`                                 | int          | The hash value of the knowledge base ID referred to by the span.                                               |
-| `kb_id_`                                | unicode      | The knowledge base ID referred to by the span.                                                                 |
+| `kb_id_`                                | str          | The knowledge base ID referred to by the span.                                                                 |
 | `ent_id`                                | int          | The hash value of the named entity the token is an instance of.                                                |
-| `ent_id_`                               | unicode      | The string ID of the named entity the token is an instance of.                                                 |
+| `ent_id_`                               | str          | The string ID of the named entity the token is an instance of.                                                 |
 | `sentiment`                             | float        | A scalar value indicating the positivity or negativity of the span.                                            |
 | `_`                                     | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
--- a/website/docs/api/stringstore.md
+++ b/website/docs/api/stringstore.md
@ -55,7 +55,7 @@ Retrieve a string from a given hash, or vice versa.
 | Name           | Type                     | Description                |
 | -------------- | ------------------------ | -------------------------- |
 | `string_or_id` | bytes, unicode or uint64 | The value to encode.       |
-| **RETURNS**    | unicode or int           | The value to be retrieved. |
+| **RETURNS**    | str or int               | The value to be retrieved. |

 ## StringStore.\_\_contains\_\_ {#contains tag="method"}

@ -70,8 +70,8 @@ Check whether a string is in the store.
 > ```

 | Name        | Type | Description                            |
-| ----------- | ------- | -------------------------------------- |
-| `string`    | unicode | The string to check.                   |
+| ----------- | ---- | -------------------------------------- |
+| `string`    | str  | The string to check.                   |
 | **RETURNS** | bool | Whether the store contains the string. |

 ## StringStore.\_\_iter\_\_ {#iter tag="method"}
@ -88,8 +88,8 @@ store will always include an empty string `''` at position `0`.
 > ```

 | Name       | Type | Description            |
-| ---------- | ------- | ---------------------- |
-| **YIELDS** | unicode | A string in the store. |
+| ---------- | ---- | ---------------------- |
+| **YIELDS** | str  | A string in the store. |

 ## StringStore.add {#add tag="method" new="2"}

@ -107,8 +107,8 @@ Add a string to the `StringStore`.
 > ```

 | Name        | Type   | Description              |
-| ----------- | ------- | ------------------------ |
-| `string`    | unicode | The string to add.       |
+| ----------- | ------ | ------------------------ |
+| `string`    | str    | The string to add.       |
 | **RETURNS** | uint64 | The string's hash value. |

 ## StringStore.to_disk {#to_disk tag="method" new="2"}
@ -122,8 +122,8 @@ Save the current state to a directory.
 > ```

 | Name   | Type         | Description                                                                                                           |
-| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |

 ## StringStore.from_disk {#from_disk tag="method" new="2"}

@ -137,8 +137,8 @@ Loads state from a directory. Modifies the object in place and returns it.
 > ```

 | Name        | Type          | Description                                                                |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| ----------- | ------------- | -------------------------------------------------------------------------- |
+| `path`      | str / `Path`  | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | **RETURNS** | `StringStore` | The modified `StringStore` object.                                         |

 ## StringStore.to_bytes {#to_bytes tag="method"}
@ -186,6 +186,6 @@ Get a 64-bit hash for a given string.
 > ```

 | Name        | Type   | Description         |
-| ----------- | ------- | ------------------- |
-| `string`    | unicode | The string to hash. |
+| ----------- | ------ | ------------------- |
+| `string`    | str    | The string to hash. |
 | **RETURNS** | uint64 | The hash.           |
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@ -230,8 +230,8 @@ Add a new label to the pipe.
 > ```

 | Name     | Type | Description                                                     |
-| -------- | ------- | --------------------------------------------------------------- |
-| `label`  | unicode | The label to add.                                               |
+| -------- | ---- | --------------------------------------------------------------- |
+| `label`  | str  | The label to add.                                               |
 | `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. |

 ## Tagger.to_disk {#to_disk tag="method"}
@ -246,8 +246,8 @@ Serialize the pipe to disk.
 > ```

 | Name      | Type         | Description                                                                                                           |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |

 ## Tagger.from_disk {#from_disk tag="method"}
@ -262,8 +262,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
 > ```

 | Name        | Type         | Description                                                                |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path`      | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | `exclude`   | list         | String names of [serialization fields](#serialization-fields) to exclude.  |
 | **RETURNS** | `Tagger`     | The modified `Tagger` object.                                              |

--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@ -44,7 +44,7 @@ shortcut for this and instantiate the component using its string name and
 | `vocab`             | `Vocab`                       | The shared vocabulary.                                                                                                                                |
 | `model`             | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
 | `exclusive_classes` | bool                          | Make categories mutually exclusive. Defaults to `False`.                                                                                              |
-| `architecture`      | unicode                       | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`.                                                 |
+| `architecture`      | str                           | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`.                                                 |
 | **RETURNS**         | `TextCategorizer`             | The newly constructed object.                                                                                                                         |

 ### Architectures {#architectures new="2.1"}
@ -248,8 +248,8 @@ Add a new label to the pipe.
 > ```

 | Name    | Type | Description       |
-| ------- | ------- | ----------------- |
-| `label` | unicode | The label to add. |
+| ------- | ---- | ----------------- |
+| `label` | str  | The label to add. |

 ## TextCategorizer.to_disk {#to_disk tag="method"}

@ -263,8 +263,8 @@ Serialize the pipe to disk.
 > ```

 | Name      | Type         | Description                                                                                                           |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |

 ## TextCategorizer.from_disk {#from_disk tag="method"}
@ -280,7 +280,7 @@ Load the pipe from disk. Modifies the object in place and returns it.

 | Name        | Type              | Description                                                                |
 | ----------- | ----------------- | -------------------------------------------------------------------------- |
-| `path`      | unicode / `Path`  | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `path`      | str / `Path`      | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | `exclude`   | list              | String names of [serialization fields](#serialization-fields) to exclude.  |
 | **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object.                                     |

--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@ -35,7 +35,7 @@ the
 > ```

 | Name             | Type        | Description                                                                           |
-| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
+| ---------------- | ----------- | ------------------------------------------------------------------------------------- |
 | `vocab`          | `Vocab`     | A storage container for lexical types.                                                |
 | `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                       |
 | `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes.   |
@ -56,8 +56,8 @@ Tokenize a string.
 > ```

 | Name        | Type  | Description                             |
-| ----------- | ------- | --------------------------------------- |
-| `string`    | unicode | The string to tokenize.                 |
+| ----------- | ----- | --------------------------------------- |
+| `string`    | str   | The string to tokenize.                 |
 | **RETURNS** | `Doc` | A container for linguistic annotations. |

 ## Tokenizer.pipe {#pipe tag="method"}
@ -83,8 +83,8 @@ Tokenize a stream of texts.
 Find internal split points of the string.

 | Name        | Type | Description                                                                                                                                        |
-| ----------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `string`    | unicode | The string to split.                                                                                                                               |
+| ----------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `string`    | str  | The string to split.                                                                                                                               |
 | **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. |

 ## Tokenizer.find_prefix {#find_prefix tag="method"}
@ -93,8 +93,8 @@ Find the length of a prefix that should be segmented from the string, or `None`
 if no prefix rules match.

 | Name        | Type | Description                                            |
-| ----------- | ------- | ------------------------------------------------------ |
-| `string`    | unicode | The string to segment.                                 |
+| ----------- | ---- | ------------------------------------------------------ |
+| `string`    | str  | The string to segment.                                 |
 | **RETURNS** | int  | The length of the prefix if present, otherwise `None`. |

 ## Tokenizer.find_suffix {#find_suffix tag="method"}
@ -104,7 +104,7 @@ if no suffix rules match.

 | Name        | Type         | Description                                            |
 | ----------- | ------------ | ------------------------------------------------------ |
-| `string`    | unicode      | The string to segment.                                 |
+| `string`    | str          | The string to segment.                                 |
 | **RETURNS** | int / `None` | The length of the suffix if present, otherwise `None`. |

 ## Tokenizer.add_special_case {#add_special_case tag="method"}
@ -125,7 +125,7 @@ and examples.

 | Name          | Type     | Description                                                                                                                                                              |
 | ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `string`      | unicode  | The string to specially tokenize.                                                                                                                                        |
+| `string`      | str      | The string to specially tokenize.                                                                                                                                        |
 | `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. |

 ## Tokenizer.explain {#explain tag="method"}
@ -143,8 +143,8 @@ produced are identical to `Tokenizer.__call__` except for whitespace tokens.
 > ```

 | Name        | Type | Description                                         |
-| ------------| -------- | --------------------------------------------------- |
-| `string`    | unicode  | The string to tokenize with the debugging tokenizer |
+| ----------- | ---- | --------------------------------------------------- |
+| `string`    | str  | The string to tokenize with the debugging tokenizer |
 | **RETURNS** | list | A list of `(pattern_string, token_string)` tuples   |

 ## Tokenizer.to_disk {#to_disk tag="method"}
@ -159,8 +159,8 @@ Serialize the tokenizer to disk.
 > ```

 | Name      | Type         | Description                                                                                                           |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |

 ## Tokenizer.from_disk {#from_disk tag="method"}
@ -175,8 +175,8 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
 > ```

 | Name        | Type         | Description                                                                |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path`      | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | `exclude`   | list         | String names of [serialization fields](#serialization-fields) to exclude.  |
 | **RETURNS** | `Tokenizer`  | The modified `Tokenizer` object.                                           |

@ -218,12 +218,12 @@ it.
 ## Attributes {#attributes}

 | Name             | Type    | Description                                                                                                                |
-| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
+| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
 | `vocab`          | `Vocab` | The vocab object of the parent `Doc`.                                                                                      |
 | `prefix_search`  | -       | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`.            |
 | `suffix_search`  | -       | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`.              |
 | `infix_finditer` | -       | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
-| `token_match`    | -       | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
+| `token_match`    | -       | A function matching the signature of `re.compile(string).match to find token matches. Returns an`re.MatchObject`or`None.   |
 | `rules`          | dict    | A dictionary of tokenizer exceptions and special cases.                                                                    |

 ## Serialization fields {#serialization-fields}
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@ -33,8 +33,8 @@ class. The data will be loaded in via
 > ```

 | Name        | Type         | Description                                                                       |
-| ----------- | ---------------- | --------------------------------------------------------------------------------- |
-| `name`      | unicode / `Path` | Model to load, i.e. shortcut link, package name or path.                          |
+| ----------- | ------------ | --------------------------------------------------------------------------------- |
+| `name`      | str / `Path` | Model to load, i.e. shortcut link, package name or path.                          |
 | `disable`   | list         | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
 | **RETURNS** | `Language`   | A `Language` object with the loaded model.                                        |

@ -79,7 +79,7 @@ Create a blank model of a given language class. This function is the twin of

 | Name        | Type       | Description                                                                                      |
 | ----------- | ---------- | ------------------------------------------------------------------------------------------------ |
-| `name`      | unicode    | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
+| `name`      | str        | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
 | `disable`   | list       | Names of pipeline components to [disable](/usage/processing-pipelines#disabling).                |
 | **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass.                                          |

@ -99,8 +99,8 @@ meta data as a dictionary instead, you can use the `meta` attribute on your
 > ```

 | Name       | Type | Description                                                   |
-| ---------- | ------- | ------------------------------------------------------------- |
-| `model`    | unicode | A model, i.e. shortcut link, package name or path (optional). |
+| ---------- | ---- | ------------------------------------------------------------- |
+| `model`    | str  | A model, i.e. shortcut link, package name or path (optional). |
 | `markdown` | bool | Print information as Markdown.                                |

 ### spacy.explain {#spacy.explain tag="function"}
@ -123,9 +123,9 @@ list of available terms, see
 > ```

 | Name        | Type | Description                                              |
-| ----------- | ------- | -------------------------------------------------------- |
-| `term`      | unicode | Term to explain.                                         |
-| **RETURNS** | unicode | The explanation, or `None` if not found in the glossary. |
+| ----------- | ---- | -------------------------------------------------------- |
+| `term`      | str  | Term to explain.                                         |
+| **RETURNS** | str  | The explanation, or `None` if not found in the glossary. |

 ### spacy.prefer_gpu {#spacy.prefer_gpu tag="function" new="2.0.14"}

@ -189,13 +189,13 @@ browser. Will run a simple web server.
 | Name      | Type                | Description                                                                                                                          | Default     |
 | --------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ----------- |
 | `docs`    | list, `Doc`, `Span` | Document(s) to visualize.                                                                                                            |
-| `style`   | unicode             | Visualization style, `'dep'` or `'ent'`.                                                                                             | `'dep'`     |
+| `style`   | str                 | Visualization style, `'dep'` or `'ent'`.                                                                                             | `'dep'`     |
 | `page`    | bool                | Render markup as full HTML page.                                                                                                     | `True`      |
 | `minify`  | bool                | Minify HTML markup.                                                                                                                  | `False`     |
 | `options` | dict                | [Visualizer-specific options](#displacy_options), e.g. colors.                                                                       | `{}`        |
 | `manual`  | bool                | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False`     |
 | `port`    | int                 | Port to serve visualization.                                                                                                         | `5000`      |
-| `host`    | unicode             | Host to serve visualization.                                                                                                         | `'0.0.0.0'` |
+| `host`    | str                 | Host to serve visualization.                                                                                                         | `'0.0.0.0'` |

 ### displacy.render {#displacy.render tag="method" new="2"}

@ -214,13 +214,13 @@ Render a dependency parse tree or named entity visualization.
 | Name        | Type                | Description                                                                                                                                               | Default |
 | ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | `docs`      | list, `Doc`, `Span` | Document(s) to visualize.                                                                                                                                 |
-| `style`     | unicode             | Visualization style, `'dep'` or `'ent'`.                                                                                                                  | `'dep'` |
+| `style`     | str                 | Visualization style, `'dep'` or `'ent'`.                                                                                                                  | `'dep'` |
 | `page`      | bool                | Render markup as full HTML page.                                                                                                                          | `False` |
 | `minify`    | bool                | Minify HTML markup.                                                                                                                                       | `False` |
 | `jupyter`   | bool                | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None`  |
 | `options`   | dict                | [Visualizer-specific options](#displacy_options), e.g. colors.                                                                                            | `{}`    |
 | `manual`    | bool                | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples.                      | `False` |
-| **RETURNS** | unicode             | Rendered HTML markup.                                                                                                                                     |
+| **RETURNS** | str                 | Rendered HTML markup.                                                                                                                                     |

 ### Visualizer options {#displacy_options}

@ -237,15 +237,15 @@ If a setting is not present in the options, the default value will be used.
 > ```

 | Name                                       | Type | Description                                                                                                     | Default                 |
-| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
+| ------------------------------------------ | ---- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
 | `fine_grained`                             | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`).              | `False`                 |
 | `add_lemma` <Tag variant="new">2.2.4</Tag> | bool | Print the lemma's in a separate row below the token texts.                                                      | `False`                 |
 | `collapse_punct`                           | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True`                  |
 | `collapse_phrases`                         | bool | Merge noun phrases into one token.                                                                              | `False`                 |
 | `compact`                                  | bool | "Compact mode" with square arrows that takes up less space.                                                     | `False`                 |
-| `color`                                    | unicode | Text color (HEX, RGB or color names).                                                                           | `'#000000'`             |
-| `bg`                                       | unicode | Background color (HEX, RGB or color names).                                                                     | `'#ffffff'`             |
-| `font`                                     | unicode | Font name or font family for all text.                                                                          | `'Arial'`               |
+| `color`                                    | str  | Text color (HEX, RGB or color names).                                                                           | `'#000000'`             |
+| `bg`                                       | str  | Background color (HEX, RGB or color names).                                                                     | `'#ffffff'`             |
+| `font`                                     | str  | Font name or font family for all text.                                                                          | `'Arial'`               |
 | `offset_x`                                 | int  | Spacing on left side of the SVG in px.                                                                          | `50`                    |
 | `arrow_stroke`                             | int  | Width of arrow path in px.                                                                                      | `2`                     |
 | `arrow_width`                              | int  | Width of arrow head in px.                                                                                      | `10` / `8` (compact)    |
@ -264,10 +264,10 @@ If a setting is not present in the options, the default value will be used.
 > ```

 | Name                                    | Type | Description                                                                                                                                | Default                                                                                          |
-| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
+| --------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
 | `ents`                                  | list | Entity types to highlight (`None` for all types).                                                                                          | `None`                                                                                           |
 | `colors`                                | dict | Color overrides. Entity types in uppercase should be mapped to color names or values.                                                      | `{}`                                                                                             |
-| `template` <Tag variant="new">2.2</Tag> | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |
+| `template` <Tag variant="new">2.2</Tag> | str  | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |

 By default, displaCy comes with colors for all
 [entity types supported by spaCy](/api/annotation#named-entities). If you're
@ -309,8 +309,8 @@ Set custom path to the data directory where spaCy looks for models.
 > ```

 | Name   | Type         | Description                 |
-| ------ | ---------------- | --------------------------- |
-| `path` | unicode / `Path` | Path to new data directory. |
+| ------ | ------------ | --------------------------- |
+| `path` | str / `Path` | Path to new data directory. |

 ### util.get_lang_class {#util.get_lang_class tag="function"}

@ -330,7 +330,7 @@ you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper.

 | Name        | Type       | Description                            |
 | ----------- | ---------- | -------------------------------------- |
-| `lang`      | unicode    | Two-letter language code, e.g. `'en'`. |
+| `lang`      | str        | Two-letter language code, e.g. `'en'`. |
 | **RETURNS** | `Language` | Language class.                        |

 ### util.set_lang_class {#util.set_lang_class tag="function"}
@ -352,7 +352,7 @@ the two-letter language code.

 | Name   | Type       | Description                            |
 | ------ | ---------- | -------------------------------------- |
-| `name` | unicode    | Two-letter language code, e.g. `'en'`. |
+| `name` | str        | Two-letter language code, e.g. `'en'`. |
 | `cls`  | `Language` | The language class, e.g. `English`.    |

 ### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"}
@ -369,8 +369,8 @@ loaded lazily, to avoid expensive setup code associated with the language data.
 > ```

 | Name        | Type | Description                            |
-| ----------- | ------- | -------------------------------------- |
-| `name`      | unicode | Two-letter language code, e.g. `'en'`. |
+| ----------- | ---- | -------------------------------------- |
+| `name`      | str  | Two-letter language code, e.g. `'en'`. |
 | **RETURNS** | bool | Whether the class has been loaded.     |

 ### util.load_model {#util.load_model tag="function" new="2"}
@ -392,7 +392,7 @@ in via [`Language.from_disk()`](/api/language#from_disk).

 | Name          | Type       | Description                                              |
 | ------------- | ---------- | -------------------------------------------------------- |
-| `name`        | unicode    | Package name, shortcut link or model path.               |
+| `name`        | str        | Package name, shortcut link or model path.               |
 | `**overrides` | -          | Specific overrides, like pipeline components to disable. |
 | **RETURNS**   | `Language` | `Language` class with the loaded model.                  |

@ -411,7 +411,7 @@ it easy to test a new model that you haven't packaged yet.

 | Name          | Type       | Description                                                                                          |
 | ------------- | ---------- | ---------------------------------------------------------------------------------------------------- |
-| `model_path`  | unicode    | Path to model data directory.                                                                        |
+| `model_path`  | str        | Path to model data directory.                                                                        |
 | `meta`        | dict       | Model meta data. If `False`, spaCy will try to load the meta from a meta.json in the same directory. |
 | `**overrides` | -          | Specific overrides, like pipeline components to disable.                                             |
 | **RETURNS**   | `Language` | `Language` class with the loaded model.                                                              |
@ -432,7 +432,7 @@ A helper function to use in the `load()` method of a model package's

 | Name          | Type       | Description                                              |
 | ------------- | ---------- | -------------------------------------------------------- |
-| `init_file`   | unicode    | Path to model's `__init__.py`, i.e. `__file__`.          |
+| `init_file`   | str        | Path to model's `__init__.py`, i.e. `__file__`.          |
 | `**overrides` | -          | Specific overrides, like pipeline components to disable. |
 | **RETURNS**   | `Language` | `Language` class with the loaded model.                  |

@ -447,8 +447,8 @@ Get a model's meta.json from a directory path and validate its contents.
 > ```

 | Name        | Type         | Description              |
-| ----------- | ---------------- | ------------------------ |
-| `path`      | unicode / `Path` | Path to model directory. |
+| ----------- | ------------ | ------------------------ |
+| `path`      | str / `Path` | Path to model directory. |
 | **RETURNS** | dict         | The model's meta data.   |

 ### util.is_package {#util.is_package tag="function"}
@ -464,8 +464,8 @@ Check if string maps to a package installed via pip. Mainly used to validate
 > ```

 | Name        | Type   | Description                                  |
-| ----------- | ------- | -------------------------------------------- |
-| `name`      | unicode | Name of package.                             |
+| ----------- | ------ | -------------------------------------------- |
+| `name`      | str    | Name of package.                             |
 | **RETURNS** | `bool` | `True` if installed package, `False` if not. |

 ### util.get_package_path {#util.get_package_path tag="function" new="2"}
@ -481,8 +481,8 @@ Get path to an installed package. Mainly used to resolve the location of
 > ```

 | Name           | Type   | Description                      |
-| -------------- | ------- | -------------------------------- |
-| `package_name` | unicode | Name of installed package.       |
+| -------------- | ------ | -------------------------------- |
+| `package_name` | str    | Name of installed package.       |
 | **RETURNS**    | `Path` | Path to model package directory. |

 ### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"}
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@ -35,7 +35,7 @@ you can add vectors to later.
 | `data`      | `ndarray[ndim=1, dtype='float32']` | The vector data.                                                                                                                                                   |
 | `keys`      | iterable                           | A sequence of keys aligned with the data.                                                                                                                          |
 | `shape`     | tuple                              | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. |
-| `name`      | unicode                            | A name to identify the vectors table.                                                                                                                              |
+| `name`      | str                                | A name to identify the vectors table.                                                                                                                              |
 | **RETURNS** | `Vectors`                          | The newly created object.                                                                                                                                          |

 ## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
@ -140,7 +140,7 @@ mapping separately. If you need to manage the strings, you should use the

 | Name        | Type                               | Description                                           |
 | ----------- | ---------------------------------- | ----------------------------------------------------- |
-| `key`       | unicode / int                      | The key to add.                                       |
+| `key`       | str / int                          | The key to add.                                       |
 | `vector`    | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key.                |
 | `row`       | int                                | An optional row number of a vector to map the key to. |
 | **RETURNS** | int                                | The row the vector was added to.                      |
@ -227,7 +227,7 @@ Look up one or more keys by row, or vice versa.

 | Name        | Type                                  | Description                                                              |
 | ----------- | ------------------------------------- | ------------------------------------------------------------------------ |
-| `key`       | unicode / int                         | Find the row that the given key points to. Returns int, `-1` if missing. |
+| `key`       | str / int                             | Find the row that the given key points to. Returns int, `-1` if missing. |
 | `keys`      | iterable                              | Find rows that the keys point to. Returns `ndarray`.                     |
 | `row`       | int                                   | Find the first key that points to the row. Returns int.                  |
 | `rows`      | iterable                              | Find the keys that point to the rows. Returns ndarray.                   |
@ -338,8 +338,8 @@ Save the current state to a directory.
 > ```

 | Name   | Type         | Description                                                                                                           |
-| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |

 ## Vectors.from_disk {#from_disk tag="method"}

@ -353,8 +353,8 @@ Loads state from a directory. Modifies the object in place and returns it.
 > ```

 | Name        | Type         | Description                                                                |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path`      | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | **RETURNS** | `Vectors`    | The modified `Vectors` object.                                             |

 ## Vectors.to_bytes {#to_bytes tag="method"}
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@ -27,7 +27,7 @@ Create the vocabulary.
 | `tag_map`                                   | dict                 | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. |
 | `lemmatizer`                                | object               | A lemmatizer. Defaults to `None`.                                                                                  |
 | `strings`                                   | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings.        |
-| `vectors_name` <Tag variant="new">2.2</Tag> | unicode              | A name to identify the vectors table.                                                                              |
+| `vectors_name` <Tag variant="new">2.2</Tag> | str                  | A name to identify the vectors table.                                                                              |
 | **RETURNS**                                 | `Vocab`              | The newly constructed object.                                                                                      |

 ## Vocab.\_\_len\_\_ {#len tag="method"}
@ -92,8 +92,8 @@ given string, you need to look it up in
 > ```

 | Name        | Type | Description                                        |
-| ----------- | ------- | -------------------------------------------------- |
-| `string`    | unicode | The ID string.                                     |
+| ----------- | ---- | -------------------------------------------------- |
+| `string`    | str  | The ID string.                                     |
 | **RETURNS** | bool | Whether the string has an entry in the vocabulary. |

 ## Vocab.add_flag {#add_flag tag="method"}
@ -117,7 +117,7 @@ using `token.check_flag(flag_id)`.

 | Name          | Type | Description                                                                                                                                     |
 | ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
-| `flag_getter` | dict | A function `f(unicode) -> bool`, to get the flag value.                                                                                         |
+| `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value.                                                                                             |
 | `flag_id`     | int  | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. |
 | **RETURNS**   | int  | The integer ID by which the flag value can be checked.                                                                                          |

@ -228,8 +228,8 @@ Save the current state to a directory.
 > ```

 | Name      | Type         | Description                                                                                                           |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |

 ## Vocab.from_disk {#from_disk tag="method" new="2"}
@ -244,8 +244,8 @@ Loads state from a directory. Modifies the object in place and returns it.
 > ```

 | Name        | Type         | Description                                                                |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path`      | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | `exclude`   | list         | String names of [serialization fields](#serialization-fields) to exclude.  |
 | **RETURNS** | `Vocab`      | The modified `Vocab` object.                                               |

--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@ -327,11 +327,11 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy)..
 ### Disabling the parser {#disabling}

 In the [default models](/models), the parser is loaded and enabled as part of
-the [standard processing pipeline](/usage/processing-pipelines). If you don't need
-any of the syntactic information, you should disable the parser. Disabling the
-parser will make spaCy load and run much faster. If you want to load the parser,
-but need to disable it for specific documents, you can also control its use on
-the `nlp` object.
+the [standard processing pipeline](/usage/processing-pipelines). If you don't
+need any of the syntactic information, you should disable the parser. Disabling
+the parser will make spaCy load and run much faster. If you want to load the
+parser, but need to disable it for specific documents, you can also control its
+use on the `nlp` object.

 ```python
 nlp = spacy.load("en_core_web_sm", disable=["parser"])
@ -989,8 +989,8 @@ nlp.tokenizer = my_tokenizer
 ```

 | Argument    | Type  | Description               |
-| ----------- | ------- | ------------------------- |
-| `text`      | unicode | The raw text to tokenize. |
+| ----------- | ----- | ------------------------- |
+| `text`      | str   | The raw text to tokenize. |
 | **RETURNS** | `Doc` | The tokenized document.   |

 <Infobox title="Important note: using a custom tokenizer" variant="warning">
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@ -272,16 +272,16 @@ doc = nlp("I won't have named entities")
 disabled.restore()
 ```

-If you want to disable all pipes except for one or a few, you can use the `enable`
-keyword. Just like the `disable` keyword, it takes a list of pipe names, or a string
-defining just one pipe.
+If you want to disable all pipes except for one or a few, you can use the
+`enable` keyword. Just like the `disable` keyword, it takes a list of pipe
+names, or a string defining just one pipe.
+
 ```python
 # Enable only the parser
 with nlp.select_pipes(enable="parser"):
    doc = nlp("I will only be parsed")
 ```

-
 Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method
 to remove pipeline components from an existing pipeline, the
 [`rename_pipe`](/api/language#rename_pipe) method to rename them, or the
@ -350,11 +350,11 @@ last** in the pipeline, or define a **custom name**. If no name is set and no
 > ```

 | Argument | Type | Description                                                              |
-| -------- | ------- | ------------------------------------------------------------------------ |
+| -------- | ---- | ------------------------------------------------------------------------ |
 | `last`   | bool | If set to `True`, component is added **last** in the pipeline (default). |
 | `first`  | bool | If set to `True`, component is added **first** in the pipeline.          |
-| `before` | unicode | String name of component to add the new component **before**.            |
-| `after`  | unicode | String name of component to add the new component **after**.             |
+| `before` | str  | String name of component to add the new component **before**.            |
+| `after`  | str  | String name of component to add the new component **after**.             |

 ### Example: A simple pipeline component {#custom-components-simple}

--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@ -158,17 +158,17 @@ The available token pattern keys correspond to a number of
 rule-based matching are:

 | Attribute                              | Type |  Description                                                                                           |
-| -------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------ |
-| `ORTH`                                 | unicode | The exact verbatim text of a token.                                                                    |
-| `TEXT` <Tag variant="new">2.1</Tag>    | unicode | The exact verbatim text of a token.                                                                    |
-| `LOWER`                                | unicode | The lowercase form of the token text.                                                                  |
+| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
+| `ORTH`                                 | str  | The exact verbatim text of a token.                                                                    |
+| `TEXT` <Tag variant="new">2.1</Tag>    | str  | The exact verbatim text of a token.                                                                    |
+| `LOWER`                                | str  | The lowercase form of the token text.                                                                  |
 |  `LENGTH`                              | int  | The length of the token text.                                                                          |
 |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | bool | Token text consists of alphabetic characters, ASCII characters, digits.                                |
 |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | bool | Token text is in lowercase, uppercase, titlecase.                                                      |
 |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | bool | Token is punctuation, whitespace, stop word.                                                           |
 |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | bool | Token text resembles a number, URL, email.                                                             |
-|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | unicode | The token's simple and extended part-of-speech tag, dependency label, lemma, shape.                    |
-| `ENT_TYPE`                             | unicode | The token's entity label.                                                                              |
+|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str  | The token's simple and extended part-of-speech tag, dependency label, lemma, shape.                    |
+| `ENT_TYPE`                             | str  | The token's entity label.                                                                              |
 | `_` <Tag variant="new">2.1</Tag>       | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |

 <Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
@ -1101,21 +1101,28 @@ powerful model packages with binary weights _and_ rules included!

 ### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}

-When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**,
-the EntityRuler calls the nlp object to construct a doc object. This happens in case you try
-to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to 
-extract matches based on the pattern's POS signature.
+When using a large amount of **phrase patterns** (roughly > 10000) it's useful
+to understand how the `add_patterns` function of the EntityRuler works. For each
+**phrase pattern**, the EntityRuler calls the nlp object to construct a doc
+object. This happens in case you try to add the EntityRuler at the end of an
+existing pipeline with, for example, a POS tagger and want to extract matches
+based on the pattern's POS signature.

-In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler.
+In this case you would pass a config value of `phrase_matcher_attr="POS"` for
+the EntityRuler.

-Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns.
+Running the full language pipeline across every pattern in a large list scales
+linearly and can therefore take a long time on large amounts of phrase patterns.

-As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively. 
+As of spaCy 2.2.4 the `add_patterns` function has been refactored to use
+nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with
+5,000-100,000 phrase patterns respectively.

-Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time.
+Even with this speedup (but especially if you're using an older version) the
+`add_patterns` function can still take a long time.

-An easy workaround to make this function run faster is disabling the other language pipes
-while adding the phrase patterns.
+An easy workaround to make this function run faster is disabling the other
+language pipes while adding the phrase patterns.

 ```python
 entityruler = EntityRuler(nlp)
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@ -94,8 +94,8 @@ docs = list(doc_bin.get_docs(nlp.vocab))

 If `store_user_data` is set to `True`, the `Doc.user_data` will be serialized as
 well, which includes the values of
-[extension attributes](/usage/processing-pipelines#custom-components-attributes) (if
-they're serializable with msgpack).
+[extension attributes](/usage/processing-pipelines#custom-components-attributes)
+(if they're serializable with msgpack).

 <Infobox title="Important note on serializing extension attributes" variant="warning">

@ -667,8 +667,8 @@ define the language data to be loaded and the
 [processing pipeline](/usage/processing-pipelines) to execute.

 | Setting    | Type | Description                                                                                                                                                          |
-| ---------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lang`     | unicode | ID of the language class to initialize.                                                                                                                              |
+| ---------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `lang`     | str  | ID of the language class to initialize.                                                                                                                              |
 | `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. |

 The `load()` method that comes with our model package templates will take care
--- a/website/docs/usage/visualizers.md
+++ b/website/docs/usage/visualizers.md
@ -68,11 +68,11 @@ arcs.
 </Infobox>

 | Argument  | Type | Description                                                 | Default     |
-| --------- | ------- | ----------------------------------------------------------- | ----------- |
+| --------- | ---- | ----------------------------------------------------------- | ----------- |
 | `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False`     |
-| `color`   | unicode | Text color (HEX, RGB or color names).                       | `"#000000"` |
-| `bg`      | unicode | Background color (HEX, RGB or color names).                 | `"#ffffff"` |
-| `font`    | unicode | Font name or font family for all text.                      | `"Arial"`   |
+| `color`   | str  | Text color (HEX, RGB or color names).                       | `"#000000"` |
+| `bg`      | str  | Background color (HEX, RGB or color names).                 | `"#ffffff"` |
+| `font`    | str  | Font name or font family for all text.                      | `"Arial"`   |

 For a list of all available options, see the
 [`displacy` API documentation](/api/top-level#displacy_options).