mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-04 01:48:04 +03:00 
			
		
		
		
	Merge branch 'develop' into master-tmp
This commit is contained in:
		
						commit
						810fce3bb1
					
				
							
								
								
									
										4
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										4
									
								
								Makefile
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -5,7 +5,7 @@ VENV := ./env$(PYVER)
 | 
			
		|||
version := $(shell "bin/get-version.sh")
 | 
			
		||||
 | 
			
		||||
dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
 | 
			
		||||
	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data
 | 
			
		||||
	$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy_lookups_data
 | 
			
		||||
	chmod a+rx $@
 | 
			
		||||
 | 
			
		||||
dist/pytest.pex : wheelhouse/pytest-*.whl
 | 
			
		||||
| 
						 | 
				
			
			@ -14,7 +14,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl
 | 
			
		|||
 | 
			
		||||
wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
 | 
			
		||||
	$(VENV)/bin/pip wheel . -w ./wheelhouse
 | 
			
		||||
	$(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse
 | 
			
		||||
	$(VENV)/bin/pip wheel spacy_lookups_data -w ./wheelhouse
 | 
			
		||||
	touch $@
 | 
			
		||||
 | 
			
		||||
wheelhouse/pytest-%.whl : $(VENV)/bin/pex
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										115
									
								
								examples/experiments/onto-joint/defaults.cfg
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										115
									
								
								examples/experiments/onto-joint/defaults.cfg
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,115 @@
 | 
			
		|||
# Training hyper-parameters and additional features.
 | 
			
		||||
[training]
 | 
			
		||||
# Whether to train on sequences with 'gold standard' sentence boundaries
 | 
			
		||||
# and tokens. If you set this to true, take care to ensure your run-time
 | 
			
		||||
# data is passed in sentence-by-sentence via some prior preprocessing.
 | 
			
		||||
gold_preproc = false
 | 
			
		||||
# Limitations on training document length or number of examples.
 | 
			
		||||
max_length = 0
 | 
			
		||||
limit = 0
 | 
			
		||||
# Data augmentation
 | 
			
		||||
orth_variant_level = 0.0
 | 
			
		||||
dropout = 0.1
 | 
			
		||||
# Controls early-stopping. 0 or -1 mean unlimited.
 | 
			
		||||
patience = 1600
 | 
			
		||||
max_epochs = 0
 | 
			
		||||
max_steps = 20000
 | 
			
		||||
eval_frequency = 400
 | 
			
		||||
# Other settings
 | 
			
		||||
seed = 0
 | 
			
		||||
accumulate_gradient = 1
 | 
			
		||||
use_pytorch_for_gpu_memory = false
 | 
			
		||||
# Control how scores are printed and checkpoints are evaluated.
 | 
			
		||||
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
 | 
			
		||||
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
 | 
			
		||||
# These settings are invalid for the transformer models.
 | 
			
		||||
init_tok2vec = null
 | 
			
		||||
vectors = null
 | 
			
		||||
 | 
			
		||||
[training.batch_size]
 | 
			
		||||
@schedules = "compounding.v1"
 | 
			
		||||
start = 1000
 | 
			
		||||
stop = 1000
 | 
			
		||||
compound = 1.001
 | 
			
		||||
 | 
			
		||||
[optimizer]
 | 
			
		||||
@optimizers = "Adam.v1"
 | 
			
		||||
beta1 = 0.9
 | 
			
		||||
beta2 = 0.999
 | 
			
		||||
L2_is_weight_decay = true
 | 
			
		||||
L2 = 0.01
 | 
			
		||||
grad_clip = 1.0
 | 
			
		||||
use_averages = true
 | 
			
		||||
eps = 1e-8
 | 
			
		||||
learn_rate = 0.001
 | 
			
		||||
 | 
			
		||||
#[optimizer.learn_rate]
 | 
			
		||||
#@schedules = "warmup_linear.v1"
 | 
			
		||||
#warmup_steps = 250
 | 
			
		||||
#total_steps = 20000
 | 
			
		||||
#initial_rate = 0.001
 | 
			
		||||
 | 
			
		||||
[nlp]
 | 
			
		||||
lang = "en"
 | 
			
		||||
vectors = ${training:vectors}
 | 
			
		||||
 | 
			
		||||
[nlp.pipeline.tok2vec]
 | 
			
		||||
factory = "tok2vec"
 | 
			
		||||
 | 
			
		||||
[nlp.pipeline.senter]
 | 
			
		||||
factory = "senter"
 | 
			
		||||
 | 
			
		||||
[nlp.pipeline.ner]
 | 
			
		||||
factory = "ner"
 | 
			
		||||
 | 
			
		||||
[nlp.pipeline.tagger]
 | 
			
		||||
factory = "tagger"
 | 
			
		||||
 | 
			
		||||
[nlp.pipeline.parser]
 | 
			
		||||
factory = "parser"
 | 
			
		||||
 | 
			
		||||
[nlp.pipeline.senter.model]
 | 
			
		||||
@architectures = "spacy.Tagger.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.pipeline.senter.model.tok2vec]
 | 
			
		||||
@architectures = "spacy.Tok2VecTensors.v1"
 | 
			
		||||
width = ${nlp.pipeline.tok2vec.model:width}
 | 
			
		||||
 | 
			
		||||
[nlp.pipeline.tagger.model]
 | 
			
		||||
@architectures = "spacy.Tagger.v1"
 | 
			
		||||
 | 
			
		||||
[nlp.pipeline.tagger.model.tok2vec]
 | 
			
		||||
@architectures = "spacy.Tok2VecTensors.v1"
 | 
			
		||||
width = ${nlp.pipeline.tok2vec.model:width}
 | 
			
		||||
 | 
			
		||||
[nlp.pipeline.parser.model]
 | 
			
		||||
@architectures = "spacy.TransitionBasedParser.v1"
 | 
			
		||||
nr_feature_tokens = 8
 | 
			
		||||
hidden_width = 128
 | 
			
		||||
maxout_pieces = 3
 | 
			
		||||
use_upper = false
 | 
			
		||||
 | 
			
		||||
[nlp.pipeline.parser.model.tok2vec]
 | 
			
		||||
@architectures = "spacy.Tok2VecTensors.v1"
 | 
			
		||||
width = ${nlp.pipeline.tok2vec.model:width}
 | 
			
		||||
 | 
			
		||||
[nlp.pipeline.ner.model]
 | 
			
		||||
@architectures = "spacy.TransitionBasedParser.v1"
 | 
			
		||||
nr_feature_tokens = 3
 | 
			
		||||
hidden_width = 128
 | 
			
		||||
maxout_pieces = 3
 | 
			
		||||
use_upper = false
 | 
			
		||||
 | 
			
		||||
[nlp.pipeline.ner.model.tok2vec]
 | 
			
		||||
@architectures = "spacy.Tok2VecTensors.v1"
 | 
			
		||||
width = ${nlp.pipeline.tok2vec.model:width}
 | 
			
		||||
 | 
			
		||||
[nlp.pipeline.tok2vec.model]
 | 
			
		||||
@architectures = "spacy.HashEmbedCNN.v1"
 | 
			
		||||
pretrained_vectors = ${nlp:vectors}
 | 
			
		||||
width = 256
 | 
			
		||||
depth = 6
 | 
			
		||||
window_size = 1
 | 
			
		||||
embed_size = 10000
 | 
			
		||||
maxout_pieces = 3
 | 
			
		||||
subword_features = true
 | 
			
		||||
| 
						 | 
				
			
			@ -13,9 +13,11 @@ numpy>=1.15.0
 | 
			
		|||
requests>=2.13.0,<3.0.0
 | 
			
		||||
plac>=0.9.6,<1.2.0
 | 
			
		||||
tqdm>=4.38.0,<5.0.0
 | 
			
		||||
# Optional dependencies
 | 
			
		||||
jsonschema>=2.6.0,<3.1.0
 | 
			
		||||
pydantic>=1.3.0,<2.0.0
 | 
			
		||||
# Official Python utilities
 | 
			
		||||
setuptools
 | 
			
		||||
packaging
 | 
			
		||||
importlib_metadata>=0.20; python_version < "3.8"
 | 
			
		||||
# Development dependencies
 | 
			
		||||
cython>=0.25
 | 
			
		||||
pytest>=4.6.5
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -47,15 +47,17 @@ install_requires =
 | 
			
		|||
    wasabi>=0.4.0,<1.1.0
 | 
			
		||||
    srsly>=2.0.0,<3.0.0
 | 
			
		||||
    catalogue>=0.0.7,<1.1.0
 | 
			
		||||
    ml_datasets
 | 
			
		||||
    ml_datasets>=0.1.1
 | 
			
		||||
    # Third-party dependencies
 | 
			
		||||
    tqdm>=4.38.0,<5.0.0
 | 
			
		||||
    setuptools
 | 
			
		||||
    numpy>=1.15.0
 | 
			
		||||
    plac>=0.9.6,<1.2.0
 | 
			
		||||
    requests>=2.13.0,<3.0.0
 | 
			
		||||
    pydantic>=1.3.0,<2.0.0
 | 
			
		||||
    tqdm>=4.38.0,<5.0.0
 | 
			
		||||
    # Official Python utilities
 | 
			
		||||
    setuptools
 | 
			
		||||
    packaging
 | 
			
		||||
    importlib_metadata>=0.20; python_version < "3.8"
 | 
			
		||||
 | 
			
		||||
[options.extras_require]
 | 
			
		||||
lookups =
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,6 +1,6 @@
 | 
			
		|||
# fmt: off
 | 
			
		||||
__title__ = "spacy"
 | 
			
		||||
__version__ = "3.0.0.dev8"
 | 
			
		||||
__version__ = "3.0.0.dev9"
 | 
			
		||||
__release__ = True
 | 
			
		||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
 | 
			
		||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -111,8 +111,8 @@ def get_entities(lines, tag_pattern, ner_map=None):
 | 
			
		|||
    final entity type with `ner_map` if mapping present. Entity tag is 'O' if
 | 
			
		||||
    the pattern is not matched.
 | 
			
		||||
 | 
			
		||||
    lines (unicode): CONLL-U lines for one sentences
 | 
			
		||||
    tag_pattern (unicode): Regex pattern for entity tag
 | 
			
		||||
    lines (str): CONLL-U lines for one sentences
 | 
			
		||||
    tag_pattern (str): Regex pattern for entity tag
 | 
			
		||||
    ner_map (dict): Map old NER tag names to new ones, '' maps to O.
 | 
			
		||||
    RETURNS (list): List of BILUO entity tags
 | 
			
		||||
    """
 | 
			
		||||
| 
						 | 
				
			
			@ -187,8 +187,8 @@ def example_from_conllu_sentence(
 | 
			
		|||
    """Create an Example from the lines for one CoNLL-U sentence, merging
 | 
			
		||||
    subtokens and appending morphology to tags if required.
 | 
			
		||||
 | 
			
		||||
    lines (unicode): The non-comment lines for a CoNLL-U sentence
 | 
			
		||||
    ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col
 | 
			
		||||
    lines (str): The non-comment lines for a CoNLL-U sentence
 | 
			
		||||
    ner_tag_pattern (str): The regex pattern for matching NER in MISC col
 | 
			
		||||
    RETURNS (Example): An example containing the annotation
 | 
			
		||||
    """
 | 
			
		||||
    # create a Doc with each subtoken as its own token
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -5,6 +5,7 @@ import sys
 | 
			
		|||
from wasabi import msg
 | 
			
		||||
 | 
			
		||||
from .. import about
 | 
			
		||||
from ..util import is_package, get_base_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def download(
 | 
			
		||||
| 
						 | 
				
			
			@ -17,7 +18,7 @@ def download(
 | 
			
		|||
    flag is set, the command expects the full model name with version.
 | 
			
		||||
    For direct downloads, the compatibility check will be skipped.
 | 
			
		||||
    """
 | 
			
		||||
    if not require_package("spacy") and "--no-deps" not in pip_args:
 | 
			
		||||
    if not is_package("spacy") and "--no-deps" not in pip_args:
 | 
			
		||||
        msg.warn(
 | 
			
		||||
            "Skipping model package dependencies and setting `--no-deps`. "
 | 
			
		||||
            "You don't seem to have the spaCy package itself installed "
 | 
			
		||||
| 
						 | 
				
			
			@ -45,21 +46,6 @@ def download(
 | 
			
		|||
            "Download and installation successful",
 | 
			
		||||
            f"You can now load the model via spacy.load('{model_name}')",
 | 
			
		||||
        )
 | 
			
		||||
        # If a model is downloaded and then loaded within the same process, our
 | 
			
		||||
        # is_package check currently fails, because pkg_resources.working_set
 | 
			
		||||
        # is not refreshed automatically (see #3923). We're trying to work
 | 
			
		||||
        # around this here be requiring the package explicitly.
 | 
			
		||||
        require_package(model_name)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def require_package(name):
 | 
			
		||||
    try:
 | 
			
		||||
        import pkg_resources
 | 
			
		||||
 | 
			
		||||
        pkg_resources.working_set.require(name)
 | 
			
		||||
        return True
 | 
			
		||||
    except:  # noqa: E722
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_json(url, desc):
 | 
			
		||||
| 
						 | 
				
			
			@ -77,8 +63,7 @@ def get_json(url, desc):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def get_compatibility():
 | 
			
		||||
    version = about.__version__
 | 
			
		||||
    version = version.rsplit(".dev", 1)[0]
 | 
			
		||||
    version = get_base_version(about.__version__)
 | 
			
		||||
    comp_table = get_json(about.__compatibility__, "compatibility table")
 | 
			
		||||
    comp = comp_table["spacy"]
 | 
			
		||||
    if version not in comp:
 | 
			
		||||
| 
						 | 
				
			
			@ -87,7 +72,7 @@ def get_compatibility():
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def get_version(model, comp):
 | 
			
		||||
    model = model.rsplit(".dev", 1)[0]
 | 
			
		||||
    model = get_base_version(model)
 | 
			
		||||
    if model not in comp:
 | 
			
		||||
        msg.fail(
 | 
			
		||||
            f"No compatible model found for '{model}' (spaCy v{about.__version__})",
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -48,7 +48,9 @@ def info(
 | 
			
		|||
        "Location": str(Path(__file__).parent.parent),
 | 
			
		||||
        "Platform": platform.platform(),
 | 
			
		||||
        "Python version": platform.python_version(),
 | 
			
		||||
        "Models": ", ".join(model["name"] for model in all_models.values()),
 | 
			
		||||
        "Models": ", ".join(
 | 
			
		||||
            f"{m['name']} ({m['version']})" for m in all_models.values()
 | 
			
		||||
        ),
 | 
			
		||||
    }
 | 
			
		||||
    if not silent:
 | 
			
		||||
        title = "Info about spaCy"
 | 
			
		||||
| 
						 | 
				
			
			@ -63,7 +65,7 @@ def print_markdown(data, title=None):
 | 
			
		|||
    """Print data in GitHub-flavoured Markdown format for issues etc.
 | 
			
		||||
 | 
			
		||||
    data (dict or list of tuples): Label/value pairs.
 | 
			
		||||
    title (unicode or None): Title, will be rendered as headline 2.
 | 
			
		||||
    title (str / None): Title, will be rendered as headline 2.
 | 
			
		||||
    """
 | 
			
		||||
    markdown = []
 | 
			
		||||
    for key, value in data.items():
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -83,14 +83,14 @@ def generate_meta(model_path, existing_meta, msg):
 | 
			
		|||
        ("lang", "Model language", meta.get("lang", "en")),
 | 
			
		||||
        ("name", "Model name", meta.get("name", "model")),
 | 
			
		||||
        ("version", "Model version", meta.get("version", "0.0.0")),
 | 
			
		||||
        ("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"),
 | 
			
		||||
        ("description", "Model description", meta.get("description", False)),
 | 
			
		||||
        ("author", "Author", meta.get("author", False)),
 | 
			
		||||
        ("email", "Author email", meta.get("email", False)),
 | 
			
		||||
        ("url", "Author website", meta.get("url", False)),
 | 
			
		||||
        ("license", "License", meta.get("license", "CC BY-SA 3.0")),
 | 
			
		||||
        ("license", "License", meta.get("license", "MIT")),
 | 
			
		||||
    ]
 | 
			
		||||
    nlp = util.load_model_from_path(Path(model_path))
 | 
			
		||||
    meta["spacy_version"] = util.get_model_version_range(about.__version__)
 | 
			
		||||
    meta["pipeline"] = nlp.pipe_names
 | 
			
		||||
    meta["vectors"] = {
 | 
			
		||||
        "width": nlp.vocab.vectors_length,
 | 
			
		||||
| 
						 | 
				
			
			@ -168,6 +168,7 @@ def setup_package():
 | 
			
		|||
        package_data={model_name: list_files(model_dir)},
 | 
			
		||||
        install_requires=list_requirements(meta),
 | 
			
		||||
        zip_safe=False,
 | 
			
		||||
        entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]}
 | 
			
		||||
    )
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -483,7 +483,6 @@ def train(
 | 
			
		|||
                    # Update model meta.json
 | 
			
		||||
                    meta["lang"] = nlp.lang
 | 
			
		||||
                    meta["pipeline"] = nlp.pipe_names
 | 
			
		||||
                    meta["spacy_version"] = f">={about.__version__}"
 | 
			
		||||
                    if beam_width == 1:
 | 
			
		||||
                        meta["speed"] = {
 | 
			
		||||
                            "nwords": nwords,
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -7,7 +7,7 @@ from pathlib import Path
 | 
			
		|||
from wasabi import msg
 | 
			
		||||
import thinc
 | 
			
		||||
import thinc.schedules
 | 
			
		||||
from thinc.api import Model
 | 
			
		||||
from thinc.api import Model, use_pytorch_for_gpu_memory
 | 
			
		||||
import random
 | 
			
		||||
 | 
			
		||||
from ..gold import GoldCorpus
 | 
			
		||||
| 
						 | 
				
			
			@ -171,6 +171,8 @@ def train_from_config(
 | 
			
		|||
    msg.info(f"Loading config from: {config_path}")
 | 
			
		||||
    config = util.load_config(config_path, create_objects=False)
 | 
			
		||||
    util.fix_random_seed(config["training"]["seed"])
 | 
			
		||||
    if config["training"]["use_pytorch_for_gpu_memory"]:
 | 
			
		||||
        use_pytorch_for_gpu_memory()
 | 
			
		||||
    nlp_config = config["nlp"]
 | 
			
		||||
    config = util.load_config(config_path, create_objects=True)
 | 
			
		||||
    msg.info("Creating nlp from config")
 | 
			
		||||
| 
						 | 
				
			
			@ -213,6 +215,12 @@ def train_from_config(
 | 
			
		|||
                if is_best_checkpoint and output_path is not None:
 | 
			
		||||
                    nlp.to_disk(output_path)
 | 
			
		||||
                progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
 | 
			
		||||
            # Clean up the objects to faciliate garbage collection.
 | 
			
		||||
            for eg in batch:
 | 
			
		||||
                eg.doc = None
 | 
			
		||||
                eg.goldparse = None
 | 
			
		||||
                eg.doc_annotation = None
 | 
			
		||||
                eg.token_annotation = None
 | 
			
		||||
    finally:
 | 
			
		||||
        if output_path is not None:
 | 
			
		||||
            final_model_path = output_path / "model-final"
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -4,6 +4,8 @@ import requests
 | 
			
		|||
from wasabi import msg
 | 
			
		||||
 | 
			
		||||
from .. import about
 | 
			
		||||
from ..util import get_package_version, get_installed_models, get_base_version
 | 
			
		||||
from ..util import get_package_path, get_model_meta, is_compatible_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def validate():
 | 
			
		||||
| 
						 | 
				
			
			@ -12,7 +14,7 @@ def validate():
 | 
			
		|||
    with the installed models. Should be run after `pip install -U spacy`.
 | 
			
		||||
    """
 | 
			
		||||
    model_pkgs, compat = get_model_pkgs()
 | 
			
		||||
    spacy_version = about.__version__.rsplit(".dev", 1)[0]
 | 
			
		||||
    spacy_version = get_base_version(about.__version__)
 | 
			
		||||
    current_compat = compat.get(spacy_version, {})
 | 
			
		||||
    if not current_compat:
 | 
			
		||||
        msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
 | 
			
		||||
| 
						 | 
				
			
			@ -25,7 +27,7 @@ def validate():
 | 
			
		|||
    msg.info(f"spaCy installation: {spacy_dir}")
 | 
			
		||||
 | 
			
		||||
    if model_pkgs:
 | 
			
		||||
        header = ("NAME", "VERSION", "")
 | 
			
		||||
        header = ("NAME", "SPACY", "VERSION", "")
 | 
			
		||||
        rows = []
 | 
			
		||||
        for name, data in model_pkgs.items():
 | 
			
		||||
            if data["compat"]:
 | 
			
		||||
| 
						 | 
				
			
			@ -34,7 +36,7 @@ def validate():
 | 
			
		|||
            else:
 | 
			
		||||
                version = msg.text(data["version"], color="red", no_print=True)
 | 
			
		||||
                comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
 | 
			
		||||
            rows.append((data["name"], version, comp))
 | 
			
		||||
            rows.append((data["name"], data["spacy"], version, comp))
 | 
			
		||||
        msg.table(rows, header=header)
 | 
			
		||||
    else:
 | 
			
		||||
        msg.text("No models found in your current environment.", exits=0)
 | 
			
		||||
| 
						 | 
				
			
			@ -44,8 +46,9 @@ def validate():
 | 
			
		|||
        cmd = "python -m spacy download {}"
 | 
			
		||||
        print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
 | 
			
		||||
    if na_models:
 | 
			
		||||
        msg.warn(
 | 
			
		||||
            f"The following models are not available for spaCy v{about.__version__}:",
 | 
			
		||||
        msg.info(
 | 
			
		||||
            f"The following models are custom spaCy models or not "
 | 
			
		||||
            f"available for spaCy v{about.__version__}:",
 | 
			
		||||
            ", ".join(na_models),
 | 
			
		||||
        )
 | 
			
		||||
    if incompat_models:
 | 
			
		||||
| 
						 | 
				
			
			@ -53,8 +56,6 @@ def validate():
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def get_model_pkgs():
 | 
			
		||||
    import pkg_resources
 | 
			
		||||
 | 
			
		||||
    with msg.loading("Loading compatibility table..."):
 | 
			
		||||
        r = requests.get(about.__compatibility__)
 | 
			
		||||
        if r.status_code != 200:
 | 
			
		||||
| 
						 | 
				
			
			@ -66,19 +67,28 @@ def get_model_pkgs():
 | 
			
		|||
    msg.good("Loaded compatibility table")
 | 
			
		||||
    compat = r.json()["spacy"]
 | 
			
		||||
    all_models = set()
 | 
			
		||||
    installed_models = get_installed_models()
 | 
			
		||||
    for spacy_v, models in dict(compat).items():
 | 
			
		||||
        all_models.update(models.keys())
 | 
			
		||||
        for model, model_vs in models.items():
 | 
			
		||||
            compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
 | 
			
		||||
    pkgs = {}
 | 
			
		||||
    for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
 | 
			
		||||
    for pkg_name in installed_models:
 | 
			
		||||
        package = pkg_name.replace("-", "_")
 | 
			
		||||
        if package in all_models:
 | 
			
		||||
            version = pkg_data.version
 | 
			
		||||
        version = get_package_version(pkg_name)
 | 
			
		||||
        if package in compat:
 | 
			
		||||
            is_compat = version in compat[package]
 | 
			
		||||
            spacy_version = about.__version__
 | 
			
		||||
        else:
 | 
			
		||||
            model_path = get_package_path(package)
 | 
			
		||||
            model_meta = get_model_meta(model_path)
 | 
			
		||||
            spacy_version = model_meta.get("spacy_version", "n/a")
 | 
			
		||||
            is_compat = is_compatible_version(about.__version__, spacy_version)
 | 
			
		||||
        pkgs[pkg_name] = {
 | 
			
		||||
            "name": package,
 | 
			
		||||
            "version": version,
 | 
			
		||||
                "compat": package in compat and version in compat[package],
 | 
			
		||||
            "spacy": spacy_version,
 | 
			
		||||
            "compat": is_compat,
 | 
			
		||||
        }
 | 
			
		||||
    return pkgs, compat
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -22,13 +22,13 @@ def render(
 | 
			
		|||
    """Render displaCy visualisation.
 | 
			
		||||
 | 
			
		||||
    docs (list or Doc): Document(s) to visualise.
 | 
			
		||||
    style (unicode): Visualisation style, 'dep' or 'ent'.
 | 
			
		||||
    style (str): Visualisation style, 'dep' or 'ent'.
 | 
			
		||||
    page (bool): Render markup as full HTML page.
 | 
			
		||||
    minify (bool): Minify HTML markup.
 | 
			
		||||
    jupyter (bool): Override Jupyter auto-detection.
 | 
			
		||||
    options (dict): Visualiser-specific options, e.g. colors.
 | 
			
		||||
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
 | 
			
		||||
    RETURNS (unicode): Rendered HTML markup.
 | 
			
		||||
    RETURNS (str): Rendered HTML markup.
 | 
			
		||||
 | 
			
		||||
    DOCS: https://spacy.io/api/top-level#displacy.render
 | 
			
		||||
    USAGE: https://spacy.io/usage/visualizers
 | 
			
		||||
| 
						 | 
				
			
			@ -73,13 +73,13 @@ def serve(
 | 
			
		|||
    """Serve displaCy visualisation.
 | 
			
		||||
 | 
			
		||||
    docs (list or Doc): Document(s) to visualise.
 | 
			
		||||
    style (unicode): Visualisation style, 'dep' or 'ent'.
 | 
			
		||||
    style (str): Visualisation style, 'dep' or 'ent'.
 | 
			
		||||
    page (bool): Render markup as full HTML page.
 | 
			
		||||
    minify (bool): Minify HTML markup.
 | 
			
		||||
    options (dict): Visualiser-specific options, e.g. colors.
 | 
			
		||||
    manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
 | 
			
		||||
    port (int): Port to serve visualisation.
 | 
			
		||||
    host (unicode): Host to serve visualisation.
 | 
			
		||||
    host (str): Host to serve visualisation.
 | 
			
		||||
 | 
			
		||||
    DOCS: https://spacy.io/api/top-level#displacy.serve
 | 
			
		||||
    USAGE: https://spacy.io/usage/visualizers
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -47,7 +47,7 @@ class DependencyRenderer(object):
 | 
			
		|||
        parsed (list): Dependency parses to render.
 | 
			
		||||
        page (bool): Render parses wrapped as full HTML page.
 | 
			
		||||
        minify (bool): Minify HTML markup.
 | 
			
		||||
        RETURNS (unicode): Rendered SVG or HTML markup.
 | 
			
		||||
        RETURNS (str): Rendered SVG or HTML markup.
 | 
			
		||||
        """
 | 
			
		||||
        # Create a random ID prefix to make sure parses don't receive the
 | 
			
		||||
        # same ID, even if they're identical
 | 
			
		||||
| 
						 | 
				
			
			@ -78,7 +78,7 @@ class DependencyRenderer(object):
 | 
			
		|||
        render_id (int): Unique ID, typically index of document.
 | 
			
		||||
        words (list): Individual words and their tags.
 | 
			
		||||
        arcs (list): Individual arcs and their start, end, direction and label.
 | 
			
		||||
        RETURNS (unicode): Rendered SVG markup.
 | 
			
		||||
        RETURNS (str): Rendered SVG markup.
 | 
			
		||||
        """
 | 
			
		||||
        self.levels = self.get_levels(arcs)
 | 
			
		||||
        self.highest_level = len(self.levels)
 | 
			
		||||
| 
						 | 
				
			
			@ -112,10 +112,10 @@ class DependencyRenderer(object):
 | 
			
		|||
    ):
 | 
			
		||||
        """Render individual word.
 | 
			
		||||
 | 
			
		||||
        text (unicode): Word text.
 | 
			
		||||
        tag (unicode): Part-of-speech tag.
 | 
			
		||||
        text (str): Word text.
 | 
			
		||||
        tag (str): Part-of-speech tag.
 | 
			
		||||
        i (int): Unique ID, typically word index.
 | 
			
		||||
        RETURNS (unicode): Rendered SVG markup.
 | 
			
		||||
        RETURNS (str): Rendered SVG markup.
 | 
			
		||||
        """
 | 
			
		||||
        y = self.offset_y + self.word_spacing
 | 
			
		||||
        x = self.offset_x + i * self.distance
 | 
			
		||||
| 
						 | 
				
			
			@ -131,12 +131,12 @@ class DependencyRenderer(object):
 | 
			
		|||
    def render_arrow(self, label, start, end, direction, i):
 | 
			
		||||
        """Render individual arrow.
 | 
			
		||||
 | 
			
		||||
        label (unicode): Dependency label.
 | 
			
		||||
        label (str): Dependency label.
 | 
			
		||||
        start (int): Index of start word.
 | 
			
		||||
        end (int): Index of end word.
 | 
			
		||||
        direction (unicode): Arrow direction, 'left' or 'right'.
 | 
			
		||||
        direction (str): Arrow direction, 'left' or 'right'.
 | 
			
		||||
        i (int): Unique ID, typically arrow index.
 | 
			
		||||
        RETURNS (unicode): Rendered SVG markup.
 | 
			
		||||
        RETURNS (str): Rendered SVG markup.
 | 
			
		||||
        """
 | 
			
		||||
        if start < 0 or end < 0:
 | 
			
		||||
            error_args = dict(start=start, end=end, label=label, dir=direction)
 | 
			
		||||
| 
						 | 
				
			
			@ -179,7 +179,7 @@ class DependencyRenderer(object):
 | 
			
		|||
        y (int): Y-coordinate of arrow start and end point.
 | 
			
		||||
        y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
 | 
			
		||||
        x_end (int): X-coordinate of arrow end point.
 | 
			
		||||
        RETURNS (unicode): Definition of the arc path ('d' attribute).
 | 
			
		||||
        RETURNS (str): Definition of the arc path ('d' attribute).
 | 
			
		||||
        """
 | 
			
		||||
        template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
 | 
			
		||||
        if self.compact:
 | 
			
		||||
| 
						 | 
				
			
			@ -189,11 +189,11 @@ class DependencyRenderer(object):
 | 
			
		|||
    def get_arrowhead(self, direction, x, y, end):
 | 
			
		||||
        """Render individual arrow head.
 | 
			
		||||
 | 
			
		||||
        direction (unicode): Arrow direction, 'left' or 'right'.
 | 
			
		||||
        direction (str): Arrow direction, 'left' or 'right'.
 | 
			
		||||
        x (int): X-coordinate of arrow start point.
 | 
			
		||||
        y (int): Y-coordinate of arrow start and end point.
 | 
			
		||||
        end (int): X-coordinate of arrow end point.
 | 
			
		||||
        RETURNS (unicode): Definition of the arrow head path ('d' attribute).
 | 
			
		||||
        RETURNS (str): Definition of the arrow head path ('d' attribute).
 | 
			
		||||
        """
 | 
			
		||||
        if direction == "left":
 | 
			
		||||
            pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
 | 
			
		||||
| 
						 | 
				
			
			@ -279,7 +279,7 @@ class EntityRenderer(object):
 | 
			
		|||
        parsed (list): Dependency parses to render.
 | 
			
		||||
        page (bool): Render parses wrapped as full HTML page.
 | 
			
		||||
        minify (bool): Minify HTML markup.
 | 
			
		||||
        RETURNS (unicode): Rendered HTML markup.
 | 
			
		||||
        RETURNS (str): Rendered HTML markup.
 | 
			
		||||
        """
 | 
			
		||||
        rendered = []
 | 
			
		||||
        for i, p in enumerate(parsed):
 | 
			
		||||
| 
						 | 
				
			
			@ -300,9 +300,9 @@ class EntityRenderer(object):
 | 
			
		|||
    def render_ents(self, text, spans, title):
 | 
			
		||||
        """Render entities in text.
 | 
			
		||||
 | 
			
		||||
        text (unicode): Original text.
 | 
			
		||||
        text (str): Original text.
 | 
			
		||||
        spans (list): Individual entity spans and their start, end and label.
 | 
			
		||||
        title (unicode or None): Document title set in Doc.user_data['title'].
 | 
			
		||||
        title (str / None): Document title set in Doc.user_data['title'].
 | 
			
		||||
        """
 | 
			
		||||
        markup = ""
 | 
			
		||||
        offset = 0
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -113,9 +113,12 @@ class Warnings(object):
 | 
			
		|||
            "ignored during training.")
 | 
			
		||||
 | 
			
		||||
    # TODO: fix numbering after merging develop into master
 | 
			
		||||
    W095 = ("Skipping unsupported morphological feature(s): {feature}. "
 | 
			
		||||
            "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
 | 
			
		||||
            "string \"Field1=Value1,Value2|Field2=Value3\".")
 | 
			
		||||
    W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is "
 | 
			
		||||
            "incompatible with the current version ({current}). This may lead "
 | 
			
		||||
            "to unexpected results or runtime errors. To resolve this, "
 | 
			
		||||
            "download a newer compatible model or retrain your custom model "
 | 
			
		||||
            "with the current spaCy version. For more details and available "
 | 
			
		||||
            "updates, run: python -m spacy validate")
 | 
			
		||||
    W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' "
 | 
			
		||||
            "instead.")
 | 
			
		||||
    W097 = ("No Model config was provided to create the '{name}' component, "
 | 
			
		||||
| 
						 | 
				
			
			@ -124,6 +127,9 @@ class Warnings(object):
 | 
			
		|||
            "so a default configuration was used.")
 | 
			
		||||
    W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
 | 
			
		||||
            "but got '{type}' instead, so ignoring it.")
 | 
			
		||||
    W100 = ("Skipping unsupported morphological feature(s): {feature}. "
 | 
			
		||||
            "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
 | 
			
		||||
            "string \"Field1=Value1,Value2|Field2=Value3\".")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@add_codes
 | 
			
		||||
| 
						 | 
				
			
			@ -621,7 +627,7 @@ class MatchPatternError(ValueError):
 | 
			
		|||
    def __init__(self, key, errors):
 | 
			
		||||
        """Custom error for validating match patterns.
 | 
			
		||||
 | 
			
		||||
        key (unicode): The name of the matcher rule.
 | 
			
		||||
        key (str): The name of the matcher rule.
 | 
			
		||||
        errors (dict): Validation errors (sequence of strings) mapped to pattern
 | 
			
		||||
            ID, i.e. the index of the added pattern.
 | 
			
		||||
        """
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,8 +1,8 @@
 | 
			
		|||
def explain(term):
 | 
			
		||||
    """Get a description for a given POS tag, dependency label or entity type.
 | 
			
		||||
 | 
			
		||||
    term (unicode): The term to explain.
 | 
			
		||||
    RETURNS (unicode): The explanation, or `None` if not found in the glossary.
 | 
			
		||||
    term (str): The term to explain.
 | 
			
		||||
    RETURNS (str): The explanation, or `None` if not found in the glossary.
 | 
			
		||||
 | 
			
		||||
    EXAMPLE:
 | 
			
		||||
        >>> spacy.explain(u'NORP')
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -154,8 +154,8 @@ class GoldCorpus(object):
 | 
			
		|||
    def __init__(self, train, dev, gold_preproc=False, limit=None):
 | 
			
		||||
        """Create a GoldCorpus.
 | 
			
		||||
 | 
			
		||||
        train (unicode or Path): File or directory of training data.
 | 
			
		||||
        dev (unicode or Path): File or directory of development data.
 | 
			
		||||
        train (str / Path): File or directory of training data.
 | 
			
		||||
        dev (str / Path): File or directory of development data.
 | 
			
		||||
        RETURNS (GoldCorpus): The newly created object.
 | 
			
		||||
        """
 | 
			
		||||
        self.limit = limit
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -38,7 +38,7 @@ cdef class Candidate:
 | 
			
		|||
 | 
			
		||||
    @property
 | 
			
		||||
    def entity_(self):
 | 
			
		||||
        """RETURNS (unicode): ID/name of this entity in the KB"""
 | 
			
		||||
        """RETURNS (str): ID/name of this entity in the KB"""
 | 
			
		||||
        return self.kb.vocab.strings[self.entity_hash]
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
| 
						 | 
				
			
			@ -48,7 +48,7 @@ cdef class Candidate:
 | 
			
		|||
 | 
			
		||||
    @property
 | 
			
		||||
    def alias_(self):
 | 
			
		||||
        """RETURNS (unicode): ID of the original alias"""
 | 
			
		||||
        """RETURNS (str): ID of the original alias"""
 | 
			
		||||
        return self.kb.vocab.strings[self.alias_hash]
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -17,7 +17,8 @@ from .tokens.underscore import Underscore
 | 
			
		|||
from .vocab import Vocab
 | 
			
		||||
from .lemmatizer import Lemmatizer
 | 
			
		||||
from .lookups import Lookups
 | 
			
		||||
from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
 | 
			
		||||
from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
 | 
			
		||||
from .pipe_analysis import count_pipeline_interdependencies
 | 
			
		||||
from .gold import Example
 | 
			
		||||
from .scorer import Scorer
 | 
			
		||||
from .util import link_vectors_to_models, create_default_optimizer, registry
 | 
			
		||||
| 
						 | 
				
			
			@ -127,7 +128,7 @@ class Language(object):
 | 
			
		|||
 | 
			
		||||
    Defaults (class): Settings, data and factory methods for creating the `nlp`
 | 
			
		||||
        object and processing pipeline.
 | 
			
		||||
    lang (unicode): Two-letter language ID, i.e. ISO code.
 | 
			
		||||
    lang (str): Two-letter language ID, i.e. ISO code.
 | 
			
		||||
 | 
			
		||||
    DOCS: https://spacy.io/api/language
 | 
			
		||||
    """
 | 
			
		||||
| 
						 | 
				
			
			@ -196,13 +197,14 @@ class Language(object):
 | 
			
		|||
 | 
			
		||||
    @property
 | 
			
		||||
    def meta(self):
 | 
			
		||||
        spacy_version = util.get_model_version_range(about.__version__)
 | 
			
		||||
        if self.vocab.lang:
 | 
			
		||||
            self._meta.setdefault("lang", self.vocab.lang)
 | 
			
		||||
        else:
 | 
			
		||||
            self._meta.setdefault("lang", self.lang)
 | 
			
		||||
        self._meta.setdefault("name", "model")
 | 
			
		||||
        self._meta.setdefault("version", "0.0.0")
 | 
			
		||||
        self._meta.setdefault("spacy_version", f">={about.__version__}")
 | 
			
		||||
        self._meta.setdefault("spacy_version", spacy_version)
 | 
			
		||||
        self._meta.setdefault("description", "")
 | 
			
		||||
        self._meta.setdefault("author", "")
 | 
			
		||||
        self._meta.setdefault("email", "")
 | 
			
		||||
| 
						 | 
				
			
			@ -292,7 +294,7 @@ class Language(object):
 | 
			
		|||
    def get_pipe(self, name):
 | 
			
		||||
        """Get a pipeline component for a given component name.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of pipeline component to get.
 | 
			
		||||
        name (str): Name of pipeline component to get.
 | 
			
		||||
        RETURNS (callable): The pipeline component.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/language#get_pipe
 | 
			
		||||
| 
						 | 
				
			
			@ -305,7 +307,7 @@ class Language(object):
 | 
			
		|||
    def create_pipe(self, name, config=dict()):
 | 
			
		||||
        """Create a pipeline component from a factory.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Factory name to look up in `Language.factories`.
 | 
			
		||||
        name (str): Factory name to look up in `Language.factories`.
 | 
			
		||||
        config (dict): Configuration parameters to initialise component.
 | 
			
		||||
        RETURNS (callable): Pipeline component.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -348,12 +350,12 @@ class Language(object):
 | 
			
		|||
        of before/after/first/last can be set. Default behaviour is "last".
 | 
			
		||||
 | 
			
		||||
        component (callable): The pipeline component.
 | 
			
		||||
        name (unicode): Name of pipeline component. Overwrites existing
 | 
			
		||||
        name (str): Name of pipeline component. Overwrites existing
 | 
			
		||||
            component.name attribute if available. If no name is set and
 | 
			
		||||
            the component exposes no name attribute, component.__name__ is
 | 
			
		||||
            used. An error is raised if a name already exists in the pipeline.
 | 
			
		||||
        before (unicode): Component name to insert component directly before.
 | 
			
		||||
        after (unicode): Component name to insert component directly after.
 | 
			
		||||
        before (str): Component name to insert component directly before.
 | 
			
		||||
        after (str): Component name to insert component directly after.
 | 
			
		||||
        first (bool): Insert component first / not first in the pipeline.
 | 
			
		||||
        last (bool): Insert component last / not last in the pipeline.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -394,7 +396,7 @@ class Language(object):
 | 
			
		|||
        """Check if a component name is present in the pipeline. Equivalent to
 | 
			
		||||
        `name in nlp.pipe_names`.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the component.
 | 
			
		||||
        name (str): Name of the component.
 | 
			
		||||
        RETURNS (bool): Whether a component of the name exists in the pipeline.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/language#has_pipe
 | 
			
		||||
| 
						 | 
				
			
			@ -404,7 +406,7 @@ class Language(object):
 | 
			
		|||
    def replace_pipe(self, name, component):
 | 
			
		||||
        """Replace a component in the pipeline.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the component to replace.
 | 
			
		||||
        name (str): Name of the component to replace.
 | 
			
		||||
        component (callable): Pipeline component.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/language#replace_pipe
 | 
			
		||||
| 
						 | 
				
			
			@ -423,8 +425,8 @@ class Language(object):
 | 
			
		|||
    def rename_pipe(self, old_name, new_name):
 | 
			
		||||
        """Rename a pipeline component.
 | 
			
		||||
 | 
			
		||||
        old_name (unicode): Name of the component to rename.
 | 
			
		||||
        new_name (unicode): New name of the component.
 | 
			
		||||
        old_name (str): Name of the component to rename.
 | 
			
		||||
        new_name (str): New name of the component.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/language#rename_pipe
 | 
			
		||||
        """
 | 
			
		||||
| 
						 | 
				
			
			@ -438,7 +440,7 @@ class Language(object):
 | 
			
		|||
    def remove_pipe(self, name):
 | 
			
		||||
        """Remove a component from the pipeline.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the component to remove.
 | 
			
		||||
        name (str): Name of the component to remove.
 | 
			
		||||
        RETURNS (tuple): A `(name, component)` tuple of the removed component.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/language#remove_pipe
 | 
			
		||||
| 
						 | 
				
			
			@ -455,7 +457,7 @@ class Language(object):
 | 
			
		|||
        and can contain arbitrary whitespace. Alignment into the original string
 | 
			
		||||
        is preserved.
 | 
			
		||||
 | 
			
		||||
        text (unicode): The text to be processed.
 | 
			
		||||
        text (str): The text to be processed.
 | 
			
		||||
        disable (list): Names of the pipeline components to disable.
 | 
			
		||||
        component_cfg (dict): An optional dictionary with extra keyword arguments
 | 
			
		||||
            for specific components.
 | 
			
		||||
| 
						 | 
				
			
			@ -564,13 +566,14 @@ class Language(object):
 | 
			
		|||
 | 
			
		||||
        if component_cfg is None:
 | 
			
		||||
            component_cfg = {}
 | 
			
		||||
        component_deps = count_pipeline_interdependencies(self.pipeline)
 | 
			
		||||
        # Determine whether component should set annotations. In theory I guess
 | 
			
		||||
        # we should do this by inspecting the meta? Or we could just always
 | 
			
		||||
        # say "yes"
 | 
			
		||||
        for name, proc in self.pipeline:
 | 
			
		||||
        for i, (name, proc) in enumerate(self.pipeline):
 | 
			
		||||
            component_cfg.setdefault(name, {})
 | 
			
		||||
            component_cfg[name].setdefault("drop", drop)
 | 
			
		||||
            component_cfg[name].setdefault("set_annotations", False)
 | 
			
		||||
            component_cfg[name]["set_annotations"] = bool(component_deps[i])
 | 
			
		||||
        for name, proc in self.pipeline:
 | 
			
		||||
            if not hasattr(proc, "update"):
 | 
			
		||||
                continue
 | 
			
		||||
| 
						 | 
				
			
			@ -938,7 +941,7 @@ class Language(object):
 | 
			
		|||
        """Save the current state to a directory.  If a model is loaded, this
 | 
			
		||||
        will include the model.
 | 
			
		||||
 | 
			
		||||
        path (unicode or Path): Path to a directory, which will be created if
 | 
			
		||||
        path (str / Path): Path to a directory, which will be created if
 | 
			
		||||
            it doesn't exist.
 | 
			
		||||
        exclude (list): Names of components or serialization fields to exclude.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -972,7 +975,7 @@ class Language(object):
 | 
			
		|||
        returns it. If the saved `Language` object contains a model, the
 | 
			
		||||
        model will be loaded.
 | 
			
		||||
 | 
			
		||||
        path (unicode or Path): A path to a directory.
 | 
			
		||||
        path (str / Path): A path to a directory.
 | 
			
		||||
        exclude (list): Names of components or serialization fields to exclude.
 | 
			
		||||
        RETURNS (Language): The modified `Language` object.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -1090,7 +1093,7 @@ class component(object):
 | 
			
		|||
    ):
 | 
			
		||||
        """Decorate a pipeline component.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Default component and factory name.
 | 
			
		||||
        name (str): Default component and factory name.
 | 
			
		||||
        assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
 | 
			
		||||
        requires (list): Attributes required by component, e.g. `["token.dep"]`.
 | 
			
		||||
        retokenizes (bool): Whether the component changes the tokenization.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -30,8 +30,8 @@ class Lemmatizer(object):
 | 
			
		|||
    def __call__(self, string, univ_pos, morphology=None):
 | 
			
		||||
        """Lemmatize a string.
 | 
			
		||||
 | 
			
		||||
        string (unicode): The string to lemmatize, e.g. the token text.
 | 
			
		||||
        univ_pos (unicode / int): The token's universal part-of-speech tag.
 | 
			
		||||
        string (str): The string to lemmatize, e.g. the token text.
 | 
			
		||||
        univ_pos (str / int): The token's universal part-of-speech tag.
 | 
			
		||||
        morphology (dict): The token's morphological features following the
 | 
			
		||||
            Universal Dependencies scheme.
 | 
			
		||||
        RETURNS (list): The available lemmas for the string.
 | 
			
		||||
| 
						 | 
				
			
			@ -69,7 +69,7 @@ class Lemmatizer(object):
 | 
			
		|||
        Check whether we're dealing with an uninflected paradigm, so we can
 | 
			
		||||
        avoid lemmatization entirely.
 | 
			
		||||
 | 
			
		||||
        univ_pos (unicode / int): The token's universal part-of-speech tag.
 | 
			
		||||
        univ_pos (str / int): The token's universal part-of-speech tag.
 | 
			
		||||
        morphology (dict): The token's morphological features following the
 | 
			
		||||
            Universal Dependencies scheme.
 | 
			
		||||
        """
 | 
			
		||||
| 
						 | 
				
			
			@ -126,10 +126,10 @@ class Lemmatizer(object):
 | 
			
		|||
        """Look up a lemma in the table, if available. If no lemma is found,
 | 
			
		||||
        the original string is returned.
 | 
			
		||||
 | 
			
		||||
        string (unicode): The original string.
 | 
			
		||||
        string (str): The original string.
 | 
			
		||||
        orth (int): Optional hash of the string to look up. If not set, the
 | 
			
		||||
            string will be used and hashed.
 | 
			
		||||
        RETURNS (unicode): The lemma if the string was found, otherwise the
 | 
			
		||||
        RETURNS (str): The lemma if the string was found, otherwise the
 | 
			
		||||
            original string.
 | 
			
		||||
        """
 | 
			
		||||
        lookup_table = self.lookups.get_table("lemma_lookup", {})
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -164,7 +164,7 @@ cdef class Lexeme:
 | 
			
		|||
            self.vocab.set_vector(self.c.orth, vector)
 | 
			
		||||
 | 
			
		||||
    property rank:
 | 
			
		||||
        """RETURNS (unicode): Sequential ID of the lexemes's lexical type, used
 | 
			
		||||
        """RETURNS (str): Sequential ID of the lexemes's lexical type, used
 | 
			
		||||
            to index into tables, e.g. for word vectors."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return self.c.id
 | 
			
		||||
| 
						 | 
				
			
			@ -187,18 +187,18 @@ cdef class Lexeme:
 | 
			
		|||
 | 
			
		||||
    @property
 | 
			
		||||
    def orth_(self):
 | 
			
		||||
        """RETURNS (unicode): The original verbatim text of the lexeme
 | 
			
		||||
        """RETURNS (str): The original verbatim text of the lexeme
 | 
			
		||||
            (identical to `Lexeme.text`). Exists mostly for consistency with
 | 
			
		||||
            the other attributes."""
 | 
			
		||||
        return self.vocab.strings[self.c.orth]
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def text(self):
 | 
			
		||||
        """RETURNS (unicode): The original verbatim text of the lexeme."""
 | 
			
		||||
        """RETURNS (str): The original verbatim text of the lexeme."""
 | 
			
		||||
        return self.orth_
 | 
			
		||||
 | 
			
		||||
    property lower:
 | 
			
		||||
        """RETURNS (unicode): Lowercase form of the lexeme."""
 | 
			
		||||
        """RETURNS (str): Lowercase form of the lexeme."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return self.c.lower
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -281,7 +281,7 @@ cdef class Lexeme:
 | 
			
		|||
            prob_table[self.c.orth] = x
 | 
			
		||||
 | 
			
		||||
    property lower_:
 | 
			
		||||
        """RETURNS (unicode): Lowercase form of the word."""
 | 
			
		||||
        """RETURNS (str): Lowercase form of the word."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return self.vocab.strings[self.c.lower]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -289,7 +289,7 @@ cdef class Lexeme:
 | 
			
		|||
            self.c.lower = self.vocab.strings.add(x)
 | 
			
		||||
 | 
			
		||||
    property norm_:
 | 
			
		||||
        """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the
 | 
			
		||||
        """RETURNS (str): The lexemes's norm, i.e. a normalised form of the
 | 
			
		||||
            lexeme text.
 | 
			
		||||
        """
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -299,7 +299,7 @@ cdef class Lexeme:
 | 
			
		|||
            self.norm = self.vocab.strings.add(x)
 | 
			
		||||
 | 
			
		||||
    property shape_:
 | 
			
		||||
        """RETURNS (unicode): Transform of the word's string, to show
 | 
			
		||||
        """RETURNS (str): Transform of the word's string, to show
 | 
			
		||||
            orthographic features.
 | 
			
		||||
        """
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -309,7 +309,7 @@ cdef class Lexeme:
 | 
			
		|||
            self.c.shape = self.vocab.strings.add(x)
 | 
			
		||||
 | 
			
		||||
    property prefix_:
 | 
			
		||||
        """RETURNS (unicode): Length-N substring from the start of the word.
 | 
			
		||||
        """RETURNS (str): Length-N substring from the start of the word.
 | 
			
		||||
            Defaults to `N=1`.
 | 
			
		||||
        """
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -319,7 +319,7 @@ cdef class Lexeme:
 | 
			
		|||
            self.c.prefix = self.vocab.strings.add(x)
 | 
			
		||||
 | 
			
		||||
    property suffix_:
 | 
			
		||||
        """RETURNS (unicode): Length-N substring from the end of the word.
 | 
			
		||||
        """RETURNS (str): Length-N substring from the end of the word.
 | 
			
		||||
            Defaults to `N=3`.
 | 
			
		||||
        """
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -329,7 +329,7 @@ cdef class Lexeme:
 | 
			
		|||
            self.c.suffix = self.vocab.strings.add(x)
 | 
			
		||||
 | 
			
		||||
    property lang_:
 | 
			
		||||
        """RETURNS (unicode): Language of the parent vocabulary."""
 | 
			
		||||
        """RETURNS (str): Language of the parent vocabulary."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return self.vocab.strings[self.c.lang]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -31,7 +31,7 @@ class Lookups(object):
 | 
			
		|||
        """Check if the lookups contain a table of a given name. Delegates to
 | 
			
		||||
        Lookups.has_table.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the table.
 | 
			
		||||
        name (str): Name of the table.
 | 
			
		||||
        RETURNS (bool): Whether a table of that name is in the lookups.
 | 
			
		||||
        """
 | 
			
		||||
        return self.has_table(name)
 | 
			
		||||
| 
						 | 
				
			
			@ -48,7 +48,7 @@ class Lookups(object):
 | 
			
		|||
    def add_table(self, name, data=SimpleFrozenDict()):
 | 
			
		||||
        """Add a new table to the lookups. Raises an error if the table exists.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Unique name of table.
 | 
			
		||||
        name (str): Unique name of table.
 | 
			
		||||
        data (dict): Optional data to add to the table.
 | 
			
		||||
        RETURNS (Table): The newly added table.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -64,7 +64,7 @@ class Lookups(object):
 | 
			
		|||
        """Get a table. Raises an error if the table doesn't exist and no
 | 
			
		||||
        default value is provided.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the table.
 | 
			
		||||
        name (str): Name of the table.
 | 
			
		||||
        default: Optional default value to return if table doesn't exist.
 | 
			
		||||
        RETURNS (Table): The table.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -79,7 +79,7 @@ class Lookups(object):
 | 
			
		|||
    def remove_table(self, name):
 | 
			
		||||
        """Remove a table. Raises an error if the table doesn't exist.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the table to remove.
 | 
			
		||||
        name (str): Name of the table to remove.
 | 
			
		||||
        RETURNS (Table): The removed table.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/lookups#remove_table
 | 
			
		||||
| 
						 | 
				
			
			@ -91,7 +91,7 @@ class Lookups(object):
 | 
			
		|||
    def has_table(self, name):
 | 
			
		||||
        """Check if the lookups contain a table of a given name.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the table.
 | 
			
		||||
        name (str): Name of the table.
 | 
			
		||||
        RETURNS (bool): Whether a table of that name exists.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/lookups#has_table
 | 
			
		||||
| 
						 | 
				
			
			@ -125,7 +125,7 @@ class Lookups(object):
 | 
			
		|||
        """Save the lookups to a directory as lookups.bin. Expects a path to a
 | 
			
		||||
        directory, which will be created if it doesn't exist.
 | 
			
		||||
 | 
			
		||||
        path (unicode / Path): The file path.
 | 
			
		||||
        path (str / Path): The file path.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/lookups#to_disk
 | 
			
		||||
        """
 | 
			
		||||
| 
						 | 
				
			
			@ -141,7 +141,7 @@ class Lookups(object):
 | 
			
		|||
        """Load lookups from a directory containing a lookups.bin. Will skip
 | 
			
		||||
        loading if the file doesn't exist.
 | 
			
		||||
 | 
			
		||||
        path (unicode / Path): The directory path.
 | 
			
		||||
        path (str / Path): The directory path.
 | 
			
		||||
        RETURNS (Lookups): The loaded lookups.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/lookups#from_disk
 | 
			
		||||
| 
						 | 
				
			
			@ -167,7 +167,7 @@ class Table(OrderedDict):
 | 
			
		|||
        """Initialize a new table from a dict.
 | 
			
		||||
 | 
			
		||||
        data (dict): The dictionary.
 | 
			
		||||
        name (unicode): Optional table name for reference.
 | 
			
		||||
        name (str): Optional table name for reference.
 | 
			
		||||
        RETURNS (Table): The newly created object.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/lookups#table.from_dict
 | 
			
		||||
| 
						 | 
				
			
			@ -179,7 +179,7 @@ class Table(OrderedDict):
 | 
			
		|||
    def __init__(self, name=None, data=None):
 | 
			
		||||
        """Initialize a new table.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Optional table name for reference.
 | 
			
		||||
        name (str): Optional table name for reference.
 | 
			
		||||
        data (dict): Initial data, used to hint Bloom Filter.
 | 
			
		||||
        RETURNS (Table): The newly created object.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -197,7 +197,7 @@ class Table(OrderedDict):
 | 
			
		|||
    def __setitem__(self, key, value):
 | 
			
		||||
        """Set new key/value pair. String keys will be hashed.
 | 
			
		||||
 | 
			
		||||
        key (unicode / int): The key to set.
 | 
			
		||||
        key (str / int): The key to set.
 | 
			
		||||
        value: The value to set.
 | 
			
		||||
        """
 | 
			
		||||
        key = get_string_id(key)
 | 
			
		||||
| 
						 | 
				
			
			@ -208,7 +208,7 @@ class Table(OrderedDict):
 | 
			
		|||
        """Set new key/value pair. String keys will be hashed.
 | 
			
		||||
        Same as table[key] = value.
 | 
			
		||||
 | 
			
		||||
        key (unicode / int): The key to set.
 | 
			
		||||
        key (str / int): The key to set.
 | 
			
		||||
        value: The value to set.
 | 
			
		||||
        """
 | 
			
		||||
        self[key] = value
 | 
			
		||||
| 
						 | 
				
			
			@ -216,7 +216,7 @@ class Table(OrderedDict):
 | 
			
		|||
    def __getitem__(self, key):
 | 
			
		||||
        """Get the value for a given key. String keys will be hashed.
 | 
			
		||||
 | 
			
		||||
        key (unicode / int): The key to get.
 | 
			
		||||
        key (str / int): The key to get.
 | 
			
		||||
        RETURNS: The value.
 | 
			
		||||
        """
 | 
			
		||||
        key = get_string_id(key)
 | 
			
		||||
| 
						 | 
				
			
			@ -225,7 +225,7 @@ class Table(OrderedDict):
 | 
			
		|||
    def get(self, key, default=None):
 | 
			
		||||
        """Get the value for a given key. String keys will be hashed.
 | 
			
		||||
 | 
			
		||||
        key (unicode / int): The key to get.
 | 
			
		||||
        key (str / int): The key to get.
 | 
			
		||||
        default: The default value to return.
 | 
			
		||||
        RETURNS: The value.
 | 
			
		||||
        """
 | 
			
		||||
| 
						 | 
				
			
			@ -235,7 +235,7 @@ class Table(OrderedDict):
 | 
			
		|||
    def __contains__(self, key):
 | 
			
		||||
        """Check whether a key is in the table. String keys will be hashed.
 | 
			
		||||
 | 
			
		||||
        key (unicode / int): The key to check.
 | 
			
		||||
        key (str / int): The key to check.
 | 
			
		||||
        RETURNS (bool): Whether the key is in the table.
 | 
			
		||||
        """
 | 
			
		||||
        key = get_string_id(key)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -66,7 +66,7 @@ cdef class DependencyMatcher:
 | 
			
		|||
    def __contains__(self, key):
 | 
			
		||||
        """Check whether the matcher contains rules for a match ID.
 | 
			
		||||
 | 
			
		||||
        key (unicode): The match ID.
 | 
			
		||||
        key (str): The match ID.
 | 
			
		||||
        RETURNS (bool): Whether the matcher contains rules for this match ID.
 | 
			
		||||
        """
 | 
			
		||||
        return self._normalize_key(key) in self._patterns
 | 
			
		||||
| 
						 | 
				
			
			@ -194,7 +194,7 @@ cdef class DependencyMatcher:
 | 
			
		|||
    def get(self, key, default=None):
 | 
			
		||||
        """Retrieve the pattern stored for a key.
 | 
			
		||||
 | 
			
		||||
        key (unicode or int): The key to retrieve.
 | 
			
		||||
        key (str / int): The key to retrieve.
 | 
			
		||||
        RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
 | 
			
		||||
        """
 | 
			
		||||
        key = self._normalize_key(key)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -64,7 +64,7 @@ cdef class Matcher:
 | 
			
		|||
    def __contains__(self, key):
 | 
			
		||||
        """Check whether the matcher contains rules for a match ID.
 | 
			
		||||
 | 
			
		||||
        key (unicode): The match ID.
 | 
			
		||||
        key (str): The match ID.
 | 
			
		||||
        RETURNS (bool): Whether the matcher contains rules for this match ID.
 | 
			
		||||
        """
 | 
			
		||||
        return self._normalize_key(key) in self._patterns
 | 
			
		||||
| 
						 | 
				
			
			@ -98,7 +98,7 @@ cdef class Matcher:
 | 
			
		|||
        number of arguments). The on_match callback becomes an optional keyword
 | 
			
		||||
        argument.
 | 
			
		||||
 | 
			
		||||
        key (unicode): The match ID.
 | 
			
		||||
        key (str): The match ID.
 | 
			
		||||
        patterns (list): The patterns to add for the given key.
 | 
			
		||||
        on_match (callable): Optional callback executed on match.
 | 
			
		||||
        *_patterns (list): For backwards compatibility: list of patterns to add
 | 
			
		||||
| 
						 | 
				
			
			@ -139,7 +139,7 @@ cdef class Matcher:
 | 
			
		|||
        """Remove a rule from the matcher. A KeyError is raised if the key does
 | 
			
		||||
        not exist.
 | 
			
		||||
 | 
			
		||||
        key (unicode): The ID of the match rule.
 | 
			
		||||
        key (str): The ID of the match rule.
 | 
			
		||||
        """
 | 
			
		||||
        norm_key = self._normalize_key(key)
 | 
			
		||||
        if not norm_key in self._patterns:
 | 
			
		||||
| 
						 | 
				
			
			@ -166,7 +166,7 @@ cdef class Matcher:
 | 
			
		|||
    def get(self, key, default=None):
 | 
			
		||||
        """Retrieve the pattern stored for a key.
 | 
			
		||||
 | 
			
		||||
        key (unicode or int): The key to retrieve.
 | 
			
		||||
        key (str / int): The key to retrieve.
 | 
			
		||||
        RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
 | 
			
		||||
        """
 | 
			
		||||
        key = self._normalize_key(key)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -30,7 +30,7 @@ cdef class PhraseMatcher:
 | 
			
		|||
        """Initialize the PhraseMatcher.
 | 
			
		||||
 | 
			
		||||
        vocab (Vocab): The shared vocabulary.
 | 
			
		||||
        attr (int / unicode): Token attribute to match on.
 | 
			
		||||
        attr (int / str): Token attribute to match on.
 | 
			
		||||
        validate (bool): Perform additional validation when patterns are added.
 | 
			
		||||
        RETURNS (PhraseMatcher): The newly constructed object.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -70,7 +70,7 @@ cdef class PhraseMatcher:
 | 
			
		|||
    def __contains__(self, key):
 | 
			
		||||
        """Check whether the matcher contains rules for a match ID.
 | 
			
		||||
 | 
			
		||||
        key (unicode): The match ID.
 | 
			
		||||
        key (str): The match ID.
 | 
			
		||||
        RETURNS (bool): Whether the matcher contains rules for this match ID.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/phrasematcher#contains
 | 
			
		||||
| 
						 | 
				
			
			@ -85,7 +85,7 @@ cdef class PhraseMatcher:
 | 
			
		|||
        """Remove a rule from the matcher by match ID. A KeyError is raised if
 | 
			
		||||
        the key does not exist.
 | 
			
		||||
 | 
			
		||||
        key (unicode): The match ID.
 | 
			
		||||
        key (str): The match ID.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/phrasematcher#remove
 | 
			
		||||
        """
 | 
			
		||||
| 
						 | 
				
			
			@ -159,7 +159,7 @@ cdef class PhraseMatcher:
 | 
			
		|||
        number of arguments). The on_match callback becomes an optional keyword
 | 
			
		||||
        argument.
 | 
			
		||||
 | 
			
		||||
        key (unicode): The match ID.
 | 
			
		||||
        key (str): The match ID.
 | 
			
		||||
        docs (list): List of `Doc` objects representing match patterns.
 | 
			
		||||
        on_match (callable): Callback executed on match.
 | 
			
		||||
        *_docs (Doc): For backwards compatibility: list of patterns to add
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -15,10 +15,10 @@ def build_tb_parser_model(
 | 
			
		|||
    use_upper=True,
 | 
			
		||||
    nO=None,
 | 
			
		||||
):
 | 
			
		||||
    token_vector_width = tok2vec.get_dim("nO")
 | 
			
		||||
    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
 | 
			
		||||
    tok2vec = chain(
 | 
			
		||||
        tok2vec,
 | 
			
		||||
        with_array(Linear(hidden_width, token_vector_width)),
 | 
			
		||||
        with_array(Linear(hidden_width, t2v_width)),
 | 
			
		||||
        list2array(),
 | 
			
		||||
    )
 | 
			
		||||
    tok2vec.set_dim("nO", hidden_width)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -6,9 +6,9 @@ from ...util import registry
 | 
			
		|||
 | 
			
		||||
@registry.architectures.register("spacy.Tagger.v1")
 | 
			
		||||
def build_tagger_model(tok2vec, nO=None) -> Model:
 | 
			
		||||
    token_vector_width = tok2vec.get_dim("nO")
 | 
			
		||||
    # TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
 | 
			
		||||
    output_layer = Softmax(nO, nI=token_vector_width, init_W=zero_init)
 | 
			
		||||
    t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
 | 
			
		||||
    output_layer = Softmax(nO, t2v_width, init_W=zero_init)
 | 
			
		||||
    softmax = with_array(output_layer)
 | 
			
		||||
    model = chain(tok2vec, softmax)
 | 
			
		||||
    model.set_ref("tok2vec", tok2vec)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -38,8 +38,8 @@ def forward(model, X, is_train):
 | 
			
		|||
 | 
			
		||||
 | 
			
		||||
def init(model, X=None, Y=None):
 | 
			
		||||
    tok2vec = model.get_ref("tok2vec").initialize()
 | 
			
		||||
    lower = model.get_ref("lower").initialize(X=X)
 | 
			
		||||
    tok2vec = model.get_ref("tok2vec").initialize(X=X)
 | 
			
		||||
    lower = model.get_ref("lower").initialize()
 | 
			
		||||
    if model.attrs["has_upper"]:
 | 
			
		||||
        statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
 | 
			
		||||
        model.get_ref("upper").initialize(X=statevecs)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -198,8 +198,8 @@ cdef class Morphology:
 | 
			
		|||
        """Add a special-case rule to the morphological analyser. Tokens whose
 | 
			
		||||
        tag and orth match the rule will receive the specified properties.
 | 
			
		||||
 | 
			
		||||
        tag (unicode): The part-of-speech tag to key the exception.
 | 
			
		||||
        orth (unicode): The word-form to key the exception.
 | 
			
		||||
        tag (str): The part-of-speech tag to key the exception.
 | 
			
		||||
        orth (str): The word-form to key the exception.
 | 
			
		||||
        """
 | 
			
		||||
        attrs = dict(attrs)
 | 
			
		||||
        attrs = _normalize_props(attrs)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -11,7 +11,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
 | 
			
		|||
    fulfilled (e.g. if previous components assign the attributes).
 | 
			
		||||
 | 
			
		||||
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
			
		||||
    name (unicode): The name of the pipeline component to analyze.
 | 
			
		||||
    name (str): The name of the pipeline component to analyze.
 | 
			
		||||
    pipe (callable): The pipeline component function to analyze.
 | 
			
		||||
    index (int): The index of the component in the pipeline.
 | 
			
		||||
    warn (bool): Show user warning if problem is found.
 | 
			
		||||
| 
						 | 
				
			
			@ -125,7 +125,7 @@ def get_assigns_for_attr(pipeline, attr):
 | 
			
		|||
    """Get all pipeline components that assign an attr, e.g. "doc.tensor".
 | 
			
		||||
 | 
			
		||||
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
			
		||||
    attr (unicode): The attribute to check.
 | 
			
		||||
    attr (str): The attribute to check.
 | 
			
		||||
    RETURNS (list): (name, pipeline) tuples of components that assign the attr.
 | 
			
		||||
    """
 | 
			
		||||
    return _get_feature_for_attr(pipeline, attr, "assigns")
 | 
			
		||||
| 
						 | 
				
			
			@ -135,7 +135,7 @@ def get_requires_for_attr(pipeline, attr):
 | 
			
		|||
    """Get all pipeline components that require an attr, e.g. "doc.tensor".
 | 
			
		||||
 | 
			
		||||
    pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
 | 
			
		||||
    attr (unicode): The attribute to check.
 | 
			
		||||
    attr (str): The attribute to check.
 | 
			
		||||
    RETURNS (list): (name, pipeline) tuples of components that require the attr.
 | 
			
		||||
    """
 | 
			
		||||
    return _get_feature_for_attr(pipeline, attr, "requires")
 | 
			
		||||
| 
						 | 
				
			
			@ -173,3 +173,22 @@ def print_summary(nlp, pretty=True, no_print=False):
 | 
			
		|||
        msg.good("No problems found.")
 | 
			
		||||
    if no_print:
 | 
			
		||||
        return {"overview": overview, "problems": problems}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def count_pipeline_interdependencies(pipeline):
 | 
			
		||||
    """Count how many subsequent components require an annotation set by each
 | 
			
		||||
    component in the pipeline.
 | 
			
		||||
    """
 | 
			
		||||
    pipe_assigns = []
 | 
			
		||||
    pipe_requires = []
 | 
			
		||||
    for name, pipe in pipeline:
 | 
			
		||||
        pipe_assigns.append(set(getattr(pipe, "assigns", [])))
 | 
			
		||||
        pipe_requires.append(set(getattr(pipe, "requires", [])))
 | 
			
		||||
    counts = []
 | 
			
		||||
    for i, assigns in enumerate(pipe_assigns):
 | 
			
		||||
        count = 0
 | 
			
		||||
        for requires in pipe_requires[i + 1 :]:
 | 
			
		||||
            if assigns.intersection(requires):
 | 
			
		||||
                count += 1
 | 
			
		||||
        counts.append(count)
 | 
			
		||||
    return counts
 | 
			
		||||
| 
						 | 
				
			
			@ -30,7 +30,7 @@ class EntityRuler(object):
 | 
			
		|||
 | 
			
		||||
        nlp (Language): The shared nlp object to pass the vocab to the matchers
 | 
			
		||||
            and process phrase patterns.
 | 
			
		||||
        phrase_matcher_attr (int / unicode): Token attribute to match on, passed
 | 
			
		||||
        phrase_matcher_attr (int / str): Token attribute to match on, passed
 | 
			
		||||
            to the internal PhraseMatcher as `attr`
 | 
			
		||||
        validate (bool): Whether patterns should be validated, passed to
 | 
			
		||||
            Matcher and PhraseMatcher as `validate`
 | 
			
		||||
| 
						 | 
				
			
			@ -315,7 +315,7 @@ class EntityRuler(object):
 | 
			
		|||
        """Load the entity ruler from a file. Expects a file containing
 | 
			
		||||
        newline-delimited JSON (JSONL) with one entry per line.
 | 
			
		||||
 | 
			
		||||
        path (unicode / Path): The JSONL file to load.
 | 
			
		||||
        path (str / Path): The JSONL file to load.
 | 
			
		||||
        **kwargs: Other config paramters, mostly for consistency.
 | 
			
		||||
 | 
			
		||||
        RETURNS (EntityRuler): The loaded entity ruler.
 | 
			
		||||
| 
						 | 
				
			
			@ -351,7 +351,7 @@ class EntityRuler(object):
 | 
			
		|||
        """Save the entity ruler patterns to a directory. The patterns will be
 | 
			
		||||
        saved as newline-delimited JSON (JSONL).
 | 
			
		||||
 | 
			
		||||
        path (unicode / Path): The JSONL file to save.
 | 
			
		||||
        path (str / Path): The JSONL file to save.
 | 
			
		||||
        **kwargs: Other config paramters, mostly for consistency.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/entityruler#to_disk
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -50,7 +50,7 @@ def merge_subtokens(doc, label="subtok"):
 | 
			
		|||
    """Merge subtokens into a single token.
 | 
			
		||||
 | 
			
		||||
    doc (Doc): The Doc object.
 | 
			
		||||
    label (unicode): The subtoken dependency label.
 | 
			
		||||
    label (str): The subtoken dependency label.
 | 
			
		||||
    RETURNS (Doc): The Doc object with merged subtokens.
 | 
			
		||||
 | 
			
		||||
    DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -531,7 +531,16 @@ class Tagger(Pipe):
 | 
			
		|||
                                          vocab.morphology.lemmatizer,
 | 
			
		||||
                                          exc=vocab.morphology.exc)
 | 
			
		||||
        self.set_output(len(self.labels))
 | 
			
		||||
        self.model.initialize()
 | 
			
		||||
        doc_sample = [Doc(self.vocab, words=["hello", "world"])]
 | 
			
		||||
        if pipeline is not None:
 | 
			
		||||
            for name, component in pipeline:
 | 
			
		||||
                if component is self:
 | 
			
		||||
                    break
 | 
			
		||||
                if hasattr(component, "pipe"):
 | 
			
		||||
                    doc_sample = list(component.pipe(doc_sample))
 | 
			
		||||
                else:
 | 
			
		||||
                    doc_sample = [component(doc) for doc in doc_sample]
 | 
			
		||||
        self.model.initialize(X=doc_sample)
 | 
			
		||||
        # Get batch of example docs, example outputs to call begin_training().
 | 
			
		||||
        # This lets the model infer shapes.
 | 
			
		||||
        link_vectors_to_models(self.vocab)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -109,7 +109,7 @@ cdef class StringStore:
 | 
			
		|||
        """Retrieve a string from a given hash, or vice versa.
 | 
			
		||||
 | 
			
		||||
        string_or_id (bytes, unicode or uint64): The value to encode.
 | 
			
		||||
        Returns (unicode or uint64): The value to be retrieved.
 | 
			
		||||
        Returns (str / uint64): The value to be retrieved.
 | 
			
		||||
        """
 | 
			
		||||
        if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
 | 
			
		||||
            return 0
 | 
			
		||||
| 
						 | 
				
			
			@ -152,7 +152,7 @@ cdef class StringStore:
 | 
			
		|||
    def add(self, string):
 | 
			
		||||
        """Add a string to the StringStore.
 | 
			
		||||
 | 
			
		||||
        string (unicode): The string to add.
 | 
			
		||||
        string (str): The string to add.
 | 
			
		||||
        RETURNS (uint64): The string's hash value.
 | 
			
		||||
        """
 | 
			
		||||
        if isinstance(string, unicode):
 | 
			
		||||
| 
						 | 
				
			
			@ -179,7 +179,7 @@ cdef class StringStore:
 | 
			
		|||
    def __contains__(self, string not None):
 | 
			
		||||
        """Check whether a string is in the store.
 | 
			
		||||
 | 
			
		||||
        string (unicode): The string to check.
 | 
			
		||||
        string (str): The string to check.
 | 
			
		||||
        RETURNS (bool): Whether the store contains the string.
 | 
			
		||||
        """
 | 
			
		||||
        cdef hash_t key
 | 
			
		||||
| 
						 | 
				
			
			@ -205,7 +205,7 @@ cdef class StringStore:
 | 
			
		|||
    def __iter__(self):
 | 
			
		||||
        """Iterate over the strings in the store, in order.
 | 
			
		||||
 | 
			
		||||
        YIELDS (unicode): A string in the store.
 | 
			
		||||
        YIELDS (str): A string in the store.
 | 
			
		||||
        """
 | 
			
		||||
        cdef int i
 | 
			
		||||
        cdef hash_t key
 | 
			
		||||
| 
						 | 
				
			
			@ -223,7 +223,7 @@ cdef class StringStore:
 | 
			
		|||
    def to_disk(self, path):
 | 
			
		||||
        """Save the current state to a directory.
 | 
			
		||||
 | 
			
		||||
        path (unicode or Path): A path to a directory, which will be created if
 | 
			
		||||
        path (str / Path): A path to a directory, which will be created if
 | 
			
		||||
            it doesn't exist. Paths may be either strings or Path-like objects.
 | 
			
		||||
        """
 | 
			
		||||
        path = util.ensure_path(path)
 | 
			
		||||
| 
						 | 
				
			
			@ -234,7 +234,7 @@ cdef class StringStore:
 | 
			
		|||
        """Loads state from a directory. Modifies the object in place and
 | 
			
		||||
        returns it.
 | 
			
		||||
 | 
			
		||||
        path (unicode or Path): A path to a directory. Paths may be either
 | 
			
		||||
        path (str / Path): A path to a directory. Paths may be either
 | 
			
		||||
            strings or `Path`-like objects.
 | 
			
		||||
        RETURNS (StringStore): The modified `StringStore` object.
 | 
			
		||||
        """
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -624,12 +624,25 @@ cdef class Parser:
 | 
			
		|||
            sgd = self.create_optimizer()
 | 
			
		||||
        doc_sample = []
 | 
			
		||||
        gold_sample = []
 | 
			
		||||
        for example in islice(get_examples(), 1000):
 | 
			
		||||
        for example in islice(get_examples(), 10):
 | 
			
		||||
            parses = example.get_gold_parses(merge=False, vocab=self.vocab)
 | 
			
		||||
            for doc, gold in parses:
 | 
			
		||||
                if len(doc):
 | 
			
		||||
                    doc_sample.append(doc)
 | 
			
		||||
                    gold_sample.append(gold)
 | 
			
		||||
        self.model.initialize(doc_sample, gold_sample)
 | 
			
		||||
 | 
			
		||||
        if pipeline is not None:
 | 
			
		||||
            for name, component in pipeline:
 | 
			
		||||
                if component is self:
 | 
			
		||||
                    break
 | 
			
		||||
                if hasattr(component, "pipe"):
 | 
			
		||||
                    doc_sample = list(component.pipe(doc_sample))
 | 
			
		||||
                else:
 | 
			
		||||
                    doc_sample = [component(doc) for doc in doc_sample]
 | 
			
		||||
        if doc_sample:
 | 
			
		||||
            self.model.initialize(doc_sample)
 | 
			
		||||
        else:
 | 
			
		||||
            self.model.initialize()
 | 
			
		||||
        if pipeline is not None:
 | 
			
		||||
            self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
 | 
			
		||||
        link_vectors_to_models(self.vocab)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -9,7 +9,6 @@ def test_build_dependencies():
 | 
			
		|||
        "pytest-timeout",
 | 
			
		||||
        "mock",
 | 
			
		||||
        "flake8",
 | 
			
		||||
        "jsonschema",
 | 
			
		||||
    ]
 | 
			
		||||
    libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,7 +1,8 @@
 | 
			
		|||
import spacy.language
 | 
			
		||||
from spacy.language import Language, component
 | 
			
		||||
from spacy.analysis import print_summary, validate_attrs
 | 
			
		||||
from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
 | 
			
		||||
from spacy.pipe_analysis import print_summary, validate_attrs
 | 
			
		||||
from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr
 | 
			
		||||
from spacy.pipe_analysis import count_pipeline_interdependencies
 | 
			
		||||
from mock import Mock, ANY
 | 
			
		||||
import pytest
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -161,3 +162,19 @@ def test_analysis_validate_attrs_remove_pipe():
 | 
			
		|||
    with pytest.warns(None) as record:
 | 
			
		||||
        nlp.remove_pipe("c2")
 | 
			
		||||
    assert not record.list
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_pipe_interdependencies():
 | 
			
		||||
    class Fancifier:
 | 
			
		||||
        name = "fancifier"
 | 
			
		||||
        assigns = ("doc._.fancy",)
 | 
			
		||||
        requires = tuple()
 | 
			
		||||
 | 
			
		||||
    class FancyNeeder:
 | 
			
		||||
        name = "needer"
 | 
			
		||||
        assigns = tuple()
 | 
			
		||||
        requires = ("doc._.fancy",)
 | 
			
		||||
 | 
			
		||||
    pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())]
 | 
			
		||||
    counts = count_pipeline_interdependencies(pipeline)
 | 
			
		||||
    assert counts == [1, 0]
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -2,9 +2,11 @@ import pytest
 | 
			
		|||
import os
 | 
			
		||||
import ctypes
 | 
			
		||||
from pathlib import Path
 | 
			
		||||
from spacy.about import __version__ as spacy_version
 | 
			
		||||
from spacy import util
 | 
			
		||||
from spacy import prefer_gpu, require_gpu
 | 
			
		||||
from spacy.ml._precomputable_affine import PrecomputableAffine, _backprop_precomputable_affine_padding
 | 
			
		||||
from spacy.ml._precomputable_affine import PrecomputableAffine
 | 
			
		||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.fixture
 | 
			
		||||
| 
						 | 
				
			
			@ -24,10 +26,12 @@ def test_util_ensure_path_succeeds(text):
 | 
			
		|||
    assert isinstance(path, Path)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("package", ["numpy"])
 | 
			
		||||
def test_util_is_package(package):
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "package,result", [("numpy", True), ("sfkodskfosdkfpsdpofkspdof", False)]
 | 
			
		||||
)
 | 
			
		||||
def test_util_is_package(package, result):
 | 
			
		||||
    """Test that an installed package via pip is recognised by util.is_package."""
 | 
			
		||||
    assert util.is_package(package)
 | 
			
		||||
    assert util.is_package(package) is result
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize("package", ["thinc"])
 | 
			
		||||
| 
						 | 
				
			
			@ -87,3 +91,21 @@ def test_ascii_filenames():
 | 
			
		|||
    root = Path(__file__).parent.parent
 | 
			
		||||
    for path in root.glob("**/*"):
 | 
			
		||||
        assert all(ord(c) < 128 for c in path.name), path.name
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "version,constraint,compatible",
 | 
			
		||||
    [
 | 
			
		||||
        (spacy_version, spacy_version, True),
 | 
			
		||||
        (spacy_version, f">={spacy_version}", True),
 | 
			
		||||
        ("3.0.0", "2.0.0", False),
 | 
			
		||||
        ("3.2.1", ">=2.0.0", True),
 | 
			
		||||
        ("2.2.10a1", ">=1.0.0,<2.1.1", False),
 | 
			
		||||
        ("3.0.0.dev3", ">=1.2.3,<4.5.6", True),
 | 
			
		||||
        ("n/a", ">=1.2.3,<4.5.6", None),
 | 
			
		||||
        ("1.2.3", "n/a", None),
 | 
			
		||||
        ("n/a", "n/a", None),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_is_compatible_version(version, constraint, compatible):
 | 
			
		||||
    assert util.is_compatible_version(version, constraint) is compatible
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										59
									
								
								spacy/tests/test_util.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										59
									
								
								spacy/tests/test_util.py
									
									
									
									
									
										Normal file
									
								
							| 
						 | 
				
			
			@ -0,0 +1,59 @@
 | 
			
		|||
import pytest
 | 
			
		||||
from spacy.gold import Example
 | 
			
		||||
 | 
			
		||||
from .util import get_random_doc
 | 
			
		||||
 | 
			
		||||
from spacy.util import minibatch_by_words
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "doc_sizes, expected_batches",
 | 
			
		||||
    [
 | 
			
		||||
        ([400, 400, 199], [3]),
 | 
			
		||||
        ([400, 400, 199, 3], [4]),
 | 
			
		||||
        ([400, 400, 199, 3, 200], [3, 2]),
 | 
			
		||||
        ([400, 400, 199, 3, 1], [5]),
 | 
			
		||||
        ([400, 400, 199, 3, 1, 1500], [5]),    # 1500 will be discarded
 | 
			
		||||
        ([400, 400, 199, 3, 1, 200], [3, 3]),
 | 
			
		||||
        ([400, 400, 199, 3, 1, 999], [3, 3]),
 | 
			
		||||
        ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
 | 
			
		||||
        ([1, 2, 999], [3]),
 | 
			
		||||
        ([1, 2, 999, 1], [4]),
 | 
			
		||||
        ([1, 200, 999, 1], [2, 2]),
 | 
			
		||||
        ([1, 999, 200, 1], [2, 2]),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_util_minibatch(doc_sizes, expected_batches):
 | 
			
		||||
    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
 | 
			
		||||
    examples = [Example(doc=doc) for doc in docs]
 | 
			
		||||
    tol = 0.2
 | 
			
		||||
    batch_size = 1000
 | 
			
		||||
    batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True))
 | 
			
		||||
    assert [len(batch) for batch in batches] == expected_batches
 | 
			
		||||
 | 
			
		||||
    max_size = batch_size + batch_size * tol
 | 
			
		||||
    for batch in batches:
 | 
			
		||||
        assert sum([len(example.doc) for example in batch]) < max_size
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
@pytest.mark.parametrize(
 | 
			
		||||
    "doc_sizes, expected_batches",
 | 
			
		||||
    [
 | 
			
		||||
        ([400, 4000, 199], [1, 2]),
 | 
			
		||||
        ([400, 400, 199, 3000, 200], [1, 4]),
 | 
			
		||||
        ([400, 400, 199, 3, 1, 1500], [1, 5]),
 | 
			
		||||
        ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
 | 
			
		||||
        ([1, 2, 9999], [1, 2]),
 | 
			
		||||
        ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
 | 
			
		||||
    ],
 | 
			
		||||
)
 | 
			
		||||
def test_util_minibatch_oversize(doc_sizes, expected_batches):
 | 
			
		||||
    """ Test that oversized documents are returned in their own batch"""
 | 
			
		||||
    docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
 | 
			
		||||
    examples = [Example(doc=doc) for doc in docs]
 | 
			
		||||
    tol = 0.2
 | 
			
		||||
    batch_size = 1000
 | 
			
		||||
    batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False))
 | 
			
		||||
    assert [len(batch) for batch in batches] == expected_batches
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -92,6 +92,13 @@ def get_batch(batch_size):
 | 
			
		|||
    return docs
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_random_doc(n_words):
 | 
			
		||||
    vocab = Vocab()
 | 
			
		||||
    # Make the words numbers, so that they're easy to track.
 | 
			
		||||
    numbers = [str(i) for i in range(0, n_words)]
 | 
			
		||||
    return Doc(vocab, words=numbers)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def apply_transition_sequence(parser, doc, sequence):
 | 
			
		||||
    """Perform a series of pre-specified transitions, to put the parser in a
 | 
			
		||||
    desired state."""
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -134,7 +134,7 @@ cdef class Tokenizer:
 | 
			
		|||
    def __call__(self, unicode string):
 | 
			
		||||
        """Tokenize a string.
 | 
			
		||||
 | 
			
		||||
        string (unicode): The string to tokenize.
 | 
			
		||||
        string (str): The string to tokenize.
 | 
			
		||||
        RETURNS (Doc): A container for linguistic annotations.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/tokenizer#call
 | 
			
		||||
| 
						 | 
				
			
			@ -147,7 +147,7 @@ cdef class Tokenizer:
 | 
			
		|||
    cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
 | 
			
		||||
        """Tokenize according to affix and token_match settings.
 | 
			
		||||
 | 
			
		||||
        string (unicode): The string to tokenize.
 | 
			
		||||
        string (str): The string to tokenize.
 | 
			
		||||
        RETURNS (Doc): A container for linguistic annotations.
 | 
			
		||||
        """
 | 
			
		||||
        if len(string) >= (2 ** 30):
 | 
			
		||||
| 
						 | 
				
			
			@ -527,7 +527,7 @@ cdef class Tokenizer:
 | 
			
		|||
    def find_infix(self, unicode string):
 | 
			
		||||
        """Find internal split points of the string, such as hyphens.
 | 
			
		||||
 | 
			
		||||
        string (unicode): The string to segment.
 | 
			
		||||
        string (str): The string to segment.
 | 
			
		||||
        RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
 | 
			
		||||
            and `.end()` methods, denoting the placement of internal segment
 | 
			
		||||
            separators, e.g. hyphens.
 | 
			
		||||
| 
						 | 
				
			
			@ -542,7 +542,7 @@ cdef class Tokenizer:
 | 
			
		|||
        """Find the length of a prefix that should be segmented from the
 | 
			
		||||
        string, or None if no prefix rules match.
 | 
			
		||||
 | 
			
		||||
        string (unicode): The string to segment.
 | 
			
		||||
        string (str): The string to segment.
 | 
			
		||||
        RETURNS (int): The length of the prefix if present, otherwise `None`.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/tokenizer#find_prefix
 | 
			
		||||
| 
						 | 
				
			
			@ -556,7 +556,7 @@ cdef class Tokenizer:
 | 
			
		|||
        """Find the length of a suffix that should be segmented from the
 | 
			
		||||
        string, or None if no suffix rules match.
 | 
			
		||||
 | 
			
		||||
        string (unicode): The string to segment.
 | 
			
		||||
        string (str): The string to segment.
 | 
			
		||||
        Returns (int): The length of the suffix if present, otherwise `None`.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/tokenizer#find_suffix
 | 
			
		||||
| 
						 | 
				
			
			@ -576,7 +576,7 @@ cdef class Tokenizer:
 | 
			
		|||
    def _validate_special_case(self, chunk, substrings):
 | 
			
		||||
        """Check whether the `ORTH` fields match the string.
 | 
			
		||||
 | 
			
		||||
        string (unicode): The string to specially tokenize.
 | 
			
		||||
        string (str): The string to specially tokenize.
 | 
			
		||||
        substrings (iterable): A sequence of dicts, where each dict describes
 | 
			
		||||
            a token and its attributes.
 | 
			
		||||
        """
 | 
			
		||||
| 
						 | 
				
			
			@ -588,7 +588,7 @@ cdef class Tokenizer:
 | 
			
		|||
    def add_special_case(self, unicode string, substrings):
 | 
			
		||||
        """Add a special-case tokenization rule.
 | 
			
		||||
 | 
			
		||||
        string (unicode): The string to specially tokenize.
 | 
			
		||||
        string (str): The string to specially tokenize.
 | 
			
		||||
        substrings (iterable): A sequence of dicts, where each dict describes
 | 
			
		||||
            a token and its attributes. The `ORTH` fields of the attributes
 | 
			
		||||
            must exactly match the string when they are concatenated.
 | 
			
		||||
| 
						 | 
				
			
			@ -629,7 +629,7 @@ cdef class Tokenizer:
 | 
			
		|||
        produced are identical to `nlp.tokenizer()` except for whitespace
 | 
			
		||||
        tokens.
 | 
			
		||||
 | 
			
		||||
        string (unicode): The string to tokenize.
 | 
			
		||||
        string (str): The string to tokenize.
 | 
			
		||||
        RETURNS (list): A list of (pattern_string, token_string) tuples
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/tokenizer#explain
 | 
			
		||||
| 
						 | 
				
			
			@ -693,7 +693,7 @@ cdef class Tokenizer:
 | 
			
		|||
    def to_disk(self, path, **kwargs):
 | 
			
		||||
        """Save the current state to a directory.
 | 
			
		||||
 | 
			
		||||
        path (unicode or Path): A path to a directory, which will be created if
 | 
			
		||||
        path (str / Path): A path to a directory, which will be created if
 | 
			
		||||
            it doesn't exist.
 | 
			
		||||
        exclude (list): String names of serialization fields to exclude.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -707,7 +707,7 @@ cdef class Tokenizer:
 | 
			
		|||
        """Loads state from a directory. Modifies the object in place and
 | 
			
		||||
        returns it.
 | 
			
		||||
 | 
			
		||||
        path (unicode or Path): A path to a directory.
 | 
			
		||||
        path (str / Path): A path to a directory.
 | 
			
		||||
        exclude (list): String names of serialization fields to exclude.
 | 
			
		||||
        RETURNS (Tokenizer): The modified `Tokenizer` object.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -117,7 +117,7 @@ cdef class Doc:
 | 
			
		|||
    def set_extension(cls, name, **kwargs):
 | 
			
		||||
        """Define a custom attribute which becomes available as `Doc._`.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the attribute to set.
 | 
			
		||||
        name (str): Name of the attribute to set.
 | 
			
		||||
        default: Optional default value of the attribute.
 | 
			
		||||
        getter (callable): Optional getter function.
 | 
			
		||||
        setter (callable): Optional setter function.
 | 
			
		||||
| 
						 | 
				
			
			@ -135,7 +135,7 @@ cdef class Doc:
 | 
			
		|||
    def get_extension(cls, name):
 | 
			
		||||
        """Look up a previously registered extension by name.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the extension.
 | 
			
		||||
        name (str): Name of the extension.
 | 
			
		||||
        RETURNS (tuple): A `(default, method, getter, setter)` tuple.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/doc#get_extension
 | 
			
		||||
| 
						 | 
				
			
			@ -146,7 +146,7 @@ cdef class Doc:
 | 
			
		|||
    def has_extension(cls, name):
 | 
			
		||||
        """Check whether an extension has been registered.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the extension.
 | 
			
		||||
        name (str): Name of the extension.
 | 
			
		||||
        RETURNS (bool): Whether the extension has been registered.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/doc#has_extension
 | 
			
		||||
| 
						 | 
				
			
			@ -157,7 +157,7 @@ cdef class Doc:
 | 
			
		|||
    def remove_extension(cls, name):
 | 
			
		||||
        """Remove a previously registered extension.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the extension.
 | 
			
		||||
        name (str): Name of the extension.
 | 
			
		||||
        RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
 | 
			
		||||
            removed extension.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -483,7 +483,7 @@ cdef class Doc:
 | 
			
		|||
    def text(self):
 | 
			
		||||
        """A unicode representation of the document text.
 | 
			
		||||
 | 
			
		||||
        RETURNS (unicode): The original verbatim text of the document.
 | 
			
		||||
        RETURNS (str): The original verbatim text of the document.
 | 
			
		||||
        """
 | 
			
		||||
        return "".join(t.text_with_ws for t in self)
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -492,7 +492,7 @@ cdef class Doc:
 | 
			
		|||
        """An alias of `Doc.text`, provided for duck-type compatibility with
 | 
			
		||||
        `Span` and `Token`.
 | 
			
		||||
 | 
			
		||||
        RETURNS (unicode): The original verbatim text of the document.
 | 
			
		||||
        RETURNS (str): The original verbatim text of the document.
 | 
			
		||||
        """
 | 
			
		||||
        return self.text
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -637,7 +637,7 @@ cdef class Doc:
 | 
			
		|||
 | 
			
		||||
    @property
 | 
			
		||||
    def lang_(self):
 | 
			
		||||
        """RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'."""
 | 
			
		||||
        """RETURNS (str): Language of the doc's vocabulary, e.g. 'en'."""
 | 
			
		||||
        return self.vocab.lang
 | 
			
		||||
 | 
			
		||||
    cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
 | 
			
		||||
| 
						 | 
				
			
			@ -852,7 +852,7 @@ cdef class Doc:
 | 
			
		|||
    def to_disk(self, path, **kwargs):
 | 
			
		||||
        """Save the current state to a directory.
 | 
			
		||||
 | 
			
		||||
        path (unicode or Path): A path to a directory, which will be created if
 | 
			
		||||
        path (str / Path): A path to a directory, which will be created if
 | 
			
		||||
            it doesn't exist. Paths may be either strings or Path-like objects.
 | 
			
		||||
        exclude (list): String names of serialization fields to exclude.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -866,7 +866,7 @@ cdef class Doc:
 | 
			
		|||
        """Loads state from a directory. Modifies the object in place and
 | 
			
		||||
        returns it.
 | 
			
		||||
 | 
			
		||||
        path (unicode or Path): A path to a directory. Paths may be either
 | 
			
		||||
        path (str / Path): A path to a directory. Paths may be either
 | 
			
		||||
            strings or `Path`-like objects.
 | 
			
		||||
        exclude (list): String names of serialization fields to exclude.
 | 
			
		||||
        RETURNS (Doc): The modified `Doc` object.
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -33,7 +33,7 @@ cdef class Span:
 | 
			
		|||
    def set_extension(cls, name, **kwargs):
 | 
			
		||||
        """Define a custom attribute which becomes available as `Span._`.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the attribute to set.
 | 
			
		||||
        name (str): Name of the attribute to set.
 | 
			
		||||
        default: Optional default value of the attribute.
 | 
			
		||||
        getter (callable): Optional getter function.
 | 
			
		||||
        setter (callable): Optional setter function.
 | 
			
		||||
| 
						 | 
				
			
			@ -51,7 +51,7 @@ cdef class Span:
 | 
			
		|||
    def get_extension(cls, name):
 | 
			
		||||
        """Look up a previously registered extension by name.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the extension.
 | 
			
		||||
        name (str): Name of the extension.
 | 
			
		||||
        RETURNS (tuple): A `(default, method, getter, setter)` tuple.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/span#get_extension
 | 
			
		||||
| 
						 | 
				
			
			@ -62,7 +62,7 @@ cdef class Span:
 | 
			
		|||
    def has_extension(cls, name):
 | 
			
		||||
        """Check whether an extension has been registered.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the extension.
 | 
			
		||||
        name (str): Name of the extension.
 | 
			
		||||
        RETURNS (bool): Whether the extension has been registered.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/span#has_extension
 | 
			
		||||
| 
						 | 
				
			
			@ -73,7 +73,7 @@ cdef class Span:
 | 
			
		|||
    def remove_extension(cls, name):
 | 
			
		||||
        """Remove a previously registered extension.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the extension.
 | 
			
		||||
        name (str): Name of the extension.
 | 
			
		||||
        RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
 | 
			
		||||
            removed extension.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -491,7 +491,7 @@ cdef class Span:
 | 
			
		|||
 | 
			
		||||
    @property
 | 
			
		||||
    def text(self):
 | 
			
		||||
        """RETURNS (unicode): The original verbatim text of the span."""
 | 
			
		||||
        """RETURNS (str): The original verbatim text of the span."""
 | 
			
		||||
        text = self.text_with_ws
 | 
			
		||||
        if self[-1].whitespace_:
 | 
			
		||||
            text = text[:-1]
 | 
			
		||||
| 
						 | 
				
			
			@ -502,7 +502,7 @@ cdef class Span:
 | 
			
		|||
        """The text content of the span with a trailing whitespace character if
 | 
			
		||||
        the last token has one.
 | 
			
		||||
 | 
			
		||||
        RETURNS (unicode): The text content of the span (with trailing
 | 
			
		||||
        RETURNS (str): The text content of the span (with trailing
 | 
			
		||||
            whitespace).
 | 
			
		||||
        """
 | 
			
		||||
        return "".join([t.text_with_ws for t in self])
 | 
			
		||||
| 
						 | 
				
			
			@ -678,7 +678,7 @@ cdef class Span:
 | 
			
		|||
            raise NotImplementedError(TempErrors.T007.format(attr="ent_id"))
 | 
			
		||||
 | 
			
		||||
    property ent_id_:
 | 
			
		||||
        """RETURNS (unicode): The (string) entity ID."""
 | 
			
		||||
        """RETURNS (str): The (string) entity ID."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return self.root.ent_id_
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -690,12 +690,12 @@ cdef class Span:
 | 
			
		|||
        """Verbatim text content (identical to `Span.text`). Exists mostly for
 | 
			
		||||
        consistency with other attributes.
 | 
			
		||||
 | 
			
		||||
        RETURNS (unicode): The span's text."""
 | 
			
		||||
        RETURNS (str): The span's text."""
 | 
			
		||||
        return self.text
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def lemma_(self):
 | 
			
		||||
        """RETURNS (unicode): The span's lemma."""
 | 
			
		||||
        """RETURNS (str): The span's lemma."""
 | 
			
		||||
        return " ".join([t.lemma_ for t in self]).strip()
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
| 
						 | 
				
			
			@ -714,7 +714,7 @@ cdef class Span:
 | 
			
		|||
        return "".join([t.text_with_ws for t in self])
 | 
			
		||||
 | 
			
		||||
    property label_:
 | 
			
		||||
        """RETURNS (unicode): The span's label."""
 | 
			
		||||
        """RETURNS (str): The span's label."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return self.doc.vocab.strings[self.label]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -724,7 +724,7 @@ cdef class Span:
 | 
			
		|||
            raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_))
 | 
			
		||||
 | 
			
		||||
    property kb_id_:
 | 
			
		||||
        """RETURNS (unicode): The named entity's KB ID."""
 | 
			
		||||
        """RETURNS (str): The named entity's KB ID."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return self.doc.vocab.strings[self.kb_id]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -36,7 +36,7 @@ cdef class Token:
 | 
			
		|||
    def set_extension(cls, name, **kwargs):
 | 
			
		||||
        """Define a custom attribute which becomes available as `Token._`.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the attribute to set.
 | 
			
		||||
        name (str): Name of the attribute to set.
 | 
			
		||||
        default: Optional default value of the attribute.
 | 
			
		||||
        getter (callable): Optional getter function.
 | 
			
		||||
        setter (callable): Optional setter function.
 | 
			
		||||
| 
						 | 
				
			
			@ -54,7 +54,7 @@ cdef class Token:
 | 
			
		|||
    def get_extension(cls, name):
 | 
			
		||||
        """Look up a previously registered extension by name.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the extension.
 | 
			
		||||
        name (str): Name of the extension.
 | 
			
		||||
        RETURNS (tuple): A `(default, method, getter, setter)` tuple.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/token#get_extension
 | 
			
		||||
| 
						 | 
				
			
			@ -65,7 +65,7 @@ cdef class Token:
 | 
			
		|||
    def has_extension(cls, name):
 | 
			
		||||
        """Check whether an extension has been registered.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the extension.
 | 
			
		||||
        name (str): Name of the extension.
 | 
			
		||||
        RETURNS (bool): Whether the extension has been registered.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/token#has_extension
 | 
			
		||||
| 
						 | 
				
			
			@ -76,7 +76,7 @@ cdef class Token:
 | 
			
		|||
    def remove_extension(cls, name):
 | 
			
		||||
        """Remove a previously registered extension.
 | 
			
		||||
 | 
			
		||||
        name (unicode): Name of the extension.
 | 
			
		||||
        name (str): Name of the extension.
 | 
			
		||||
        RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
 | 
			
		||||
            removed extension.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -244,12 +244,12 @@ cdef class Token:
 | 
			
		|||
 | 
			
		||||
    @property
 | 
			
		||||
    def text(self):
 | 
			
		||||
        """RETURNS (unicode): The original verbatim text of the token."""
 | 
			
		||||
        """RETURNS (str): The original verbatim text of the token."""
 | 
			
		||||
        return self.orth_
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def text_with_ws(self):
 | 
			
		||||
        """RETURNS (unicode): The text content of the span (with trailing
 | 
			
		||||
        """RETURNS (str): The text content of the span (with trailing
 | 
			
		||||
            whitespace).
 | 
			
		||||
        """
 | 
			
		||||
        cdef unicode orth = self.vocab.strings[self.c.lex.orth]
 | 
			
		||||
| 
						 | 
				
			
			@ -762,7 +762,7 @@ cdef class Token:
 | 
			
		|||
            self.c.ent_type = ent_type
 | 
			
		||||
 | 
			
		||||
    property ent_type_:
 | 
			
		||||
        """RETURNS (unicode): Named entity type."""
 | 
			
		||||
        """RETURNS (str): Named entity type."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return self.vocab.strings[self.c.ent_type]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -785,7 +785,7 @@ cdef class Token:
 | 
			
		|||
        and "" means no entity tag is set. "B" with an empty ent_type
 | 
			
		||||
        means that the token is blocked from further processing by NER.
 | 
			
		||||
 | 
			
		||||
        RETURNS (unicode): IOB code of named entity tag.
 | 
			
		||||
        RETURNS (str): IOB code of named entity tag.
 | 
			
		||||
        """
 | 
			
		||||
        iob_strings = ("", "I", "O", "B")
 | 
			
		||||
        return iob_strings[self.c.ent_iob]
 | 
			
		||||
| 
						 | 
				
			
			@ -801,7 +801,7 @@ cdef class Token:
 | 
			
		|||
            self.c.ent_id = key
 | 
			
		||||
 | 
			
		||||
    property ent_id_:
 | 
			
		||||
        """RETURNS (unicode): ID of the entity the token is an instance of,
 | 
			
		||||
        """RETURNS (str): ID of the entity the token is an instance of,
 | 
			
		||||
            if any.
 | 
			
		||||
        """
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -819,7 +819,7 @@ cdef class Token:
 | 
			
		|||
            self.c.ent_kb_id = ent_kb_id
 | 
			
		||||
 | 
			
		||||
    property ent_kb_id_:
 | 
			
		||||
        """RETURNS (unicode): Named entity KB ID."""
 | 
			
		||||
        """RETURNS (str): Named entity KB ID."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return self.vocab.strings[self.c.ent_kb_id]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -828,12 +828,12 @@ cdef class Token:
 | 
			
		|||
 | 
			
		||||
    @property
 | 
			
		||||
    def whitespace_(self):
 | 
			
		||||
        """RETURNS (unicode): The trailing whitespace character, if present."""
 | 
			
		||||
        """RETURNS (str): The trailing whitespace character, if present."""
 | 
			
		||||
        return " " if self.c.spacy else ""
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def orth_(self):
 | 
			
		||||
        """RETURNS (unicode): Verbatim text content (identical to
 | 
			
		||||
        """RETURNS (str): Verbatim text content (identical to
 | 
			
		||||
            `Token.text`). Exists mostly for consistency with the other
 | 
			
		||||
            attributes.
 | 
			
		||||
        """
 | 
			
		||||
| 
						 | 
				
			
			@ -841,13 +841,13 @@ cdef class Token:
 | 
			
		|||
 | 
			
		||||
    @property
 | 
			
		||||
    def lower_(self):
 | 
			
		||||
        """RETURNS (unicode): The lowercase token text. Equivalent to
 | 
			
		||||
        """RETURNS (str): The lowercase token text. Equivalent to
 | 
			
		||||
            `Token.text.lower()`.
 | 
			
		||||
        """
 | 
			
		||||
        return self.vocab.strings[self.c.lex.lower]
 | 
			
		||||
 | 
			
		||||
    property norm_:
 | 
			
		||||
        """RETURNS (unicode): The token's norm, i.e. a normalised form of the
 | 
			
		||||
        """RETURNS (str): The token's norm, i.e. a normalised form of the
 | 
			
		||||
            token text. Usually set in the language's tokenizer exceptions or
 | 
			
		||||
            norm exceptions.
 | 
			
		||||
        """
 | 
			
		||||
| 
						 | 
				
			
			@ -859,34 +859,34 @@ cdef class Token:
 | 
			
		|||
 | 
			
		||||
    @property
 | 
			
		||||
    def shape_(self):
 | 
			
		||||
        """RETURNS (unicode): Transform of the tokens's string, to show
 | 
			
		||||
        """RETURNS (str): Transform of the tokens's string, to show
 | 
			
		||||
            orthographic features. For example, "Xxxx" or "dd".
 | 
			
		||||
        """
 | 
			
		||||
        return self.vocab.strings[self.c.lex.shape]
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def prefix_(self):
 | 
			
		||||
        """RETURNS (unicode): A length-N substring from the start of the token.
 | 
			
		||||
        """RETURNS (str): A length-N substring from the start of the token.
 | 
			
		||||
            Defaults to `N=1`.
 | 
			
		||||
        """
 | 
			
		||||
        return self.vocab.strings[self.c.lex.prefix]
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def suffix_(self):
 | 
			
		||||
        """RETURNS (unicode): A length-N substring from the end of the token.
 | 
			
		||||
        """RETURNS (str): A length-N substring from the end of the token.
 | 
			
		||||
            Defaults to `N=3`.
 | 
			
		||||
        """
 | 
			
		||||
        return self.vocab.strings[self.c.lex.suffix]
 | 
			
		||||
 | 
			
		||||
    @property
 | 
			
		||||
    def lang_(self):
 | 
			
		||||
        """RETURNS (unicode): Language of the parent document's vocabulary,
 | 
			
		||||
        """RETURNS (str): Language of the parent document's vocabulary,
 | 
			
		||||
            e.g. 'en'.
 | 
			
		||||
        """
 | 
			
		||||
        return self.vocab.strings[self.c.lex.lang]
 | 
			
		||||
 | 
			
		||||
    property lemma_:
 | 
			
		||||
        """RETURNS (unicode): The token lemma, i.e. the base form of the word,
 | 
			
		||||
        """RETURNS (str): The token lemma, i.e. the base form of the word,
 | 
			
		||||
            with no inflectional suffixes.
 | 
			
		||||
        """
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
| 
						 | 
				
			
			@ -899,7 +899,7 @@ cdef class Token:
 | 
			
		|||
            self.c.lemma = self.vocab.strings.add(lemma_)
 | 
			
		||||
 | 
			
		||||
    property pos_:
 | 
			
		||||
        """RETURNS (unicode): Coarse-grained part-of-speech tag."""
 | 
			
		||||
        """RETURNS (str): Coarse-grained part-of-speech tag."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return parts_of_speech.NAMES[self.c.pos]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -907,7 +907,7 @@ cdef class Token:
 | 
			
		|||
            self.c.pos = parts_of_speech.IDS[pos_name]
 | 
			
		||||
 | 
			
		||||
    property tag_:
 | 
			
		||||
        """RETURNS (unicode): Fine-grained part-of-speech tag."""
 | 
			
		||||
        """RETURNS (str): Fine-grained part-of-speech tag."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return self.vocab.strings[self.c.tag]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -915,7 +915,7 @@ cdef class Token:
 | 
			
		|||
            self.tag = self.vocab.strings.add(tag)
 | 
			
		||||
 | 
			
		||||
    property dep_:
 | 
			
		||||
        """RETURNS (unicode): The syntactic dependency label."""
 | 
			
		||||
        """RETURNS (str): The syntactic dependency label."""
 | 
			
		||||
        def __get__(self):
 | 
			
		||||
            return self.vocab.strings[self.c.dep]
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
							
								
								
									
										211
									
								
								spacy/util.py
									
									
									
									
									
								
							
							
						
						
									
										211
									
								
								spacy/util.py
									
									
									
									
									
								
							| 
						 | 
				
			
			@ -15,6 +15,8 @@ import srsly
 | 
			
		|||
import catalogue
 | 
			
		||||
import sys
 | 
			
		||||
import warnings
 | 
			
		||||
from packaging.specifiers import SpecifierSet, InvalidSpecifier
 | 
			
		||||
from packaging.version import Version, InvalidVersion
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
try:
 | 
			
		||||
| 
						 | 
				
			
			@ -22,9 +24,16 @@ try:
 | 
			
		|||
except ImportError:
 | 
			
		||||
    cupy = None
 | 
			
		||||
 | 
			
		||||
try:  # Python 3.8
 | 
			
		||||
    import importlib.metadata as importlib_metadata
 | 
			
		||||
except ImportError:
 | 
			
		||||
    import importlib_metadata
 | 
			
		||||
 | 
			
		||||
from .symbols import ORTH
 | 
			
		||||
from .compat import cupy, CudaStream
 | 
			
		||||
from .errors import Errors, Warnings
 | 
			
		||||
from . import about
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
_PRINT_ENV = False
 | 
			
		||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
 | 
			
		||||
| 
						 | 
				
			
			@ -37,6 +46,10 @@ class registry(thinc.registry):
 | 
			
		|||
    factories = catalogue.create("spacy", "factories", entry_points=True)
 | 
			
		||||
    displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
 | 
			
		||||
    assets = catalogue.create("spacy", "assets", entry_points=True)
 | 
			
		||||
    # This is mostly used to get a list of all installed models in the current
 | 
			
		||||
    # environment. spaCy models packaged with `spacy package` will "advertise"
 | 
			
		||||
    # themselves via entry points.
 | 
			
		||||
    models = catalogue.create("spacy", "models", entry_points=True)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def set_env_log(value):
 | 
			
		||||
| 
						 | 
				
			
			@ -49,7 +62,7 @@ def lang_class_is_loaded(lang):
 | 
			
		|||
    loaded lazily, to avoid expensive setup code associated with the language
 | 
			
		||||
    data.
 | 
			
		||||
 | 
			
		||||
    lang (unicode): Two-letter language code, e.g. 'en'.
 | 
			
		||||
    lang (str): Two-letter language code, e.g. 'en'.
 | 
			
		||||
    RETURNS (bool): Whether a Language class has been loaded.
 | 
			
		||||
    """
 | 
			
		||||
    return lang in registry.languages
 | 
			
		||||
| 
						 | 
				
			
			@ -58,7 +71,7 @@ def lang_class_is_loaded(lang):
 | 
			
		|||
def get_lang_class(lang):
 | 
			
		||||
    """Import and load a Language class.
 | 
			
		||||
 | 
			
		||||
    lang (unicode): Two-letter language code, e.g. 'en'.
 | 
			
		||||
    lang (str): Two-letter language code, e.g. 'en'.
 | 
			
		||||
    RETURNS (Language): Language class.
 | 
			
		||||
    """
 | 
			
		||||
    # Check if language is registered / entry point is available
 | 
			
		||||
| 
						 | 
				
			
			@ -76,7 +89,7 @@ def get_lang_class(lang):
 | 
			
		|||
def set_lang_class(name, cls):
 | 
			
		||||
    """Set a custom Language class name that can be loaded via get_lang_class.
 | 
			
		||||
 | 
			
		||||
    name (unicode): Name of Language class.
 | 
			
		||||
    name (str): Name of Language class.
 | 
			
		||||
    cls (Language): Language class.
 | 
			
		||||
    """
 | 
			
		||||
    registry.languages.register(name, func=cls)
 | 
			
		||||
| 
						 | 
				
			
			@ -98,7 +111,7 @@ def load_language_data(path):
 | 
			
		|||
    """Load JSON language data using the given path as a base. If the provided
 | 
			
		||||
    path isn't present, will attempt to load a gzipped version before giving up.
 | 
			
		||||
 | 
			
		||||
    path (unicode / Path): The data to load.
 | 
			
		||||
    path (str / Path): The data to load.
 | 
			
		||||
    RETURNS: The loaded data.
 | 
			
		||||
    """
 | 
			
		||||
    path = ensure_path(path)
 | 
			
		||||
| 
						 | 
				
			
			@ -119,7 +132,7 @@ def get_module_path(module):
 | 
			
		|||
def load_model(name, **overrides):
 | 
			
		||||
    """Load a model from a package or data path.
 | 
			
		||||
 | 
			
		||||
    name (unicode): Package name or model path.
 | 
			
		||||
    name (str): Package name or model path.
 | 
			
		||||
    **overrides: Specific overrides, like pipeline components to disable.
 | 
			
		||||
    RETURNS (Language): `Language` class with the loaded model.
 | 
			
		||||
    """
 | 
			
		||||
| 
						 | 
				
			
			@ -193,7 +206,7 @@ def load_model_from_init_py(init_file, **overrides):
 | 
			
		|||
    """Helper function to use in the `load()` method of a model package's
 | 
			
		||||
    __init__.py.
 | 
			
		||||
 | 
			
		||||
    init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
 | 
			
		||||
    init_file (str): Path to model's __init__.py, i.e. `__file__`.
 | 
			
		||||
    **overrides: Specific overrides, like pipeline components to disable.
 | 
			
		||||
    RETURNS (Language): `Language` class with loaded model.
 | 
			
		||||
    """
 | 
			
		||||
| 
						 | 
				
			
			@ -206,11 +219,74 @@ def load_model_from_init_py(init_file, **overrides):
 | 
			
		|||
    return load_model_from_path(data_path, meta, **overrides)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_installed_models():
 | 
			
		||||
    """List all model packages currently installed in the environment.
 | 
			
		||||
 | 
			
		||||
    RETURNS (list): The string names of the models.
 | 
			
		||||
    """
 | 
			
		||||
    return list(registry.models.get_all().keys())
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_package_version(name):
 | 
			
		||||
    """Get the version of an installed package. Typically used to get model
 | 
			
		||||
    package versions.
 | 
			
		||||
 | 
			
		||||
    name (str): The name of the installed Python package.
 | 
			
		||||
    RETURNS (str / None): The version or None if package not installed.
 | 
			
		||||
    """
 | 
			
		||||
    try:
 | 
			
		||||
        return importlib_metadata.version(name)
 | 
			
		||||
    except importlib_metadata.PackageNotFoundError:
 | 
			
		||||
        return None
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def is_compatible_version(version, constraint, prereleases=True):
 | 
			
		||||
    """Check if a version (e.g. "2.0.0") is compatible given a version
 | 
			
		||||
    constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version,
 | 
			
		||||
    it's interpreted as =={version}.
 | 
			
		||||
 | 
			
		||||
    version (str): The version to check.
 | 
			
		||||
    constraint (str): The constraint string.
 | 
			
		||||
    prereleases (bool): Whether to allow prereleases. If set to False,
 | 
			
		||||
        prerelease versions will be considered incompatible.
 | 
			
		||||
    RETURNS (bool / None): Whether the version is compatible, or None if the
 | 
			
		||||
        version or constraint are invalid.
 | 
			
		||||
    """
 | 
			
		||||
    # Handle cases where exact version is provided as constraint
 | 
			
		||||
    if constraint[0].isdigit():
 | 
			
		||||
        constraint = f"=={constraint}"
 | 
			
		||||
    try:
 | 
			
		||||
        spec = SpecifierSet(constraint)
 | 
			
		||||
        version = Version(version)
 | 
			
		||||
    except (InvalidSpecifier, InvalidVersion):
 | 
			
		||||
        return None
 | 
			
		||||
    spec.prereleases = prereleases
 | 
			
		||||
    return version in spec
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_model_version_range(spacy_version):
 | 
			
		||||
    """Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy
 | 
			
		||||
    version. Models are always compatible across patch versions but not
 | 
			
		||||
    across minor or major versions.
 | 
			
		||||
    """
 | 
			
		||||
    release = Version(spacy_version).release
 | 
			
		||||
    return f">={spacy_version},<{release[0]}.{release[1] + 1}.0"
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_base_version(version):
 | 
			
		||||
    """Generate the base version without any prerelease identifiers.
 | 
			
		||||
 | 
			
		||||
    version (str): The version, e.g. "3.0.0.dev1".
 | 
			
		||||
    RETURNS (str): The base version, e.g. "3.0.0".
 | 
			
		||||
    """
 | 
			
		||||
    return Version(version).base_version
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def load_config(path, create_objects=False):
 | 
			
		||||
    """Load a Thinc-formatted config file, optionally filling in objects where
 | 
			
		||||
    the config references registry entries. See "Thinc config files" for details.
 | 
			
		||||
 | 
			
		||||
    path (unicode or Path): Path to the config file
 | 
			
		||||
    path (str / Path): Path to the config file
 | 
			
		||||
    create_objects (bool): Whether to automatically create objects when the config
 | 
			
		||||
        references registry entries. Defaults to False.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -227,7 +303,7 @@ def load_config_from_str(string, create_objects=False):
 | 
			
		|||
    """Load a Thinc-formatted config, optionally filling in objects where
 | 
			
		||||
    the config references registry entries. See "Thinc config files" for details.
 | 
			
		||||
 | 
			
		||||
    string (unicode or Path): Text contents of the config file.
 | 
			
		||||
    string (str / Path): Text contents of the config file.
 | 
			
		||||
    create_objects (bool): Whether to automatically create objects when the config
 | 
			
		||||
        references registry entries. Defaults to False.
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -243,7 +319,7 @@ def load_config_from_str(string, create_objects=False):
 | 
			
		|||
def get_model_meta(path):
 | 
			
		||||
    """Get model meta.json from a directory path and validate its contents.
 | 
			
		||||
 | 
			
		||||
    path (unicode or Path): Path to model directory.
 | 
			
		||||
    path (str / Path): Path to model directory.
 | 
			
		||||
    RETURNS (dict): The model's meta data.
 | 
			
		||||
    """
 | 
			
		||||
    model_path = ensure_path(path)
 | 
			
		||||
| 
						 | 
				
			
			@ -256,13 +332,23 @@ def get_model_meta(path):
 | 
			
		|||
    for setting in ["lang", "name", "version"]:
 | 
			
		||||
        if setting not in meta or not meta[setting]:
 | 
			
		||||
            raise ValueError(Errors.E054.format(setting=setting))
 | 
			
		||||
    if "spacy_version" in meta:
 | 
			
		||||
        if not is_compatible_version(about.__version__, meta["spacy_version"]):
 | 
			
		||||
            warnings.warn(
 | 
			
		||||
                Warnings.W095.format(
 | 
			
		||||
                    model=f"{meta['lang']}_{meta['name']}",
 | 
			
		||||
                    model_version=meta["version"],
 | 
			
		||||
                    version=meta["spacy_version"],
 | 
			
		||||
                    current=about.__version__,
 | 
			
		||||
                )
 | 
			
		||||
            )
 | 
			
		||||
    return meta
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_model_config(path):
 | 
			
		||||
    """Get the model's config from a directory path.
 | 
			
		||||
 | 
			
		||||
    path (unicode or Path): Path to model directory.
 | 
			
		||||
    path (str / Path): Path to model directory.
 | 
			
		||||
    RETURNS (Config): The model's config data.
 | 
			
		||||
    """
 | 
			
		||||
    model_path = ensure_path(path)
 | 
			
		||||
| 
						 | 
				
			
			@ -279,23 +365,20 @@ def get_model_config(path):
 | 
			
		|||
def is_package(name):
 | 
			
		||||
    """Check if string maps to a package installed via pip.
 | 
			
		||||
 | 
			
		||||
    name (unicode): Name of package.
 | 
			
		||||
    name (str): Name of package.
 | 
			
		||||
    RETURNS (bool): True if installed package, False if not.
 | 
			
		||||
    """
 | 
			
		||||
    import pkg_resources
 | 
			
		||||
 | 
			
		||||
    name = name.lower()  # compare package name against lowercase name
 | 
			
		||||
    packages = pkg_resources.working_set.by_key.keys()
 | 
			
		||||
    for package in packages:
 | 
			
		||||
        if package.lower().replace("-", "_") == name:
 | 
			
		||||
    try:
 | 
			
		||||
        importlib_metadata.distribution(name)
 | 
			
		||||
        return True
 | 
			
		||||
    except:  # noqa: E722
 | 
			
		||||
        return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def get_package_path(name):
 | 
			
		||||
    """Get the path to an installed package.
 | 
			
		||||
 | 
			
		||||
    name (unicode): Package name.
 | 
			
		||||
    name (str): Package name.
 | 
			
		||||
    RETURNS (Path): Path to installed package.
 | 
			
		||||
    """
 | 
			
		||||
    name = name.lower()  # use lowercase version to be safe
 | 
			
		||||
| 
						 | 
				
			
			@ -470,8 +553,8 @@ def expand_exc(excs, search, replace):
 | 
			
		|||
    For example, to add additional versions with typographic apostrophes.
 | 
			
		||||
 | 
			
		||||
    excs (dict): Tokenizer exceptions.
 | 
			
		||||
    search (unicode): String to find and replace.
 | 
			
		||||
    replace (unicode): Replacement.
 | 
			
		||||
    search (str): String to find and replace.
 | 
			
		||||
    replace (str): Replacement.
 | 
			
		||||
    RETURNS (dict): Combined tokenizer exceptions.
 | 
			
		||||
    """
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -575,41 +658,73 @@ def decaying(start, stop, decay):
 | 
			
		|||
        curr -= decay
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0.2):
 | 
			
		||||
def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False):
 | 
			
		||||
    """Create minibatches of roughly a given number of words. If any examples
 | 
			
		||||
    are longer than the specified batch length, they will appear in a batch by
 | 
			
		||||
    themselves."""
 | 
			
		||||
    themselves, or be discarded if discard_oversize=True."""
 | 
			
		||||
    if isinstance(size, int):
 | 
			
		||||
        size_ = itertools.repeat(size)
 | 
			
		||||
    elif isinstance(size, List):
 | 
			
		||||
        size_ = iter(size)
 | 
			
		||||
    else:
 | 
			
		||||
        size_ = size
 | 
			
		||||
    examples = iter(examples)
 | 
			
		||||
    oversize = []
 | 
			
		||||
    while True:
 | 
			
		||||
        batch_size = next(size_)
 | 
			
		||||
        tol_size = batch_size * 0.2
 | 
			
		||||
 | 
			
		||||
    target_size = next(size_)
 | 
			
		||||
    tol_size = target_size * tolerance
 | 
			
		||||
    batch = []
 | 
			
		||||
        if oversize:
 | 
			
		||||
            example = oversize.pop(0)
 | 
			
		||||
    overflow = []
 | 
			
		||||
    batch_size = 0
 | 
			
		||||
    overflow_size = 0
 | 
			
		||||
 | 
			
		||||
    for example in examples:
 | 
			
		||||
        n_words = count_words(example.doc)
 | 
			
		||||
        # if the current example exceeds the maximum batch size, it is returned separately
 | 
			
		||||
        # but only if discard_oversize=False.
 | 
			
		||||
        if n_words > target_size + tol_size:
 | 
			
		||||
            if not discard_oversize:
 | 
			
		||||
                yield [example]
 | 
			
		||||
 | 
			
		||||
        # add the example to the current batch if there's no overflow yet and it still fits
 | 
			
		||||
        elif overflow_size == 0 and (batch_size + n_words) <= target_size:
 | 
			
		||||
            batch.append(example)
 | 
			
		||||
            batch_size -= n_words
 | 
			
		||||
        while batch_size >= 1:
 | 
			
		||||
            try:
 | 
			
		||||
                example = next(examples)
 | 
			
		||||
            except StopIteration:
 | 
			
		||||
                if batch:
 | 
			
		||||
                    yield batch
 | 
			
		||||
                return
 | 
			
		||||
            n_words = count_words(example.doc)
 | 
			
		||||
            if n_words < (batch_size + tol_size):
 | 
			
		||||
                batch_size -= n_words
 | 
			
		||||
                batch.append(example)
 | 
			
		||||
            batch_size += n_words
 | 
			
		||||
 | 
			
		||||
        # add the example to the overflow buffer if it fits in the tolerance margin
 | 
			
		||||
        elif (batch_size + overflow_size + n_words) <= (target_size + tol_size):
 | 
			
		||||
            overflow.append(example)
 | 
			
		||||
            overflow_size += n_words
 | 
			
		||||
 | 
			
		||||
        # yield the previous batch and start a new one. The new one gets the overflow examples.
 | 
			
		||||
        else:
 | 
			
		||||
                oversize.append(example)
 | 
			
		||||
            yield batch
 | 
			
		||||
            target_size = next(size_)
 | 
			
		||||
            tol_size = target_size * tolerance
 | 
			
		||||
            batch = overflow
 | 
			
		||||
            batch_size = overflow_size
 | 
			
		||||
            overflow = []
 | 
			
		||||
            overflow_size = 0
 | 
			
		||||
 | 
			
		||||
            # this example still fits
 | 
			
		||||
            if (batch_size + n_words) <= target_size:
 | 
			
		||||
                batch.append(example)
 | 
			
		||||
                batch_size += n_words
 | 
			
		||||
 | 
			
		||||
            # this example fits in overflow
 | 
			
		||||
            elif (batch_size + n_words) <= (target_size + tol_size):
 | 
			
		||||
                overflow.append(example)
 | 
			
		||||
                overflow_size += n_words
 | 
			
		||||
 | 
			
		||||
            # this example does not fit with the previous overflow: start another new batch
 | 
			
		||||
            else:
 | 
			
		||||
                yield batch
 | 
			
		||||
                target_size = next(size_)
 | 
			
		||||
                tol_size = target_size * tolerance
 | 
			
		||||
                batch = [example]
 | 
			
		||||
                batch_size = n_words
 | 
			
		||||
 | 
			
		||||
    # yield the final batch
 | 
			
		||||
    if batch:
 | 
			
		||||
        batch.extend(overflow)
 | 
			
		||||
        yield batch
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -705,8 +820,8 @@ def from_disk(path, readers, exclude):
 | 
			
		|||
def import_file(name, loc):
 | 
			
		||||
    """Import module from a file. Used to load models from a directory.
 | 
			
		||||
 | 
			
		||||
    name (unicode): Name of module to load.
 | 
			
		||||
    loc (unicode / Path): Path to the file.
 | 
			
		||||
    name (str): Name of module to load.
 | 
			
		||||
    loc (str / Path): Path to the file.
 | 
			
		||||
    RETURNS: The loaded module.
 | 
			
		||||
    """
 | 
			
		||||
    loc = str(loc)
 | 
			
		||||
| 
						 | 
				
			
			@ -721,8 +836,8 @@ def minify_html(html):
 | 
			
		|||
    Disclaimer: NOT a general-purpose solution, only removes indentation and
 | 
			
		||||
    newlines.
 | 
			
		||||
 | 
			
		||||
    html (unicode): Markup to minify.
 | 
			
		||||
    RETURNS (unicode): "Minified" HTML.
 | 
			
		||||
    html (str): Markup to minify.
 | 
			
		||||
    RETURNS (str): "Minified" HTML.
 | 
			
		||||
    """
 | 
			
		||||
    return html.strip().replace("    ", "").replace("\n", "")
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -731,8 +846,8 @@ def escape_html(text):
 | 
			
		|||
    """Replace <, >, &, " with their HTML encoded representation. Intended to
 | 
			
		||||
    prevent HTML errors in rendered displaCy markup.
 | 
			
		||||
 | 
			
		||||
    text (unicode): The original text.
 | 
			
		||||
    RETURNS (unicode): Equivalent text to be safely used within HTML.
 | 
			
		||||
    text (str): The original text.
 | 
			
		||||
    RETURNS (str): Equivalent text to be safely used within HTML.
 | 
			
		||||
    """
 | 
			
		||||
    text = text.replace("&", "&")
 | 
			
		||||
    text = text.replace("<", "<")
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -57,7 +57,7 @@ cdef class Vectors:
 | 
			
		|||
        shape (tuple): Size of the table, as (# entries, # columns)
 | 
			
		||||
        data (numpy.ndarray): The vector data.
 | 
			
		||||
        keys (iterable): A sequence of keys, aligned with the data.
 | 
			
		||||
        name (unicode): A name to identify the vectors table.
 | 
			
		||||
        name (str): A name to identify the vectors table.
 | 
			
		||||
        RETURNS (Vectors): The newly created object.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/vectors#init
 | 
			
		||||
| 
						 | 
				
			
			@ -244,7 +244,7 @@ cdef class Vectors:
 | 
			
		|||
    def find(self, *, key=None, keys=None, row=None, rows=None):
 | 
			
		||||
        """Look up one or more keys by row, or vice versa.
 | 
			
		||||
 | 
			
		||||
        key (unicode / int): Find the row that the given key points to.
 | 
			
		||||
        key (str / int): Find the row that the given key points to.
 | 
			
		||||
            Returns int, -1 if missing.
 | 
			
		||||
        keys (iterable): Find rows that the keys point to.
 | 
			
		||||
            Returns ndarray.
 | 
			
		||||
| 
						 | 
				
			
			@ -366,7 +366,7 @@ cdef class Vectors:
 | 
			
		|||
    def to_disk(self, path, **kwargs):
 | 
			
		||||
        """Save the current state to a directory.
 | 
			
		||||
 | 
			
		||||
        path (unicode / Path): A path to a directory, which will be created if
 | 
			
		||||
        path (str / Path): A path to a directory, which will be created if
 | 
			
		||||
            it doesn't exists.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/vectors#to_disk
 | 
			
		||||
| 
						 | 
				
			
			@ -386,7 +386,7 @@ cdef class Vectors:
 | 
			
		|||
        """Loads state from a directory. Modifies the object in place and
 | 
			
		||||
        returns it.
 | 
			
		||||
 | 
			
		||||
        path (unicode / Path): Directory path, string or Path-like object.
 | 
			
		||||
        path (str / Path): Directory path, string or Path-like object.
 | 
			
		||||
        RETURNS (Vectors): The modified object.
 | 
			
		||||
 | 
			
		||||
        DOCS: https://spacy.io/api/vectors#from_disk
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -505,8 +505,8 @@ tokenization can be provided.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Key      | Type | Description                                                |
 | 
			
		||||
| -------- | ------- | ---------------------------------------------------------- |
 | 
			
		||||
| `text`   | unicode | The raw input text. Is not required if `tokens` available. |
 | 
			
		||||
| -------- | ---- | ---------------------------------------------------------- |
 | 
			
		||||
| `text`   | str  | The raw input text. Is not required if `tokens` available. |
 | 
			
		||||
| `tokens` | list | Optional tokenization, one string per token.               |
 | 
			
		||||
 | 
			
		||||
```json
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -170,7 +170,7 @@ vocabulary.
 | 
			
		|||
| Name        | Type             | Description                                                                                 |
 | 
			
		||||
| ----------- | ---------------- | ------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `mem`       | `cymem.Pool`     | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. |
 | 
			
		||||
| `string`    | unicode          | The string of the word to look up.                                                          |
 | 
			
		||||
| `string`    | str              | The string of the word to look up.                                                          |
 | 
			
		||||
| **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary.                                                               |
 | 
			
		||||
 | 
			
		||||
### Vocab.get_by_orth {#vocab_get_by_orth tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -230,8 +230,8 @@ Add a new label to the pipe.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name    | Type | Description       |
 | 
			
		||||
| ------- | ------- | ----------------- |
 | 
			
		||||
| `label` | unicode | The label to add. |
 | 
			
		||||
| ------- | ---- | ----------------- |
 | 
			
		||||
| `label` | str  | The label to add. |
 | 
			
		||||
 | 
			
		||||
## DependencyParser.to_disk {#to_disk tag="method"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -245,8 +245,8 @@ Serialize the pipe to disk.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name      | Type         | Description                                                                                                           |
 | 
			
		||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |
 | 
			
		||||
 | 
			
		||||
## DependencyParser.from_disk {#from_disk tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -262,7 +262,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
 | 
			
		|||
 | 
			
		||||
| Name        | Type               | Description                                                                |
 | 
			
		||||
| ----------- | ------------------ | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | unicode / `Path`   | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `path`      | str / `Path`       | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude`   | list               | String names of [serialization fields](#serialization-fields) to exclude.  |
 | 
			
		||||
| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object.                                    |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -123,7 +123,7 @@ details, see the documentation on
 | 
			
		|||
 | 
			
		||||
| Name      | Type     | Description                                                                                                                         |
 | 
			
		||||
| --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `name`    | unicode  | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`.                       |
 | 
			
		||||
| `name`    | str      | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`.                       |
 | 
			
		||||
| `default` | -        | Optional default value of the attribute if no getter or method is defined.                                                          |
 | 
			
		||||
| `method`  | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`.                                                          |
 | 
			
		||||
| `getter`  | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute.          |
 | 
			
		||||
| 
						 | 
				
			
			@ -146,8 +146,8 @@ Look up a previously registered extension by name. Returns a 4-tuple
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type  | Description                                                   |
 | 
			
		||||
| ----------- | ------- | ------------------------------------------------------------- |
 | 
			
		||||
| `name`      | unicode | Name of the extension.                                        |
 | 
			
		||||
| ----------- | ----- | ------------------------------------------------------------- |
 | 
			
		||||
| `name`      | str   | Name of the extension.                                        |
 | 
			
		||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
 | 
			
		||||
 | 
			
		||||
## Doc.has_extension {#has_extension tag="classmethod" new="2"}
 | 
			
		||||
| 
						 | 
				
			
			@ -163,8 +163,8 @@ Check whether an extension has been registered on the `Doc` class.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                |
 | 
			
		||||
| ----------- | ------- | ------------------------------------------ |
 | 
			
		||||
| `name`      | unicode | Name of the extension to check.            |
 | 
			
		||||
| ----------- | ---- | ------------------------------------------ |
 | 
			
		||||
| `name`      | str  | Name of the extension to check.            |
 | 
			
		||||
| **RETURNS** | bool | Whether the extension has been registered. |
 | 
			
		||||
 | 
			
		||||
## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
 | 
			
		||||
| 
						 | 
				
			
			@ -181,8 +181,8 @@ Remove a previously registered extension.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type  | Description                                                           |
 | 
			
		||||
| ----------- | ------- | --------------------------------------------------------------------- |
 | 
			
		||||
| `name`      | unicode | Name of the extension.                                                |
 | 
			
		||||
| ----------- | ----- | --------------------------------------------------------------------- |
 | 
			
		||||
| `name`      | str   | Name of the extension.                                                |
 | 
			
		||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
 | 
			
		||||
 | 
			
		||||
## Doc.char_span {#char_span tag="method" new="2"}
 | 
			
		||||
| 
						 | 
				
			
			@ -369,8 +369,8 @@ Save the current state to a directory.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name      | Type         | Description                                                                                                           |
 | 
			
		||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |
 | 
			
		||||
 | 
			
		||||
## Doc.from_disk {#from_disk tag="method" new="2"}
 | 
			
		||||
| 
						 | 
				
			
			@ -386,8 +386,8 @@ Loads state from a directory. Modifies the object in place and returns it.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type         | Description                                                                |
 | 
			
		||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude`   | list         | String names of [serialization fields](#serialization-fields) to exclude.  |
 | 
			
		||||
| **RETURNS** | `Doc`        | The modified `Doc` object.                                                 |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -648,15 +648,15 @@ The L2 norm of the document's vector representation.
 | 
			
		|||
 | 
			
		||||
| Name                                    | Type         | Description                                                                                                                                                                                                                                                                                |
 | 
			
		||||
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
			
		||||
| `text`                                  | unicode      | A unicode representation of the document text.                                                                                                                                                                                                                                             |
 | 
			
		||||
| `text_with_ws`                          | unicode      | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`.                                                                                                                                                                                                      |
 | 
			
		||||
| `text`                                  | str          | A unicode representation of the document text.                                                                                                                                                                                                                                             |
 | 
			
		||||
| `text_with_ws`                          | str          | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`.                                                                                                                                                                                                      |
 | 
			
		||||
| `mem`                                   | `Pool`       | The document's local memory heap, for all C data it owns.                                                                                                                                                                                                                                  |
 | 
			
		||||
| `vocab`                                 | `Vocab`      | The store of lexical types.                                                                                                                                                                                                                                                                |
 | 
			
		||||
| `tensor` <Tag variant="new">2</Tag>     | `ndarray`    | Container for dense vector representations.                                                                                                                                                                                                                                                |
 | 
			
		||||
| `cats` <Tag variant="new">2</Tag>       | dict         | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float.                                                                                     |
 | 
			
		||||
| `user_data`                             | -            | A generic storage area, for user custom data.                                                                                                                                                                                                                                              |
 | 
			
		||||
| `lang` <Tag variant="new">2.1</Tag>     | int          | Language of the document's vocabulary.                                                                                                                                                                                                                                                     |
 | 
			
		||||
| `lang_` <Tag variant="new">2.1</Tag>    | unicode      | Language of the document's vocabulary.                                                                                                                                                                                                                                                     |
 | 
			
		||||
| `lang_` <Tag variant="new">2.1</Tag>    | str          | Language of the document's vocabulary.                                                                                                                                                                                                                                                     |
 | 
			
		||||
| `is_tagged`                             | bool         | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty.                                                                                                                                                                                  |
 | 
			
		||||
| `is_parsed`                             | bool         | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty.                                                                                                                                                                                   |
 | 
			
		||||
| `is_sentenced`                          | bool         | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty.                                                                                                                                                                        |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -259,8 +259,8 @@ Serialize the pipe to disk.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name      | Type         | Description                                                                                                           |
 | 
			
		||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |
 | 
			
		||||
 | 
			
		||||
## EntityLinker.from_disk {#from_disk tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -275,8 +275,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type           | Description                                                                |
 | 
			
		||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ----------- | -------------- | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | str / `Path`   | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude`   | list           | String names of [serialization fields](#serialization-fields) to exclude.  |
 | 
			
		||||
| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object.                                        |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -231,8 +231,8 @@ Add a new label to the pipe.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name    | Type | Description       |
 | 
			
		||||
| ------- | ------- | ----------------- |
 | 
			
		||||
| `label` | unicode | The label to add. |
 | 
			
		||||
| ------- | ---- | ----------------- |
 | 
			
		||||
| `label` | str  | The label to add. |
 | 
			
		||||
 | 
			
		||||
## EntityRecognizer.to_disk {#to_disk tag="method"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -246,8 +246,8 @@ Serialize the pipe to disk.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name      | Type         | Description                                                                                                           |
 | 
			
		||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |
 | 
			
		||||
 | 
			
		||||
## EntityRecognizer.from_disk {#from_disk tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -263,7 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
 | 
			
		|||
 | 
			
		||||
| Name        | Type               | Description                                                                |
 | 
			
		||||
| ----------- | ------------------ | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | unicode / `Path`   | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `path`      | str / `Path`       | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude`   | list               | String names of [serialization fields](#serialization-fields) to exclude.  |
 | 
			
		||||
| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object.                                    |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -73,8 +73,8 @@ Whether a label is present in the patterns.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                  |
 | 
			
		||||
| ----------- | ------- | -------------------------------------------- |
 | 
			
		||||
| `label`     | unicode | The label to check.                          |
 | 
			
		||||
| ----------- | ---- | -------------------------------------------- |
 | 
			
		||||
| `label`     | str  | The label to check.                          |
 | 
			
		||||
| **RETURNS** | bool | Whether the entity ruler contains the label. |
 | 
			
		||||
 | 
			
		||||
## EntityRuler.\_\_call\_\_ {#call tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -83,8 +83,9 @@ Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
 | 
			
		|||
happens automatically after the component has been added to the pipeline using
 | 
			
		||||
[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
 | 
			
		||||
with `overwrite_ents=True`, existing entities will be replaced if they overlap
 | 
			
		||||
with the matches. When matches overlap in a Doc, the entity ruler prioritizes longer
 | 
			
		||||
patterns over shorter, and if equal the match occuring first in the Doc is chosen.
 | 
			
		||||
with the matches. When matches overlap in a Doc, the entity ruler prioritizes
 | 
			
		||||
longer patterns over shorter, and if equal the match occuring first in the Doc
 | 
			
		||||
is chosen.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
| 
						 | 
				
			
			@ -140,8 +141,8 @@ only the patterns are saved as JSONL. If a directory name is provided, a
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name   | Type         | Description                                                                                                                         |
 | 
			
		||||
| ------ | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path` | unicode / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ------ | ------------ | ----------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path` | str / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
 | 
			
		||||
## EntityRuler.from_disk {#from_disk tag="method"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -159,8 +160,8 @@ configuration.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type          | Description                                                                              |
 | 
			
		||||
| ----------- | ---------------- | ---------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | unicode / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ----------- | ------------- | ---------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | str / `Path`  | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object.                                                       |
 | 
			
		||||
 | 
			
		||||
## EntityRuler.to_bytes {#to_bytes tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -18,7 +18,7 @@ Create a `GoldCorpus`. IF the input data is an iterable, each item should be a
 | 
			
		|||
for further details.
 | 
			
		||||
 | 
			
		||||
| Name        | Type                    | Description                                                  |
 | 
			
		||||
| ----------- | --------------------------- | ------------------------------------------------------------ |
 | 
			
		||||
| `train`     | unicode / `Path` / iterable | Training data, as a path (file or directory) or iterable.    |
 | 
			
		||||
| `dev`       | unicode / `Path` / iterable | Development data, as a path (file or directory) or iterable. |
 | 
			
		||||
| ----------- | ----------------------- | ------------------------------------------------------------ |
 | 
			
		||||
| `train`     | str / `Path` / iterable | Training data, as a path (file or directory) or iterable.    |
 | 
			
		||||
| `dev`       | str / `Path` / iterable | Development data, as a path (file or directory) or iterable. |
 | 
			
		||||
| **RETURNS** | `GoldCorpus`            | The newly constructed object.                                |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -60,7 +60,8 @@ Whether the provided syntactic annotations form a projective dependency tree.
 | 
			
		|||
 | 
			
		||||
Convert a list of Doc objects into the
 | 
			
		||||
[JSON-serializable format](/api/annotation#json-input) used by the
 | 
			
		||||
[`spacy train`](/api/cli#train) command. Each input doc will be treated as a 'paragraph' in the output doc.
 | 
			
		||||
[`spacy train`](/api/cli#train) command. Each input doc will be treated as a
 | 
			
		||||
'paragraph' in the output doc.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
| 
						 | 
				
			
			@ -158,7 +159,7 @@ single-token entity.
 | 
			
		|||
| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `doc`       | `Doc`    | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document.                          |
 | 
			
		||||
| `entities`  | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. |
 | 
			
		||||
| **RETURNS** | list     | Unicode strings, describing the [BILUO](/api/annotation#biluo) tags.                                                                            |
 | 
			
		||||
| **RETURNS** | list     | str strings, describing the [BILUO](/api/annotation#biluo) tags.                                                                                |
 | 
			
		||||
 | 
			
		||||
### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1,16 +1,19 @@
 | 
			
		|||
---
 | 
			
		||||
title: KnowledgeBase
 | 
			
		||||
teaser: A storage class for entities and aliases of a specific knowledge base (ontology)
 | 
			
		||||
teaser:
 | 
			
		||||
  A storage class for entities and aliases of a specific knowledge base
 | 
			
		||||
  (ontology)
 | 
			
		||||
tag: class
 | 
			
		||||
source: spacy/kb.pyx
 | 
			
		||||
new: 2.2
 | 
			
		||||
---
 | 
			
		||||
 | 
			
		||||
The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init)
 | 
			
		||||
objects, which are plausible external identifiers given a certain textual mention.
 | 
			
		||||
Each such `Candidate` holds information from the relevant KB entities,
 | 
			
		||||
such as its frequency in text and possible aliases.
 | 
			
		||||
Each entity in the knowledge base also has a pretrained entity vector of a fixed size.
 | 
			
		||||
The `KnowledgeBase` object provides a method to generate
 | 
			
		||||
[`Candidate`](/api/kb/#candidate_init) objects, which are plausible external
 | 
			
		||||
identifiers given a certain textual mention. Each such `Candidate` holds
 | 
			
		||||
information from the relevant KB entities, such as its frequency in text and
 | 
			
		||||
possible aliases. Each entity in the knowledge base also has a pretrained entity
 | 
			
		||||
vector of a fixed size.
 | 
			
		||||
 | 
			
		||||
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -25,24 +28,24 @@ Create the knowledge base.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name                   | Type            | Description                              |
 | 
			
		||||
| ----------------------- | ---------------- | ----------------------------------------- |
 | 
			
		||||
| ---------------------- | --------------- | ---------------------------------------- |
 | 
			
		||||
| `vocab`                | `Vocab`         | A `Vocab` object.                        |
 | 
			
		||||
| `entity_vector_length` | int             | Length of the fixed-size entity vectors. |
 | 
			
		||||
| **RETURNS**            | `KnowledgeBase` | The newly constructed object.            |
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
 | 
			
		||||
 | 
			
		||||
The length of the fixed-size entity vectors in the knowledge base.
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                              |
 | 
			
		||||
| ----------- | ---- | ----------------------------------------- |
 | 
			
		||||
| ----------- | ---- | ---------------------------------------- |
 | 
			
		||||
| **RETURNS** | int  | Length of the fixed-size entity vectors. |
 | 
			
		||||
 | 
			
		||||
## KnowledgeBase.add_entity {#add_entity tag="method"}
 | 
			
		||||
 | 
			
		||||
Add an entity to the knowledge base, specifying its corpus frequency
 | 
			
		||||
and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length).
 | 
			
		||||
Add an entity to the knowledge base, specifying its corpus frequency and entity
 | 
			
		||||
vector, which should be of length
 | 
			
		||||
[`entity_vector_length`](/api/kb#entity_vector_length).
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
| 
						 | 
				
			
			@ -52,15 +55,15 @@ and entity vector, which should be of length [`entity_vector_length`](/api/kb#en
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name            | Type   | Description                                     |
 | 
			
		||||
| --------------- | ------------- | ------------------------------------------------- |
 | 
			
		||||
| `entity`        | unicode       | The unique entity identifier                      |
 | 
			
		||||
| --------------- | ------ | ----------------------------------------------- |
 | 
			
		||||
| `entity`        | str    | The unique entity identifier                    |
 | 
			
		||||
| `freq`          | float  | The frequency of the entity in a typical corpus |
 | 
			
		||||
| `entity_vector` | vector | The pretrained vector of the entity             |
 | 
			
		||||
 | 
			
		||||
## KnowledgeBase.set_entities {#set_entities tag="method"}
 | 
			
		||||
 | 
			
		||||
Define the full list of entities in the knowledge base, specifying the corpus frequency
 | 
			
		||||
and entity vector for each entity.
 | 
			
		||||
Define the full list of entities in the knowledge base, specifying the corpus
 | 
			
		||||
frequency and entity vector for each entity.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
| 
						 | 
				
			
			@ -69,17 +72,18 @@ and entity vector for each entity.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name          | Type     | Description                       |
 | 
			
		||||
| ------------- | ------------- | ------------------------------------------------- |
 | 
			
		||||
| ------------- | -------- | --------------------------------- |
 | 
			
		||||
| `entity_list` | iterable | List of unique entity identifiers |
 | 
			
		||||
| `freq_list`   | iterable | List of entity frequencies        |
 | 
			
		||||
| `vector_list` | iterable | List of entity vectors            |
 | 
			
		||||
 | 
			
		||||
## KnowledgeBase.add_alias {#add_alias tag="method"}
 | 
			
		||||
 | 
			
		||||
Add an alias or mention to the knowledge base, specifying its potential KB identifiers
 | 
			
		||||
and their prior probabilities. The entity identifiers should refer to entities previously
 | 
			
		||||
added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities).
 | 
			
		||||
The sum of the prior probabilities should not exceed 1.
 | 
			
		||||
Add an alias or mention to the knowledge base, specifying its potential KB
 | 
			
		||||
identifiers and their prior probabilities. The entity identifiers should refer
 | 
			
		||||
to entities previously added with [`add_entity`](/api/kb#add_entity) or
 | 
			
		||||
[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
 | 
			
		||||
should not exceed 1.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
| 
						 | 
				
			
			@ -88,10 +92,10 @@ The sum of the prior probabilities should not exceed 1.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name            | Type     | Description                                        |
 | 
			
		||||
| -------------- | ------------- | -------------------------------------------------- |
 | 
			
		||||
| `alias`        | unicode       | The textual mention or alias                       |
 | 
			
		||||
| --------------- | -------- | -------------------------------------------------- |
 | 
			
		||||
| `alias`         | str      | The textual mention or alias                       |
 | 
			
		||||
| `entities`      | iterable | The potential entities that the alias may refer to |
 | 
			
		||||
| `probabilities`| iterable      | The prior probabilities of each entity             |
 | 
			
		||||
| `probabilities` | iterable | The prior probabilities of each entity             |
 | 
			
		||||
 | 
			
		||||
## KnowledgeBase.\_\_len\_\_ {#len tag="method"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -118,7 +122,7 @@ Get a list of all entity IDs in the knowledge base.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                 |
 | 
			
		||||
| ----------- | ---- | --------------------------------------------- |
 | 
			
		||||
| ----------- | ---- | ------------------------------------------- |
 | 
			
		||||
| **RETURNS** | list | The list of entities in the knowledge base. |
 | 
			
		||||
 | 
			
		||||
## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -132,7 +136,7 @@ Get the total number of aliases in the knowledge base.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                  |
 | 
			
		||||
| ----------- | ---- | --------------------------------------------- |
 | 
			
		||||
| ----------- | ---- | -------------------------------------------- |
 | 
			
		||||
| **RETURNS** | int  | The number of aliases in the knowledge base. |
 | 
			
		||||
 | 
			
		||||
## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -146,7 +150,7 @@ Get a list of all aliases in the knowledge base.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                |
 | 
			
		||||
| ----------- | ---- | --------------------------------------------- |
 | 
			
		||||
| ----------- | ---- | ------------------------------------------ |
 | 
			
		||||
| **RETURNS** | list | The list of aliases in the knowledge base. |
 | 
			
		||||
 | 
			
		||||
## KnowledgeBase.get_candidates {#get_candidates tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -161,8 +165,8 @@ of type [`Candidate`](/api/kb/#candidate_init).
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type     | Description                              |
 | 
			
		||||
| ------------- | ------------- | -------------------------------------------------- |
 | 
			
		||||
| `alias`       | unicode       | The textual mention or alias                       |
 | 
			
		||||
| ----------- | -------- | ---------------------------------------- |
 | 
			
		||||
| `alias`     | str      | The textual mention or alias             |
 | 
			
		||||
| **RETURNS** | iterable | The list of relevant `Candidate` objects |
 | 
			
		||||
 | 
			
		||||
## KnowledgeBase.get_vector {#get_vector tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -176,14 +180,14 @@ Given a certain entity ID, retrieve its pretrained entity vector.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type   | Description       |
 | 
			
		||||
| ------------- | ------------- | -------------------------------------------------- |
 | 
			
		||||
| `entity`      | unicode       | The entity ID                                      |
 | 
			
		||||
| ----------- | ------ | ----------------- |
 | 
			
		||||
| `entity`    | str    | The entity ID     |
 | 
			
		||||
| **RETURNS** | vector | The entity vector |
 | 
			
		||||
 | 
			
		||||
## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
 | 
			
		||||
 | 
			
		||||
Given a certain entity ID and a certain textual mention, retrieve
 | 
			
		||||
the prior probability of the fact that the mention links to the entity ID.
 | 
			
		||||
Given a certain entity ID and a certain textual mention, retrieve the prior
 | 
			
		||||
probability of the fact that the mention links to the entity ID.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
| 
						 | 
				
			
			@ -192,9 +196,9 @@ the prior probability of the fact that the mention links to the entity ID.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type  | Description                                                    |
 | 
			
		||||
| ------------- | ------------- | --------------------------------------------------------------- |
 | 
			
		||||
| `entity`      | unicode       | The entity ID                                                   |
 | 
			
		||||
| `alias`       | unicode       | The textual mention or alias                                    |
 | 
			
		||||
| ----------- | ----- | -------------------------------------------------------------- |
 | 
			
		||||
| `entity`    | str   | The entity ID                                                  |
 | 
			
		||||
| `alias`     | str   | The textual mention or alias                                   |
 | 
			
		||||
| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` |
 | 
			
		||||
 | 
			
		||||
## KnowledgeBase.dump {#dump tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -208,13 +212,13 @@ Save the current state of the knowledge base to a directory.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name  | Type         | Description                                                                                                           |
 | 
			
		||||
| ------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ |
 | 
			
		||||
| `loc`         | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects.    |
 | 
			
		||||
| ----- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `loc` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
 | 
			
		||||
## KnowledgeBase.load_bulk {#load_bulk tag="method"}
 | 
			
		||||
 | 
			
		||||
Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab)
 | 
			
		||||
should also be the same as the one used to create the KB.
 | 
			
		||||
Restore the state of the knowledge base from a given directory. Note that the
 | 
			
		||||
[`Vocab`](/api/vocab) should also be the same as the one used to create the KB.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
| 
						 | 
				
			
			@ -226,18 +230,16 @@ should also be the same as the one used to create the KB.
 | 
			
		|||
> kb.load_bulk("/path/to/kb")
 | 
			
		||||
> ```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
| Name        | Type            | Description                                                                |
 | 
			
		||||
| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
 | 
			
		||||
| `loc`       | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects.                |
 | 
			
		||||
| ----------- | --------------- | -------------------------------------------------------------------------- |
 | 
			
		||||
| `loc`       | str / `Path`    | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object.                                       |
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
## Candidate.\_\_init\_\_ {#candidate_init tag="method"}
 | 
			
		||||
 | 
			
		||||
Construct a `Candidate` object. Usually this constructor is not called directly,
 | 
			
		||||
but instead these objects are returned by the [`get_candidates`](/api/kb#get_candidates) method
 | 
			
		||||
of a `KnowledgeBase`.
 | 
			
		||||
but instead these objects are returned by the
 | 
			
		||||
[`get_candidates`](/api/kb#get_candidates) method of a `KnowledgeBase`.
 | 
			
		||||
 | 
			
		||||
> #### Example
 | 
			
		||||
>
 | 
			
		||||
| 
						 | 
				
			
			@ -258,11 +260,11 @@ of a `KnowledgeBase`.
 | 
			
		|||
## Candidate attributes {#candidate_attributes}
 | 
			
		||||
 | 
			
		||||
| Name            | Type   | Description                                                    |
 | 
			
		||||
| ---------------------- | ------------ | ------------------------------------------------------------------ |
 | 
			
		||||
| --------------- | ------ | -------------------------------------------------------------- |
 | 
			
		||||
| `entity`        | int    | The entity's unique KB identifier                              |
 | 
			
		||||
| `entity_`              | unicode      | The entity's unique KB identifier                                  |
 | 
			
		||||
| `entity_`       | str    | The entity's unique KB identifier                              |
 | 
			
		||||
| `alias`         | int    | The alias or textual mention                                   |
 | 
			
		||||
| `alias_`               | unicode      | The alias or textual mention                                       |
 | 
			
		||||
| `alias_`        | str    | The alias or textual mention                                   |
 | 
			
		||||
| `prior_prob`    | long   | The prior probability of the `alias` referring to the `entity` |
 | 
			
		||||
| `entity_freq`   | long   | The frequency of the entity in a typical corpus                |
 | 
			
		||||
| `entity_vector` | vector | The pretrained vector of the entity                            |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -50,8 +50,8 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type  | Description                                                                       |
 | 
			
		||||
| ----------- | ------- | --------------------------------------------------------------------------------- |
 | 
			
		||||
| `text`      | unicode | The text to be processed.                                                         |
 | 
			
		||||
| ----------- | ----- | --------------------------------------------------------------------------------- |
 | 
			
		||||
| `text`      | str   | The text to be processed.                                                         |
 | 
			
		||||
| `disable`   | list  | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
 | 
			
		||||
| **RETURNS** | `Doc` | A container for accessing the annotations.                                        |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -201,7 +201,7 @@ Create a pipeline component from a factory.
 | 
			
		|||
 | 
			
		||||
| Name        | Type     | Description                                                                        |
 | 
			
		||||
| ----------- | -------- | ---------------------------------------------------------------------------------- |
 | 
			
		||||
| `name`      | unicode  | Factory name to look up in [`Language.factories`](/api/language#class-attributes). |
 | 
			
		||||
| `name`      | str      | Factory name to look up in [`Language.factories`](/api/language#class-attributes). |
 | 
			
		||||
| `config`    | dict     | Configuration parameters to initialize component.                                  |
 | 
			
		||||
| **RETURNS** | callable | The pipeline component.                                                            |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -224,9 +224,9 @@ take a `Doc` object, modify it and return it. Only one of `before`, `after`,
 | 
			
		|||
| Name        | Type     | Description                                                                                                                                                                                                                                            |
 | 
			
		||||
| ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
			
		||||
| `component` | callable | The pipeline component.                                                                                                                                                                                                                                |
 | 
			
		||||
| `name`      | unicode  | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. |
 | 
			
		||||
| `before`    | unicode  | Component name to insert component directly before.                                                                                                                                                                                                    |
 | 
			
		||||
| `after`     | unicode  | Component name to insert component directly after:                                                                                                                                                                                                     |
 | 
			
		||||
| `name`      | str      | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. |
 | 
			
		||||
| `before`    | str      | Component name to insert component directly before.                                                                                                                                                                                                    |
 | 
			
		||||
| `after`     | str      | Component name to insert component directly after:                                                                                                                                                                                                     |
 | 
			
		||||
| `first`     | bool     | Insert component first / not first in the pipeline.                                                                                                                                                                                                    |
 | 
			
		||||
| `last`      | bool     | Insert component last / not last in the pipeline.                                                                                                                                                                                                      |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -244,8 +244,8 @@ Check whether a component is present in the pipeline. Equivalent to
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                              |
 | 
			
		||||
| ----------- | ------- | -------------------------------------------------------- |
 | 
			
		||||
| `name`      | unicode | Name of the pipeline component to check.                 |
 | 
			
		||||
| ----------- | ---- | -------------------------------------------------------- |
 | 
			
		||||
| `name`      | str  | Name of the pipeline component to check.                 |
 | 
			
		||||
| **RETURNS** | bool | Whether a component of that name exists in the pipeline. |
 | 
			
		||||
 | 
			
		||||
## Language.get_pipe {#get_pipe tag="method" new="2"}
 | 
			
		||||
| 
						 | 
				
			
			@ -261,7 +261,7 @@ Get a pipeline component for a given component name.
 | 
			
		|||
 | 
			
		||||
| Name        | Type     | Description                            |
 | 
			
		||||
| ----------- | -------- | -------------------------------------- |
 | 
			
		||||
| `name`      | unicode  | Name of the pipeline component to get. |
 | 
			
		||||
| `name`      | str      | Name of the pipeline component to get. |
 | 
			
		||||
| **RETURNS** | callable | The pipeline component.                |
 | 
			
		||||
 | 
			
		||||
## Language.replace_pipe {#replace_pipe tag="method" new="2"}
 | 
			
		||||
| 
						 | 
				
			
			@ -276,7 +276,7 @@ Replace a component in the pipeline.
 | 
			
		|||
 | 
			
		||||
| Name        | Type     | Description                       |
 | 
			
		||||
| ----------- | -------- | --------------------------------- |
 | 
			
		||||
| `name`      | unicode  | Name of the component to replace. |
 | 
			
		||||
| `name`      | str      | Name of the component to replace. |
 | 
			
		||||
| `component` | callable | The pipeline component to insert. |
 | 
			
		||||
 | 
			
		||||
## Language.rename_pipe {#rename_pipe tag="method" new="2"}
 | 
			
		||||
| 
						 | 
				
			
			@ -293,9 +293,9 @@ added to the pipeline, you can also use the `name` argument on
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name       | Type | Description                      |
 | 
			
		||||
| ---------- | ------- | -------------------------------- |
 | 
			
		||||
| `old_name` | unicode | Name of the component to rename. |
 | 
			
		||||
| `new_name` | unicode | New name of the component.       |
 | 
			
		||||
| ---------- | ---- | -------------------------------- |
 | 
			
		||||
| `old_name` | str  | Name of the component to rename. |
 | 
			
		||||
| `new_name` | str  | New name of the component.       |
 | 
			
		||||
 | 
			
		||||
## Language.remove_pipe {#remove_pipe tag="method" new="2"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -310,8 +310,8 @@ component function.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type  | Description                                           |
 | 
			
		||||
| ----------- | ------- | ----------------------------------------------------- |
 | 
			
		||||
| `name`      | unicode | Name of the component to remove.                      |
 | 
			
		||||
| ----------- | ----- | ----------------------------------------------------- |
 | 
			
		||||
| `name`      | str   | Name of the component to remove.                      |
 | 
			
		||||
| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. |
 | 
			
		||||
 | 
			
		||||
## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"}
 | 
			
		||||
| 
						 | 
				
			
			@ -342,12 +342,11 @@ latter case, all components not in the `enable` list, will be disabled.
 | 
			
		|||
| Name        | Type            | Description                                                                          |
 | 
			
		||||
| ----------- | --------------- | ------------------------------------------------------------------------------------ |
 | 
			
		||||
| `disable`   | list            | Names of pipeline components to disable.                                             |
 | 
			
		||||
| `disable`   | unicode         | Name of pipeline component to disable.                                               |
 | 
			
		||||
| `disable`   | str             | Name of pipeline component to disable.                                               |
 | 
			
		||||
| `enable`    | list            | Names of pipeline components that will not be disabled.                              |
 | 
			
		||||
| `enable`    | unicode         | Name of pipeline component that will not be disabled.                                |
 | 
			
		||||
| `enable`    | str             | Name of pipeline component that will not be disabled.                                |
 | 
			
		||||
| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
<Infobox title="Changed in v3.0" variant="warning">
 | 
			
		||||
 | 
			
		||||
As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
 | 
			
		||||
| 
						 | 
				
			
			@ -371,8 +370,8 @@ the model**.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name      | Type         | Description                                                                                                           |
 | 
			
		||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude` | list         | Names of pipeline components or [serialization fields](#serialization-fields) to exclude.                             |
 | 
			
		||||
 | 
			
		||||
## Language.from_disk {#from_disk tag="method" new="2"}
 | 
			
		||||
| 
						 | 
				
			
			@ -396,8 +395,8 @@ loaded object.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type         | Description                                                                               |
 | 
			
		||||
| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects.                |
 | 
			
		||||
| ----------- | ------------ | ----------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects.                |
 | 
			
		||||
| `exclude`   | list         | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
 | 
			
		||||
| **RETURNS** | `Language`   | The modified `Language` object.                                                           |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -481,9 +480,9 @@ per component.
 | 
			
		|||
## Class attributes {#class-attributes}
 | 
			
		||||
 | 
			
		||||
| Name                                   | Type  | Description                                                                                                                         |
 | 
			
		||||
| -------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| -------------------------------------- | ----- | ----------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `Defaults`                             | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline.                                           |
 | 
			
		||||
| `lang`                                 | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes).                                     |
 | 
			
		||||
| `lang`                                 | str   | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes).                                     |
 | 
			
		||||
| `factories` <Tag variant="new">2</Tag> | dict  | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. |
 | 
			
		||||
 | 
			
		||||
## Serialization fields {#serialization-fields}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -63,8 +63,8 @@ Lemmatize a string.
 | 
			
		|||
 | 
			
		||||
| Name         | Type          | Description                                                                                              |
 | 
			
		||||
| ------------ | ------------- | -------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `string`     | unicode       | The string to lemmatize, e.g. the token text.                                                            |
 | 
			
		||||
| `univ_pos`   | unicode / int | The token's universal part-of-speech tag.                                                                |
 | 
			
		||||
| `string`     | str           | The string to lemmatize, e.g. the token text.                                                            |
 | 
			
		||||
| `univ_pos`   | str / int     | The token's universal part-of-speech tag.                                                                |
 | 
			
		||||
| `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. |
 | 
			
		||||
| **RETURNS**  | list          | The available lemmas for the string.                                                                     |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -83,10 +83,10 @@ original string is returned. Languages can provide a
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                                                                                 |
 | 
			
		||||
| ----------- | ------- | ----------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `string`    | unicode | The string to look up.                                                                                      |
 | 
			
		||||
| ----------- | ---- | ----------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `string`    | str  | The string to look up.                                                                                      |
 | 
			
		||||
| `orth`      | int  | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
 | 
			
		||||
| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string.                                           |
 | 
			
		||||
| **RETURNS** | str  | The lemma if the string was found, otherwise the original string.                                           |
 | 
			
		||||
 | 
			
		||||
## Lemmatizer.is_base_form {#is_base_form tag="method"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -103,8 +103,8 @@ lemmatization entirely.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name         | Type      | Description                                                                             |
 | 
			
		||||
| ------------ | ------------- | --------------------------------------------------------------------------------------- |
 | 
			
		||||
| `univ_pos`   | unicode / int | The token's universal part-of-speech tag.                                               |
 | 
			
		||||
| ------------ | --------- | --------------------------------------------------------------------------------------- |
 | 
			
		||||
| `univ_pos`   | str / int | The token's universal part-of-speech tag.                                               |
 | 
			
		||||
| `morphology` | dict      | The token's morphological features.                                                     |
 | 
			
		||||
| **RETURNS**  | bool      | Whether the token's part-of-speech tag and morphological features describe a base form. |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -125,21 +125,21 @@ The L2 norm of the lexeme's vector representation.
 | 
			
		|||
| Name                                         | Type    | Description                                                                                                                                                                                                                                                  |
 | 
			
		||||
| -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
			
		||||
| `vocab`                                      | `Vocab` | The lexeme's vocabulary.                                                                                                                                                                                                                                     |
 | 
			
		||||
| `text`                                       | unicode | Verbatim text content.                                                                                                                                                                                                                                       |
 | 
			
		||||
| `text`                                       | str     | Verbatim text content.                                                                                                                                                                                                                                       |
 | 
			
		||||
| `orth`                                       | int     | ID of the verbatim text content.                                                                                                                                                                                                                             |
 | 
			
		||||
| `orth_`                                      | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes.                                                                                                                                                 |
 | 
			
		||||
| `orth_`                                      | str     | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes.                                                                                                                                                 |
 | 
			
		||||
| `rank`                                       | int     | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors.                                                                                                                                                               |
 | 
			
		||||
| `flags`                                      | int     | Container of the lexeme's binary flags.                                                                                                                                                                                                                      |
 | 
			
		||||
| `norm`                                       | int     | The lexemes's norm, i.e. a normalized form of the lexeme text.                                                                                                                                                                                               |
 | 
			
		||||
| `norm_`                                      | unicode | The lexemes's norm, i.e. a normalized form of the lexeme text.                                                                                                                                                                                               |
 | 
			
		||||
| `norm_`                                      | str     | The lexemes's norm, i.e. a normalized form of the lexeme text.                                                                                                                                                                                               |
 | 
			
		||||
| `lower`                                      | int     | Lowercase form of the word.                                                                                                                                                                                                                                  |
 | 
			
		||||
| `lower_`                                     | unicode | Lowercase form of the word.                                                                                                                                                                                                                                  |
 | 
			
		||||
| `lower_`                                     | str     | Lowercase form of the word.                                                                                                                                                                                                                                  |
 | 
			
		||||
| `shape`                                      | int     | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
 | 
			
		||||
| `shape_`                                     | unicode | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`.  |
 | 
			
		||||
| `shape_`                                     | str     | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`.  |
 | 
			
		||||
| `prefix`                                     | int     | Length-N substring from the start of the word. Defaults to `N=1`.                                                                                                                                                                                            |
 | 
			
		||||
| `prefix_`                                    | unicode | Length-N substring from the start of the word. Defaults to `N=1`.                                                                                                                                                                                            |
 | 
			
		||||
| `prefix_`                                    | str     | Length-N substring from the start of the word. Defaults to `N=1`.                                                                                                                                                                                            |
 | 
			
		||||
| `suffix`                                     | int     | Length-N substring from the end of the word. Defaults to `N=3`.                                                                                                                                                                                              |
 | 
			
		||||
| `suffix_`                                    | unicode | Length-N substring from the start of the word. Defaults to `N=3`.                                                                                                                                                                                            |
 | 
			
		||||
| `suffix_`                                    | str     | Length-N substring from the start of the word. Defaults to `N=3`.                                                                                                                                                                                            |
 | 
			
		||||
| `is_alpha`                                   | bool    | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`.                                                                                                                                                                     |
 | 
			
		||||
| `is_ascii`                                   | bool    | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`.                                                                                                                                                      |
 | 
			
		||||
| `is_digit`                                   | bool    | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`.                                                                                                                                                                                    |
 | 
			
		||||
| 
						 | 
				
			
			@ -159,7 +159,7 @@ The L2 norm of the lexeme's vector representation.
 | 
			
		|||
| `is_oov`                                     | bool    | Is the lexeme out-of-vocabulary?                                                                                                                                                                                                                             |
 | 
			
		||||
| `is_stop`                                    | bool    | Is the lexeme part of a "stop list"?                                                                                                                                                                                                                         |
 | 
			
		||||
| `lang`                                       | int     | Language of the parent vocabulary.                                                                                                                                                                                                                           |
 | 
			
		||||
| `lang_`                                      | unicode | Language of the parent vocabulary.                                                                                                                                                                                                                           |
 | 
			
		||||
| `lang_`                                      | str     | Language of the parent vocabulary.                                                                                                                                                                                                                           |
 | 
			
		||||
| `prob`                                       | float   | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary).                                                                                                                                                   |
 | 
			
		||||
| `cluster`                                    | int     | Brown cluster ID.                                                                                                                                                                                                                                            |
 | 
			
		||||
| `sentiment`                                  | float   | A scalar value indicating the positivity or negativity of the lexeme.                                                                                                                                                                                        |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -57,8 +57,8 @@ Check if the lookups contain a table of a given name. Delegates to
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                     |
 | 
			
		||||
| ----------- | ------- | ----------------------------------------------- |
 | 
			
		||||
| `name`      | unicode | Name of the table.                              |
 | 
			
		||||
| ----------- | ---- | ----------------------------------------------- |
 | 
			
		||||
| `name`      | str  | Name of the table.                              |
 | 
			
		||||
| **RETURNS** | bool | Whether a table of that name is in the lookups. |
 | 
			
		||||
 | 
			
		||||
## Lookups.tables {#tables tag="property"}
 | 
			
		||||
| 
						 | 
				
			
			@ -91,7 +91,7 @@ exists.
 | 
			
		|||
 | 
			
		||||
| Name        | Type                          | Description                        |
 | 
			
		||||
| ----------- | ----------------------------- | ---------------------------------- |
 | 
			
		||||
| `name`      | unicode                       | Unique name of the table.          |
 | 
			
		||||
| `name`      | str                           | Unique name of the table.          |
 | 
			
		||||
| `data`      | dict                          | Optional data to add to the table. |
 | 
			
		||||
| **RETURNS** | [`Table`](/api/lookups#table) | The newly added table.             |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -110,7 +110,7 @@ Get a table from the lookups. Raises an error if the table doesn't exist.
 | 
			
		|||
 | 
			
		||||
| Name        | Type                          | Description        |
 | 
			
		||||
| ----------- | ----------------------------- | ------------------ |
 | 
			
		||||
| `name`      | unicode                       | Name of the table. |
 | 
			
		||||
| `name`      | str                           | Name of the table. |
 | 
			
		||||
| **RETURNS** | [`Table`](/api/lookups#table) | The table.         |
 | 
			
		||||
 | 
			
		||||
## Lookups.remove_table {#remove_table tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -128,7 +128,7 @@ Remove a table from the lookups. Raises an error if the table doesn't exist.
 | 
			
		|||
 | 
			
		||||
| Name        | Type                          | Description                  |
 | 
			
		||||
| ----------- | ----------------------------- | ---------------------------- |
 | 
			
		||||
| `name`      | unicode                       | Name of the table to remove. |
 | 
			
		||||
| `name`      | str                           | Name of the table to remove. |
 | 
			
		||||
| **RETURNS** | [`Table`](/api/lookups#table) | The removed table.           |
 | 
			
		||||
 | 
			
		||||
## Lookups.has_table {#has_table tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -145,8 +145,8 @@ Check if the lookups contain a table of a given name. Equivalent to
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                     |
 | 
			
		||||
| ----------- | ------- | ----------------------------------------------- |
 | 
			
		||||
| `name`      | unicode | Name of the table.                              |
 | 
			
		||||
| ----------- | ---- | ----------------------------------------------- |
 | 
			
		||||
| `name`      | str  | Name of the table.                              |
 | 
			
		||||
| **RETURNS** | bool | Whether a table of that name is in the lookups. |
 | 
			
		||||
 | 
			
		||||
## Lookups.to_bytes {#to_bytes tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -192,8 +192,8 @@ which will be created if it doesn't exist.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name   | Type         | Description                                                                                                           |
 | 
			
		||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
 | 
			
		||||
## Lookups.from_disk {#from_disk tag="method"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -209,8 +209,8 @@ the file doesn't exist.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type         | Description                                                                |
 | 
			
		||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| **RETURNS** | `Lookups`    | The loaded lookups.                                                        |
 | 
			
		||||
 | 
			
		||||
## Table {#table tag="class, ordererddict"}
 | 
			
		||||
| 
						 | 
				
			
			@ -238,7 +238,7 @@ Initialize a new table.
 | 
			
		|||
 | 
			
		||||
| Name        | Type    | Description                        |
 | 
			
		||||
| ----------- | ------- | ---------------------------------- |
 | 
			
		||||
| `name`      | unicode | Optional table name for reference. |
 | 
			
		||||
| `name`      | str     | Optional table name for reference. |
 | 
			
		||||
| **RETURNS** | `Table` | The newly constructed object.      |
 | 
			
		||||
 | 
			
		||||
### Table.from_dict {#table.from_dict tag="classmethod"}
 | 
			
		||||
| 
						 | 
				
			
			@ -256,7 +256,7 @@ Initialize a new table from a dict.
 | 
			
		|||
| Name        | Type    | Description                        |
 | 
			
		||||
| ----------- | ------- | ---------------------------------- |
 | 
			
		||||
| `data`      | dict    | The dictionary.                    |
 | 
			
		||||
| `name`      | unicode | Optional table name for reference. |
 | 
			
		||||
| `name`      | str     | Optional table name for reference. |
 | 
			
		||||
| **RETURNS** | `Table` | The newly constructed object.      |
 | 
			
		||||
 | 
			
		||||
### Table.set {#table.set tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -274,8 +274,8 @@ Set a new key / value pair. String keys will be hashed. Same as
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name    | Type      | Description |
 | 
			
		||||
| ------- | ------------- | ----------- |
 | 
			
		||||
| `key`   | unicode / int | The key.    |
 | 
			
		||||
| ------- | --------- | ----------- |
 | 
			
		||||
| `key`   | str / int | The key.    |
 | 
			
		||||
| `value` | -         | The value.  |
 | 
			
		||||
 | 
			
		||||
### Table.to_bytes {#table.to_bytes tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -313,6 +313,6 @@ Load a table from a bytestring.
 | 
			
		|||
 | 
			
		||||
| Name           | Type                        | Description                                           |
 | 
			
		||||
| -------------- | --------------------------- | ----------------------------------------------------- |
 | 
			
		||||
| `name`         | unicode                     | Table name.                                           |
 | 
			
		||||
| `name`         | str                         | Table name.                                           |
 | 
			
		||||
| `default_size` | int                         | Default size of bloom filters if no data is provided. |
 | 
			
		||||
| `bloom`        | `preshed.bloom.BloomFilter` | The bloom filters.                                    |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -126,8 +126,8 @@ Check whether the matcher contains rules for a match ID.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                           |
 | 
			
		||||
| ----------- | ------- | ----------------------------------------------------- |
 | 
			
		||||
| `key`       | unicode | The match ID.                                         |
 | 
			
		||||
| ----------- | ---- | ----------------------------------------------------- |
 | 
			
		||||
| `key`       | str  | The match ID.                                         |
 | 
			
		||||
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
 | 
			
		||||
 | 
			
		||||
## Matcher.add {#add tag="method" new="2"}
 | 
			
		||||
| 
						 | 
				
			
			@ -153,7 +153,7 @@ overwritten.
 | 
			
		|||
 | 
			
		||||
| Name        | Type               | Description                                                                                   |
 | 
			
		||||
| ----------- | ------------------ | --------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `match_id`  | unicode            | An ID for the thing you're matching.                                                          |
 | 
			
		||||
| `match_id`  | str                | An ID for the thing you're matching.                                                          |
 | 
			
		||||
| `on_match`  | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
 | 
			
		||||
| `*patterns` | list               | Match pattern. A pattern consists of a list of dicts, where each dict describes a token.      |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -189,8 +189,8 @@ exist.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name  | Type | Description               |
 | 
			
		||||
| ----- | ------- | ------------------------- |
 | 
			
		||||
| `key` | unicode | The ID of the match rule. |
 | 
			
		||||
| ----- | ---- | ------------------------- |
 | 
			
		||||
| `key` | str  | The ID of the match rule. |
 | 
			
		||||
 | 
			
		||||
## Matcher.get {#get tag="method" new="2"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -205,6 +205,6 @@ Retrieve the pattern stored for a key. Returns the rule as an
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type  | Description                                   |
 | 
			
		||||
| ----------- | ------- | --------------------------------------------- |
 | 
			
		||||
| `key`       | unicode | The ID of the match rule.                     |
 | 
			
		||||
| ----------- | ----- | --------------------------------------------- |
 | 
			
		||||
| `key`       | str   | The ID of the match rule.                     |
 | 
			
		||||
| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -134,8 +134,8 @@ Check whether the matcher contains rules for a match ID.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                           |
 | 
			
		||||
| ----------- | ------- | ----------------------------------------------------- |
 | 
			
		||||
| `key`       | unicode | The match ID.                                         |
 | 
			
		||||
| ----------- | ---- | ----------------------------------------------------- |
 | 
			
		||||
| `key`       | str  | The match ID.                                         |
 | 
			
		||||
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
 | 
			
		||||
 | 
			
		||||
## PhraseMatcher.add {#add tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -162,7 +162,7 @@ overwritten.
 | 
			
		|||
 | 
			
		||||
| Name       | Type               | Description                                                                                   |
 | 
			
		||||
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `match_id` | unicode            | An ID for the thing you're matching.                                                          |
 | 
			
		||||
| `match_id` | str                | An ID for the thing you're matching.                                                          |
 | 
			
		||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
 | 
			
		||||
| `*docs`    | `Doc`              | `Doc` objects of the phrases to match.                                                        |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -199,5 +199,5 @@ does not exist.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name  | Type | Description               |
 | 
			
		||||
| ----- | ------- | ------------------------- |
 | 
			
		||||
| `key` | unicode | The ID of the match rule. |
 | 
			
		||||
| ----- | ---- | ------------------------- |
 | 
			
		||||
| `key` | str  | The ID of the match rule. |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -113,7 +113,7 @@ end of the pipeline and after all other components.
 | 
			
		|||
</Infobox>
 | 
			
		||||
 | 
			
		||||
| Name        | Type  | Description                                                  |
 | 
			
		||||
| ----------- | ------- | ------------------------------------------------------------ |
 | 
			
		||||
| ----------- | ----- | ------------------------------------------------------------ |
 | 
			
		||||
| `doc`       | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
 | 
			
		||||
| `label`     | unicode | The subtoken dependency label. Defaults to `"subtok"`.       |
 | 
			
		||||
| `label`     | str   | The subtoken dependency label. Defaults to `"subtok"`.       |
 | 
			
		||||
| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens.                    |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -82,8 +82,8 @@ a file `sentencizer.json`. This also happens automatically when you save an
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name   | Type         | Description                                                                                                      |
 | 
			
		||||
| ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ------ | ------------ | ---------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path` | str / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
 | 
			
		||||
## Sentencizer.from_disk {#from_disk tag="method"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -99,8 +99,8 @@ added to its pipeline.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type          | Description                                                                |
 | 
			
		||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | unicode / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ----------- | ------------- | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | str / `Path`  | A path to a JSON file. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object.                                         |
 | 
			
		||||
 | 
			
		||||
## Sentencizer.to_bytes {#to_bytes tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -110,7 +110,7 @@ For details, see the documentation on
 | 
			
		|||
 | 
			
		||||
| Name      | Type     | Description                                                                                                                           |
 | 
			
		||||
| --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `name`    | unicode  | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`.                        |
 | 
			
		||||
| `name`    | str      | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`.                        |
 | 
			
		||||
| `default` | -        | Optional default value of the attribute if no getter or method is defined.                                                            |
 | 
			
		||||
| `method`  | callable | Set a custom method on the object, for example `span._.compare(other_span)`.                                                          |
 | 
			
		||||
| `getter`  | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute.            |
 | 
			
		||||
| 
						 | 
				
			
			@ -133,8 +133,8 @@ Look up a previously registered extension by name. Returns a 4-tuple
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type  | Description                                                   |
 | 
			
		||||
| ----------- | ------- | ------------------------------------------------------------- |
 | 
			
		||||
| `name`      | unicode | Name of the extension.                                        |
 | 
			
		||||
| ----------- | ----- | ------------------------------------------------------------- |
 | 
			
		||||
| `name`      | str   | Name of the extension.                                        |
 | 
			
		||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
 | 
			
		||||
 | 
			
		||||
## Span.has_extension {#has_extension tag="classmethod" new="2"}
 | 
			
		||||
| 
						 | 
				
			
			@ -150,8 +150,8 @@ Check whether an extension has been registered on the `Span` class.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                |
 | 
			
		||||
| ----------- | ------- | ------------------------------------------ |
 | 
			
		||||
| `name`      | unicode | Name of the extension to check.            |
 | 
			
		||||
| ----------- | ---- | ------------------------------------------ |
 | 
			
		||||
| `name`      | str  | Name of the extension to check.            |
 | 
			
		||||
| **RETURNS** | bool | Whether the extension has been registered. |
 | 
			
		||||
 | 
			
		||||
## Span.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
 | 
			
		||||
| 
						 | 
				
			
			@ -168,8 +168,8 @@ Remove a previously registered extension.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type  | Description                                                           |
 | 
			
		||||
| ----------- | ------- | --------------------------------------------------------------------- |
 | 
			
		||||
| `name`      | unicode | Name of the extension.                                                |
 | 
			
		||||
| ----------- | ----- | --------------------------------------------------------------------- |
 | 
			
		||||
| `name`      | str   | Name of the extension.                                                |
 | 
			
		||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
 | 
			
		||||
 | 
			
		||||
## Span.char_span {#char_span tag="method" new="2.2.4"}
 | 
			
		||||
| 
						 | 
				
			
			@ -497,16 +497,16 @@ The L2 norm of the span's vector representation.
 | 
			
		|||
| `end`                                   | int          | The token offset for the end of the span.                                                                      |
 | 
			
		||||
| `start_char`                            | int          | The character offset for the start of the span.                                                                |
 | 
			
		||||
| `end_char`                              | int          | The character offset for the end of the span.                                                                  |
 | 
			
		||||
| `text`                                  | unicode      | A unicode representation of the span text.                                                                     |
 | 
			
		||||
| `text_with_ws`                          | unicode      | The text content of the span with a trailing whitespace character if the last token has one.                   |
 | 
			
		||||
| `text`                                  | str          | A unicode representation of the span text.                                                                     |
 | 
			
		||||
| `text_with_ws`                          | str          | The text content of the span with a trailing whitespace character if the last token has one.                   |
 | 
			
		||||
| `orth`                                  | int          | ID of the verbatim text content.                                                                               |
 | 
			
		||||
| `orth_`                                 | unicode      | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes.     |
 | 
			
		||||
| `orth_`                                 | str          | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes.     |
 | 
			
		||||
| `label`                                 | int          | The hash value of the span's label.                                                                            |
 | 
			
		||||
| `label_`                                | unicode      | The span's label.                                                                                              |
 | 
			
		||||
| `lemma_`                                | unicode      | The span's lemma.                                                                                              |
 | 
			
		||||
| `label_`                                | str          | The span's label.                                                                                              |
 | 
			
		||||
| `lemma_`                                | str          | The span's lemma.                                                                                              |
 | 
			
		||||
| `kb_id`                                 | int          | The hash value of the knowledge base ID referred to by the span.                                               |
 | 
			
		||||
| `kb_id_`                                | unicode      | The knowledge base ID referred to by the span.                                                                 |
 | 
			
		||||
| `kb_id_`                                | str          | The knowledge base ID referred to by the span.                                                                 |
 | 
			
		||||
| `ent_id`                                | int          | The hash value of the named entity the token is an instance of.                                                |
 | 
			
		||||
| `ent_id_`                               | unicode      | The string ID of the named entity the token is an instance of.                                                 |
 | 
			
		||||
| `ent_id_`                               | str          | The string ID of the named entity the token is an instance of.                                                 |
 | 
			
		||||
| `sentiment`                             | float        | A scalar value indicating the positivity or negativity of the span.                                            |
 | 
			
		||||
| `_`                                     | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -55,7 +55,7 @@ Retrieve a string from a given hash, or vice versa.
 | 
			
		|||
| Name           | Type                     | Description                |
 | 
			
		||||
| -------------- | ------------------------ | -------------------------- |
 | 
			
		||||
| `string_or_id` | bytes, unicode or uint64 | The value to encode.       |
 | 
			
		||||
| **RETURNS**    | unicode or int           | The value to be retrieved. |
 | 
			
		||||
| **RETURNS**    | str or int               | The value to be retrieved. |
 | 
			
		||||
 | 
			
		||||
## StringStore.\_\_contains\_\_ {#contains tag="method"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -70,8 +70,8 @@ Check whether a string is in the store.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                            |
 | 
			
		||||
| ----------- | ------- | -------------------------------------- |
 | 
			
		||||
| `string`    | unicode | The string to check.                   |
 | 
			
		||||
| ----------- | ---- | -------------------------------------- |
 | 
			
		||||
| `string`    | str  | The string to check.                   |
 | 
			
		||||
| **RETURNS** | bool | Whether the store contains the string. |
 | 
			
		||||
 | 
			
		||||
## StringStore.\_\_iter\_\_ {#iter tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -88,8 +88,8 @@ store will always include an empty string `''` at position `0`.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name       | Type | Description            |
 | 
			
		||||
| ---------- | ------- | ---------------------- |
 | 
			
		||||
| **YIELDS** | unicode | A string in the store. |
 | 
			
		||||
| ---------- | ---- | ---------------------- |
 | 
			
		||||
| **YIELDS** | str  | A string in the store. |
 | 
			
		||||
 | 
			
		||||
## StringStore.add {#add tag="method" new="2"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -107,8 +107,8 @@ Add a string to the `StringStore`.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type   | Description              |
 | 
			
		||||
| ----------- | ------- | ------------------------ |
 | 
			
		||||
| `string`    | unicode | The string to add.       |
 | 
			
		||||
| ----------- | ------ | ------------------------ |
 | 
			
		||||
| `string`    | str    | The string to add.       |
 | 
			
		||||
| **RETURNS** | uint64 | The string's hash value. |
 | 
			
		||||
 | 
			
		||||
## StringStore.to_disk {#to_disk tag="method" new="2"}
 | 
			
		||||
| 
						 | 
				
			
			@ -122,8 +122,8 @@ Save the current state to a directory.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name   | Type         | Description                                                                                                           |
 | 
			
		||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
 | 
			
		||||
## StringStore.from_disk {#from_disk tag="method" new="2"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -137,8 +137,8 @@ Loads state from a directory. Modifies the object in place and returns it.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type          | Description                                                                |
 | 
			
		||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ----------- | ------------- | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | str / `Path`  | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| **RETURNS** | `StringStore` | The modified `StringStore` object.                                         |
 | 
			
		||||
 | 
			
		||||
## StringStore.to_bytes {#to_bytes tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -186,6 +186,6 @@ Get a 64-bit hash for a given string.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type   | Description         |
 | 
			
		||||
| ----------- | ------- | ------------------- |
 | 
			
		||||
| `string`    | unicode | The string to hash. |
 | 
			
		||||
| ----------- | ------ | ------------------- |
 | 
			
		||||
| `string`    | str    | The string to hash. |
 | 
			
		||||
| **RETURNS** | uint64 | The hash.           |
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -230,8 +230,8 @@ Add a new label to the pipe.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name     | Type | Description                                                     |
 | 
			
		||||
| -------- | ------- | --------------------------------------------------------------- |
 | 
			
		||||
| `label`  | unicode | The label to add.                                               |
 | 
			
		||||
| -------- | ---- | --------------------------------------------------------------- |
 | 
			
		||||
| `label`  | str  | The label to add.                                               |
 | 
			
		||||
| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. |
 | 
			
		||||
 | 
			
		||||
## Tagger.to_disk {#to_disk tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -246,8 +246,8 @@ Serialize the pipe to disk.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name      | Type         | Description                                                                                                           |
 | 
			
		||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |
 | 
			
		||||
 | 
			
		||||
## Tagger.from_disk {#from_disk tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -262,8 +262,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type         | Description                                                                |
 | 
			
		||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude`   | list         | String names of [serialization fields](#serialization-fields) to exclude.  |
 | 
			
		||||
| **RETURNS** | `Tagger`     | The modified `Tagger` object.                                              |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -44,7 +44,7 @@ shortcut for this and instantiate the component using its string name and
 | 
			
		|||
| `vocab`             | `Vocab`                       | The shared vocabulary.                                                                                                                                |
 | 
			
		||||
| `model`             | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
 | 
			
		||||
| `exclusive_classes` | bool                          | Make categories mutually exclusive. Defaults to `False`.                                                                                              |
 | 
			
		||||
| `architecture`      | unicode                       | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`.                                                 |
 | 
			
		||||
| `architecture`      | str                           | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`.                                                 |
 | 
			
		||||
| **RETURNS**         | `TextCategorizer`             | The newly constructed object.                                                                                                                         |
 | 
			
		||||
 | 
			
		||||
### Architectures {#architectures new="2.1"}
 | 
			
		||||
| 
						 | 
				
			
			@ -248,8 +248,8 @@ Add a new label to the pipe.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name    | Type | Description       |
 | 
			
		||||
| ------- | ------- | ----------------- |
 | 
			
		||||
| `label` | unicode | The label to add. |
 | 
			
		||||
| ------- | ---- | ----------------- |
 | 
			
		||||
| `label` | str  | The label to add. |
 | 
			
		||||
 | 
			
		||||
## TextCategorizer.to_disk {#to_disk tag="method"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -263,8 +263,8 @@ Serialize the pipe to disk.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name      | Type         | Description                                                                                                           |
 | 
			
		||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |
 | 
			
		||||
 | 
			
		||||
## TextCategorizer.from_disk {#from_disk tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -280,7 +280,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
 | 
			
		|||
 | 
			
		||||
| Name        | Type              | Description                                                                |
 | 
			
		||||
| ----------- | ----------------- | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | unicode / `Path`  | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `path`      | str / `Path`      | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude`   | list              | String names of [serialization fields](#serialization-fields) to exclude.  |
 | 
			
		||||
| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object.                                     |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -35,7 +35,7 @@ the
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name             | Type        | Description                                                                           |
 | 
			
		||||
| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| ---------------- | ----------- | ------------------------------------------------------------------------------------- |
 | 
			
		||||
| `vocab`          | `Vocab`     | A storage container for lexical types.                                                |
 | 
			
		||||
| `rules`          | dict        | Exceptions and special-cases for the tokenizer.                                       |
 | 
			
		||||
| `prefix_search`  | callable    | A function matching the signature of `re.compile(string).search` to match prefixes.   |
 | 
			
		||||
| 
						 | 
				
			
			@ -56,8 +56,8 @@ Tokenize a string.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type  | Description                             |
 | 
			
		||||
| ----------- | ------- | --------------------------------------- |
 | 
			
		||||
| `string`    | unicode | The string to tokenize.                 |
 | 
			
		||||
| ----------- | ----- | --------------------------------------- |
 | 
			
		||||
| `string`    | str   | The string to tokenize.                 |
 | 
			
		||||
| **RETURNS** | `Doc` | A container for linguistic annotations. |
 | 
			
		||||
 | 
			
		||||
## Tokenizer.pipe {#pipe tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -83,8 +83,8 @@ Tokenize a stream of texts.
 | 
			
		|||
Find internal split points of the string.
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                                                                                                                        |
 | 
			
		||||
| ----------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `string`    | unicode | The string to split.                                                                                                                               |
 | 
			
		||||
| ----------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `string`    | str  | The string to split.                                                                                                                               |
 | 
			
		||||
| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. |
 | 
			
		||||
 | 
			
		||||
## Tokenizer.find_prefix {#find_prefix tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -93,8 +93,8 @@ Find the length of a prefix that should be segmented from the string, or `None`
 | 
			
		|||
if no prefix rules match.
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                            |
 | 
			
		||||
| ----------- | ------- | ------------------------------------------------------ |
 | 
			
		||||
| `string`    | unicode | The string to segment.                                 |
 | 
			
		||||
| ----------- | ---- | ------------------------------------------------------ |
 | 
			
		||||
| `string`    | str  | The string to segment.                                 |
 | 
			
		||||
| **RETURNS** | int  | The length of the prefix if present, otherwise `None`. |
 | 
			
		||||
 | 
			
		||||
## Tokenizer.find_suffix {#find_suffix tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -104,7 +104,7 @@ if no suffix rules match.
 | 
			
		|||
 | 
			
		||||
| Name        | Type         | Description                                            |
 | 
			
		||||
| ----------- | ------------ | ------------------------------------------------------ |
 | 
			
		||||
| `string`    | unicode      | The string to segment.                                 |
 | 
			
		||||
| `string`    | str          | The string to segment.                                 |
 | 
			
		||||
| **RETURNS** | int / `None` | The length of the suffix if present, otherwise `None`. |
 | 
			
		||||
 | 
			
		||||
## Tokenizer.add_special_case {#add_special_case tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -125,7 +125,7 @@ and examples.
 | 
			
		|||
 | 
			
		||||
| Name          | Type     | Description                                                                                                                                                              |
 | 
			
		||||
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
 | 
			
		||||
| `string`      | unicode  | The string to specially tokenize.                                                                                                                                        |
 | 
			
		||||
| `string`      | str      | The string to specially tokenize.                                                                                                                                        |
 | 
			
		||||
| `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. |
 | 
			
		||||
 | 
			
		||||
## Tokenizer.explain {#explain tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -143,8 +143,8 @@ produced are identical to `Tokenizer.__call__` except for whitespace tokens.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                         |
 | 
			
		||||
| ------------| -------- | --------------------------------------------------- |
 | 
			
		||||
| `string`    | unicode  | The string to tokenize with the debugging tokenizer |
 | 
			
		||||
| ----------- | ---- | --------------------------------------------------- |
 | 
			
		||||
| `string`    | str  | The string to tokenize with the debugging tokenizer |
 | 
			
		||||
| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples   |
 | 
			
		||||
 | 
			
		||||
## Tokenizer.to_disk {#to_disk tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -159,8 +159,8 @@ Serialize the tokenizer to disk.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name      | Type         | Description                                                                                                           |
 | 
			
		||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |
 | 
			
		||||
 | 
			
		||||
## Tokenizer.from_disk {#from_disk tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -175,8 +175,8 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type         | Description                                                                |
 | 
			
		||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude`   | list         | String names of [serialization fields](#serialization-fields) to exclude.  |
 | 
			
		||||
| **RETURNS** | `Tokenizer`  | The modified `Tokenizer` object.                                           |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -218,12 +218,12 @@ it.
 | 
			
		|||
## Attributes {#attributes}
 | 
			
		||||
 | 
			
		||||
| Name             | Type    | Description                                                                                                                |
 | 
			
		||||
| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `vocab`          | `Vocab` | The vocab object of the parent `Doc`.                                                                                      |
 | 
			
		||||
| `prefix_search`  | -       | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`.            |
 | 
			
		||||
| `suffix_search`  | -       | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`.              |
 | 
			
		||||
| `infix_finditer` | -       | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
 | 
			
		||||
| `token_match`    | -       | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
 | 
			
		||||
| `token_match`    | -       | A function matching the signature of `re.compile(string).match to find token matches. Returns an`re.MatchObject`or`None.   |
 | 
			
		||||
| `rules`          | dict    | A dictionary of tokenizer exceptions and special cases.                                                                    |
 | 
			
		||||
 | 
			
		||||
## Serialization fields {#serialization-fields}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -33,8 +33,8 @@ class. The data will be loaded in via
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type         | Description                                                                       |
 | 
			
		||||
| ----------- | ---------------- | --------------------------------------------------------------------------------- |
 | 
			
		||||
| `name`      | unicode / `Path` | Model to load, i.e. shortcut link, package name or path.                          |
 | 
			
		||||
| ----------- | ------------ | --------------------------------------------------------------------------------- |
 | 
			
		||||
| `name`      | str / `Path` | Model to load, i.e. shortcut link, package name or path.                          |
 | 
			
		||||
| `disable`   | list         | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
 | 
			
		||||
| **RETURNS** | `Language`   | A `Language` object with the loaded model.                                        |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -79,7 +79,7 @@ Create a blank model of a given language class. This function is the twin of
 | 
			
		|||
 | 
			
		||||
| Name        | Type       | Description                                                                                      |
 | 
			
		||||
| ----------- | ---------- | ------------------------------------------------------------------------------------------------ |
 | 
			
		||||
| `name`      | unicode    | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
 | 
			
		||||
| `name`      | str        | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
 | 
			
		||||
| `disable`   | list       | Names of pipeline components to [disable](/usage/processing-pipelines#disabling).                |
 | 
			
		||||
| **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass.                                          |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -99,8 +99,8 @@ meta data as a dictionary instead, you can use the `meta` attribute on your
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name       | Type | Description                                                   |
 | 
			
		||||
| ---------- | ------- | ------------------------------------------------------------- |
 | 
			
		||||
| `model`    | unicode | A model, i.e. shortcut link, package name or path (optional). |
 | 
			
		||||
| ---------- | ---- | ------------------------------------------------------------- |
 | 
			
		||||
| `model`    | str  | A model, i.e. shortcut link, package name or path (optional). |
 | 
			
		||||
| `markdown` | bool | Print information as Markdown.                                |
 | 
			
		||||
 | 
			
		||||
### spacy.explain {#spacy.explain tag="function"}
 | 
			
		||||
| 
						 | 
				
			
			@ -123,9 +123,9 @@ list of available terms, see
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                              |
 | 
			
		||||
| ----------- | ------- | -------------------------------------------------------- |
 | 
			
		||||
| `term`      | unicode | Term to explain.                                         |
 | 
			
		||||
| **RETURNS** | unicode | The explanation, or `None` if not found in the glossary. |
 | 
			
		||||
| ----------- | ---- | -------------------------------------------------------- |
 | 
			
		||||
| `term`      | str  | Term to explain.                                         |
 | 
			
		||||
| **RETURNS** | str  | The explanation, or `None` if not found in the glossary. |
 | 
			
		||||
 | 
			
		||||
### spacy.prefer_gpu {#spacy.prefer_gpu tag="function" new="2.0.14"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -189,13 +189,13 @@ browser. Will run a simple web server.
 | 
			
		|||
| Name      | Type                | Description                                                                                                                          | Default     |
 | 
			
		||||
| --------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ----------- |
 | 
			
		||||
| `docs`    | list, `Doc`, `Span` | Document(s) to visualize.                                                                                                            |
 | 
			
		||||
| `style`   | unicode             | Visualization style, `'dep'` or `'ent'`.                                                                                             | `'dep'`     |
 | 
			
		||||
| `style`   | str                 | Visualization style, `'dep'` or `'ent'`.                                                                                             | `'dep'`     |
 | 
			
		||||
| `page`    | bool                | Render markup as full HTML page.                                                                                                     | `True`      |
 | 
			
		||||
| `minify`  | bool                | Minify HTML markup.                                                                                                                  | `False`     |
 | 
			
		||||
| `options` | dict                | [Visualizer-specific options](#displacy_options), e.g. colors.                                                                       | `{}`        |
 | 
			
		||||
| `manual`  | bool                | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False`     |
 | 
			
		||||
| `port`    | int                 | Port to serve visualization.                                                                                                         | `5000`      |
 | 
			
		||||
| `host`    | unicode             | Host to serve visualization.                                                                                                         | `'0.0.0.0'` |
 | 
			
		||||
| `host`    | str                 | Host to serve visualization.                                                                                                         | `'0.0.0.0'` |
 | 
			
		||||
 | 
			
		||||
### displacy.render {#displacy.render tag="method" new="2"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -214,13 +214,13 @@ Render a dependency parse tree or named entity visualization.
 | 
			
		|||
| Name        | Type                | Description                                                                                                                                               | Default |
 | 
			
		||||
| ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
 | 
			
		||||
| `docs`      | list, `Doc`, `Span` | Document(s) to visualize.                                                                                                                                 |
 | 
			
		||||
| `style`     | unicode             | Visualization style, `'dep'` or `'ent'`.                                                                                                                  | `'dep'` |
 | 
			
		||||
| `style`     | str                 | Visualization style, `'dep'` or `'ent'`.                                                                                                                  | `'dep'` |
 | 
			
		||||
| `page`      | bool                | Render markup as full HTML page.                                                                                                                          | `False` |
 | 
			
		||||
| `minify`    | bool                | Minify HTML markup.                                                                                                                                       | `False` |
 | 
			
		||||
| `jupyter`   | bool                | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None`  |
 | 
			
		||||
| `options`   | dict                | [Visualizer-specific options](#displacy_options), e.g. colors.                                                                                            | `{}`    |
 | 
			
		||||
| `manual`    | bool                | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples.                      | `False` |
 | 
			
		||||
| **RETURNS** | unicode             | Rendered HTML markup.                                                                                                                                     |
 | 
			
		||||
| **RETURNS** | str                 | Rendered HTML markup.                                                                                                                                     |
 | 
			
		||||
 | 
			
		||||
### Visualizer options {#displacy_options}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -237,15 +237,15 @@ If a setting is not present in the options, the default value will be used.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name                                       | Type | Description                                                                                                     | Default                 |
 | 
			
		||||
| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
 | 
			
		||||
| ------------------------------------------ | ---- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
 | 
			
		||||
| `fine_grained`                             | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`).              | `False`                 |
 | 
			
		||||
| `add_lemma` <Tag variant="new">2.2.4</Tag> | bool | Print the lemma's in a separate row below the token texts.                                                      | `False`                 |
 | 
			
		||||
| `collapse_punct`                           | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True`                  |
 | 
			
		||||
| `collapse_phrases`                         | bool | Merge noun phrases into one token.                                                                              | `False`                 |
 | 
			
		||||
| `compact`                                  | bool | "Compact mode" with square arrows that takes up less space.                                                     | `False`                 |
 | 
			
		||||
| `color`                                    | unicode | Text color (HEX, RGB or color names).                                                                           | `'#000000'`             |
 | 
			
		||||
| `bg`                                       | unicode | Background color (HEX, RGB or color names).                                                                     | `'#ffffff'`             |
 | 
			
		||||
| `font`                                     | unicode | Font name or font family for all text.                                                                          | `'Arial'`               |
 | 
			
		||||
| `color`                                    | str  | Text color (HEX, RGB or color names).                                                                           | `'#000000'`             |
 | 
			
		||||
| `bg`                                       | str  | Background color (HEX, RGB or color names).                                                                     | `'#ffffff'`             |
 | 
			
		||||
| `font`                                     | str  | Font name or font family for all text.                                                                          | `'Arial'`               |
 | 
			
		||||
| `offset_x`                                 | int  | Spacing on left side of the SVG in px.                                                                          | `50`                    |
 | 
			
		||||
| `arrow_stroke`                             | int  | Width of arrow path in px.                                                                                      | `2`                     |
 | 
			
		||||
| `arrow_width`                              | int  | Width of arrow head in px.                                                                                      | `10` / `8` (compact)    |
 | 
			
		||||
| 
						 | 
				
			
			@ -264,10 +264,10 @@ If a setting is not present in the options, the default value will be used.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name                                    | Type | Description                                                                                                                                | Default                                                                                          |
 | 
			
		||||
| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
 | 
			
		||||
| --------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
 | 
			
		||||
| `ents`                                  | list | Entity types to highlight (`None` for all types).                                                                                          | `None`                                                                                           |
 | 
			
		||||
| `colors`                                | dict | Color overrides. Entity types in uppercase should be mapped to color names or values.                                                      | `{}`                                                                                             |
 | 
			
		||||
| `template` <Tag variant="new">2.2</Tag> | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |
 | 
			
		||||
| `template` <Tag variant="new">2.2</Tag> | str  | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |
 | 
			
		||||
 | 
			
		||||
By default, displaCy comes with colors for all
 | 
			
		||||
[entity types supported by spaCy](/api/annotation#named-entities). If you're
 | 
			
		||||
| 
						 | 
				
			
			@ -309,8 +309,8 @@ Set custom path to the data directory where spaCy looks for models.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name   | Type         | Description                 |
 | 
			
		||||
| ------ | ---------------- | --------------------------- |
 | 
			
		||||
| `path` | unicode / `Path` | Path to new data directory. |
 | 
			
		||||
| ------ | ------------ | --------------------------- |
 | 
			
		||||
| `path` | str / `Path` | Path to new data directory. |
 | 
			
		||||
 | 
			
		||||
### util.get_lang_class {#util.get_lang_class tag="function"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -330,7 +330,7 @@ you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper.
 | 
			
		|||
 | 
			
		||||
| Name        | Type       | Description                            |
 | 
			
		||||
| ----------- | ---------- | -------------------------------------- |
 | 
			
		||||
| `lang`      | unicode    | Two-letter language code, e.g. `'en'`. |
 | 
			
		||||
| `lang`      | str        | Two-letter language code, e.g. `'en'`. |
 | 
			
		||||
| **RETURNS** | `Language` | Language class.                        |
 | 
			
		||||
 | 
			
		||||
### util.set_lang_class {#util.set_lang_class tag="function"}
 | 
			
		||||
| 
						 | 
				
			
			@ -352,7 +352,7 @@ the two-letter language code.
 | 
			
		|||
 | 
			
		||||
| Name   | Type       | Description                            |
 | 
			
		||||
| ------ | ---------- | -------------------------------------- |
 | 
			
		||||
| `name` | unicode    | Two-letter language code, e.g. `'en'`. |
 | 
			
		||||
| `name` | str        | Two-letter language code, e.g. `'en'`. |
 | 
			
		||||
| `cls`  | `Language` | The language class, e.g. `English`.    |
 | 
			
		||||
 | 
			
		||||
### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"}
 | 
			
		||||
| 
						 | 
				
			
			@ -369,8 +369,8 @@ loaded lazily, to avoid expensive setup code associated with the language data.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                            |
 | 
			
		||||
| ----------- | ------- | -------------------------------------- |
 | 
			
		||||
| `name`      | unicode | Two-letter language code, e.g. `'en'`. |
 | 
			
		||||
| ----------- | ---- | -------------------------------------- |
 | 
			
		||||
| `name`      | str  | Two-letter language code, e.g. `'en'`. |
 | 
			
		||||
| **RETURNS** | bool | Whether the class has been loaded.     |
 | 
			
		||||
 | 
			
		||||
### util.load_model {#util.load_model tag="function" new="2"}
 | 
			
		||||
| 
						 | 
				
			
			@ -392,7 +392,7 @@ in via [`Language.from_disk()`](/api/language#from_disk).
 | 
			
		|||
 | 
			
		||||
| Name          | Type       | Description                                              |
 | 
			
		||||
| ------------- | ---------- | -------------------------------------------------------- |
 | 
			
		||||
| `name`        | unicode    | Package name, shortcut link or model path.               |
 | 
			
		||||
| `name`        | str        | Package name, shortcut link or model path.               |
 | 
			
		||||
| `**overrides` | -          | Specific overrides, like pipeline components to disable. |
 | 
			
		||||
| **RETURNS**   | `Language` | `Language` class with the loaded model.                  |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -411,7 +411,7 @@ it easy to test a new model that you haven't packaged yet.
 | 
			
		|||
 | 
			
		||||
| Name          | Type       | Description                                                                                          |
 | 
			
		||||
| ------------- | ---------- | ---------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `model_path`  | unicode    | Path to model data directory.                                                                        |
 | 
			
		||||
| `model_path`  | str        | Path to model data directory.                                                                        |
 | 
			
		||||
| `meta`        | dict       | Model meta data. If `False`, spaCy will try to load the meta from a meta.json in the same directory. |
 | 
			
		||||
| `**overrides` | -          | Specific overrides, like pipeline components to disable.                                             |
 | 
			
		||||
| **RETURNS**   | `Language` | `Language` class with the loaded model.                                                              |
 | 
			
		||||
| 
						 | 
				
			
			@ -432,7 +432,7 @@ A helper function to use in the `load()` method of a model package's
 | 
			
		|||
 | 
			
		||||
| Name          | Type       | Description                                              |
 | 
			
		||||
| ------------- | ---------- | -------------------------------------------------------- |
 | 
			
		||||
| `init_file`   | unicode    | Path to model's `__init__.py`, i.e. `__file__`.          |
 | 
			
		||||
| `init_file`   | str        | Path to model's `__init__.py`, i.e. `__file__`.          |
 | 
			
		||||
| `**overrides` | -          | Specific overrides, like pipeline components to disable. |
 | 
			
		||||
| **RETURNS**   | `Language` | `Language` class with the loaded model.                  |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -447,8 +447,8 @@ Get a model's meta.json from a directory path and validate its contents.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type         | Description              |
 | 
			
		||||
| ----------- | ---------------- | ------------------------ |
 | 
			
		||||
| `path`      | unicode / `Path` | Path to model directory. |
 | 
			
		||||
| ----------- | ------------ | ------------------------ |
 | 
			
		||||
| `path`      | str / `Path` | Path to model directory. |
 | 
			
		||||
| **RETURNS** | dict         | The model's meta data.   |
 | 
			
		||||
 | 
			
		||||
### util.is_package {#util.is_package tag="function"}
 | 
			
		||||
| 
						 | 
				
			
			@ -464,8 +464,8 @@ Check if string maps to a package installed via pip. Mainly used to validate
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type   | Description                                  |
 | 
			
		||||
| ----------- | ------- | -------------------------------------------- |
 | 
			
		||||
| `name`      | unicode | Name of package.                             |
 | 
			
		||||
| ----------- | ------ | -------------------------------------------- |
 | 
			
		||||
| `name`      | str    | Name of package.                             |
 | 
			
		||||
| **RETURNS** | `bool` | `True` if installed package, `False` if not. |
 | 
			
		||||
 | 
			
		||||
### util.get_package_path {#util.get_package_path tag="function" new="2"}
 | 
			
		||||
| 
						 | 
				
			
			@ -481,8 +481,8 @@ Get path to an installed package. Mainly used to resolve the location of
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name           | Type   | Description                      |
 | 
			
		||||
| -------------- | ------- | -------------------------------- |
 | 
			
		||||
| `package_name` | unicode | Name of installed package.       |
 | 
			
		||||
| -------------- | ------ | -------------------------------- |
 | 
			
		||||
| `package_name` | str    | Name of installed package.       |
 | 
			
		||||
| **RETURNS**    | `Path` | Path to model package directory. |
 | 
			
		||||
 | 
			
		||||
### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -35,7 +35,7 @@ you can add vectors to later.
 | 
			
		|||
| `data`      | `ndarray[ndim=1, dtype='float32']` | The vector data.                                                                                                                                                   |
 | 
			
		||||
| `keys`      | iterable                           | A sequence of keys aligned with the data.                                                                                                                          |
 | 
			
		||||
| `shape`     | tuple                              | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. |
 | 
			
		||||
| `name`      | unicode                            | A name to identify the vectors table.                                                                                                                              |
 | 
			
		||||
| `name`      | str                                | A name to identify the vectors table.                                                                                                                              |
 | 
			
		||||
| **RETURNS** | `Vectors`                          | The newly created object.                                                                                                                                          |
 | 
			
		||||
 | 
			
		||||
## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -140,7 +140,7 @@ mapping separately. If you need to manage the strings, you should use the
 | 
			
		|||
 | 
			
		||||
| Name        | Type                               | Description                                           |
 | 
			
		||||
| ----------- | ---------------------------------- | ----------------------------------------------------- |
 | 
			
		||||
| `key`       | unicode / int                      | The key to add.                                       |
 | 
			
		||||
| `key`       | str / int                          | The key to add.                                       |
 | 
			
		||||
| `vector`    | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key.                |
 | 
			
		||||
| `row`       | int                                | An optional row number of a vector to map the key to. |
 | 
			
		||||
| **RETURNS** | int                                | The row the vector was added to.                      |
 | 
			
		||||
| 
						 | 
				
			
			@ -227,7 +227,7 @@ Look up one or more keys by row, or vice versa.
 | 
			
		|||
 | 
			
		||||
| Name        | Type                                  | Description                                                              |
 | 
			
		||||
| ----------- | ------------------------------------- | ------------------------------------------------------------------------ |
 | 
			
		||||
| `key`       | unicode / int                         | Find the row that the given key points to. Returns int, `-1` if missing. |
 | 
			
		||||
| `key`       | str / int                             | Find the row that the given key points to. Returns int, `-1` if missing. |
 | 
			
		||||
| `keys`      | iterable                              | Find rows that the keys point to. Returns `ndarray`.                     |
 | 
			
		||||
| `row`       | int                                   | Find the first key that points to the row. Returns int.                  |
 | 
			
		||||
| `rows`      | iterable                              | Find the keys that point to the rows. Returns ndarray.                   |
 | 
			
		||||
| 
						 | 
				
			
			@ -338,8 +338,8 @@ Save the current state to a directory.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name   | Type         | Description                                                                                                           |
 | 
			
		||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
 | 
			
		||||
## Vectors.from_disk {#from_disk tag="method"}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -353,8 +353,8 @@ Loads state from a directory. Modifies the object in place and returns it.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type         | Description                                                                |
 | 
			
		||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| **RETURNS** | `Vectors`    | The modified `Vectors` object.                                             |
 | 
			
		||||
 | 
			
		||||
## Vectors.to_bytes {#to_bytes tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -27,7 +27,7 @@ Create the vocabulary.
 | 
			
		|||
| `tag_map`                                   | dict                 | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. |
 | 
			
		||||
| `lemmatizer`                                | object               | A lemmatizer. Defaults to `None`.                                                                                  |
 | 
			
		||||
| `strings`                                   | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings.        |
 | 
			
		||||
| `vectors_name` <Tag variant="new">2.2</Tag> | unicode              | A name to identify the vectors table.                                                                              |
 | 
			
		||||
| `vectors_name` <Tag variant="new">2.2</Tag> | str                  | A name to identify the vectors table.                                                                              |
 | 
			
		||||
| **RETURNS**                                 | `Vocab`              | The newly constructed object.                                                                                      |
 | 
			
		||||
 | 
			
		||||
## Vocab.\_\_len\_\_ {#len tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -92,8 +92,8 @@ given string, you need to look it up in
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type | Description                                        |
 | 
			
		||||
| ----------- | ------- | -------------------------------------------------- |
 | 
			
		||||
| `string`    | unicode | The ID string.                                     |
 | 
			
		||||
| ----------- | ---- | -------------------------------------------------- |
 | 
			
		||||
| `string`    | str  | The ID string.                                     |
 | 
			
		||||
| **RETURNS** | bool | Whether the string has an entry in the vocabulary. |
 | 
			
		||||
 | 
			
		||||
## Vocab.add_flag {#add_flag tag="method"}
 | 
			
		||||
| 
						 | 
				
			
			@ -117,7 +117,7 @@ using `token.check_flag(flag_id)`.
 | 
			
		|||
 | 
			
		||||
| Name          | Type | Description                                                                                                                                     |
 | 
			
		||||
| ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `flag_getter` | dict | A function `f(unicode) -> bool`, to get the flag value.                                                                                         |
 | 
			
		||||
| `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value.                                                                                             |
 | 
			
		||||
| `flag_id`     | int  | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. |
 | 
			
		||||
| **RETURNS**   | int  | The integer ID by which the flag value can be checked.                                                                                          |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -228,8 +228,8 @@ Save the current state to a directory.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name      | Type         | Description                                                                                                           |
 | 
			
		||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `path`    | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude` | list         | String names of [serialization fields](#serialization-fields) to exclude.                                             |
 | 
			
		||||
 | 
			
		||||
## Vocab.from_disk {#from_disk tag="method" new="2"}
 | 
			
		||||
| 
						 | 
				
			
			@ -244,8 +244,8 @@ Loads state from a directory. Modifies the object in place and returns it.
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Name        | Type         | Description                                                                |
 | 
			
		||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
 | 
			
		||||
| `path`      | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
 | 
			
		||||
| `exclude`   | list         | String names of [serialization fields](#serialization-fields) to exclude.  |
 | 
			
		||||
| **RETURNS** | `Vocab`      | The modified `Vocab` object.                                               |
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -327,11 +327,11 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy)..
 | 
			
		|||
### Disabling the parser {#disabling}
 | 
			
		||||
 | 
			
		||||
In the [default models](/models), the parser is loaded and enabled as part of
 | 
			
		||||
the [standard processing pipeline](/usage/processing-pipelines). If you don't need
 | 
			
		||||
any of the syntactic information, you should disable the parser. Disabling the
 | 
			
		||||
parser will make spaCy load and run much faster. If you want to load the parser,
 | 
			
		||||
but need to disable it for specific documents, you can also control its use on
 | 
			
		||||
the `nlp` object.
 | 
			
		||||
the [standard processing pipeline](/usage/processing-pipelines). If you don't
 | 
			
		||||
need any of the syntactic information, you should disable the parser. Disabling
 | 
			
		||||
the parser will make spaCy load and run much faster. If you want to load the
 | 
			
		||||
parser, but need to disable it for specific documents, you can also control its
 | 
			
		||||
use on the `nlp` object.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
nlp = spacy.load("en_core_web_sm", disable=["parser"])
 | 
			
		||||
| 
						 | 
				
			
			@ -989,8 +989,8 @@ nlp.tokenizer = my_tokenizer
 | 
			
		|||
```
 | 
			
		||||
 | 
			
		||||
| Argument    | Type  | Description               |
 | 
			
		||||
| ----------- | ------- | ------------------------- |
 | 
			
		||||
| `text`      | unicode | The raw text to tokenize. |
 | 
			
		||||
| ----------- | ----- | ------------------------- |
 | 
			
		||||
| `text`      | str   | The raw text to tokenize. |
 | 
			
		||||
| **RETURNS** | `Doc` | The tokenized document.   |
 | 
			
		||||
 | 
			
		||||
<Infobox title="Important note: using a custom tokenizer" variant="warning">
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -272,16 +272,16 @@ doc = nlp("I won't have named entities")
 | 
			
		|||
disabled.restore()
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
If you want to disable all pipes except for one or a few, you can use the `enable`
 | 
			
		||||
keyword. Just like the `disable` keyword, it takes a list of pipe names, or a string
 | 
			
		||||
defining just one pipe.
 | 
			
		||||
If you want to disable all pipes except for one or a few, you can use the
 | 
			
		||||
`enable` keyword. Just like the `disable` keyword, it takes a list of pipe
 | 
			
		||||
names, or a string defining just one pipe.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
# Enable only the parser
 | 
			
		||||
with nlp.select_pipes(enable="parser"):
 | 
			
		||||
    doc = nlp("I will only be parsed")
 | 
			
		||||
```
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method
 | 
			
		||||
to remove pipeline components from an existing pipeline, the
 | 
			
		||||
[`rename_pipe`](/api/language#rename_pipe) method to rename them, or the
 | 
			
		||||
| 
						 | 
				
			
			@ -350,11 +350,11 @@ last** in the pipeline, or define a **custom name**. If no name is set and no
 | 
			
		|||
> ```
 | 
			
		||||
 | 
			
		||||
| Argument | Type | Description                                                              |
 | 
			
		||||
| -------- | ------- | ------------------------------------------------------------------------ |
 | 
			
		||||
| -------- | ---- | ------------------------------------------------------------------------ |
 | 
			
		||||
| `last`   | bool | If set to `True`, component is added **last** in the pipeline (default). |
 | 
			
		||||
| `first`  | bool | If set to `True`, component is added **first** in the pipeline.          |
 | 
			
		||||
| `before` | unicode | String name of component to add the new component **before**.            |
 | 
			
		||||
| `after`  | unicode | String name of component to add the new component **after**.             |
 | 
			
		||||
| `before` | str  | String name of component to add the new component **before**.            |
 | 
			
		||||
| `after`  | str  | String name of component to add the new component **after**.             |
 | 
			
		||||
 | 
			
		||||
### Example: A simple pipeline component {#custom-components-simple}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -158,17 +158,17 @@ The available token pattern keys correspond to a number of
 | 
			
		|||
rule-based matching are:
 | 
			
		||||
 | 
			
		||||
| Attribute                              | Type |  Description                                                                                           |
 | 
			
		||||
| -------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------ |
 | 
			
		||||
| `ORTH`                                 | unicode | The exact verbatim text of a token.                                                                    |
 | 
			
		||||
| `TEXT` <Tag variant="new">2.1</Tag>    | unicode | The exact verbatim text of a token.                                                                    |
 | 
			
		||||
| `LOWER`                                | unicode | The lowercase form of the token text.                                                                  |
 | 
			
		||||
| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
 | 
			
		||||
| `ORTH`                                 | str  | The exact verbatim text of a token.                                                                    |
 | 
			
		||||
| `TEXT` <Tag variant="new">2.1</Tag>    | str  | The exact verbatim text of a token.                                                                    |
 | 
			
		||||
| `LOWER`                                | str  | The lowercase form of the token text.                                                                  |
 | 
			
		||||
|  `LENGTH`                              | int  | The length of the token text.                                                                          |
 | 
			
		||||
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT`    | bool | Token text consists of alphabetic characters, ASCII characters, digits.                                |
 | 
			
		||||
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE`    | bool | Token text is in lowercase, uppercase, titlecase.                                                      |
 | 
			
		||||
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP`     | bool | Token is punctuation, whitespace, stop word.                                                           |
 | 
			
		||||
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL`  | bool | Token text resembles a number, URL, email.                                                             |
 | 
			
		||||
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | unicode | The token's simple and extended part-of-speech tag, dependency label, lemma, shape.                    |
 | 
			
		||||
| `ENT_TYPE`                             | unicode | The token's entity label.                                                                              |
 | 
			
		||||
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str  | The token's simple and extended part-of-speech tag, dependency label, lemma, shape.                    |
 | 
			
		||||
| `ENT_TYPE`                             | str  | The token's entity label.                                                                              |
 | 
			
		||||
| `_` <Tag variant="new">2.1</Tag>       | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
 | 
			
		||||
 | 
			
		||||
<Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
 | 
			
		||||
| 
						 | 
				
			
			@ -1101,21 +1101,28 @@ powerful model packages with binary weights _and_ rules included!
 | 
			
		|||
 | 
			
		||||
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
 | 
			
		||||
 | 
			
		||||
When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**,
 | 
			
		||||
the EntityRuler calls the nlp object to construct a doc object. This happens in case you try
 | 
			
		||||
to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to 
 | 
			
		||||
extract matches based on the pattern's POS signature.
 | 
			
		||||
When using a large amount of **phrase patterns** (roughly > 10000) it's useful
 | 
			
		||||
to understand how the `add_patterns` function of the EntityRuler works. For each
 | 
			
		||||
**phrase pattern**, the EntityRuler calls the nlp object to construct a doc
 | 
			
		||||
object. This happens in case you try to add the EntityRuler at the end of an
 | 
			
		||||
existing pipeline with, for example, a POS tagger and want to extract matches
 | 
			
		||||
based on the pattern's POS signature.
 | 
			
		||||
 | 
			
		||||
In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler.
 | 
			
		||||
In this case you would pass a config value of `phrase_matcher_attr="POS"` for
 | 
			
		||||
the EntityRuler.
 | 
			
		||||
 | 
			
		||||
Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns.
 | 
			
		||||
Running the full language pipeline across every pattern in a large list scales
 | 
			
		||||
linearly and can therefore take a long time on large amounts of phrase patterns.
 | 
			
		||||
 | 
			
		||||
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively. 
 | 
			
		||||
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use
 | 
			
		||||
nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with
 | 
			
		||||
5,000-100,000 phrase patterns respectively.
 | 
			
		||||
 | 
			
		||||
Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time.
 | 
			
		||||
Even with this speedup (but especially if you're using an older version) the
 | 
			
		||||
`add_patterns` function can still take a long time.
 | 
			
		||||
 | 
			
		||||
An easy workaround to make this function run faster is disabling the other language pipes
 | 
			
		||||
while adding the phrase patterns.
 | 
			
		||||
An easy workaround to make this function run faster is disabling the other
 | 
			
		||||
language pipes while adding the phrase patterns.
 | 
			
		||||
 | 
			
		||||
```python
 | 
			
		||||
entityruler = EntityRuler(nlp)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -94,8 +94,8 @@ docs = list(doc_bin.get_docs(nlp.vocab))
 | 
			
		|||
 | 
			
		||||
If `store_user_data` is set to `True`, the `Doc.user_data` will be serialized as
 | 
			
		||||
well, which includes the values of
 | 
			
		||||
[extension attributes](/usage/processing-pipelines#custom-components-attributes) (if
 | 
			
		||||
they're serializable with msgpack).
 | 
			
		||||
[extension attributes](/usage/processing-pipelines#custom-components-attributes)
 | 
			
		||||
(if they're serializable with msgpack).
 | 
			
		||||
 | 
			
		||||
<Infobox title="Important note on serializing extension attributes" variant="warning">
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -667,8 +667,8 @@ define the language data to be loaded and the
 | 
			
		|||
[processing pipeline](/usage/processing-pipelines) to execute.
 | 
			
		||||
 | 
			
		||||
| Setting    | Type | Description                                                                                                                                                          |
 | 
			
		||||
| ---------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `lang`     | unicode | ID of the language class to initialize.                                                                                                                              |
 | 
			
		||||
| ---------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | 
			
		||||
| `lang`     | str  | ID of the language class to initialize.                                                                                                                              |
 | 
			
		||||
| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. |
 | 
			
		||||
 | 
			
		||||
The `load()` method that comes with our model package templates will take care
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -68,11 +68,11 @@ arcs.
 | 
			
		|||
</Infobox>
 | 
			
		||||
 | 
			
		||||
| Argument  | Type | Description                                                 | Default     |
 | 
			
		||||
| --------- | ------- | ----------------------------------------------------------- | ----------- |
 | 
			
		||||
| --------- | ---- | ----------------------------------------------------------- | ----------- |
 | 
			
		||||
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False`     |
 | 
			
		||||
| `color`   | unicode | Text color (HEX, RGB or color names).                       | `"#000000"` |
 | 
			
		||||
| `bg`      | unicode | Background color (HEX, RGB or color names).                 | `"#ffffff"` |
 | 
			
		||||
| `font`    | unicode | Font name or font family for all text.                      | `"Arial"`   |
 | 
			
		||||
| `color`   | str  | Text color (HEX, RGB or color names).                       | `"#000000"` |
 | 
			
		||||
| `bg`      | str  | Background color (HEX, RGB or color names).                 | `"#ffffff"` |
 | 
			
		||||
| `font`    | str  | Font name or font family for all text.                      | `"Arial"`   |
 | 
			
		||||
 | 
			
		||||
For a list of all available options, see the
 | 
			
		||||
[`displacy` API documentation](/api/top-level#displacy_options).
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in New Issue
	
	Block a user