diff --git a/Makefile b/Makefile
index cf96d6294..9916e3cf5 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,7 @@ VENV := ./env$(PYVER)
version := $(shell "bin/get-version.sh")
dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
- $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data
+ $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy_lookups_data
chmod a+rx $@
dist/pytest.pex : wheelhouse/pytest-*.whl
@@ -14,7 +14,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl
wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
$(VENV)/bin/pip wheel . -w ./wheelhouse
- $(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse
+ $(VENV)/bin/pip wheel spacy_lookups_data -w ./wheelhouse
touch $@
wheelhouse/pytest-%.whl : $(VENV)/bin/pex
diff --git a/examples/experiments/onto-joint/defaults.cfg b/examples/experiments/onto-joint/defaults.cfg
new file mode 100644
index 000000000..fbac4ea7d
--- /dev/null
+++ b/examples/experiments/onto-joint/defaults.cfg
@@ -0,0 +1,115 @@
+# Training hyper-parameters and additional features.
+[training]
+# Whether to train on sequences with 'gold standard' sentence boundaries
+# and tokens. If you set this to true, take care to ensure your run-time
+# data is passed in sentence-by-sentence via some prior preprocessing.
+gold_preproc = false
+# Limitations on training document length or number of examples.
+max_length = 0
+limit = 0
+# Data augmentation
+orth_variant_level = 0.0
+dropout = 0.1
+# Controls early-stopping. 0 or -1 mean unlimited.
+patience = 1600
+max_epochs = 0
+max_steps = 20000
+eval_frequency = 400
+# Other settings
+seed = 0
+accumulate_gradient = 1
+use_pytorch_for_gpu_memory = false
+# Control how scores are printed and checkpoints are evaluated.
+scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
+score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
+# These settings are invalid for the transformer models.
+init_tok2vec = null
+vectors = null
+
+[training.batch_size]
+@schedules = "compounding.v1"
+start = 1000
+stop = 1000
+compound = 1.001
+
+[optimizer]
+@optimizers = "Adam.v1"
+beta1 = 0.9
+beta2 = 0.999
+L2_is_weight_decay = true
+L2 = 0.01
+grad_clip = 1.0
+use_averages = true
+eps = 1e-8
+learn_rate = 0.001
+
+#[optimizer.learn_rate]
+#@schedules = "warmup_linear.v1"
+#warmup_steps = 250
+#total_steps = 20000
+#initial_rate = 0.001
+
+[nlp]
+lang = "en"
+vectors = ${training:vectors}
+
+[nlp.pipeline.tok2vec]
+factory = "tok2vec"
+
+[nlp.pipeline.senter]
+factory = "senter"
+
+[nlp.pipeline.ner]
+factory = "ner"
+
+[nlp.pipeline.tagger]
+factory = "tagger"
+
+[nlp.pipeline.parser]
+factory = "parser"
+
+[nlp.pipeline.senter.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.senter.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tagger.model]
+@architectures = "spacy.Tagger.v1"
+
+[nlp.pipeline.tagger.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.parser.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 8
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.parser.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.ner.model]
+@architectures = "spacy.TransitionBasedParser.v1"
+nr_feature_tokens = 3
+hidden_width = 128
+maxout_pieces = 3
+use_upper = false
+
+[nlp.pipeline.ner.model.tok2vec]
+@architectures = "spacy.Tok2VecTensors.v1"
+width = ${nlp.pipeline.tok2vec.model:width}
+
+[nlp.pipeline.tok2vec.model]
+@architectures = "spacy.HashEmbedCNN.v1"
+pretrained_vectors = ${nlp:vectors}
+width = 256
+depth = 6
+window_size = 1
+embed_size = 10000
+maxout_pieces = 3
+subword_features = true
diff --git a/requirements.txt b/requirements.txt
index e5f1ae10b..a104b68ba 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,9 +13,11 @@ numpy>=1.15.0
requests>=2.13.0,<3.0.0
plac>=0.9.6,<1.2.0
tqdm>=4.38.0,<5.0.0
-# Optional dependencies
-jsonschema>=2.6.0,<3.1.0
pydantic>=1.3.0,<2.0.0
+# Official Python utilities
+setuptools
+packaging
+importlib_metadata>=0.20; python_version < "3.8"
# Development dependencies
cython>=0.25
pytest>=4.6.5
diff --git a/setup.cfg b/setup.cfg
index f0895bbbb..c19b8d857 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -47,15 +47,17 @@ install_requires =
wasabi>=0.4.0,<1.1.0
srsly>=2.0.0,<3.0.0
catalogue>=0.0.7,<1.1.0
- ml_datasets
+ ml_datasets>=0.1.1
# Third-party dependencies
tqdm>=4.38.0,<5.0.0
- setuptools
numpy>=1.15.0
plac>=0.9.6,<1.2.0
requests>=2.13.0,<3.0.0
pydantic>=1.3.0,<2.0.0
- tqdm>=4.38.0,<5.0.0
+ # Official Python utilities
+ setuptools
+ packaging
+ importlib_metadata>=0.20; python_version < "3.8"
[options.extras_require]
lookups =
diff --git a/spacy/about.py b/spacy/about.py
index 3af1b77a0..04a660ad1 100644
--- a/spacy/about.py
+++ b/spacy/about.py
@@ -1,6 +1,6 @@
# fmt: off
__title__ = "spacy"
-__version__ = "3.0.0.dev8"
+__version__ = "3.0.0.dev9"
__release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
diff --git a/spacy/cli/converters/conllu2json.py b/spacy/cli/converters/conllu2json.py
index 0b2920802..1ece755b8 100644
--- a/spacy/cli/converters/conllu2json.py
+++ b/spacy/cli/converters/conllu2json.py
@@ -111,8 +111,8 @@ def get_entities(lines, tag_pattern, ner_map=None):
final entity type with `ner_map` if mapping present. Entity tag is 'O' if
the pattern is not matched.
- lines (unicode): CONLL-U lines for one sentences
- tag_pattern (unicode): Regex pattern for entity tag
+ lines (str): CONLL-U lines for one sentences
+ tag_pattern (str): Regex pattern for entity tag
ner_map (dict): Map old NER tag names to new ones, '' maps to O.
RETURNS (list): List of BILUO entity tags
"""
@@ -187,8 +187,8 @@ def example_from_conllu_sentence(
"""Create an Example from the lines for one CoNLL-U sentence, merging
subtokens and appending morphology to tags if required.
- lines (unicode): The non-comment lines for a CoNLL-U sentence
- ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col
+ lines (str): The non-comment lines for a CoNLL-U sentence
+ ner_tag_pattern (str): The regex pattern for matching NER in MISC col
RETURNS (Example): An example containing the annotation
"""
# create a Doc with each subtoken as its own token
diff --git a/spacy/cli/download.py b/spacy/cli/download.py
index 0230e272d..3d56822a5 100644
--- a/spacy/cli/download.py
+++ b/spacy/cli/download.py
@@ -5,6 +5,7 @@ import sys
from wasabi import msg
from .. import about
+from ..util import is_package, get_base_version
def download(
@@ -17,7 +18,7 @@ def download(
flag is set, the command expects the full model name with version.
For direct downloads, the compatibility check will be skipped.
"""
- if not require_package("spacy") and "--no-deps" not in pip_args:
+ if not is_package("spacy") and "--no-deps" not in pip_args:
msg.warn(
"Skipping model package dependencies and setting `--no-deps`. "
"You don't seem to have the spaCy package itself installed "
@@ -45,21 +46,6 @@ def download(
"Download and installation successful",
f"You can now load the model via spacy.load('{model_name}')",
)
- # If a model is downloaded and then loaded within the same process, our
- # is_package check currently fails, because pkg_resources.working_set
- # is not refreshed automatically (see #3923). We're trying to work
- # around this here be requiring the package explicitly.
- require_package(model_name)
-
-
-def require_package(name):
- try:
- import pkg_resources
-
- pkg_resources.working_set.require(name)
- return True
- except: # noqa: E722
- return False
def get_json(url, desc):
@@ -77,8 +63,7 @@ def get_json(url, desc):
def get_compatibility():
- version = about.__version__
- version = version.rsplit(".dev", 1)[0]
+ version = get_base_version(about.__version__)
comp_table = get_json(about.__compatibility__, "compatibility table")
comp = comp_table["spacy"]
if version not in comp:
@@ -87,7 +72,7 @@ def get_compatibility():
def get_version(model, comp):
- model = model.rsplit(".dev", 1)[0]
+ model = get_base_version(model)
if model not in comp:
msg.fail(
f"No compatible model found for '{model}' (spaCy v{about.__version__})",
diff --git a/spacy/cli/info.py b/spacy/cli/info.py
index 23f766368..98fd5cabf 100644
--- a/spacy/cli/info.py
+++ b/spacy/cli/info.py
@@ -48,7 +48,9 @@ def info(
"Location": str(Path(__file__).parent.parent),
"Platform": platform.platform(),
"Python version": platform.python_version(),
- "Models": ", ".join(model["name"] for model in all_models.values()),
+ "Models": ", ".join(
+ f"{m['name']} ({m['version']})" for m in all_models.values()
+ ),
}
if not silent:
title = "Info about spaCy"
@@ -63,7 +65,7 @@ def print_markdown(data, title=None):
"""Print data in GitHub-flavoured Markdown format for issues etc.
data (dict or list of tuples): Label/value pairs.
- title (unicode or None): Title, will be rendered as headline 2.
+ title (str / None): Title, will be rendered as headline 2.
"""
markdown = []
for key, value in data.items():
diff --git a/spacy/cli/package.py b/spacy/cli/package.py
index 8e27e44d0..153e61ba3 100644
--- a/spacy/cli/package.py
+++ b/spacy/cli/package.py
@@ -83,14 +83,14 @@ def generate_meta(model_path, existing_meta, msg):
("lang", "Model language", meta.get("lang", "en")),
("name", "Model name", meta.get("name", "model")),
("version", "Model version", meta.get("version", "0.0.0")),
- ("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"),
("description", "Model description", meta.get("description", False)),
("author", "Author", meta.get("author", False)),
("email", "Author email", meta.get("email", False)),
("url", "Author website", meta.get("url", False)),
- ("license", "License", meta.get("license", "CC BY-SA 3.0")),
+ ("license", "License", meta.get("license", "MIT")),
]
nlp = util.load_model_from_path(Path(model_path))
+ meta["spacy_version"] = util.get_model_version_range(about.__version__)
meta["pipeline"] = nlp.pipe_names
meta["vectors"] = {
"width": nlp.vocab.vectors_length,
@@ -168,6 +168,7 @@ def setup_package():
package_data={model_name: list_files(model_dir)},
install_requires=list_requirements(meta),
zip_safe=False,
+ entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]}
)
diff --git a/spacy/cli/train.py b/spacy/cli/train.py
index d4010c43b..cbe977cad 100644
--- a/spacy/cli/train.py
+++ b/spacy/cli/train.py
@@ -483,7 +483,6 @@ def train(
# Update model meta.json
meta["lang"] = nlp.lang
meta["pipeline"] = nlp.pipe_names
- meta["spacy_version"] = f">={about.__version__}"
if beam_width == 1:
meta["speed"] = {
"nwords": nwords,
diff --git a/spacy/cli/train_from_config.py b/spacy/cli/train_from_config.py
index c75c861cc..c0e3bd169 100644
--- a/spacy/cli/train_from_config.py
+++ b/spacy/cli/train_from_config.py
@@ -7,7 +7,7 @@ from pathlib import Path
from wasabi import msg
import thinc
import thinc.schedules
-from thinc.api import Model
+from thinc.api import Model, use_pytorch_for_gpu_memory
import random
from ..gold import GoldCorpus
@@ -171,6 +171,8 @@ def train_from_config(
msg.info(f"Loading config from: {config_path}")
config = util.load_config(config_path, create_objects=False)
util.fix_random_seed(config["training"]["seed"])
+ if config["training"]["use_pytorch_for_gpu_memory"]:
+ use_pytorch_for_gpu_memory()
nlp_config = config["nlp"]
config = util.load_config(config_path, create_objects=True)
msg.info("Creating nlp from config")
@@ -213,6 +215,12 @@ def train_from_config(
if is_best_checkpoint and output_path is not None:
nlp.to_disk(output_path)
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
+ # Clean up the objects to faciliate garbage collection.
+ for eg in batch:
+ eg.doc = None
+ eg.goldparse = None
+ eg.doc_annotation = None
+ eg.token_annotation = None
finally:
if output_path is not None:
final_model_path = output_path / "model-final"
diff --git a/spacy/cli/validate.py b/spacy/cli/validate.py
index a23ce3453..080cd77e2 100644
--- a/spacy/cli/validate.py
+++ b/spacy/cli/validate.py
@@ -4,6 +4,8 @@ import requests
from wasabi import msg
from .. import about
+from ..util import get_package_version, get_installed_models, get_base_version
+from ..util import get_package_path, get_model_meta, is_compatible_version
def validate():
@@ -12,7 +14,7 @@ def validate():
with the installed models. Should be run after `pip install -U spacy`.
"""
model_pkgs, compat = get_model_pkgs()
- spacy_version = about.__version__.rsplit(".dev", 1)[0]
+ spacy_version = get_base_version(about.__version__)
current_compat = compat.get(spacy_version, {})
if not current_compat:
msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
@@ -25,7 +27,7 @@ def validate():
msg.info(f"spaCy installation: {spacy_dir}")
if model_pkgs:
- header = ("NAME", "VERSION", "")
+ header = ("NAME", "SPACY", "VERSION", "")
rows = []
for name, data in model_pkgs.items():
if data["compat"]:
@@ -34,7 +36,7 @@ def validate():
else:
version = msg.text(data["version"], color="red", no_print=True)
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
- rows.append((data["name"], version, comp))
+ rows.append((data["name"], data["spacy"], version, comp))
msg.table(rows, header=header)
else:
msg.text("No models found in your current environment.", exits=0)
@@ -44,8 +46,9 @@ def validate():
cmd = "python -m spacy download {}"
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
if na_models:
- msg.warn(
- f"The following models are not available for spaCy v{about.__version__}:",
+ msg.info(
+ f"The following models are custom spaCy models or not "
+ f"available for spaCy v{about.__version__}:",
", ".join(na_models),
)
if incompat_models:
@@ -53,8 +56,6 @@ def validate():
def get_model_pkgs():
- import pkg_resources
-
with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__)
if r.status_code != 200:
@@ -66,20 +67,29 @@ def get_model_pkgs():
msg.good("Loaded compatibility table")
compat = r.json()["spacy"]
all_models = set()
+ installed_models = get_installed_models()
for spacy_v, models in dict(compat).items():
all_models.update(models.keys())
for model, model_vs in models.items():
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
pkgs = {}
- for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
+ for pkg_name in installed_models:
package = pkg_name.replace("-", "_")
- if package in all_models:
- version = pkg_data.version
- pkgs[pkg_name] = {
- "name": package,
- "version": version,
- "compat": package in compat and version in compat[package],
- }
+ version = get_package_version(pkg_name)
+ if package in compat:
+ is_compat = version in compat[package]
+ spacy_version = about.__version__
+ else:
+ model_path = get_package_path(package)
+ model_meta = get_model_meta(model_path)
+ spacy_version = model_meta.get("spacy_version", "n/a")
+ is_compat = is_compatible_version(about.__version__, spacy_version)
+ pkgs[pkg_name] = {
+ "name": package,
+ "version": version,
+ "spacy": spacy_version,
+ "compat": is_compat,
+ }
return pkgs, compat
diff --git a/spacy/displacy/__init__.py b/spacy/displacy/__init__.py
index 3f84dabce..2c377a043 100644
--- a/spacy/displacy/__init__.py
+++ b/spacy/displacy/__init__.py
@@ -22,13 +22,13 @@ def render(
"""Render displaCy visualisation.
docs (list or Doc): Document(s) to visualise.
- style (unicode): Visualisation style, 'dep' or 'ent'.
+ style (str): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
jupyter (bool): Override Jupyter auto-detection.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
- RETURNS (unicode): Rendered HTML markup.
+ RETURNS (str): Rendered HTML markup.
DOCS: https://spacy.io/api/top-level#displacy.render
USAGE: https://spacy.io/usage/visualizers
@@ -73,13 +73,13 @@ def serve(
"""Serve displaCy visualisation.
docs (list or Doc): Document(s) to visualise.
- style (unicode): Visualisation style, 'dep' or 'ent'.
+ style (str): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup.
options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
port (int): Port to serve visualisation.
- host (unicode): Host to serve visualisation.
+ host (str): Host to serve visualisation.
DOCS: https://spacy.io/api/top-level#displacy.serve
USAGE: https://spacy.io/usage/visualizers
diff --git a/spacy/displacy/render.py b/spacy/displacy/render.py
index 0d4cdb77f..ef8632cbc 100644
--- a/spacy/displacy/render.py
+++ b/spacy/displacy/render.py
@@ -47,7 +47,7 @@ class DependencyRenderer(object):
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
- RETURNS (unicode): Rendered SVG or HTML markup.
+ RETURNS (str): Rendered SVG or HTML markup.
"""
# Create a random ID prefix to make sure parses don't receive the
# same ID, even if they're identical
@@ -78,7 +78,7 @@ class DependencyRenderer(object):
render_id (int): Unique ID, typically index of document.
words (list): Individual words and their tags.
arcs (list): Individual arcs and their start, end, direction and label.
- RETURNS (unicode): Rendered SVG markup.
+ RETURNS (str): Rendered SVG markup.
"""
self.levels = self.get_levels(arcs)
self.highest_level = len(self.levels)
@@ -112,10 +112,10 @@ class DependencyRenderer(object):
):
"""Render individual word.
- text (unicode): Word text.
- tag (unicode): Part-of-speech tag.
+ text (str): Word text.
+ tag (str): Part-of-speech tag.
i (int): Unique ID, typically word index.
- RETURNS (unicode): Rendered SVG markup.
+ RETURNS (str): Rendered SVG markup.
"""
y = self.offset_y + self.word_spacing
x = self.offset_x + i * self.distance
@@ -131,12 +131,12 @@ class DependencyRenderer(object):
def render_arrow(self, label, start, end, direction, i):
"""Render individual arrow.
- label (unicode): Dependency label.
+ label (str): Dependency label.
start (int): Index of start word.
end (int): Index of end word.
- direction (unicode): Arrow direction, 'left' or 'right'.
+ direction (str): Arrow direction, 'left' or 'right'.
i (int): Unique ID, typically arrow index.
- RETURNS (unicode): Rendered SVG markup.
+ RETURNS (str): Rendered SVG markup.
"""
if start < 0 or end < 0:
error_args = dict(start=start, end=end, label=label, dir=direction)
@@ -179,7 +179,7 @@ class DependencyRenderer(object):
y (int): Y-coordinate of arrow start and end point.
y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
x_end (int): X-coordinate of arrow end point.
- RETURNS (unicode): Definition of the arc path ('d' attribute).
+ RETURNS (str): Definition of the arc path ('d' attribute).
"""
template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
if self.compact:
@@ -189,11 +189,11 @@ class DependencyRenderer(object):
def get_arrowhead(self, direction, x, y, end):
"""Render individual arrow head.
- direction (unicode): Arrow direction, 'left' or 'right'.
+ direction (str): Arrow direction, 'left' or 'right'.
x (int): X-coordinate of arrow start point.
y (int): Y-coordinate of arrow start and end point.
end (int): X-coordinate of arrow end point.
- RETURNS (unicode): Definition of the arrow head path ('d' attribute).
+ RETURNS (str): Definition of the arrow head path ('d' attribute).
"""
if direction == "left":
pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
@@ -279,7 +279,7 @@ class EntityRenderer(object):
parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup.
- RETURNS (unicode): Rendered HTML markup.
+ RETURNS (str): Rendered HTML markup.
"""
rendered = []
for i, p in enumerate(parsed):
@@ -300,9 +300,9 @@ class EntityRenderer(object):
def render_ents(self, text, spans, title):
"""Render entities in text.
- text (unicode): Original text.
+ text (str): Original text.
spans (list): Individual entity spans and their start, end and label.
- title (unicode or None): Document title set in Doc.user_data['title'].
+ title (str / None): Document title set in Doc.user_data['title'].
"""
markup = ""
offset = 0
diff --git a/spacy/errors.py b/spacy/errors.py
index 905f7d443..6184c078c 100644
--- a/spacy/errors.py
+++ b/spacy/errors.py
@@ -113,9 +113,12 @@ class Warnings(object):
"ignored during training.")
# TODO: fix numbering after merging develop into master
- W095 = ("Skipping unsupported morphological feature(s): {feature}. "
- "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
- "string \"Field1=Value1,Value2|Field2=Value3\".")
+ W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is "
+ "incompatible with the current version ({current}). This may lead "
+ "to unexpected results or runtime errors. To resolve this, "
+ "download a newer compatible model or retrain your custom model "
+ "with the current spaCy version. For more details and available "
+ "updates, run: python -m spacy validate")
W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' "
"instead.")
W097 = ("No Model config was provided to create the '{name}' component, "
@@ -124,6 +127,9 @@ class Warnings(object):
"so a default configuration was used.")
W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
"but got '{type}' instead, so ignoring it.")
+ W100 = ("Skipping unsupported morphological feature(s): {feature}. "
+ "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
+ "string \"Field1=Value1,Value2|Field2=Value3\".")
@add_codes
@@ -621,7 +627,7 @@ class MatchPatternError(ValueError):
def __init__(self, key, errors):
"""Custom error for validating match patterns.
- key (unicode): The name of the matcher rule.
+ key (str): The name of the matcher rule.
errors (dict): Validation errors (sequence of strings) mapped to pattern
ID, i.e. the index of the added pattern.
"""
diff --git a/spacy/glossary.py b/spacy/glossary.py
index 938a575cd..c4a6a5c45 100644
--- a/spacy/glossary.py
+++ b/spacy/glossary.py
@@ -1,8 +1,8 @@
def explain(term):
"""Get a description for a given POS tag, dependency label or entity type.
- term (unicode): The term to explain.
- RETURNS (unicode): The explanation, or `None` if not found in the glossary.
+ term (str): The term to explain.
+ RETURNS (str): The explanation, or `None` if not found in the glossary.
EXAMPLE:
>>> spacy.explain(u'NORP')
diff --git a/spacy/gold.pyx b/spacy/gold.pyx
index 27f9f6553..1e58f0635 100644
--- a/spacy/gold.pyx
+++ b/spacy/gold.pyx
@@ -154,8 +154,8 @@ class GoldCorpus(object):
def __init__(self, train, dev, gold_preproc=False, limit=None):
"""Create a GoldCorpus.
- train (unicode or Path): File or directory of training data.
- dev (unicode or Path): File or directory of development data.
+ train (str / Path): File or directory of training data.
+ dev (str / Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object.
"""
self.limit = limit
diff --git a/spacy/kb.pyx b/spacy/kb.pyx
index 86a8d49b8..8d8464f3c 100644
--- a/spacy/kb.pyx
+++ b/spacy/kb.pyx
@@ -38,7 +38,7 @@ cdef class Candidate:
@property
def entity_(self):
- """RETURNS (unicode): ID/name of this entity in the KB"""
+ """RETURNS (str): ID/name of this entity in the KB"""
return self.kb.vocab.strings[self.entity_hash]
@property
@@ -48,7 +48,7 @@ cdef class Candidate:
@property
def alias_(self):
- """RETURNS (unicode): ID of the original alias"""
+ """RETURNS (str): ID of the original alias"""
return self.kb.vocab.strings[self.alias_hash]
@property
diff --git a/spacy/language.py b/spacy/language.py
index f8732b471..f281fa1ba 100644
--- a/spacy/language.py
+++ b/spacy/language.py
@@ -17,7 +17,8 @@ from .tokens.underscore import Underscore
from .vocab import Vocab
from .lemmatizer import Lemmatizer
from .lookups import Lookups
-from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
+from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
+from .pipe_analysis import count_pipeline_interdependencies
from .gold import Example
from .scorer import Scorer
from .util import link_vectors_to_models, create_default_optimizer, registry
@@ -127,7 +128,7 @@ class Language(object):
Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline.
- lang (unicode): Two-letter language ID, i.e. ISO code.
+ lang (str): Two-letter language ID, i.e. ISO code.
DOCS: https://spacy.io/api/language
"""
@@ -196,13 +197,14 @@ class Language(object):
@property
def meta(self):
+ spacy_version = util.get_model_version_range(about.__version__)
if self.vocab.lang:
self._meta.setdefault("lang", self.vocab.lang)
else:
self._meta.setdefault("lang", self.lang)
self._meta.setdefault("name", "model")
self._meta.setdefault("version", "0.0.0")
- self._meta.setdefault("spacy_version", f">={about.__version__}")
+ self._meta.setdefault("spacy_version", spacy_version)
self._meta.setdefault("description", "")
self._meta.setdefault("author", "")
self._meta.setdefault("email", "")
@@ -292,7 +294,7 @@ class Language(object):
def get_pipe(self, name):
"""Get a pipeline component for a given component name.
- name (unicode): Name of pipeline component to get.
+ name (str): Name of pipeline component to get.
RETURNS (callable): The pipeline component.
DOCS: https://spacy.io/api/language#get_pipe
@@ -305,7 +307,7 @@ class Language(object):
def create_pipe(self, name, config=dict()):
"""Create a pipeline component from a factory.
- name (unicode): Factory name to look up in `Language.factories`.
+ name (str): Factory name to look up in `Language.factories`.
config (dict): Configuration parameters to initialise component.
RETURNS (callable): Pipeline component.
@@ -348,12 +350,12 @@ class Language(object):
of before/after/first/last can be set. Default behaviour is "last".
component (callable): The pipeline component.
- name (unicode): Name of pipeline component. Overwrites existing
+ name (str): Name of pipeline component. Overwrites existing
component.name attribute if available. If no name is set and
the component exposes no name attribute, component.__name__ is
used. An error is raised if a name already exists in the pipeline.
- before (unicode): Component name to insert component directly before.
- after (unicode): Component name to insert component directly after.
+ before (str): Component name to insert component directly before.
+ after (str): Component name to insert component directly after.
first (bool): Insert component first / not first in the pipeline.
last (bool): Insert component last / not last in the pipeline.
@@ -394,7 +396,7 @@ class Language(object):
"""Check if a component name is present in the pipeline. Equivalent to
`name in nlp.pipe_names`.
- name (unicode): Name of the component.
+ name (str): Name of the component.
RETURNS (bool): Whether a component of the name exists in the pipeline.
DOCS: https://spacy.io/api/language#has_pipe
@@ -404,7 +406,7 @@ class Language(object):
def replace_pipe(self, name, component):
"""Replace a component in the pipeline.
- name (unicode): Name of the component to replace.
+ name (str): Name of the component to replace.
component (callable): Pipeline component.
DOCS: https://spacy.io/api/language#replace_pipe
@@ -423,8 +425,8 @@ class Language(object):
def rename_pipe(self, old_name, new_name):
"""Rename a pipeline component.
- old_name (unicode): Name of the component to rename.
- new_name (unicode): New name of the component.
+ old_name (str): Name of the component to rename.
+ new_name (str): New name of the component.
DOCS: https://spacy.io/api/language#rename_pipe
"""
@@ -438,7 +440,7 @@ class Language(object):
def remove_pipe(self, name):
"""Remove a component from the pipeline.
- name (unicode): Name of the component to remove.
+ name (str): Name of the component to remove.
RETURNS (tuple): A `(name, component)` tuple of the removed component.
DOCS: https://spacy.io/api/language#remove_pipe
@@ -455,7 +457,7 @@ class Language(object):
and can contain arbitrary whitespace. Alignment into the original string
is preserved.
- text (unicode): The text to be processed.
+ text (str): The text to be processed.
disable (list): Names of the pipeline components to disable.
component_cfg (dict): An optional dictionary with extra keyword arguments
for specific components.
@@ -564,13 +566,14 @@ class Language(object):
if component_cfg is None:
component_cfg = {}
+ component_deps = count_pipeline_interdependencies(self.pipeline)
# Determine whether component should set annotations. In theory I guess
# we should do this by inspecting the meta? Or we could just always
# say "yes"
- for name, proc in self.pipeline:
+ for i, (name, proc) in enumerate(self.pipeline):
component_cfg.setdefault(name, {})
component_cfg[name].setdefault("drop", drop)
- component_cfg[name].setdefault("set_annotations", False)
+ component_cfg[name]["set_annotations"] = bool(component_deps[i])
for name, proc in self.pipeline:
if not hasattr(proc, "update"):
continue
@@ -938,7 +941,7 @@ class Language(object):
"""Save the current state to a directory. If a model is loaded, this
will include the model.
- path (unicode or Path): Path to a directory, which will be created if
+ path (str / Path): Path to a directory, which will be created if
it doesn't exist.
exclude (list): Names of components or serialization fields to exclude.
@@ -972,7 +975,7 @@ class Language(object):
returns it. If the saved `Language` object contains a model, the
model will be loaded.
- path (unicode or Path): A path to a directory.
+ path (str / Path): A path to a directory.
exclude (list): Names of components or serialization fields to exclude.
RETURNS (Language): The modified `Language` object.
@@ -1090,7 +1093,7 @@ class component(object):
):
"""Decorate a pipeline component.
- name (unicode): Default component and factory name.
+ name (str): Default component and factory name.
assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
requires (list): Attributes required by component, e.g. `["token.dep"]`.
retokenizes (bool): Whether the component changes the tokenization.
diff --git a/spacy/lemmatizer.py b/spacy/lemmatizer.py
index 517a10866..c4944407f 100644
--- a/spacy/lemmatizer.py
+++ b/spacy/lemmatizer.py
@@ -30,8 +30,8 @@ class Lemmatizer(object):
def __call__(self, string, univ_pos, morphology=None):
"""Lemmatize a string.
- string (unicode): The string to lemmatize, e.g. the token text.
- univ_pos (unicode / int): The token's universal part-of-speech tag.
+ string (str): The string to lemmatize, e.g. the token text.
+ univ_pos (str / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
RETURNS (list): The available lemmas for the string.
@@ -69,7 +69,7 @@ class Lemmatizer(object):
Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely.
- univ_pos (unicode / int): The token's universal part-of-speech tag.
+ univ_pos (str / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the
Universal Dependencies scheme.
"""
@@ -126,10 +126,10 @@ class Lemmatizer(object):
"""Look up a lemma in the table, if available. If no lemma is found,
the original string is returned.
- string (unicode): The original string.
+ string (str): The original string.
orth (int): Optional hash of the string to look up. If not set, the
string will be used and hashed.
- RETURNS (unicode): The lemma if the string was found, otherwise the
+ RETURNS (str): The lemma if the string was found, otherwise the
original string.
"""
lookup_table = self.lookups.get_table("lemma_lookup", {})
diff --git a/spacy/lexeme.pyx b/spacy/lexeme.pyx
index 40aab697e..fc3b30a6d 100644
--- a/spacy/lexeme.pyx
+++ b/spacy/lexeme.pyx
@@ -164,7 +164,7 @@ cdef class Lexeme:
self.vocab.set_vector(self.c.orth, vector)
property rank:
- """RETURNS (unicode): Sequential ID of the lexemes's lexical type, used
+ """RETURNS (str): Sequential ID of the lexemes's lexical type, used
to index into tables, e.g. for word vectors."""
def __get__(self):
return self.c.id
@@ -187,18 +187,18 @@ cdef class Lexeme:
@property
def orth_(self):
- """RETURNS (unicode): The original verbatim text of the lexeme
+ """RETURNS (str): The original verbatim text of the lexeme
(identical to `Lexeme.text`). Exists mostly for consistency with
the other attributes."""
return self.vocab.strings[self.c.orth]
@property
def text(self):
- """RETURNS (unicode): The original verbatim text of the lexeme."""
+ """RETURNS (str): The original verbatim text of the lexeme."""
return self.orth_
property lower:
- """RETURNS (unicode): Lowercase form of the lexeme."""
+ """RETURNS (str): Lowercase form of the lexeme."""
def __get__(self):
return self.c.lower
@@ -281,7 +281,7 @@ cdef class Lexeme:
prob_table[self.c.orth] = x
property lower_:
- """RETURNS (unicode): Lowercase form of the word."""
+ """RETURNS (str): Lowercase form of the word."""
def __get__(self):
return self.vocab.strings[self.c.lower]
@@ -289,7 +289,7 @@ cdef class Lexeme:
self.c.lower = self.vocab.strings.add(x)
property norm_:
- """RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the
+ """RETURNS (str): The lexemes's norm, i.e. a normalised form of the
lexeme text.
"""
def __get__(self):
@@ -299,7 +299,7 @@ cdef class Lexeme:
self.norm = self.vocab.strings.add(x)
property shape_:
- """RETURNS (unicode): Transform of the word's string, to show
+ """RETURNS (str): Transform of the word's string, to show
orthographic features.
"""
def __get__(self):
@@ -309,7 +309,7 @@ cdef class Lexeme:
self.c.shape = self.vocab.strings.add(x)
property prefix_:
- """RETURNS (unicode): Length-N substring from the start of the word.
+ """RETURNS (str): Length-N substring from the start of the word.
Defaults to `N=1`.
"""
def __get__(self):
@@ -319,7 +319,7 @@ cdef class Lexeme:
self.c.prefix = self.vocab.strings.add(x)
property suffix_:
- """RETURNS (unicode): Length-N substring from the end of the word.
+ """RETURNS (str): Length-N substring from the end of the word.
Defaults to `N=3`.
"""
def __get__(self):
@@ -329,7 +329,7 @@ cdef class Lexeme:
self.c.suffix = self.vocab.strings.add(x)
property lang_:
- """RETURNS (unicode): Language of the parent vocabulary."""
+ """RETURNS (str): Language of the parent vocabulary."""
def __get__(self):
return self.vocab.strings[self.c.lang]
diff --git a/spacy/lookups.py b/spacy/lookups.py
index 7e49f4dca..d6aa5f9a0 100644
--- a/spacy/lookups.py
+++ b/spacy/lookups.py
@@ -31,7 +31,7 @@ class Lookups(object):
"""Check if the lookups contain a table of a given name. Delegates to
Lookups.has_table.
- name (unicode): Name of the table.
+ name (str): Name of the table.
RETURNS (bool): Whether a table of that name is in the lookups.
"""
return self.has_table(name)
@@ -48,7 +48,7 @@ class Lookups(object):
def add_table(self, name, data=SimpleFrozenDict()):
"""Add a new table to the lookups. Raises an error if the table exists.
- name (unicode): Unique name of table.
+ name (str): Unique name of table.
data (dict): Optional data to add to the table.
RETURNS (Table): The newly added table.
@@ -64,7 +64,7 @@ class Lookups(object):
"""Get a table. Raises an error if the table doesn't exist and no
default value is provided.
- name (unicode): Name of the table.
+ name (str): Name of the table.
default: Optional default value to return if table doesn't exist.
RETURNS (Table): The table.
@@ -79,7 +79,7 @@ class Lookups(object):
def remove_table(self, name):
"""Remove a table. Raises an error if the table doesn't exist.
- name (unicode): Name of the table to remove.
+ name (str): Name of the table to remove.
RETURNS (Table): The removed table.
DOCS: https://spacy.io/api/lookups#remove_table
@@ -91,7 +91,7 @@ class Lookups(object):
def has_table(self, name):
"""Check if the lookups contain a table of a given name.
- name (unicode): Name of the table.
+ name (str): Name of the table.
RETURNS (bool): Whether a table of that name exists.
DOCS: https://spacy.io/api/lookups#has_table
@@ -125,7 +125,7 @@ class Lookups(object):
"""Save the lookups to a directory as lookups.bin. Expects a path to a
directory, which will be created if it doesn't exist.
- path (unicode / Path): The file path.
+ path (str / Path): The file path.
DOCS: https://spacy.io/api/lookups#to_disk
"""
@@ -141,7 +141,7 @@ class Lookups(object):
"""Load lookups from a directory containing a lookups.bin. Will skip
loading if the file doesn't exist.
- path (unicode / Path): The directory path.
+ path (str / Path): The directory path.
RETURNS (Lookups): The loaded lookups.
DOCS: https://spacy.io/api/lookups#from_disk
@@ -167,7 +167,7 @@ class Table(OrderedDict):
"""Initialize a new table from a dict.
data (dict): The dictionary.
- name (unicode): Optional table name for reference.
+ name (str): Optional table name for reference.
RETURNS (Table): The newly created object.
DOCS: https://spacy.io/api/lookups#table.from_dict
@@ -179,7 +179,7 @@ class Table(OrderedDict):
def __init__(self, name=None, data=None):
"""Initialize a new table.
- name (unicode): Optional table name for reference.
+ name (str): Optional table name for reference.
data (dict): Initial data, used to hint Bloom Filter.
RETURNS (Table): The newly created object.
@@ -197,7 +197,7 @@ class Table(OrderedDict):
def __setitem__(self, key, value):
"""Set new key/value pair. String keys will be hashed.
- key (unicode / int): The key to set.
+ key (str / int): The key to set.
value: The value to set.
"""
key = get_string_id(key)
@@ -208,7 +208,7 @@ class Table(OrderedDict):
"""Set new key/value pair. String keys will be hashed.
Same as table[key] = value.
- key (unicode / int): The key to set.
+ key (str / int): The key to set.
value: The value to set.
"""
self[key] = value
@@ -216,7 +216,7 @@ class Table(OrderedDict):
def __getitem__(self, key):
"""Get the value for a given key. String keys will be hashed.
- key (unicode / int): The key to get.
+ key (str / int): The key to get.
RETURNS: The value.
"""
key = get_string_id(key)
@@ -225,7 +225,7 @@ class Table(OrderedDict):
def get(self, key, default=None):
"""Get the value for a given key. String keys will be hashed.
- key (unicode / int): The key to get.
+ key (str / int): The key to get.
default: The default value to return.
RETURNS: The value.
"""
@@ -235,7 +235,7 @@ class Table(OrderedDict):
def __contains__(self, key):
"""Check whether a key is in the table. String keys will be hashed.
- key (unicode / int): The key to check.
+ key (str / int): The key to check.
RETURNS (bool): Whether the key is in the table.
"""
key = get_string_id(key)
diff --git a/spacy/matcher/dependencymatcher.pyx b/spacy/matcher/dependencymatcher.pyx
index ff707a71c..ddeeedd06 100644
--- a/spacy/matcher/dependencymatcher.pyx
+++ b/spacy/matcher/dependencymatcher.pyx
@@ -66,7 +66,7 @@ cdef class DependencyMatcher:
def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID.
- key (unicode): The match ID.
+ key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
return self._normalize_key(key) in self._patterns
@@ -194,7 +194,7 @@ cdef class DependencyMatcher:
def get(self, key, default=None):
"""Retrieve the pattern stored for a key.
- key (unicode or int): The key to retrieve.
+ key (str / int): The key to retrieve.
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
"""
key = self._normalize_key(key)
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
index 8bd66cbca..158730e60 100644
--- a/spacy/matcher/matcher.pyx
+++ b/spacy/matcher/matcher.pyx
@@ -64,7 +64,7 @@ cdef class Matcher:
def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID.
- key (unicode): The match ID.
+ key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
"""
return self._normalize_key(key) in self._patterns
@@ -98,7 +98,7 @@ cdef class Matcher:
number of arguments). The on_match callback becomes an optional keyword
argument.
- key (unicode): The match ID.
+ key (str): The match ID.
patterns (list): The patterns to add for the given key.
on_match (callable): Optional callback executed on match.
*_patterns (list): For backwards compatibility: list of patterns to add
@@ -139,7 +139,7 @@ cdef class Matcher:
"""Remove a rule from the matcher. A KeyError is raised if the key does
not exist.
- key (unicode): The ID of the match rule.
+ key (str): The ID of the match rule.
"""
norm_key = self._normalize_key(key)
if not norm_key in self._patterns:
@@ -166,7 +166,7 @@ cdef class Matcher:
def get(self, key, default=None):
"""Retrieve the pattern stored for a key.
- key (unicode or int): The key to retrieve.
+ key (str / int): The key to retrieve.
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
"""
key = self._normalize_key(key)
diff --git a/spacy/matcher/phrasematcher.pyx b/spacy/matcher/phrasematcher.pyx
index 14cc39787..aa4534296 100644
--- a/spacy/matcher/phrasematcher.pyx
+++ b/spacy/matcher/phrasematcher.pyx
@@ -30,7 +30,7 @@ cdef class PhraseMatcher:
"""Initialize the PhraseMatcher.
vocab (Vocab): The shared vocabulary.
- attr (int / unicode): Token attribute to match on.
+ attr (int / str): Token attribute to match on.
validate (bool): Perform additional validation when patterns are added.
RETURNS (PhraseMatcher): The newly constructed object.
@@ -70,7 +70,7 @@ cdef class PhraseMatcher:
def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID.
- key (unicode): The match ID.
+ key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID.
DOCS: https://spacy.io/api/phrasematcher#contains
@@ -85,7 +85,7 @@ cdef class PhraseMatcher:
"""Remove a rule from the matcher by match ID. A KeyError is raised if
the key does not exist.
- key (unicode): The match ID.
+ key (str): The match ID.
DOCS: https://spacy.io/api/phrasematcher#remove
"""
@@ -159,7 +159,7 @@ cdef class PhraseMatcher:
number of arguments). The on_match callback becomes an optional keyword
argument.
- key (unicode): The match ID.
+ key (str): The match ID.
docs (list): List of `Doc` objects representing match patterns.
on_match (callable): Callback executed on match.
*_docs (Doc): For backwards compatibility: list of patterns to add
diff --git a/spacy/ml/models/parser.py b/spacy/ml/models/parser.py
index 710d36a1d..bdcd709b1 100644
--- a/spacy/ml/models/parser.py
+++ b/spacy/ml/models/parser.py
@@ -15,10 +15,10 @@ def build_tb_parser_model(
use_upper=True,
nO=None,
):
- token_vector_width = tok2vec.get_dim("nO")
+ t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec = chain(
tok2vec,
- with_array(Linear(hidden_width, token_vector_width)),
+ with_array(Linear(hidden_width, t2v_width)),
list2array(),
)
tok2vec.set_dim("nO", hidden_width)
diff --git a/spacy/ml/models/tagger.py b/spacy/ml/models/tagger.py
index 683c8b518..00e268ede 100644
--- a/spacy/ml/models/tagger.py
+++ b/spacy/ml/models/tagger.py
@@ -6,9 +6,9 @@ from ...util import registry
@registry.architectures.register("spacy.Tagger.v1")
def build_tagger_model(tok2vec, nO=None) -> Model:
- token_vector_width = tok2vec.get_dim("nO")
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
- output_layer = Softmax(nO, nI=token_vector_width, init_W=zero_init)
+ t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
+ output_layer = Softmax(nO, t2v_width, init_W=zero_init)
softmax = with_array(output_layer)
model = chain(tok2vec, softmax)
model.set_ref("tok2vec", tok2vec)
diff --git a/spacy/ml/tb_framework.py b/spacy/ml/tb_framework.py
index e4301a644..251189389 100644
--- a/spacy/ml/tb_framework.py
+++ b/spacy/ml/tb_framework.py
@@ -38,8 +38,8 @@ def forward(model, X, is_train):
def init(model, X=None, Y=None):
- tok2vec = model.get_ref("tok2vec").initialize()
- lower = model.get_ref("lower").initialize(X=X)
+ tok2vec = model.get_ref("tok2vec").initialize(X=X)
+ lower = model.get_ref("lower").initialize()
if model.attrs["has_upper"]:
statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
model.get_ref("upper").initialize(X=statevecs)
diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
index f7e38bbea..31d83244c 100644
--- a/spacy/morphology.pyx
+++ b/spacy/morphology.pyx
@@ -198,8 +198,8 @@ cdef class Morphology:
"""Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties.
- tag (unicode): The part-of-speech tag to key the exception.
- orth (unicode): The word-form to key the exception.
+ tag (str): The part-of-speech tag to key the exception.
+ orth (str): The word-form to key the exception.
"""
attrs = dict(attrs)
attrs = _normalize_props(attrs)
diff --git a/spacy/analysis.py b/spacy/pipe_analysis.py
similarity index 90%
rename from spacy/analysis.py
rename to spacy/pipe_analysis.py
index c2600048f..971ebe518 100644
--- a/spacy/analysis.py
+++ b/spacy/pipe_analysis.py
@@ -11,7 +11,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
fulfilled (e.g. if previous components assign the attributes).
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
- name (unicode): The name of the pipeline component to analyze.
+ name (str): The name of the pipeline component to analyze.
pipe (callable): The pipeline component function to analyze.
index (int): The index of the component in the pipeline.
warn (bool): Show user warning if problem is found.
@@ -125,7 +125,7 @@ def get_assigns_for_attr(pipeline, attr):
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
- attr (unicode): The attribute to check.
+ attr (str): The attribute to check.
RETURNS (list): (name, pipeline) tuples of components that assign the attr.
"""
return _get_feature_for_attr(pipeline, attr, "assigns")
@@ -135,7 +135,7 @@ def get_requires_for_attr(pipeline, attr):
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
- attr (unicode): The attribute to check.
+ attr (str): The attribute to check.
RETURNS (list): (name, pipeline) tuples of components that require the attr.
"""
return _get_feature_for_attr(pipeline, attr, "requires")
@@ -173,3 +173,22 @@ def print_summary(nlp, pretty=True, no_print=False):
msg.good("No problems found.")
if no_print:
return {"overview": overview, "problems": problems}
+
+
+def count_pipeline_interdependencies(pipeline):
+ """Count how many subsequent components require an annotation set by each
+ component in the pipeline.
+ """
+ pipe_assigns = []
+ pipe_requires = []
+ for name, pipe in pipeline:
+ pipe_assigns.append(set(getattr(pipe, "assigns", [])))
+ pipe_requires.append(set(getattr(pipe, "requires", [])))
+ counts = []
+ for i, assigns in enumerate(pipe_assigns):
+ count = 0
+ for requires in pipe_requires[i + 1 :]:
+ if assigns.intersection(requires):
+ count += 1
+ counts.append(count)
+ return counts
diff --git a/spacy/pipeline/entityruler.py b/spacy/pipeline/entityruler.py
index 58160c2e9..bdc009192 100644
--- a/spacy/pipeline/entityruler.py
+++ b/spacy/pipeline/entityruler.py
@@ -30,7 +30,7 @@ class EntityRuler(object):
nlp (Language): The shared nlp object to pass the vocab to the matchers
and process phrase patterns.
- phrase_matcher_attr (int / unicode): Token attribute to match on, passed
+ phrase_matcher_attr (int / str): Token attribute to match on, passed
to the internal PhraseMatcher as `attr`
validate (bool): Whether patterns should be validated, passed to
Matcher and PhraseMatcher as `validate`
@@ -315,7 +315,7 @@ class EntityRuler(object):
"""Load the entity ruler from a file. Expects a file containing
newline-delimited JSON (JSONL) with one entry per line.
- path (unicode / Path): The JSONL file to load.
+ path (str / Path): The JSONL file to load.
**kwargs: Other config paramters, mostly for consistency.
RETURNS (EntityRuler): The loaded entity ruler.
@@ -351,7 +351,7 @@ class EntityRuler(object):
"""Save the entity ruler patterns to a directory. The patterns will be
saved as newline-delimited JSON (JSONL).
- path (unicode / Path): The JSONL file to save.
+ path (str / Path): The JSONL file to save.
**kwargs: Other config paramters, mostly for consistency.
DOCS: https://spacy.io/api/entityruler#to_disk
diff --git a/spacy/pipeline/functions.py b/spacy/pipeline/functions.py
index 6e9d4197c..622791512 100644
--- a/spacy/pipeline/functions.py
+++ b/spacy/pipeline/functions.py
@@ -50,7 +50,7 @@ def merge_subtokens(doc, label="subtok"):
"""Merge subtokens into a single token.
doc (Doc): The Doc object.
- label (unicode): The subtoken dependency label.
+ label (str): The subtoken dependency label.
RETURNS (Doc): The Doc object with merged subtokens.
DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens
diff --git a/spacy/pipeline/pipes.pyx b/spacy/pipeline/pipes.pyx
index 6804a98c3..42110efb0 100644
--- a/spacy/pipeline/pipes.pyx
+++ b/spacy/pipeline/pipes.pyx
@@ -531,7 +531,16 @@ class Tagger(Pipe):
vocab.morphology.lemmatizer,
exc=vocab.morphology.exc)
self.set_output(len(self.labels))
- self.model.initialize()
+ doc_sample = [Doc(self.vocab, words=["hello", "world"])]
+ if pipeline is not None:
+ for name, component in pipeline:
+ if component is self:
+ break
+ if hasattr(component, "pipe"):
+ doc_sample = list(component.pipe(doc_sample))
+ else:
+ doc_sample = [component(doc) for doc in doc_sample]
+ self.model.initialize(X=doc_sample)
# Get batch of example docs, example outputs to call begin_training().
# This lets the model infer shapes.
link_vectors_to_models(self.vocab)
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
index a30f11729..9e584ce8a 100644
--- a/spacy/strings.pyx
+++ b/spacy/strings.pyx
@@ -109,7 +109,7 @@ cdef class StringStore:
"""Retrieve a string from a given hash, or vice versa.
string_or_id (bytes, unicode or uint64): The value to encode.
- Returns (unicode or uint64): The value to be retrieved.
+ Returns (str / uint64): The value to be retrieved.
"""
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
return 0
@@ -152,7 +152,7 @@ cdef class StringStore:
def add(self, string):
"""Add a string to the StringStore.
- string (unicode): The string to add.
+ string (str): The string to add.
RETURNS (uint64): The string's hash value.
"""
if isinstance(string, unicode):
@@ -179,7 +179,7 @@ cdef class StringStore:
def __contains__(self, string not None):
"""Check whether a string is in the store.
- string (unicode): The string to check.
+ string (str): The string to check.
RETURNS (bool): Whether the store contains the string.
"""
cdef hash_t key
@@ -205,7 +205,7 @@ cdef class StringStore:
def __iter__(self):
"""Iterate over the strings in the store, in order.
- YIELDS (unicode): A string in the store.
+ YIELDS (str): A string in the store.
"""
cdef int i
cdef hash_t key
@@ -223,7 +223,7 @@ cdef class StringStore:
def to_disk(self, path):
"""Save the current state to a directory.
- path (unicode or Path): A path to a directory, which will be created if
+ path (str / Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or Path-like objects.
"""
path = util.ensure_path(path)
@@ -234,7 +234,7 @@ cdef class StringStore:
"""Loads state from a directory. Modifies the object in place and
returns it.
- path (unicode or Path): A path to a directory. Paths may be either
+ path (str / Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
RETURNS (StringStore): The modified `StringStore` object.
"""
diff --git a/spacy/syntax/nn_parser.pyx b/spacy/syntax/nn_parser.pyx
index 1437bdd98..fcaff444e 100644
--- a/spacy/syntax/nn_parser.pyx
+++ b/spacy/syntax/nn_parser.pyx
@@ -624,12 +624,25 @@ cdef class Parser:
sgd = self.create_optimizer()
doc_sample = []
gold_sample = []
- for example in islice(get_examples(), 1000):
+ for example in islice(get_examples(), 10):
parses = example.get_gold_parses(merge=False, vocab=self.vocab)
for doc, gold in parses:
- doc_sample.append(doc)
- gold_sample.append(gold)
- self.model.initialize(doc_sample, gold_sample)
+ if len(doc):
+ doc_sample.append(doc)
+ gold_sample.append(gold)
+
+ if pipeline is not None:
+ for name, component in pipeline:
+ if component is self:
+ break
+ if hasattr(component, "pipe"):
+ doc_sample = list(component.pipe(doc_sample))
+ else:
+ doc_sample = [component(doc) for doc in doc_sample]
+ if doc_sample:
+ self.model.initialize(doc_sample)
+ else:
+ self.model.initialize()
if pipeline is not None:
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
link_vectors_to_models(self.vocab)
diff --git a/spacy/tests/package/test_requirements.py b/spacy/tests/package/test_requirements.py
index 59a8569ee..0dc0f9d6c 100644
--- a/spacy/tests/package/test_requirements.py
+++ b/spacy/tests/package/test_requirements.py
@@ -9,7 +9,6 @@ def test_build_dependencies():
"pytest-timeout",
"mock",
"flake8",
- "jsonschema",
]
libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"]
diff --git a/spacy/tests/pipeline/test_analysis.py b/spacy/tests/pipeline/test_analysis.py
index cda39f6ee..b826438f5 100644
--- a/spacy/tests/pipeline/test_analysis.py
+++ b/spacy/tests/pipeline/test_analysis.py
@@ -1,7 +1,8 @@
import spacy.language
from spacy.language import Language, component
-from spacy.analysis import print_summary, validate_attrs
-from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
+from spacy.pipe_analysis import print_summary, validate_attrs
+from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr
+from spacy.pipe_analysis import count_pipeline_interdependencies
from mock import Mock, ANY
import pytest
@@ -161,3 +162,19 @@ def test_analysis_validate_attrs_remove_pipe():
with pytest.warns(None) as record:
nlp.remove_pipe("c2")
assert not record.list
+
+
+def test_pipe_interdependencies():
+ class Fancifier:
+ name = "fancifier"
+ assigns = ("doc._.fancy",)
+ requires = tuple()
+
+ class FancyNeeder:
+ name = "needer"
+ assigns = tuple()
+ requires = ("doc._.fancy",)
+
+ pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())]
+ counts = count_pipeline_interdependencies(pipeline)
+ assert counts == [1, 0]
diff --git a/spacy/tests/test_misc.py b/spacy/tests/test_misc.py
index c320b19c0..e4b4e570c 100644
--- a/spacy/tests/test_misc.py
+++ b/spacy/tests/test_misc.py
@@ -2,9 +2,11 @@ import pytest
import os
import ctypes
from pathlib import Path
+from spacy.about import __version__ as spacy_version
from spacy import util
from spacy import prefer_gpu, require_gpu
-from spacy.ml._precomputable_affine import PrecomputableAffine, _backprop_precomputable_affine_padding
+from spacy.ml._precomputable_affine import PrecomputableAffine
+from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
@pytest.fixture
@@ -24,10 +26,12 @@ def test_util_ensure_path_succeeds(text):
assert isinstance(path, Path)
-@pytest.mark.parametrize("package", ["numpy"])
-def test_util_is_package(package):
+@pytest.mark.parametrize(
+ "package,result", [("numpy", True), ("sfkodskfosdkfpsdpofkspdof", False)]
+)
+def test_util_is_package(package, result):
"""Test that an installed package via pip is recognised by util.is_package."""
- assert util.is_package(package)
+ assert util.is_package(package) is result
@pytest.mark.parametrize("package", ["thinc"])
@@ -87,3 +91,21 @@ def test_ascii_filenames():
root = Path(__file__).parent.parent
for path in root.glob("**/*"):
assert all(ord(c) < 128 for c in path.name), path.name
+
+
+@pytest.mark.parametrize(
+ "version,constraint,compatible",
+ [
+ (spacy_version, spacy_version, True),
+ (spacy_version, f">={spacy_version}", True),
+ ("3.0.0", "2.0.0", False),
+ ("3.2.1", ">=2.0.0", True),
+ ("2.2.10a1", ">=1.0.0,<2.1.1", False),
+ ("3.0.0.dev3", ">=1.2.3,<4.5.6", True),
+ ("n/a", ">=1.2.3,<4.5.6", None),
+ ("1.2.3", "n/a", None),
+ ("n/a", "n/a", None),
+ ],
+)
+def test_is_compatible_version(version, constraint, compatible):
+ assert util.is_compatible_version(version, constraint) is compatible
diff --git a/spacy/tests/test_util.py b/spacy/tests/test_util.py
new file mode 100644
index 000000000..1410755db
--- /dev/null
+++ b/spacy/tests/test_util.py
@@ -0,0 +1,59 @@
+import pytest
+from spacy.gold import Example
+
+from .util import get_random_doc
+
+from spacy.util import minibatch_by_words
+
+
+@pytest.mark.parametrize(
+ "doc_sizes, expected_batches",
+ [
+ ([400, 400, 199], [3]),
+ ([400, 400, 199, 3], [4]),
+ ([400, 400, 199, 3, 200], [3, 2]),
+ ([400, 400, 199, 3, 1], [5]),
+ ([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
+ ([400, 400, 199, 3, 1, 200], [3, 3]),
+ ([400, 400, 199, 3, 1, 999], [3, 3]),
+ ([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
+ ([1, 2, 999], [3]),
+ ([1, 2, 999, 1], [4]),
+ ([1, 200, 999, 1], [2, 2]),
+ ([1, 999, 200, 1], [2, 2]),
+ ],
+)
+def test_util_minibatch(doc_sizes, expected_batches):
+ docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
+ examples = [Example(doc=doc) for doc in docs]
+ tol = 0.2
+ batch_size = 1000
+ batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True))
+ assert [len(batch) for batch in batches] == expected_batches
+
+ max_size = batch_size + batch_size * tol
+ for batch in batches:
+ assert sum([len(example.doc) for example in batch]) < max_size
+
+
+@pytest.mark.parametrize(
+ "doc_sizes, expected_batches",
+ [
+ ([400, 4000, 199], [1, 2]),
+ ([400, 400, 199, 3000, 200], [1, 4]),
+ ([400, 400, 199, 3, 1, 1500], [1, 5]),
+ ([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
+ ([1, 2, 9999], [1, 2]),
+ ([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
+ ],
+)
+def test_util_minibatch_oversize(doc_sizes, expected_batches):
+ """ Test that oversized documents are returned in their own batch"""
+ docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
+ examples = [Example(doc=doc) for doc in docs]
+ tol = 0.2
+ batch_size = 1000
+ batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False))
+ assert [len(batch) for batch in batches] == expected_batches
+
+
diff --git a/spacy/tests/util.py b/spacy/tests/util.py
index e29342268..3d0a023c9 100644
--- a/spacy/tests/util.py
+++ b/spacy/tests/util.py
@@ -92,6 +92,13 @@ def get_batch(batch_size):
return docs
+def get_random_doc(n_words):
+ vocab = Vocab()
+ # Make the words numbers, so that they're easy to track.
+ numbers = [str(i) for i in range(0, n_words)]
+ return Doc(vocab, words=numbers)
+
+
def apply_transition_sequence(parser, doc, sequence):
"""Perform a series of pre-specified transitions, to put the parser in a
desired state."""
diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
index 7e75052f7..538bf60e9 100644
--- a/spacy/tokenizer.pyx
+++ b/spacy/tokenizer.pyx
@@ -134,7 +134,7 @@ cdef class Tokenizer:
def __call__(self, unicode string):
"""Tokenize a string.
- string (unicode): The string to tokenize.
+ string (str): The string to tokenize.
RETURNS (Doc): A container for linguistic annotations.
DOCS: https://spacy.io/api/tokenizer#call
@@ -147,7 +147,7 @@ cdef class Tokenizer:
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
"""Tokenize according to affix and token_match settings.
- string (unicode): The string to tokenize.
+ string (str): The string to tokenize.
RETURNS (Doc): A container for linguistic annotations.
"""
if len(string) >= (2 ** 30):
@@ -527,7 +527,7 @@ cdef class Tokenizer:
def find_infix(self, unicode string):
"""Find internal split points of the string, such as hyphens.
- string (unicode): The string to segment.
+ string (str): The string to segment.
RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
and `.end()` methods, denoting the placement of internal segment
separators, e.g. hyphens.
@@ -542,7 +542,7 @@ cdef class Tokenizer:
"""Find the length of a prefix that should be segmented from the
string, or None if no prefix rules match.
- string (unicode): The string to segment.
+ string (str): The string to segment.
RETURNS (int): The length of the prefix if present, otherwise `None`.
DOCS: https://spacy.io/api/tokenizer#find_prefix
@@ -556,7 +556,7 @@ cdef class Tokenizer:
"""Find the length of a suffix that should be segmented from the
string, or None if no suffix rules match.
- string (unicode): The string to segment.
+ string (str): The string to segment.
Returns (int): The length of the suffix if present, otherwise `None`.
DOCS: https://spacy.io/api/tokenizer#find_suffix
@@ -576,7 +576,7 @@ cdef class Tokenizer:
def _validate_special_case(self, chunk, substrings):
"""Check whether the `ORTH` fields match the string.
- string (unicode): The string to specially tokenize.
+ string (str): The string to specially tokenize.
substrings (iterable): A sequence of dicts, where each dict describes
a token and its attributes.
"""
@@ -588,7 +588,7 @@ cdef class Tokenizer:
def add_special_case(self, unicode string, substrings):
"""Add a special-case tokenization rule.
- string (unicode): The string to specially tokenize.
+ string (str): The string to specially tokenize.
substrings (iterable): A sequence of dicts, where each dict describes
a token and its attributes. The `ORTH` fields of the attributes
must exactly match the string when they are concatenated.
@@ -629,7 +629,7 @@ cdef class Tokenizer:
produced are identical to `nlp.tokenizer()` except for whitespace
tokens.
- string (unicode): The string to tokenize.
+ string (str): The string to tokenize.
RETURNS (list): A list of (pattern_string, token_string) tuples
DOCS: https://spacy.io/api/tokenizer#explain
@@ -693,7 +693,7 @@ cdef class Tokenizer:
def to_disk(self, path, **kwargs):
"""Save the current state to a directory.
- path (unicode or Path): A path to a directory, which will be created if
+ path (str / Path): A path to a directory, which will be created if
it doesn't exist.
exclude (list): String names of serialization fields to exclude.
@@ -707,7 +707,7 @@ cdef class Tokenizer:
"""Loads state from a directory. Modifies the object in place and
returns it.
- path (unicode or Path): A path to a directory.
+ path (str / Path): A path to a directory.
exclude (list): String names of serialization fields to exclude.
RETURNS (Tokenizer): The modified `Tokenizer` object.
diff --git a/spacy/tokens/doc.pyx b/spacy/tokens/doc.pyx
index e6841eb80..debab6aeb 100644
--- a/spacy/tokens/doc.pyx
+++ b/spacy/tokens/doc.pyx
@@ -117,7 +117,7 @@ cdef class Doc:
def set_extension(cls, name, **kwargs):
"""Define a custom attribute which becomes available as `Doc._`.
- name (unicode): Name of the attribute to set.
+ name (str): Name of the attribute to set.
default: Optional default value of the attribute.
getter (callable): Optional getter function.
setter (callable): Optional setter function.
@@ -135,7 +135,7 @@ cdef class Doc:
def get_extension(cls, name):
"""Look up a previously registered extension by name.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
DOCS: https://spacy.io/api/doc#get_extension
@@ -146,7 +146,7 @@ cdef class Doc:
def has_extension(cls, name):
"""Check whether an extension has been registered.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (bool): Whether the extension has been registered.
DOCS: https://spacy.io/api/doc#has_extension
@@ -157,7 +157,7 @@ cdef class Doc:
def remove_extension(cls, name):
"""Remove a previously registered extension.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
removed extension.
@@ -483,7 +483,7 @@ cdef class Doc:
def text(self):
"""A unicode representation of the document text.
- RETURNS (unicode): The original verbatim text of the document.
+ RETURNS (str): The original verbatim text of the document.
"""
return "".join(t.text_with_ws for t in self)
@@ -492,7 +492,7 @@ cdef class Doc:
"""An alias of `Doc.text`, provided for duck-type compatibility with
`Span` and `Token`.
- RETURNS (unicode): The original verbatim text of the document.
+ RETURNS (str): The original verbatim text of the document.
"""
return self.text
@@ -637,7 +637,7 @@ cdef class Doc:
@property
def lang_(self):
- """RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'."""
+ """RETURNS (str): Language of the doc's vocabulary, e.g. 'en'."""
return self.vocab.lang
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
@@ -852,7 +852,7 @@ cdef class Doc:
def to_disk(self, path, **kwargs):
"""Save the current state to a directory.
- path (unicode or Path): A path to a directory, which will be created if
+ path (str / Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or Path-like objects.
exclude (list): String names of serialization fields to exclude.
@@ -866,7 +866,7 @@ cdef class Doc:
"""Loads state from a directory. Modifies the object in place and
returns it.
- path (unicode or Path): A path to a directory. Paths may be either
+ path (str / Path): A path to a directory. Paths may be either
strings or `Path`-like objects.
exclude (list): String names of serialization fields to exclude.
RETURNS (Doc): The modified `Doc` object.
diff --git a/spacy/tokens/span.pyx b/spacy/tokens/span.pyx
index e9b151985..b8f79f8a6 100644
--- a/spacy/tokens/span.pyx
+++ b/spacy/tokens/span.pyx
@@ -33,7 +33,7 @@ cdef class Span:
def set_extension(cls, name, **kwargs):
"""Define a custom attribute which becomes available as `Span._`.
- name (unicode): Name of the attribute to set.
+ name (str): Name of the attribute to set.
default: Optional default value of the attribute.
getter (callable): Optional getter function.
setter (callable): Optional setter function.
@@ -51,7 +51,7 @@ cdef class Span:
def get_extension(cls, name):
"""Look up a previously registered extension by name.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
DOCS: https://spacy.io/api/span#get_extension
@@ -62,7 +62,7 @@ cdef class Span:
def has_extension(cls, name):
"""Check whether an extension has been registered.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (bool): Whether the extension has been registered.
DOCS: https://spacy.io/api/span#has_extension
@@ -73,7 +73,7 @@ cdef class Span:
def remove_extension(cls, name):
"""Remove a previously registered extension.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
removed extension.
@@ -491,7 +491,7 @@ cdef class Span:
@property
def text(self):
- """RETURNS (unicode): The original verbatim text of the span."""
+ """RETURNS (str): The original verbatim text of the span."""
text = self.text_with_ws
if self[-1].whitespace_:
text = text[:-1]
@@ -502,7 +502,7 @@ cdef class Span:
"""The text content of the span with a trailing whitespace character if
the last token has one.
- RETURNS (unicode): The text content of the span (with trailing
+ RETURNS (str): The text content of the span (with trailing
whitespace).
"""
return "".join([t.text_with_ws for t in self])
@@ -678,7 +678,7 @@ cdef class Span:
raise NotImplementedError(TempErrors.T007.format(attr="ent_id"))
property ent_id_:
- """RETURNS (unicode): The (string) entity ID."""
+ """RETURNS (str): The (string) entity ID."""
def __get__(self):
return self.root.ent_id_
@@ -690,12 +690,12 @@ cdef class Span:
"""Verbatim text content (identical to `Span.text`). Exists mostly for
consistency with other attributes.
- RETURNS (unicode): The span's text."""
+ RETURNS (str): The span's text."""
return self.text
@property
def lemma_(self):
- """RETURNS (unicode): The span's lemma."""
+ """RETURNS (str): The span's lemma."""
return " ".join([t.lemma_ for t in self]).strip()
@property
@@ -714,7 +714,7 @@ cdef class Span:
return "".join([t.text_with_ws for t in self])
property label_:
- """RETURNS (unicode): The span's label."""
+ """RETURNS (str): The span's label."""
def __get__(self):
return self.doc.vocab.strings[self.label]
@@ -724,7 +724,7 @@ cdef class Span:
raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_))
property kb_id_:
- """RETURNS (unicode): The named entity's KB ID."""
+ """RETURNS (str): The named entity's KB ID."""
def __get__(self):
return self.doc.vocab.strings[self.kb_id]
diff --git a/spacy/tokens/token.pyx b/spacy/tokens/token.pyx
index 58e9196ea..320cfaad5 100644
--- a/spacy/tokens/token.pyx
+++ b/spacy/tokens/token.pyx
@@ -36,7 +36,7 @@ cdef class Token:
def set_extension(cls, name, **kwargs):
"""Define a custom attribute which becomes available as `Token._`.
- name (unicode): Name of the attribute to set.
+ name (str): Name of the attribute to set.
default: Optional default value of the attribute.
getter (callable): Optional getter function.
setter (callable): Optional setter function.
@@ -54,7 +54,7 @@ cdef class Token:
def get_extension(cls, name):
"""Look up a previously registered extension by name.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
DOCS: https://spacy.io/api/token#get_extension
@@ -65,7 +65,7 @@ cdef class Token:
def has_extension(cls, name):
"""Check whether an extension has been registered.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (bool): Whether the extension has been registered.
DOCS: https://spacy.io/api/token#has_extension
@@ -76,7 +76,7 @@ cdef class Token:
def remove_extension(cls, name):
"""Remove a previously registered extension.
- name (unicode): Name of the extension.
+ name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
removed extension.
@@ -244,12 +244,12 @@ cdef class Token:
@property
def text(self):
- """RETURNS (unicode): The original verbatim text of the token."""
+ """RETURNS (str): The original verbatim text of the token."""
return self.orth_
@property
def text_with_ws(self):
- """RETURNS (unicode): The text content of the span (with trailing
+ """RETURNS (str): The text content of the span (with trailing
whitespace).
"""
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
@@ -762,7 +762,7 @@ cdef class Token:
self.c.ent_type = ent_type
property ent_type_:
- """RETURNS (unicode): Named entity type."""
+ """RETURNS (str): Named entity type."""
def __get__(self):
return self.vocab.strings[self.c.ent_type]
@@ -785,7 +785,7 @@ cdef class Token:
and "" means no entity tag is set. "B" with an empty ent_type
means that the token is blocked from further processing by NER.
- RETURNS (unicode): IOB code of named entity tag.
+ RETURNS (str): IOB code of named entity tag.
"""
iob_strings = ("", "I", "O", "B")
return iob_strings[self.c.ent_iob]
@@ -801,7 +801,7 @@ cdef class Token:
self.c.ent_id = key
property ent_id_:
- """RETURNS (unicode): ID of the entity the token is an instance of,
+ """RETURNS (str): ID of the entity the token is an instance of,
if any.
"""
def __get__(self):
@@ -819,7 +819,7 @@ cdef class Token:
self.c.ent_kb_id = ent_kb_id
property ent_kb_id_:
- """RETURNS (unicode): Named entity KB ID."""
+ """RETURNS (str): Named entity KB ID."""
def __get__(self):
return self.vocab.strings[self.c.ent_kb_id]
@@ -828,12 +828,12 @@ cdef class Token:
@property
def whitespace_(self):
- """RETURNS (unicode): The trailing whitespace character, if present."""
+ """RETURNS (str): The trailing whitespace character, if present."""
return " " if self.c.spacy else ""
@property
def orth_(self):
- """RETURNS (unicode): Verbatim text content (identical to
+ """RETURNS (str): Verbatim text content (identical to
`Token.text`). Exists mostly for consistency with the other
attributes.
"""
@@ -841,13 +841,13 @@ cdef class Token:
@property
def lower_(self):
- """RETURNS (unicode): The lowercase token text. Equivalent to
+ """RETURNS (str): The lowercase token text. Equivalent to
`Token.text.lower()`.
"""
return self.vocab.strings[self.c.lex.lower]
property norm_:
- """RETURNS (unicode): The token's norm, i.e. a normalised form of the
+ """RETURNS (str): The token's norm, i.e. a normalised form of the
token text. Usually set in the language's tokenizer exceptions or
norm exceptions.
"""
@@ -859,34 +859,34 @@ cdef class Token:
@property
def shape_(self):
- """RETURNS (unicode): Transform of the tokens's string, to show
+ """RETURNS (str): Transform of the tokens's string, to show
orthographic features. For example, "Xxxx" or "dd".
"""
return self.vocab.strings[self.c.lex.shape]
@property
def prefix_(self):
- """RETURNS (unicode): A length-N substring from the start of the token.
+ """RETURNS (str): A length-N substring from the start of the token.
Defaults to `N=1`.
"""
return self.vocab.strings[self.c.lex.prefix]
@property
def suffix_(self):
- """RETURNS (unicode): A length-N substring from the end of the token.
+ """RETURNS (str): A length-N substring from the end of the token.
Defaults to `N=3`.
"""
return self.vocab.strings[self.c.lex.suffix]
@property
def lang_(self):
- """RETURNS (unicode): Language of the parent document's vocabulary,
+ """RETURNS (str): Language of the parent document's vocabulary,
e.g. 'en'.
"""
return self.vocab.strings[self.c.lex.lang]
property lemma_:
- """RETURNS (unicode): The token lemma, i.e. the base form of the word,
+ """RETURNS (str): The token lemma, i.e. the base form of the word,
with no inflectional suffixes.
"""
def __get__(self):
@@ -899,7 +899,7 @@ cdef class Token:
self.c.lemma = self.vocab.strings.add(lemma_)
property pos_:
- """RETURNS (unicode): Coarse-grained part-of-speech tag."""
+ """RETURNS (str): Coarse-grained part-of-speech tag."""
def __get__(self):
return parts_of_speech.NAMES[self.c.pos]
@@ -907,7 +907,7 @@ cdef class Token:
self.c.pos = parts_of_speech.IDS[pos_name]
property tag_:
- """RETURNS (unicode): Fine-grained part-of-speech tag."""
+ """RETURNS (str): Fine-grained part-of-speech tag."""
def __get__(self):
return self.vocab.strings[self.c.tag]
@@ -915,7 +915,7 @@ cdef class Token:
self.tag = self.vocab.strings.add(tag)
property dep_:
- """RETURNS (unicode): The syntactic dependency label."""
+ """RETURNS (str): The syntactic dependency label."""
def __get__(self):
return self.vocab.strings[self.c.dep]
diff --git a/spacy/util.py b/spacy/util.py
index a6ccae075..97cc5a8d7 100644
--- a/spacy/util.py
+++ b/spacy/util.py
@@ -15,6 +15,8 @@ import srsly
import catalogue
import sys
import warnings
+from packaging.specifiers import SpecifierSet, InvalidSpecifier
+from packaging.version import Version, InvalidVersion
try:
@@ -22,9 +24,16 @@ try:
except ImportError:
cupy = None
+try: # Python 3.8
+ import importlib.metadata as importlib_metadata
+except ImportError:
+ import importlib_metadata
+
from .symbols import ORTH
from .compat import cupy, CudaStream
from .errors import Errors, Warnings
+from . import about
+
_PRINT_ENV = False
OOV_RANK = numpy.iinfo(numpy.uint64).max
@@ -37,6 +46,10 @@ class registry(thinc.registry):
factories = catalogue.create("spacy", "factories", entry_points=True)
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
assets = catalogue.create("spacy", "assets", entry_points=True)
+ # This is mostly used to get a list of all installed models in the current
+ # environment. spaCy models packaged with `spacy package` will "advertise"
+ # themselves via entry points.
+ models = catalogue.create("spacy", "models", entry_points=True)
def set_env_log(value):
@@ -49,7 +62,7 @@ def lang_class_is_loaded(lang):
loaded lazily, to avoid expensive setup code associated with the language
data.
- lang (unicode): Two-letter language code, e.g. 'en'.
+ lang (str): Two-letter language code, e.g. 'en'.
RETURNS (bool): Whether a Language class has been loaded.
"""
return lang in registry.languages
@@ -58,7 +71,7 @@ def lang_class_is_loaded(lang):
def get_lang_class(lang):
"""Import and load a Language class.
- lang (unicode): Two-letter language code, e.g. 'en'.
+ lang (str): Two-letter language code, e.g. 'en'.
RETURNS (Language): Language class.
"""
# Check if language is registered / entry point is available
@@ -76,7 +89,7 @@ def get_lang_class(lang):
def set_lang_class(name, cls):
"""Set a custom Language class name that can be loaded via get_lang_class.
- name (unicode): Name of Language class.
+ name (str): Name of Language class.
cls (Language): Language class.
"""
registry.languages.register(name, func=cls)
@@ -98,7 +111,7 @@ def load_language_data(path):
"""Load JSON language data using the given path as a base. If the provided
path isn't present, will attempt to load a gzipped version before giving up.
- path (unicode / Path): The data to load.
+ path (str / Path): The data to load.
RETURNS: The loaded data.
"""
path = ensure_path(path)
@@ -119,7 +132,7 @@ def get_module_path(module):
def load_model(name, **overrides):
"""Load a model from a package or data path.
- name (unicode): Package name or model path.
+ name (str): Package name or model path.
**overrides: Specific overrides, like pipeline components to disable.
RETURNS (Language): `Language` class with the loaded model.
"""
@@ -193,7 +206,7 @@ def load_model_from_init_py(init_file, **overrides):
"""Helper function to use in the `load()` method of a model package's
__init__.py.
- init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
+ init_file (str): Path to model's __init__.py, i.e. `__file__`.
**overrides: Specific overrides, like pipeline components to disable.
RETURNS (Language): `Language` class with loaded model.
"""
@@ -206,11 +219,74 @@ def load_model_from_init_py(init_file, **overrides):
return load_model_from_path(data_path, meta, **overrides)
+def get_installed_models():
+ """List all model packages currently installed in the environment.
+
+ RETURNS (list): The string names of the models.
+ """
+ return list(registry.models.get_all().keys())
+
+
+def get_package_version(name):
+ """Get the version of an installed package. Typically used to get model
+ package versions.
+
+ name (str): The name of the installed Python package.
+ RETURNS (str / None): The version or None if package not installed.
+ """
+ try:
+ return importlib_metadata.version(name)
+ except importlib_metadata.PackageNotFoundError:
+ return None
+
+
+def is_compatible_version(version, constraint, prereleases=True):
+ """Check if a version (e.g. "2.0.0") is compatible given a version
+ constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version,
+ it's interpreted as =={version}.
+
+ version (str): The version to check.
+ constraint (str): The constraint string.
+ prereleases (bool): Whether to allow prereleases. If set to False,
+ prerelease versions will be considered incompatible.
+ RETURNS (bool / None): Whether the version is compatible, or None if the
+ version or constraint are invalid.
+ """
+ # Handle cases where exact version is provided as constraint
+ if constraint[0].isdigit():
+ constraint = f"=={constraint}"
+ try:
+ spec = SpecifierSet(constraint)
+ version = Version(version)
+ except (InvalidSpecifier, InvalidVersion):
+ return None
+ spec.prereleases = prereleases
+ return version in spec
+
+
+def get_model_version_range(spacy_version):
+ """Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy
+ version. Models are always compatible across patch versions but not
+ across minor or major versions.
+ """
+ release = Version(spacy_version).release
+ return f">={spacy_version},<{release[0]}.{release[1] + 1}.0"
+
+
+def get_base_version(version):
+ """Generate the base version without any prerelease identifiers.
+
+ version (str): The version, e.g. "3.0.0.dev1".
+ RETURNS (str): The base version, e.g. "3.0.0".
+ """
+ return Version(version).base_version
+
+
def load_config(path, create_objects=False):
"""Load a Thinc-formatted config file, optionally filling in objects where
the config references registry entries. See "Thinc config files" for details.
- path (unicode or Path): Path to the config file
+ path (str / Path): Path to the config file
create_objects (bool): Whether to automatically create objects when the config
references registry entries. Defaults to False.
@@ -227,7 +303,7 @@ def load_config_from_str(string, create_objects=False):
"""Load a Thinc-formatted config, optionally filling in objects where
the config references registry entries. See "Thinc config files" for details.
- string (unicode or Path): Text contents of the config file.
+ string (str / Path): Text contents of the config file.
create_objects (bool): Whether to automatically create objects when the config
references registry entries. Defaults to False.
@@ -243,7 +319,7 @@ def load_config_from_str(string, create_objects=False):
def get_model_meta(path):
"""Get model meta.json from a directory path and validate its contents.
- path (unicode or Path): Path to model directory.
+ path (str / Path): Path to model directory.
RETURNS (dict): The model's meta data.
"""
model_path = ensure_path(path)
@@ -256,13 +332,23 @@ def get_model_meta(path):
for setting in ["lang", "name", "version"]:
if setting not in meta or not meta[setting]:
raise ValueError(Errors.E054.format(setting=setting))
+ if "spacy_version" in meta:
+ if not is_compatible_version(about.__version__, meta["spacy_version"]):
+ warnings.warn(
+ Warnings.W095.format(
+ model=f"{meta['lang']}_{meta['name']}",
+ model_version=meta["version"],
+ version=meta["spacy_version"],
+ current=about.__version__,
+ )
+ )
return meta
def get_model_config(path):
"""Get the model's config from a directory path.
- path (unicode or Path): Path to model directory.
+ path (str / Path): Path to model directory.
RETURNS (Config): The model's config data.
"""
model_path = ensure_path(path)
@@ -279,23 +365,20 @@ def get_model_config(path):
def is_package(name):
"""Check if string maps to a package installed via pip.
- name (unicode): Name of package.
+ name (str): Name of package.
RETURNS (bool): True if installed package, False if not.
"""
- import pkg_resources
-
- name = name.lower() # compare package name against lowercase name
- packages = pkg_resources.working_set.by_key.keys()
- for package in packages:
- if package.lower().replace("-", "_") == name:
- return True
- return False
+ try:
+ importlib_metadata.distribution(name)
+ return True
+ except: # noqa: E722
+ return False
def get_package_path(name):
"""Get the path to an installed package.
- name (unicode): Package name.
+ name (str): Package name.
RETURNS (Path): Path to installed package.
"""
name = name.lower() # use lowercase version to be safe
@@ -470,8 +553,8 @@ def expand_exc(excs, search, replace):
For example, to add additional versions with typographic apostrophes.
excs (dict): Tokenizer exceptions.
- search (unicode): String to find and replace.
- replace (unicode): Replacement.
+ search (str): String to find and replace.
+ replace (str): Replacement.
RETURNS (dict): Combined tokenizer exceptions.
"""
@@ -575,42 +658,74 @@ def decaying(start, stop, decay):
curr -= decay
-def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0.2):
+def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False):
"""Create minibatches of roughly a given number of words. If any examples
are longer than the specified batch length, they will appear in a batch by
- themselves."""
+ themselves, or be discarded if discard_oversize=True."""
if isinstance(size, int):
size_ = itertools.repeat(size)
elif isinstance(size, List):
size_ = iter(size)
else:
size_ = size
- examples = iter(examples)
- oversize = []
- while True:
- batch_size = next(size_)
- tol_size = batch_size * 0.2
- batch = []
- if oversize:
- example = oversize.pop(0)
- n_words = count_words(example.doc)
+
+ target_size = next(size_)
+ tol_size = target_size * tolerance
+ batch = []
+ overflow = []
+ batch_size = 0
+ overflow_size = 0
+
+ for example in examples:
+ n_words = count_words(example.doc)
+ # if the current example exceeds the maximum batch size, it is returned separately
+ # but only if discard_oversize=False.
+ if n_words > target_size + tol_size:
+ if not discard_oversize:
+ yield [example]
+
+ # add the example to the current batch if there's no overflow yet and it still fits
+ elif overflow_size == 0 and (batch_size + n_words) <= target_size:
batch.append(example)
- batch_size -= n_words
- while batch_size >= 1:
- try:
- example = next(examples)
- except StopIteration:
- if batch:
- yield batch
- return
- n_words = count_words(example.doc)
- if n_words < (batch_size + tol_size):
- batch_size -= n_words
- batch.append(example)
- else:
- oversize.append(example)
- if batch:
+ batch_size += n_words
+
+ # add the example to the overflow buffer if it fits in the tolerance margin
+ elif (batch_size + overflow_size + n_words) <= (target_size + tol_size):
+ overflow.append(example)
+ overflow_size += n_words
+
+ # yield the previous batch and start a new one. The new one gets the overflow examples.
+ else:
yield batch
+ target_size = next(size_)
+ tol_size = target_size * tolerance
+ batch = overflow
+ batch_size = overflow_size
+ overflow = []
+ overflow_size = 0
+
+ # this example still fits
+ if (batch_size + n_words) <= target_size:
+ batch.append(example)
+ batch_size += n_words
+
+ # this example fits in overflow
+ elif (batch_size + n_words) <= (target_size + tol_size):
+ overflow.append(example)
+ overflow_size += n_words
+
+ # this example does not fit with the previous overflow: start another new batch
+ else:
+ yield batch
+ target_size = next(size_)
+ tol_size = target_size * tolerance
+ batch = [example]
+ batch_size = n_words
+
+ # yield the final batch
+ if batch:
+ batch.extend(overflow)
+ yield batch
def itershuffle(iterable, bufsize=1000):
@@ -705,8 +820,8 @@ def from_disk(path, readers, exclude):
def import_file(name, loc):
"""Import module from a file. Used to load models from a directory.
- name (unicode): Name of module to load.
- loc (unicode / Path): Path to the file.
+ name (str): Name of module to load.
+ loc (str / Path): Path to the file.
RETURNS: The loaded module.
"""
loc = str(loc)
@@ -721,8 +836,8 @@ def minify_html(html):
Disclaimer: NOT a general-purpose solution, only removes indentation and
newlines.
- html (unicode): Markup to minify.
- RETURNS (unicode): "Minified" HTML.
+ html (str): Markup to minify.
+ RETURNS (str): "Minified" HTML.
"""
return html.strip().replace(" ", "").replace("\n", "")
@@ -731,8 +846,8 @@ def escape_html(text):
"""Replace <, >, &, " with their HTML encoded representation. Intended to
prevent HTML errors in rendered displaCy markup.
- text (unicode): The original text.
- RETURNS (unicode): Equivalent text to be safely used within HTML.
+ text (str): The original text.
+ RETURNS (str): Equivalent text to be safely used within HTML.
"""
text = text.replace("&", "&")
text = text.replace("<", "<")
diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
index 471c6463f..4537d612d 100644
--- a/spacy/vectors.pyx
+++ b/spacy/vectors.pyx
@@ -57,7 +57,7 @@ cdef class Vectors:
shape (tuple): Size of the table, as (# entries, # columns)
data (numpy.ndarray): The vector data.
keys (iterable): A sequence of keys, aligned with the data.
- name (unicode): A name to identify the vectors table.
+ name (str): A name to identify the vectors table.
RETURNS (Vectors): The newly created object.
DOCS: https://spacy.io/api/vectors#init
@@ -244,7 +244,7 @@ cdef class Vectors:
def find(self, *, key=None, keys=None, row=None, rows=None):
"""Look up one or more keys by row, or vice versa.
- key (unicode / int): Find the row that the given key points to.
+ key (str / int): Find the row that the given key points to.
Returns int, -1 if missing.
keys (iterable): Find rows that the keys point to.
Returns ndarray.
@@ -366,7 +366,7 @@ cdef class Vectors:
def to_disk(self, path, **kwargs):
"""Save the current state to a directory.
- path (unicode / Path): A path to a directory, which will be created if
+ path (str / Path): A path to a directory, which will be created if
it doesn't exists.
DOCS: https://spacy.io/api/vectors#to_disk
@@ -386,7 +386,7 @@ cdef class Vectors:
"""Loads state from a directory. Modifies the object in place and
returns it.
- path (unicode / Path): Directory path, string or Path-like object.
+ path (str / Path): Directory path, string or Path-like object.
RETURNS (Vectors): The modified object.
DOCS: https://spacy.io/api/vectors#from_disk
diff --git a/website/docs/api/cli.md b/website/docs/api/cli.md
index 505977be9..aacfb414c 100644
--- a/website/docs/api/cli.md
+++ b/website/docs/api/cli.md
@@ -504,10 +504,10 @@ tokenization can be provided.
> srsly.write_jsonl("/path/to/text.jsonl", data)
> ```
-| Key | Type | Description |
-| -------- | ------- | ---------------------------------------------------------- |
-| `text` | unicode | The raw input text. Is not required if `tokens` available. |
-| `tokens` | list | Optional tokenization, one string per token. |
+| Key | Type | Description |
+| -------- | ---- | ---------------------------------------------------------- |
+| `text` | str | The raw input text. Is not required if `tokens` available. |
+| `tokens` | list | Optional tokenization, one string per token. |
```json
### Example
diff --git a/website/docs/api/cython-classes.md b/website/docs/api/cython-classes.md
index 77d6fdd10..9dea04284 100644
--- a/website/docs/api/cython-classes.md
+++ b/website/docs/api/cython-classes.md
@@ -170,7 +170,7 @@ vocabulary.
| Name | Type | Description |
| ----------- | ---------------- | ------------------------------------------------------------------------------------------- |
| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. |
-| `string` | unicode | The string of the word to look up. |
+| `string` | str | The string of the word to look up. |
| **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary. |
### Vocab.get_by_orth {#vocab_get_by_orth tag="method"}
diff --git a/website/docs/api/dependencyparser.md b/website/docs/api/dependencyparser.md
index df0df3e38..0980dc2e0 100644
--- a/website/docs/api/dependencyparser.md
+++ b/website/docs/api/dependencyparser.md
@@ -229,9 +229,9 @@ Add a new label to the pipe.
> parser.add_label("MY_LABEL")
> ```
-| Name | Type | Description |
-| ------- | ------- | ----------------- |
-| `label` | unicode | The label to add. |
+| Name | Type | Description |
+| ------- | ---- | ----------------- |
+| `label` | str | The label to add. |
## DependencyParser.to_disk {#to_disk tag="method"}
@@ -244,10 +244,10 @@ Serialize the pipe to disk.
> parser.to_disk("/path/to/parser")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## DependencyParser.from_disk {#from_disk tag="method"}
@@ -262,7 +262,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description |
| ----------- | ------------------ | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. |
diff --git a/website/docs/api/doc.md b/website/docs/api/doc.md
index 7decc2278..50fb10756 100644
--- a/website/docs/api/doc.md
+++ b/website/docs/api/doc.md
@@ -123,7 +123,7 @@ details, see the documentation on
| Name | Type | Description |
| --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. |
+| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. |
| `default` | - | Optional default value of the attribute if no getter or method is defined. |
| `method` | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`. |
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. |
@@ -145,10 +145,10 @@ Look up a previously registered extension by name. Returns a 4-tuple
> assert extension == (False, None, None, None)
> ```
-| Name | Type | Description |
-| ----------- | ------- | ------------------------------------------------------------- |
-| `name` | unicode | Name of the extension. |
-| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
+| Name | Type | Description |
+| ----------- | ----- | ------------------------------------------------------------- |
+| `name` | str | Name of the extension. |
+| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
## Doc.has_extension {#has_extension tag="classmethod" new="2"}
@@ -162,10 +162,10 @@ Check whether an extension has been registered on the `Doc` class.
> assert Doc.has_extension('has_city')
> ```
-| Name | Type | Description |
-| ----------- | ------- | ------------------------------------------ |
-| `name` | unicode | Name of the extension to check. |
-| **RETURNS** | bool | Whether the extension has been registered. |
+| Name | Type | Description |
+| ----------- | ---- | ------------------------------------------ |
+| `name` | str | Name of the extension to check. |
+| **RETURNS** | bool | Whether the extension has been registered. |
## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
@@ -180,10 +180,10 @@ Remove a previously registered extension.
> assert not Doc.has_extension('has_city')
> ```
-| Name | Type | Description |
-| ----------- | ------- | --------------------------------------------------------------------- |
-| `name` | unicode | Name of the extension. |
-| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
+| Name | Type | Description |
+| ----------- | ----- | --------------------------------------------------------------------- |
+| `name` | str | Name of the extension. |
+| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
## Doc.char_span {#char_span tag="method" new="2"}
@@ -368,10 +368,10 @@ Save the current state to a directory.
> doc.to_disk("/path/to/doc")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Doc.from_disk {#from_disk tag="method" new="2"}
@@ -385,11 +385,11 @@ Loads state from a directory. Modifies the object in place and returns it.
> doc = Doc(Vocab()).from_disk("/path/to/doc")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
-| **RETURNS** | `Doc` | The modified `Doc` object. |
+| Name | Type | Description |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `Doc` | The modified `Doc` object. |
## Doc.to_bytes {#to_bytes tag="method"}
@@ -648,15 +648,15 @@ The L2 norm of the document's vector representation.
| Name | Type | Description |
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `text` | unicode | A unicode representation of the document text. |
-| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
+| `text` | str | A unicode representation of the document text. |
+| `text_with_ws` | str | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
| `vocab` | `Vocab` | The store of lexical types. |
| `tensor` 2 | `ndarray` | Container for dense vector representations. |
| `cats` 2 | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. |
| `user_data` | - | A generic storage area, for user custom data. |
| `lang` 2.1 | int | Language of the document's vocabulary. |
-| `lang_` 2.1 | unicode | Language of the document's vocabulary. |
+| `lang_` 2.1 | str | Language of the document's vocabulary. |
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. |
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. |
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. |
diff --git a/website/docs/api/entitylinker.md b/website/docs/api/entitylinker.md
index a9d6a31a5..d7f25ed56 100644
--- a/website/docs/api/entitylinker.md
+++ b/website/docs/api/entitylinker.md
@@ -258,10 +258,10 @@ Serialize the pipe to disk.
> entity_linker.to_disk("/path/to/entity_linker")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## EntityLinker.from_disk {#from_disk tag="method"}
@@ -274,11 +274,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
> entity_linker.from_disk("/path/to/entity_linker")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
-| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. |
+| Name | Type | Description |
+| ----------- | -------------- | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. |
## Serialization fields {#serialization-fields}
diff --git a/website/docs/api/entityrecognizer.md b/website/docs/api/entityrecognizer.md
index 9345ee249..c9a81f6f1 100644
--- a/website/docs/api/entityrecognizer.md
+++ b/website/docs/api/entityrecognizer.md
@@ -230,9 +230,9 @@ Add a new label to the pipe.
> ner.add_label("MY_LABEL")
> ```
-| Name | Type | Description |
-| ------- | ------- | ----------------- |
-| `label` | unicode | The label to add. |
+| Name | Type | Description |
+| ------- | ---- | ----------------- |
+| `label` | str | The label to add. |
## EntityRecognizer.to_disk {#to_disk tag="method"}
@@ -245,10 +245,10 @@ Serialize the pipe to disk.
> ner.to_disk("/path/to/ner")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## EntityRecognizer.from_disk {#from_disk tag="method"}
@@ -263,7 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description |
| ----------- | ------------------ | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. |
diff --git a/website/docs/api/entityruler.md b/website/docs/api/entityruler.md
index 0fd24897d..7bee3a77a 100644
--- a/website/docs/api/entityruler.md
+++ b/website/docs/api/entityruler.md
@@ -72,10 +72,10 @@ Whether a label is present in the patterns.
> assert not "PERSON" in ruler
> ```
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------------- |
-| `label` | unicode | The label to check. |
-| **RETURNS** | bool | Whether the entity ruler contains the label. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------------- |
+| `label` | str | The label to check. |
+| **RETURNS** | bool | Whether the entity ruler contains the label. |
## EntityRuler.\_\_call\_\_ {#call tag="method"}
@@ -83,8 +83,9 @@ Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
happens automatically after the component has been added to the pipeline using
[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
with `overwrite_ents=True`, existing entities will be replaced if they overlap
-with the matches. When matches overlap in a Doc, the entity ruler prioritizes longer
-patterns over shorter, and if equal the match occuring first in the Doc is chosen.
+with the matches. When matches overlap in a Doc, the entity ruler prioritizes
+longer patterns over shorter, and if equal the match occuring first in the Doc
+is chosen.
> #### Example
>
@@ -139,9 +140,9 @@ only the patterns are saved as JSONL. If a directory name is provided, a
> ruler.to_disk("/path/to/entity_ruler") # saves patterns and config
> ```
-| Name | Type | Description |
-| ------ | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| Name | Type | Description |
+| ------ | ------------ | ----------------------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## EntityRuler.from_disk {#from_disk tag="method"}
@@ -158,10 +159,10 @@ configuration.
> ruler.from_disk("/path/to/entity_ruler") # loads patterns and config
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | ---------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. |
+| Name | Type | Description |
+| ----------- | ------------- | ---------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. |
+| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. |
## EntityRuler.to_bytes {#to_bytes tag="method"}
diff --git a/website/docs/api/goldcorpus.md b/website/docs/api/goldcorpus.md
index a18ef4d32..7767b28bd 100644
--- a/website/docs/api/goldcorpus.md
+++ b/website/docs/api/goldcorpus.md
@@ -17,8 +17,8 @@ Create a `GoldCorpus`. IF the input data is an iterable, each item should be a
[`gold.read_json_file`](https://github.com/explosion/spaCy/tree/master/spacy/gold.pyx)
for further details.
-| Name | Type | Description |
-| ----------- | --------------------------- | ------------------------------------------------------------ |
-| `train` | unicode / `Path` / iterable | Training data, as a path (file or directory) or iterable. |
-| `dev` | unicode / `Path` / iterable | Development data, as a path (file or directory) or iterable. |
-| **RETURNS** | `GoldCorpus` | The newly constructed object. |
+| Name | Type | Description |
+| ----------- | ----------------------- | ------------------------------------------------------------ |
+| `train` | str / `Path` / iterable | Training data, as a path (file or directory) or iterable. |
+| `dev` | str / `Path` / iterable | Development data, as a path (file or directory) or iterable. |
+| **RETURNS** | `GoldCorpus` | The newly constructed object. |
diff --git a/website/docs/api/goldparse.md b/website/docs/api/goldparse.md
index 443913311..379913ba2 100644
--- a/website/docs/api/goldparse.md
+++ b/website/docs/api/goldparse.md
@@ -60,7 +60,8 @@ Whether the provided syntactic annotations form a projective dependency tree.
Convert a list of Doc objects into the
[JSON-serializable format](/api/annotation#json-input) used by the
-[`spacy train`](/api/cli#train) command. Each input doc will be treated as a 'paragraph' in the output doc.
+[`spacy train`](/api/cli#train) command. Each input doc will be treated as a
+'paragraph' in the output doc.
> #### Example
>
@@ -158,7 +159,7 @@ single-token entity.
| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. |
| `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. |
-| **RETURNS** | list | Unicode strings, describing the [BILUO](/api/annotation#biluo) tags. |
+| **RETURNS** | list | str strings, describing the [BILUO](/api/annotation#biluo) tags. |
### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}
diff --git a/website/docs/api/kb.md b/website/docs/api/kb.md
index eeba85e84..f088815fd 100644
--- a/website/docs/api/kb.md
+++ b/website/docs/api/kb.md
@@ -1,16 +1,19 @@
---
title: KnowledgeBase
-teaser: A storage class for entities and aliases of a specific knowledge base (ontology)
+teaser:
+ A storage class for entities and aliases of a specific knowledge base
+ (ontology)
tag: class
source: spacy/kb.pyx
new: 2.2
---
-The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init)
-objects, which are plausible external identifiers given a certain textual mention.
-Each such `Candidate` holds information from the relevant KB entities,
-such as its frequency in text and possible aliases.
-Each entity in the knowledge base also has a pretrained entity vector of a fixed size.
+The `KnowledgeBase` object provides a method to generate
+[`Candidate`](/api/kb/#candidate_init) objects, which are plausible external
+identifiers given a certain textual mention. Each such `Candidate` holds
+information from the relevant KB entities, such as its frequency in text and
+possible aliases. Each entity in the knowledge base also has a pretrained entity
+vector of a fixed size.
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
@@ -24,25 +27,25 @@ Create the knowledge base.
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
> ```
-| Name | Type | Description |
-| ----------------------- | ---------------- | ----------------------------------------- |
-| `vocab` | `Vocab` | A `Vocab` object. |
-| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
-| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
-
+| Name | Type | Description |
+| ---------------------- | --------------- | ---------------------------------------- |
+| `vocab` | `Vocab` | A `Vocab` object. |
+| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
+| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
The length of the fixed-size entity vectors in the knowledge base.
-| Name | Type | Description |
-| ----------- | ---- | ----------------------------------------- |
-| **RETURNS** | int | Length of the fixed-size entity vectors. |
+| Name | Type | Description |
+| ----------- | ---- | ---------------------------------------- |
+| **RETURNS** | int | Length of the fixed-size entity vectors. |
## KnowledgeBase.add_entity {#add_entity tag="method"}
-Add an entity to the knowledge base, specifying its corpus frequency
-and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length).
+Add an entity to the knowledge base, specifying its corpus frequency and entity
+vector, which should be of length
+[`entity_vector_length`](/api/kb#entity_vector_length).
> #### Example
>
@@ -51,16 +54,16 @@ and entity vector, which should be of length [`entity_vector_length`](/api/kb#en
> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
> ```
-| Name | Type | Description |
-| --------------- | ------------- | ------------------------------------------------- |
-| `entity` | unicode | The unique entity identifier |
-| `freq` | float | The frequency of the entity in a typical corpus |
-| `entity_vector` | vector | The pretrained vector of the entity |
+| Name | Type | Description |
+| --------------- | ------ | ----------------------------------------------- |
+| `entity` | str | The unique entity identifier |
+| `freq` | float | The frequency of the entity in a typical corpus |
+| `entity_vector` | vector | The pretrained vector of the entity |
## KnowledgeBase.set_entities {#set_entities tag="method"}
-Define the full list of entities in the knowledge base, specifying the corpus frequency
-and entity vector for each entity.
+Define the full list of entities in the knowledge base, specifying the corpus
+frequency and entity vector for each entity.
> #### Example
>
@@ -68,18 +71,19 @@ and entity vector for each entity.
> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
> ```
-| Name | Type | Description |
-| ------------- | ------------- | ------------------------------------------------- |
-| `entity_list` | iterable | List of unique entity identifiers |
-| `freq_list` | iterable | List of entity frequencies |
-| `vector_list` | iterable | List of entity vectors |
+| Name | Type | Description |
+| ------------- | -------- | --------------------------------- |
+| `entity_list` | iterable | List of unique entity identifiers |
+| `freq_list` | iterable | List of entity frequencies |
+| `vector_list` | iterable | List of entity vectors |
## KnowledgeBase.add_alias {#add_alias tag="method"}
-Add an alias or mention to the knowledge base, specifying its potential KB identifiers
-and their prior probabilities. The entity identifiers should refer to entities previously
-added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities).
-The sum of the prior probabilities should not exceed 1.
+Add an alias or mention to the knowledge base, specifying its potential KB
+identifiers and their prior probabilities. The entity identifiers should refer
+to entities previously added with [`add_entity`](/api/kb#add_entity) or
+[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
+should not exceed 1.
> #### Example
>
@@ -87,11 +91,11 @@ The sum of the prior probabilities should not exceed 1.
> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
> ```
-| Name | Type | Description |
-| -------------- | ------------- | -------------------------------------------------- |
-| `alias` | unicode | The textual mention or alias |
-| `entities` | iterable | The potential entities that the alias may refer to |
-| `probabilities`| iterable | The prior probabilities of each entity |
+| Name | Type | Description |
+| --------------- | -------- | -------------------------------------------------- |
+| `alias` | str | The textual mention or alias |
+| `entities` | iterable | The potential entities that the alias may refer to |
+| `probabilities` | iterable | The prior probabilities of each entity |
## KnowledgeBase.\_\_len\_\_ {#len tag="method"}
@@ -117,9 +121,9 @@ Get a list of all entity IDs in the knowledge base.
> all_entities = kb.get_entity_strings()
> ```
-| Name | Type | Description |
-| ----------- | ---- | --------------------------------------------- |
-| **RETURNS** | list | The list of entities in the knowledge base. |
+| Name | Type | Description |
+| ----------- | ---- | ------------------------------------------- |
+| **RETURNS** | list | The list of entities in the knowledge base. |
## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
@@ -131,9 +135,9 @@ Get the total number of aliases in the knowledge base.
> total_aliases = kb.get_size_aliases()
> ```
-| Name | Type | Description |
-| ----------- | ---- | --------------------------------------------- |
-| **RETURNS** | int | The number of aliases in the knowledge base. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------------- |
+| **RETURNS** | int | The number of aliases in the knowledge base. |
## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
@@ -145,9 +149,9 @@ Get a list of all aliases in the knowledge base.
> all_aliases = kb.get_alias_strings()
> ```
-| Name | Type | Description |
-| ----------- | ---- | --------------------------------------------- |
-| **RETURNS** | list | The list of aliases in the knowledge base. |
+| Name | Type | Description |
+| ----------- | ---- | ------------------------------------------ |
+| **RETURNS** | list | The list of aliases in the knowledge base. |
## KnowledgeBase.get_candidates {#get_candidates tag="method"}
@@ -160,10 +164,10 @@ of type [`Candidate`](/api/kb/#candidate_init).
> candidates = kb.get_candidates("Douglas")
> ```
-| Name | Type | Description |
-| ------------- | ------------- | -------------------------------------------------- |
-| `alias` | unicode | The textual mention or alias |
-| **RETURNS** | iterable | The list of relevant `Candidate` objects |
+| Name | Type | Description |
+| ----------- | -------- | ---------------------------------------- |
+| `alias` | str | The textual mention or alias |
+| **RETURNS** | iterable | The list of relevant `Candidate` objects |
## KnowledgeBase.get_vector {#get_vector tag="method"}
@@ -175,15 +179,15 @@ Given a certain entity ID, retrieve its pretrained entity vector.
> vector = kb.get_vector("Q42")
> ```
-| Name | Type | Description |
-| ------------- | ------------- | -------------------------------------------------- |
-| `entity` | unicode | The entity ID |
-| **RETURNS** | vector | The entity vector |
+| Name | Type | Description |
+| ----------- | ------ | ----------------- |
+| `entity` | str | The entity ID |
+| **RETURNS** | vector | The entity vector |
## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
-Given a certain entity ID and a certain textual mention, retrieve
-the prior probability of the fact that the mention links to the entity ID.
+Given a certain entity ID and a certain textual mention, retrieve the prior
+probability of the fact that the mention links to the entity ID.
> #### Example
>
@@ -191,11 +195,11 @@ the prior probability of the fact that the mention links to the entity ID.
> probability = kb.get_prior_prob("Q42", "Douglas")
> ```
-| Name | Type | Description |
-| ------------- | ------------- | --------------------------------------------------------------- |
-| `entity` | unicode | The entity ID |
-| `alias` | unicode | The textual mention or alias |
-| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` |
+| Name | Type | Description |
+| ----------- | ----- | -------------------------------------------------------------- |
+| `entity` | str | The entity ID |
+| `alias` | str | The textual mention or alias |
+| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` |
## KnowledgeBase.dump {#dump tag="method"}
@@ -207,14 +211,14 @@ Save the current state of the knowledge base to a directory.
> kb.dump(loc)
> ```
-| Name | Type | Description |
-| ------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ |
-| `loc` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| Name | Type | Description |
+| ----- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `loc` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## KnowledgeBase.load_bulk {#load_bulk tag="method"}
-Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab)
-should also be the same as the one used to create the KB.
+Restore the state of the knowledge base from a given directory. Note that the
+[`Vocab`](/api/vocab) should also be the same as the one used to create the KB.
> #### Example
>
@@ -226,18 +230,16 @@ should also be the same as the one used to create the KB.
> kb.load_bulk("/path/to/kb")
> ```
-
-| Name | Type | Description |
-| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
-| `loc` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. |
-
+| Name | Type | Description |
+| ----------- | --------------- | -------------------------------------------------------------------------- |
+| `loc` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. |
## Candidate.\_\_init\_\_ {#candidate_init tag="method"}
Construct a `Candidate` object. Usually this constructor is not called directly,
-but instead these objects are returned by the [`get_candidates`](/api/kb#get_candidates) method
-of a `KnowledgeBase`.
+but instead these objects are returned by the
+[`get_candidates`](/api/kb#get_candidates) method of a `KnowledgeBase`.
> #### Example
>
@@ -257,12 +259,12 @@ of a `KnowledgeBase`.
## Candidate attributes {#candidate_attributes}
-| Name | Type | Description |
-| ---------------------- | ------------ | ------------------------------------------------------------------ |
-| `entity` | int | The entity's unique KB identifier |
-| `entity_` | unicode | The entity's unique KB identifier |
-| `alias` | int | The alias or textual mention |
-| `alias_` | unicode | The alias or textual mention |
-| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
-| `entity_freq` | long | The frequency of the entity in a typical corpus |
-| `entity_vector` | vector | The pretrained vector of the entity |
+| Name | Type | Description |
+| --------------- | ------ | -------------------------------------------------------------- |
+| `entity` | int | The entity's unique KB identifier |
+| `entity_` | str | The entity's unique KB identifier |
+| `alias` | int | The alias or textual mention |
+| `alias_` | str | The alias or textual mention |
+| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
+| `entity_freq` | long | The frequency of the entity in a typical corpus |
+| `entity_vector` | vector | The pretrained vector of the entity |
diff --git a/website/docs/api/language.md b/website/docs/api/language.md
index 50689a7ef..e1991f260 100644
--- a/website/docs/api/language.md
+++ b/website/docs/api/language.md
@@ -49,11 +49,11 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
> assert (doc[0].text, doc[0].head.tag_) == ("An", "NN")
> ```
-| Name | Type | Description |
-| ----------- | ------- | --------------------------------------------------------------------------------- |
-| `text` | unicode | The text to be processed. |
-| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
-| **RETURNS** | `Doc` | A container for accessing the annotations. |
+| Name | Type | Description |
+| ----------- | ----- | --------------------------------------------------------------------------------- |
+| `text` | str | The text to be processed. |
+| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
+| **RETURNS** | `Doc` | A container for accessing the annotations. |
@@ -201,7 +201,7 @@ Create a pipeline component from a factory.
| Name | Type | Description |
| ----------- | -------- | ---------------------------------------------------------------------------------- |
-| `name` | unicode | Factory name to look up in [`Language.factories`](/api/language#class-attributes). |
+| `name` | str | Factory name to look up in [`Language.factories`](/api/language#class-attributes). |
| `config` | dict | Configuration parameters to initialize component. |
| **RETURNS** | callable | The pipeline component. |
@@ -224,9 +224,9 @@ take a `Doc` object, modify it and return it. Only one of `before`, `after`,
| Name | Type | Description |
| ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `component` | callable | The pipeline component. |
-| `name` | unicode | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. |
-| `before` | unicode | Component name to insert component directly before. |
-| `after` | unicode | Component name to insert component directly after: |
+| `name` | str | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. |
+| `before` | str | Component name to insert component directly before. |
+| `after` | str | Component name to insert component directly after: |
| `first` | bool | Insert component first / not first in the pipeline. |
| `last` | bool | Insert component last / not last in the pipeline. |
@@ -243,10 +243,10 @@ Check whether a component is present in the pipeline. Equivalent to
> assert nlp.has_pipe("component")
> ```
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------------------------- |
-| `name` | unicode | Name of the pipeline component to check. |
-| **RETURNS** | bool | Whether a component of that name exists in the pipeline. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------------------------- |
+| `name` | str | Name of the pipeline component to check. |
+| **RETURNS** | bool | Whether a component of that name exists in the pipeline. |
## Language.get_pipe {#get_pipe tag="method" new="2"}
@@ -261,7 +261,7 @@ Get a pipeline component for a given component name.
| Name | Type | Description |
| ----------- | -------- | -------------------------------------- |
-| `name` | unicode | Name of the pipeline component to get. |
+| `name` | str | Name of the pipeline component to get. |
| **RETURNS** | callable | The pipeline component. |
## Language.replace_pipe {#replace_pipe tag="method" new="2"}
@@ -276,7 +276,7 @@ Replace a component in the pipeline.
| Name | Type | Description |
| ----------- | -------- | --------------------------------- |
-| `name` | unicode | Name of the component to replace. |
+| `name` | str | Name of the component to replace. |
| `component` | callable | The pipeline component to insert. |
## Language.rename_pipe {#rename_pipe tag="method" new="2"}
@@ -292,10 +292,10 @@ added to the pipeline, you can also use the `name` argument on
> nlp.rename_pipe("parser", "spacy_parser")
> ```
-| Name | Type | Description |
-| ---------- | ------- | -------------------------------- |
-| `old_name` | unicode | Name of the component to rename. |
-| `new_name` | unicode | New name of the component. |
+| Name | Type | Description |
+| ---------- | ---- | -------------------------------- |
+| `old_name` | str | Name of the component to rename. |
+| `new_name` | str | New name of the component. |
## Language.remove_pipe {#remove_pipe tag="method" new="2"}
@@ -309,10 +309,10 @@ component function.
> assert name == "parser"
> ```
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------------- |
-| `name` | unicode | Name of the component to remove. |
-| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. |
+| Name | Type | Description |
+| ----------- | ----- | ----------------------------------------------------- |
+| `name` | str | Name of the component to remove. |
+| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. |
## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"}
@@ -342,12 +342,11 @@ latter case, all components not in the `enable` list, will be disabled.
| Name | Type | Description |
| ----------- | --------------- | ------------------------------------------------------------------------------------ |
| `disable` | list | Names of pipeline components to disable. |
-| `disable` | unicode | Name of pipeline component to disable. |
+| `disable` | str | Name of pipeline component to disable. |
| `enable` | list | Names of pipeline components that will not be disabled. |
-| `enable` | unicode | Name of pipeline component that will not be disabled. |
+| `enable` | str | Name of pipeline component that will not be disabled. |
| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
-
As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
@@ -370,10 +369,10 @@ the model**.
> nlp.to_disk("/path/to/models")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
## Language.from_disk {#from_disk tag="method" new="2"}
@@ -395,11 +394,11 @@ loaded object.
> nlp = English().from_disk("/path/to/en_model")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
-| **RETURNS** | `Language` | The modified `Language` object. |
+| Name | Type | Description |
+| ----------- | ------------ | ----------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `Language` | The modified `Language` object. |
@@ -480,11 +479,11 @@ per component.
## Class attributes {#class-attributes}
-| Name | Type | Description |
-| -------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------------- |
-| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
-| `lang` | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
-| `factories` 2 | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. |
+| Name | Type | Description |
+| -------------------------------------- | ----- | ----------------------------------------------------------------------------------------------------------------------------------- |
+| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
+| `lang` | str | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
+| `factories` 2 | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. |
## Serialization fields {#serialization-fields}
diff --git a/website/docs/api/lemmatizer.md b/website/docs/api/lemmatizer.md
index f43e17fd3..16cd624f5 100644
--- a/website/docs/api/lemmatizer.md
+++ b/website/docs/api/lemmatizer.md
@@ -63,8 +63,8 @@ Lemmatize a string.
| Name | Type | Description |
| ------------ | ------------- | -------------------------------------------------------------------------------------------------------- |
-| `string` | unicode | The string to lemmatize, e.g. the token text. |
-| `univ_pos` | unicode / int | The token's universal part-of-speech tag. |
+| `string` | str | The string to lemmatize, e.g. the token text. |
+| `univ_pos` | str / int | The token's universal part-of-speech tag. |
| `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. |
| **RETURNS** | list | The available lemmas for the string. |
@@ -82,11 +82,11 @@ original string is returned. Languages can provide a
> assert lemmatizer.lookup("going") == "go"
> ```
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------------------------------------------------------------------- |
-| `string` | unicode | The string to look up. |
-| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
-| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. |
+| Name | Type | Description |
+| ----------- | ---- | ----------------------------------------------------------------------------------------------------------- |
+| `string` | str | The string to look up. |
+| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
+| **RETURNS** | str | The lemma if the string was found, otherwise the original string. |
## Lemmatizer.is_base_form {#is_base_form tag="method"}
@@ -102,11 +102,11 @@ lemmatization entirely.
> assert is_base_form == True
> ```
-| Name | Type | Description |
-| ------------ | ------------- | --------------------------------------------------------------------------------------- |
-| `univ_pos` | unicode / int | The token's universal part-of-speech tag. |
-| `morphology` | dict | The token's morphological features. |
-| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. |
+| Name | Type | Description |
+| ------------ | --------- | --------------------------------------------------------------------------------------- |
+| `univ_pos` | str / int | The token's universal part-of-speech tag. |
+| `morphology` | dict | The token's morphological features. |
+| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. |
## Attributes {#attributes}
diff --git a/website/docs/api/lexeme.md b/website/docs/api/lexeme.md
index feb167a9d..39148e476 100644
--- a/website/docs/api/lexeme.md
+++ b/website/docs/api/lexeme.md
@@ -125,21 +125,21 @@ The L2 norm of the lexeme's vector representation.
| Name | Type | Description |
| -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | The lexeme's vocabulary. |
-| `text` | unicode | Verbatim text content. |
+| `text` | str | Verbatim text content. |
| `orth` | int | ID of the verbatim text content. |
-| `orth_` | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. |
+| `orth_` | str | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. |
| `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. |
| `flags` | int | Container of the lexeme's binary flags. |
| `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. |
-| `norm_` | unicode | The lexemes's norm, i.e. a normalized form of the lexeme text. |
+| `norm_` | str | The lexemes's norm, i.e. a normalized form of the lexeme text. |
| `lower` | int | Lowercase form of the word. |
-| `lower_` | unicode | Lowercase form of the word. |
+| `lower_` | str | Lowercase form of the word. |
| `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
-| `shape_` | unicode | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
+| `shape_` | str | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
| `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. |
-| `prefix_` | unicode | Length-N substring from the start of the word. Defaults to `N=1`. |
+| `prefix_` | str | Length-N substring from the start of the word. Defaults to `N=1`. |
| `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. |
-| `suffix_` | unicode | Length-N substring from the start of the word. Defaults to `N=3`. |
+| `suffix_` | str | Length-N substring from the start of the word. Defaults to `N=3`. |
| `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. |
| `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. |
| `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. |
@@ -159,7 +159,7 @@ The L2 norm of the lexeme's vector representation.
| `is_oov` | bool | Is the lexeme out-of-vocabulary? |
| `is_stop` | bool | Is the lexeme part of a "stop list"? |
| `lang` | int | Language of the parent vocabulary. |
-| `lang_` | unicode | Language of the parent vocabulary. |
+| `lang_` | str | Language of the parent vocabulary. |
| `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). |
| `cluster` | int | Brown cluster ID. |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. |
diff --git a/website/docs/api/lookups.md b/website/docs/api/lookups.md
index bd3b38303..b91d92646 100644
--- a/website/docs/api/lookups.md
+++ b/website/docs/api/lookups.md
@@ -56,10 +56,10 @@ Check if the lookups contain a table of a given name. Delegates to
> assert "some_table" in lookups
> ```
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------- |
-| `name` | unicode | Name of the table. |
-| **RETURNS** | bool | Whether a table of that name is in the lookups. |
+| Name | Type | Description |
+| ----------- | ---- | ----------------------------------------------- |
+| `name` | str | Name of the table. |
+| **RETURNS** | bool | Whether a table of that name is in the lookups. |
## Lookups.tables {#tables tag="property"}
@@ -91,7 +91,7 @@ exists.
| Name | Type | Description |
| ----------- | ----------------------------- | ---------------------------------- |
-| `name` | unicode | Unique name of the table. |
+| `name` | str | Unique name of the table. |
| `data` | dict | Optional data to add to the table. |
| **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. |
@@ -110,7 +110,7 @@ Get a table from the lookups. Raises an error if the table doesn't exist.
| Name | Type | Description |
| ----------- | ----------------------------- | ------------------ |
-| `name` | unicode | Name of the table. |
+| `name` | str | Name of the table. |
| **RETURNS** | [`Table`](/api/lookups#table) | The table. |
## Lookups.remove_table {#remove_table tag="method"}
@@ -128,7 +128,7 @@ Remove a table from the lookups. Raises an error if the table doesn't exist.
| Name | Type | Description |
| ----------- | ----------------------------- | ---------------------------- |
-| `name` | unicode | Name of the table to remove. |
+| `name` | str | Name of the table to remove. |
| **RETURNS** | [`Table`](/api/lookups#table) | The removed table. |
## Lookups.has_table {#has_table tag="method"}
@@ -144,10 +144,10 @@ Check if the lookups contain a table of a given name. Equivalent to
> assert lookups.has_table("some_table")
> ```
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------- |
-| `name` | unicode | Name of the table. |
-| **RETURNS** | bool | Whether a table of that name is in the lookups. |
+| Name | Type | Description |
+| ----------- | ---- | ----------------------------------------------- |
+| `name` | str | Name of the table. |
+| **RETURNS** | bool | Whether a table of that name is in the lookups. |
## Lookups.to_bytes {#to_bytes tag="method"}
@@ -191,9 +191,9 @@ which will be created if it doesn't exist.
> lookups.to_disk("/path/to/lookups")
> ```
-| Name | Type | Description |
-| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| Name | Type | Description |
+| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## Lookups.from_disk {#from_disk tag="method"}
@@ -208,10 +208,10 @@ the file doesn't exist.
> lookups.from_disk("/path/to/lookups")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `Lookups` | The loaded lookups. |
+| Name | Type | Description |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| **RETURNS** | `Lookups` | The loaded lookups. |
## Table {#table tag="class, ordererddict"}
@@ -238,7 +238,7 @@ Initialize a new table.
| Name | Type | Description |
| ----------- | ------- | ---------------------------------- |
-| `name` | unicode | Optional table name for reference. |
+| `name` | str | Optional table name for reference. |
| **RETURNS** | `Table` | The newly constructed object. |
### Table.from_dict {#table.from_dict tag="classmethod"}
@@ -256,7 +256,7 @@ Initialize a new table from a dict.
| Name | Type | Description |
| ----------- | ------- | ---------------------------------- |
| `data` | dict | The dictionary. |
-| `name` | unicode | Optional table name for reference. |
+| `name` | str | Optional table name for reference. |
| **RETURNS** | `Table` | The newly constructed object. |
### Table.set {#table.set tag="method"}
@@ -273,10 +273,10 @@ Set a new key / value pair. String keys will be hashed. Same as
> assert table["foo"] == "bar"
> ```
-| Name | Type | Description |
-| ------- | ------------- | ----------- |
-| `key` | unicode / int | The key. |
-| `value` | - | The value. |
+| Name | Type | Description |
+| ------- | --------- | ----------- |
+| `key` | str / int | The key. |
+| `value` | - | The value. |
### Table.to_bytes {#table.to_bytes tag="method"}
@@ -313,6 +313,6 @@ Load a table from a bytestring.
| Name | Type | Description |
| -------------- | --------------------------- | ----------------------------------------------------- |
-| `name` | unicode | Table name. |
+| `name` | str | Table name. |
| `default_size` | int | Default size of bloom filters if no data is provided. |
| `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. |
diff --git a/website/docs/api/matcher.md b/website/docs/api/matcher.md
index bfd4fb0ec..8a872558c 100644
--- a/website/docs/api/matcher.md
+++ b/website/docs/api/matcher.md
@@ -125,10 +125,10 @@ Check whether the matcher contains rules for a match ID.
> assert 'Rule' in matcher
> ```
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------------- |
-| `key` | unicode | The match ID. |
-| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
+| Name | Type | Description |
+| ----------- | ---- | ----------------------------------------------------- |
+| `key` | str | The match ID. |
+| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
## Matcher.add {#add tag="method" new="2"}
@@ -153,7 +153,7 @@ overwritten.
| Name | Type | Description |
| ----------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id` | unicode | An ID for the thing you're matching. |
+| `match_id` | str | An ID for the thing you're matching. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
@@ -188,9 +188,9 @@ exist.
> assert "Rule" not in matcher
> ```
-| Name | Type | Description |
-| ----- | ------- | ------------------------- |
-| `key` | unicode | The ID of the match rule. |
+| Name | Type | Description |
+| ----- | ---- | ------------------------- |
+| `key` | str | The ID of the match rule. |
## Matcher.get {#get tag="method" new="2"}
@@ -204,7 +204,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
> on_match, patterns = matcher.get("Rule")
> ```
-| Name | Type | Description |
-| ----------- | ------- | --------------------------------------------- |
-| `key` | unicode | The ID of the match rule. |
-| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
+| Name | Type | Description |
+| ----------- | ----- | --------------------------------------------- |
+| `key` | str | The ID of the match rule. |
+| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
diff --git a/website/docs/api/phrasematcher.md b/website/docs/api/phrasematcher.md
index a72277420..fa6729f41 100644
--- a/website/docs/api/phrasematcher.md
+++ b/website/docs/api/phrasematcher.md
@@ -133,10 +133,10 @@ Check whether the matcher contains rules for a match ID.
> assert "OBAMA" in matcher
> ```
-| Name | Type | Description |
-| ----------- | ------- | ----------------------------------------------------- |
-| `key` | unicode | The match ID. |
-| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
+| Name | Type | Description |
+| ----------- | ---- | ----------------------------------------------------- |
+| `key` | str | The match ID. |
+| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
## PhraseMatcher.add {#add tag="method"}
@@ -162,7 +162,7 @@ overwritten.
| Name | Type | Description |
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
-| `match_id` | unicode | An ID for the thing you're matching. |
+| `match_id` | str | An ID for the thing you're matching. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `*docs` | `Doc` | `Doc` objects of the phrases to match. |
@@ -198,6 +198,6 @@ does not exist.
> assert "OBAMA" not in matcher
> ```
-| Name | Type | Description |
-| ----- | ------- | ------------------------- |
-| `key` | unicode | The ID of the match rule. |
+| Name | Type | Description |
+| ----- | ---- | ------------------------- |
+| `key` | str | The ID of the match rule. |
diff --git a/website/docs/api/pipeline-functions.md b/website/docs/api/pipeline-functions.md
index 6e2b473b1..fc417845c 100644
--- a/website/docs/api/pipeline-functions.md
+++ b/website/docs/api/pipeline-functions.md
@@ -112,8 +112,8 @@ end of the pipeline and after all other components.
-| Name | Type | Description |
-| ----------- | ------- | ------------------------------------------------------------ |
-| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
-| `label` | unicode | The subtoken dependency label. Defaults to `"subtok"`. |
-| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. |
+| Name | Type | Description |
+| ----------- | ----- | ------------------------------------------------------------ |
+| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
+| `label` | str | The subtoken dependency label. Defaults to `"subtok"`. |
+| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. |
diff --git a/website/docs/api/sentencizer.md b/website/docs/api/sentencizer.md
index c9b935f22..03e843fcc 100644
--- a/website/docs/api/sentencizer.md
+++ b/website/docs/api/sentencizer.md
@@ -81,9 +81,9 @@ a file `sentencizer.json`. This also happens automatically when you save an
> sentencizer.to_disk("/path/to/sentencizer.jsonl")
> ```
-| Name | Type | Description |
-| ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| Name | Type | Description |
+| ------ | ------------ | ---------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## Sentencizer.from_disk {#from_disk tag="method"}
@@ -98,10 +98,10 @@ added to its pipeline.
> sentencizer.from_disk("/path/to/sentencizer.json")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. |
+| Name | Type | Description |
+| ----------- | ------------- | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. |
+| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. |
## Sentencizer.to_bytes {#to_bytes tag="method"}
diff --git a/website/docs/api/span.md b/website/docs/api/span.md
index 3833bbca9..c41d9aa03 100644
--- a/website/docs/api/span.md
+++ b/website/docs/api/span.md
@@ -110,7 +110,7 @@ For details, see the documentation on
| Name | Type | Description |
| --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- |
-| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. |
+| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. |
| `default` | - | Optional default value of the attribute if no getter or method is defined. |
| `method` | callable | Set a custom method on the object, for example `span._.compare(other_span)`. |
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. |
@@ -132,10 +132,10 @@ Look up a previously registered extension by name. Returns a 4-tuple
> assert extension == (False, None, None, None)
> ```
-| Name | Type | Description |
-| ----------- | ------- | ------------------------------------------------------------- |
-| `name` | unicode | Name of the extension. |
-| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
+| Name | Type | Description |
+| ----------- | ----- | ------------------------------------------------------------- |
+| `name` | str | Name of the extension. |
+| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
## Span.has_extension {#has_extension tag="classmethod" new="2"}
@@ -149,10 +149,10 @@ Check whether an extension has been registered on the `Span` class.
> assert Span.has_extension("is_city")
> ```
-| Name | Type | Description |
-| ----------- | ------- | ------------------------------------------ |
-| `name` | unicode | Name of the extension to check. |
-| **RETURNS** | bool | Whether the extension has been registered. |
+| Name | Type | Description |
+| ----------- | ---- | ------------------------------------------ |
+| `name` | str | Name of the extension to check. |
+| **RETURNS** | bool | Whether the extension has been registered. |
## Span.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
@@ -167,10 +167,10 @@ Remove a previously registered extension.
> assert not Span.has_extension("is_city")
> ```
-| Name | Type | Description |
-| ----------- | ------- | --------------------------------------------------------------------- |
-| `name` | unicode | Name of the extension. |
-| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
+| Name | Type | Description |
+| ----------- | ----- | --------------------------------------------------------------------- |
+| `name` | str | Name of the extension. |
+| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
## Span.char_span {#char_span tag="method" new="2.2.4"}
@@ -497,16 +497,16 @@ The L2 norm of the span's vector representation.
| `end` | int | The token offset for the end of the span. |
| `start_char` | int | The character offset for the start of the span. |
| `end_char` | int | The character offset for the end of the span. |
-| `text` | unicode | A unicode representation of the span text. |
-| `text_with_ws` | unicode | The text content of the span with a trailing whitespace character if the last token has one. |
+| `text` | str | A unicode representation of the span text. |
+| `text_with_ws` | str | The text content of the span with a trailing whitespace character if the last token has one. |
| `orth` | int | ID of the verbatim text content. |
-| `orth_` | unicode | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. |
+| `orth_` | str | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. |
| `label` | int | The hash value of the span's label. |
-| `label_` | unicode | The span's label. |
-| `lemma_` | unicode | The span's lemma. |
+| `label_` | str | The span's label. |
+| `lemma_` | str | The span's lemma. |
| `kb_id` | int | The hash value of the knowledge base ID referred to by the span. |
-| `kb_id_` | unicode | The knowledge base ID referred to by the span. |
+| `kb_id_` | str | The knowledge base ID referred to by the span. |
| `ent_id` | int | The hash value of the named entity the token is an instance of. |
-| `ent_id_` | unicode | The string ID of the named entity the token is an instance of. |
+| `ent_id_` | str | The string ID of the named entity the token is an instance of. |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the span. |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
diff --git a/website/docs/api/stringstore.md b/website/docs/api/stringstore.md
index 268f19125..922174c78 100644
--- a/website/docs/api/stringstore.md
+++ b/website/docs/api/stringstore.md
@@ -55,7 +55,7 @@ Retrieve a string from a given hash, or vice versa.
| Name | Type | Description |
| -------------- | ------------------------ | -------------------------- |
| `string_or_id` | bytes, unicode or uint64 | The value to encode. |
-| **RETURNS** | unicode or int | The value to be retrieved. |
+| **RETURNS** | str or int | The value to be retrieved. |
## StringStore.\_\_contains\_\_ {#contains tag="method"}
@@ -69,10 +69,10 @@ Check whether a string is in the store.
> assert not "cherry" in stringstore
> ```
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------- |
-| `string` | unicode | The string to check. |
-| **RETURNS** | bool | Whether the store contains the string. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------- |
+| `string` | str | The string to check. |
+| **RETURNS** | bool | Whether the store contains the string. |
## StringStore.\_\_iter\_\_ {#iter tag="method"}
@@ -87,9 +87,9 @@ store will always include an empty string `''` at position `0`.
> assert all_strings == ["apple", "orange"]
> ```
-| Name | Type | Description |
-| ---------- | ------- | ---------------------- |
-| **YIELDS** | unicode | A string in the store. |
+| Name | Type | Description |
+| ---------- | ---- | ---------------------- |
+| **YIELDS** | str | A string in the store. |
## StringStore.add {#add tag="method" new="2"}
@@ -106,10 +106,10 @@ Add a string to the `StringStore`.
> assert stringstore["banana"] == banana_hash
> ```
-| Name | Type | Description |
-| ----------- | ------- | ------------------------ |
-| `string` | unicode | The string to add. |
-| **RETURNS** | uint64 | The string's hash value. |
+| Name | Type | Description |
+| ----------- | ------ | ------------------------ |
+| `string` | str | The string to add. |
+| **RETURNS** | uint64 | The string's hash value. |
## StringStore.to_disk {#to_disk tag="method" new="2"}
@@ -121,9 +121,9 @@ Save the current state to a directory.
> stringstore.to_disk("/path/to/strings")
> ```
-| Name | Type | Description |
-| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| Name | Type | Description |
+| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## StringStore.from_disk {#from_disk tag="method" new="2"}
@@ -136,10 +136,10 @@ Loads state from a directory. Modifies the object in place and returns it.
> stringstore = StringStore().from_disk("/path/to/strings")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `StringStore` | The modified `StringStore` object. |
+| Name | Type | Description |
+| ----------- | ------------- | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| **RETURNS** | `StringStore` | The modified `StringStore` object. |
## StringStore.to_bytes {#to_bytes tag="method"}
@@ -185,7 +185,7 @@ Get a 64-bit hash for a given string.
> assert hash_string("apple") == 8566208034543834098
> ```
-| Name | Type | Description |
-| ----------- | ------- | ------------------- |
-| `string` | unicode | The string to hash. |
-| **RETURNS** | uint64 | The hash. |
+| Name | Type | Description |
+| ----------- | ------ | ------------------- |
+| `string` | str | The string to hash. |
+| **RETURNS** | uint64 | The hash. |
diff --git a/website/docs/api/tagger.md b/website/docs/api/tagger.md
index bd3382f89..f14da3ac5 100644
--- a/website/docs/api/tagger.md
+++ b/website/docs/api/tagger.md
@@ -229,10 +229,10 @@ Add a new label to the pipe.
> tagger.add_label("MY_LABEL", {POS: 'NOUN'})
> ```
-| Name | Type | Description |
-| -------- | ------- | --------------------------------------------------------------- |
-| `label` | unicode | The label to add. |
-| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. |
+| Name | Type | Description |
+| -------- | ---- | --------------------------------------------------------------- |
+| `label` | str | The label to add. |
+| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. |
## Tagger.to_disk {#to_disk tag="method"}
@@ -245,10 +245,10 @@ Serialize the pipe to disk.
> tagger.to_disk("/path/to/tagger")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Tagger.from_disk {#from_disk tag="method"}
@@ -261,11 +261,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
> tagger.from_disk("/path/to/tagger")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
-| **RETURNS** | `Tagger` | The modified `Tagger` object. |
+| Name | Type | Description |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `Tagger` | The modified `Tagger` object. |
## Tagger.to_bytes {#to_bytes tag="method"}
diff --git a/website/docs/api/textcategorizer.md b/website/docs/api/textcategorizer.md
index 1a0280265..dc1c083ac 100644
--- a/website/docs/api/textcategorizer.md
+++ b/website/docs/api/textcategorizer.md
@@ -44,7 +44,7 @@ shortcut for this and instantiate the component using its string name and
| `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
| `exclusive_classes` | bool | Make categories mutually exclusive. Defaults to `False`. |
-| `architecture` | unicode | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. |
+| `architecture` | str | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. |
| **RETURNS** | `TextCategorizer` | The newly constructed object. |
### Architectures {#architectures new="2.1"}
@@ -247,9 +247,9 @@ Add a new label to the pipe.
> textcat.add_label("MY_LABEL")
> ```
-| Name | Type | Description |
-| ------- | ------- | ----------------- |
-| `label` | unicode | The label to add. |
+| Name | Type | Description |
+| ------- | ---- | ----------------- |
+| `label` | str | The label to add. |
## TextCategorizer.to_disk {#to_disk tag="method"}
@@ -262,10 +262,10 @@ Serialize the pipe to disk.
> textcat.to_disk("/path/to/textcat")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## TextCategorizer.from_disk {#from_disk tag="method"}
@@ -280,7 +280,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description |
| ----------- | ----------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. |
diff --git a/website/docs/api/tokenizer.md b/website/docs/api/tokenizer.md
index 7462af739..c71f849ad 100644
--- a/website/docs/api/tokenizer.md
+++ b/website/docs/api/tokenizer.md
@@ -34,15 +34,15 @@ the
> tokenizer = nlp.Defaults.create_tokenizer(nlp)
> ```
-| Name | Type | Description |
-| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | `Vocab` | A storage container for lexical types. |
-| `rules` | dict | Exceptions and special-cases for the tokenizer. |
-| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
-| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
-| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
-| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. |
-| **RETURNS** | `Tokenizer` | The newly constructed object. |
+| Name | Type | Description |
+| ---------------- | ----------- | ------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | A storage container for lexical types. |
+| `rules` | dict | Exceptions and special-cases for the tokenizer. |
+| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
+| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
+| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
+| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. |
+| **RETURNS** | `Tokenizer` | The newly constructed object. |
## Tokenizer.\_\_call\_\_ {#call tag="method"}
@@ -55,10 +55,10 @@ Tokenize a string.
> assert len(tokens) == 4
> ```
-| Name | Type | Description |
-| ----------- | ------- | --------------------------------------- |
-| `string` | unicode | The string to tokenize. |
-| **RETURNS** | `Doc` | A container for linguistic annotations. |
+| Name | Type | Description |
+| ----------- | ----- | --------------------------------------- |
+| `string` | str | The string to tokenize. |
+| **RETURNS** | `Doc` | A container for linguistic annotations. |
## Tokenizer.pipe {#pipe tag="method"}
@@ -82,20 +82,20 @@ Tokenize a stream of texts.
Find internal split points of the string.
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `string` | unicode | The string to split. |
-| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `string` | str | The string to split. |
+| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. |
## Tokenizer.find_prefix {#find_prefix tag="method"}
Find the length of a prefix that should be segmented from the string, or `None`
if no prefix rules match.
-| Name | Type | Description |
-| ----------- | ------- | ------------------------------------------------------ |
-| `string` | unicode | The string to segment. |
-| **RETURNS** | int | The length of the prefix if present, otherwise `None`. |
+| Name | Type | Description |
+| ----------- | ---- | ------------------------------------------------------ |
+| `string` | str | The string to segment. |
+| **RETURNS** | int | The length of the prefix if present, otherwise `None`. |
## Tokenizer.find_suffix {#find_suffix tag="method"}
@@ -104,7 +104,7 @@ if no suffix rules match.
| Name | Type | Description |
| ----------- | ------------ | ------------------------------------------------------ |
-| `string` | unicode | The string to segment. |
+| `string` | str | The string to segment. |
| **RETURNS** | int / `None` | The length of the suffix if present, otherwise `None`. |
## Tokenizer.add_special_case {#add_special_case tag="method"}
@@ -125,7 +125,7 @@ and examples.
| Name | Type | Description |
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| `string` | unicode | The string to specially tokenize. |
+| `string` | str | The string to specially tokenize. |
| `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. |
## Tokenizer.explain {#explain tag="method"}
@@ -142,10 +142,10 @@ produced are identical to `Tokenizer.__call__` except for whitespace tokens.
> assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"]
> ```
-| Name | Type | Description |
-| ------------| -------- | --------------------------------------------------- |
-| `string` | unicode | The string to tokenize with the debugging tokenizer |
-| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples |
+| Name | Type | Description |
+| ----------- | ---- | --------------------------------------------------- |
+| `string` | str | The string to tokenize with the debugging tokenizer |
+| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples |
## Tokenizer.to_disk {#to_disk tag="method"}
@@ -158,10 +158,10 @@ Serialize the tokenizer to disk.
> tokenizer.to_disk("/path/to/tokenizer")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Tokenizer.from_disk {#from_disk tag="method"}
@@ -174,11 +174,11 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
> tokenizer.from_disk("/path/to/tokenizer")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
-| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. |
+| Name | Type | Description |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. |
## Tokenizer.to_bytes {#to_bytes tag="method"}
@@ -217,14 +217,14 @@ it.
## Attributes {#attributes}
-| Name | Type | Description |
-| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
-| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
-| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
-| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
-| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
-| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
-| `rules` | dict | A dictionary of tokenizer exceptions and special cases. |
+| Name | Type | Description |
+| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
+| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
+| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
+| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
+| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
+| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an`re.MatchObject`or`None. |
+| `rules` | dict | A dictionary of tokenizer exceptions and special cases. |
## Serialization fields {#serialization-fields}
diff --git a/website/docs/api/top-level.md b/website/docs/api/top-level.md
index 2360ad472..bdd094021 100644
--- a/website/docs/api/top-level.md
+++ b/website/docs/api/top-level.md
@@ -32,11 +32,11 @@ class. The data will be loaded in via
> nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | --------------------------------------------------------------------------------- |
-| `name` | unicode / `Path` | Model to load, i.e. shortcut link, package name or path. |
-| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
-| **RETURNS** | `Language` | A `Language` object with the loaded model. |
+| Name | Type | Description |
+| ----------- | ------------ | --------------------------------------------------------------------------------- |
+| `name` | str / `Path` | Model to load, i.e. shortcut link, package name or path. |
+| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
+| **RETURNS** | `Language` | A `Language` object with the loaded model. |
Essentially, `spacy.load()` is a convenience wrapper that reads the language ID
and pipeline components from a model's `meta.json`, initializes the `Language`
@@ -79,7 +79,7 @@ Create a blank model of a given language class. This function is the twin of
| Name | Type | Description |
| ----------- | ---------- | ------------------------------------------------------------------------------------------------ |
-| `name` | unicode | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
+| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
| **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. |
@@ -98,10 +98,10 @@ meta data as a dictionary instead, you can use the `meta` attribute on your
> spacy.info("de", markdown=True)
> ```
-| Name | Type | Description |
-| ---------- | ------- | ------------------------------------------------------------- |
-| `model` | unicode | A model, i.e. shortcut link, package name or path (optional). |
-| `markdown` | bool | Print information as Markdown. |
+| Name | Type | Description |
+| ---------- | ---- | ------------------------------------------------------------- |
+| `model` | str | A model, i.e. shortcut link, package name or path (optional). |
+| `markdown` | bool | Print information as Markdown. |
### spacy.explain {#spacy.explain tag="function"}
@@ -122,10 +122,10 @@ list of available terms, see
> # world NN noun, singular or mass
> ```
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------------------------- |
-| `term` | unicode | Term to explain. |
-| **RETURNS** | unicode | The explanation, or `None` if not found in the glossary. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------------------------- |
+| `term` | str | Term to explain. |
+| **RETURNS** | str | The explanation, or `None` if not found in the glossary. |
### spacy.prefer_gpu {#spacy.prefer_gpu tag="function" new="2.0.14"}
@@ -189,13 +189,13 @@ browser. Will run a simple web server.
| Name | Type | Description | Default |
| --------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ----------- |
| `docs` | list, `Doc`, `Span` | Document(s) to visualize. |
-| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
+| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
| `page` | bool | Render markup as full HTML page. | `True` |
| `minify` | bool | Minify HTML markup. | `False` |
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
| `port` | int | Port to serve visualization. | `5000` |
-| `host` | unicode | Host to serve visualization. | `'0.0.0.0'` |
+| `host` | str | Host to serve visualization. | `'0.0.0.0'` |
### displacy.render {#displacy.render tag="method" new="2"}
@@ -214,13 +214,13 @@ Render a dependency parse tree or named entity visualization.
| Name | Type | Description | Default |
| ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
| `docs` | list, `Doc`, `Span` | Document(s) to visualize. |
-| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
+| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
| `page` | bool | Render markup as full HTML page. | `False` |
| `minify` | bool | Minify HTML markup. | `False` |
| `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` |
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
-| **RETURNS** | unicode | Rendered HTML markup. |
+| **RETURNS** | str | Rendered HTML markup. |
### Visualizer options {#displacy_options}
@@ -236,22 +236,22 @@ If a setting is not present in the options, the default value will be used.
> displacy.serve(doc, style="dep", options=options)
> ```
-| Name | Type | Description | Default |
-| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
-| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
-| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` |
-| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
-| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
-| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
-| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` |
-| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` |
-| `font` | unicode | Font name or font family for all text. | `'Arial'` |
-| `offset_x` | int | Spacing on left side of the SVG in px. | `50` |
-| `arrow_stroke` | int | Width of arrow path in px. | `2` |
-| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) |
-| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) |
-| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` |
-| `distance` | int | Distance between words in px. | `175` / `150` (compact) |
+| Name | Type | Description | Default |
+| ------------------------------------------ | ---- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
+| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
+| `add_lemma` 2.2.4 | bool | Print the lemma's in a separate row below the token texts. | `False` |
+| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
+| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
+| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
+| `color` | str | Text color (HEX, RGB or color names). | `'#000000'` |
+| `bg` | str | Background color (HEX, RGB or color names). | `'#ffffff'` |
+| `font` | str | Font name or font family for all text. | `'Arial'` |
+| `offset_x` | int | Spacing on left side of the SVG in px. | `50` |
+| `arrow_stroke` | int | Width of arrow path in px. | `2` |
+| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) |
+| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) |
+| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` |
+| `distance` | int | Distance between words in px. | `175` / `150` (compact) |
#### Named Entity Visualizer options {#displacy_options-ent}
@@ -263,11 +263,11 @@ If a setting is not present in the options, the default value will be used.
> displacy.serve(doc, style="ent", options=options)
> ```
-| Name | Type | Description | Default |
-| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
-| `ents` | list | Entity types to highlight (`None` for all types). | `None` |
-| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` |
-| `template` 2.2 | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |
+| Name | Type | Description | Default |
+| --------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
+| `ents` | list | Entity types to highlight (`None` for all types). | `None` |
+| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` |
+| `template` 2.2 | str | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |
By default, displaCy comes with colors for all
[entity types supported by spaCy](/api/annotation#named-entities). If you're
@@ -308,9 +308,9 @@ Set custom path to the data directory where spaCy looks for models.
> # PosixPath('/custom/path')
> ```
-| Name | Type | Description |
-| ------ | ---------------- | --------------------------- |
-| `path` | unicode / `Path` | Path to new data directory. |
+| Name | Type | Description |
+| ------ | ------------ | --------------------------- |
+| `path` | str / `Path` | Path to new data directory. |
### util.get_lang_class {#util.get_lang_class tag="function"}
@@ -330,7 +330,7 @@ you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper.
| Name | Type | Description |
| ----------- | ---------- | -------------------------------------- |
-| `lang` | unicode | Two-letter language code, e.g. `'en'`. |
+| `lang` | str | Two-letter language code, e.g. `'en'`. |
| **RETURNS** | `Language` | Language class. |
### util.set_lang_class {#util.set_lang_class tag="function"}
@@ -352,7 +352,7 @@ the two-letter language code.
| Name | Type | Description |
| ------ | ---------- | -------------------------------------- |
-| `name` | unicode | Two-letter language code, e.g. `'en'`. |
+| `name` | str | Two-letter language code, e.g. `'en'`. |
| `cls` | `Language` | The language class, e.g. `English`. |
### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"}
@@ -368,10 +368,10 @@ loaded lazily, to avoid expensive setup code associated with the language data.
> assert util.lang_class_is_loaded("de") is False
> ```
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------- |
-| `name` | unicode | Two-letter language code, e.g. `'en'`. |
-| **RETURNS** | bool | Whether the class has been loaded. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------- |
+| `name` | str | Two-letter language code, e.g. `'en'`. |
+| **RETURNS** | bool | Whether the class has been loaded. |
### util.load_model {#util.load_model tag="function" new="2"}
@@ -392,7 +392,7 @@ in via [`Language.from_disk()`](/api/language#from_disk).
| Name | Type | Description |
| ------------- | ---------- | -------------------------------------------------------- |
-| `name` | unicode | Package name, shortcut link or model path. |
+| `name` | str | Package name, shortcut link or model path. |
| `**overrides` | - | Specific overrides, like pipeline components to disable. |
| **RETURNS** | `Language` | `Language` class with the loaded model. |
@@ -411,7 +411,7 @@ it easy to test a new model that you haven't packaged yet.
| Name | Type | Description |
| ------------- | ---------- | ---------------------------------------------------------------------------------------------------- |
-| `model_path` | unicode | Path to model data directory. |
+| `model_path` | str | Path to model data directory. |
| `meta` | dict | Model meta data. If `False`, spaCy will try to load the meta from a meta.json in the same directory. |
| `**overrides` | - | Specific overrides, like pipeline components to disable. |
| **RETURNS** | `Language` | `Language` class with the loaded model. |
@@ -432,7 +432,7 @@ A helper function to use in the `load()` method of a model package's
| Name | Type | Description |
| ------------- | ---------- | -------------------------------------------------------- |
-| `init_file` | unicode | Path to model's `__init__.py`, i.e. `__file__`. |
+| `init_file` | str | Path to model's `__init__.py`, i.e. `__file__`. |
| `**overrides` | - | Specific overrides, like pipeline components to disable. |
| **RETURNS** | `Language` | `Language` class with the loaded model. |
@@ -446,10 +446,10 @@ Get a model's meta.json from a directory path and validate its contents.
> meta = util.get_model_meta("/path/to/model")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | ------------------------ |
-| `path` | unicode / `Path` | Path to model directory. |
-| **RETURNS** | dict | The model's meta data. |
+| Name | Type | Description |
+| ----------- | ------------ | ------------------------ |
+| `path` | str / `Path` | Path to model directory. |
+| **RETURNS** | dict | The model's meta data. |
### util.is_package {#util.is_package tag="function"}
@@ -463,10 +463,10 @@ Check if string maps to a package installed via pip. Mainly used to validate
> util.is_package("xyz") # False
> ```
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------------- |
-| `name` | unicode | Name of package. |
-| **RETURNS** | `bool` | `True` if installed package, `False` if not. |
+| Name | Type | Description |
+| ----------- | ------ | -------------------------------------------- |
+| `name` | str | Name of package. |
+| **RETURNS** | `bool` | `True` if installed package, `False` if not. |
### util.get_package_path {#util.get_package_path tag="function" new="2"}
@@ -480,10 +480,10 @@ Get path to an installed package. Mainly used to resolve the location of
> # /usr/lib/python3.6/site-packages/en_core_web_sm
> ```
-| Name | Type | Description |
-| -------------- | ------- | -------------------------------- |
-| `package_name` | unicode | Name of installed package. |
-| **RETURNS** | `Path` | Path to model package directory. |
+| Name | Type | Description |
+| -------------- | ------ | -------------------------------- |
+| `package_name` | str | Name of installed package. |
+| **RETURNS** | `Path` | Path to model package directory. |
### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"}
diff --git a/website/docs/api/vectors.md b/website/docs/api/vectors.md
index a4c36f8cd..939cc8655 100644
--- a/website/docs/api/vectors.md
+++ b/website/docs/api/vectors.md
@@ -35,7 +35,7 @@ you can add vectors to later.
| `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. |
| `keys` | iterable | A sequence of keys aligned with the data. |
| `shape` | tuple | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. |
-| `name` | unicode | A name to identify the vectors table. |
+| `name` | str | A name to identify the vectors table. |
| **RETURNS** | `Vectors` | The newly created object. |
## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
@@ -140,7 +140,7 @@ mapping separately. If you need to manage the strings, you should use the
| Name | Type | Description |
| ----------- | ---------------------------------- | ----------------------------------------------------- |
-| `key` | unicode / int | The key to add. |
+| `key` | str / int | The key to add. |
| `vector` | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key. |
| `row` | int | An optional row number of a vector to map the key to. |
| **RETURNS** | int | The row the vector was added to. |
@@ -227,7 +227,7 @@ Look up one or more keys by row, or vice versa.
| Name | Type | Description |
| ----------- | ------------------------------------- | ------------------------------------------------------------------------ |
-| `key` | unicode / int | Find the row that the given key points to. Returns int, `-1` if missing. |
+| `key` | str / int | Find the row that the given key points to. Returns int, `-1` if missing. |
| `keys` | iterable | Find rows that the keys point to. Returns `ndarray`. |
| `row` | int | Find the first key that points to the row. Returns int. |
| `rows` | iterable | Find the keys that point to the rows. Returns ndarray. |
@@ -337,9 +337,9 @@ Save the current state to a directory.
>
> ```
-| Name | Type | Description |
-| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| Name | Type | Description |
+| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## Vectors.from_disk {#from_disk tag="method"}
@@ -352,10 +352,10 @@ Loads state from a directory. Modifies the object in place and returns it.
> vectors.from_disk("/path/to/vectors")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| **RETURNS** | `Vectors` | The modified `Vectors` object. |
+| Name | Type | Description |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| **RETURNS** | `Vectors` | The modified `Vectors` object. |
## Vectors.to_bytes {#to_bytes tag="method"}
diff --git a/website/docs/api/vocab.md b/website/docs/api/vocab.md
index e024ab54a..b851f6882 100644
--- a/website/docs/api/vocab.md
+++ b/website/docs/api/vocab.md
@@ -27,7 +27,7 @@ Create the vocabulary.
| `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. |
| `lemmatizer` | object | A lemmatizer. Defaults to `None`. |
| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. |
-| `vectors_name` 2.2 | unicode | A name to identify the vectors table. |
+| `vectors_name` 2.2 | str | A name to identify the vectors table. |
| **RETURNS** | `Vocab` | The newly constructed object. |
## Vocab.\_\_len\_\_ {#len tag="method"}
@@ -91,10 +91,10 @@ given string, you need to look it up in
> assert oov not in nlp.vocab
> ```
-| Name | Type | Description |
-| ----------- | ------- | -------------------------------------------------- |
-| `string` | unicode | The ID string. |
-| **RETURNS** | bool | Whether the string has an entry in the vocabulary. |
+| Name | Type | Description |
+| ----------- | ---- | -------------------------------------------------- |
+| `string` | str | The ID string. |
+| **RETURNS** | bool | Whether the string has an entry in the vocabulary. |
## Vocab.add_flag {#add_flag tag="method"}
@@ -117,7 +117,7 @@ using `token.check_flag(flag_id)`.
| Name | Type | Description |
| ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
-| `flag_getter` | dict | A function `f(unicode) -> bool`, to get the flag value. |
+| `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value. |
| `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. |
| **RETURNS** | int | The integer ID by which the flag value can be checked. |
@@ -227,10 +227,10 @@ Save the current state to a directory.
> nlp.vocab.to_disk("/path/to/vocab")
> ```
-| Name | Type | Description |
-| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| Name | Type | Description |
+| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Vocab.from_disk {#from_disk tag="method" new="2"}
@@ -243,11 +243,11 @@ Loads state from a directory. Modifies the object in place and returns it.
> vocab = Vocab().from_disk("/path/to/vocab")
> ```
-| Name | Type | Description |
-| ----------- | ---------------- | -------------------------------------------------------------------------- |
-| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
-| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
-| **RETURNS** | `Vocab` | The modified `Vocab` object. |
+| Name | Type | Description |
+| ----------- | ------------ | -------------------------------------------------------------------------- |
+| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
+| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
+| **RETURNS** | `Vocab` | The modified `Vocab` object. |
## Vocab.to_bytes {#to_bytes tag="method"}
diff --git a/website/docs/usage/linguistic-features.md b/website/docs/usage/linguistic-features.md
index d17e5a661..4b3c61b9d 100644
--- a/website/docs/usage/linguistic-features.md
+++ b/website/docs/usage/linguistic-features.md
@@ -327,11 +327,11 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy)..
### Disabling the parser {#disabling}
In the [default models](/models), the parser is loaded and enabled as part of
-the [standard processing pipeline](/usage/processing-pipelines). If you don't need
-any of the syntactic information, you should disable the parser. Disabling the
-parser will make spaCy load and run much faster. If you want to load the parser,
-but need to disable it for specific documents, you can also control its use on
-the `nlp` object.
+the [standard processing pipeline](/usage/processing-pipelines). If you don't
+need any of the syntactic information, you should disable the parser. Disabling
+the parser will make spaCy load and run much faster. If you want to load the
+parser, but need to disable it for specific documents, you can also control its
+use on the `nlp` object.
```python
nlp = spacy.load("en_core_web_sm", disable=["parser"])
@@ -988,10 +988,10 @@ nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = my_tokenizer
```
-| Argument | Type | Description |
-| ----------- | ------- | ------------------------- |
-| `text` | unicode | The raw text to tokenize. |
-| **RETURNS** | `Doc` | The tokenized document. |
+| Argument | Type | Description |
+| ----------- | ----- | ------------------------- |
+| `text` | str | The raw text to tokenize. |
+| **RETURNS** | `Doc` | The tokenized document. |
diff --git a/website/docs/usage/processing-pipelines.md b/website/docs/usage/processing-pipelines.md
index 696e11106..e7aca3981 100644
--- a/website/docs/usage/processing-pipelines.md
+++ b/website/docs/usage/processing-pipelines.md
@@ -272,16 +272,16 @@ doc = nlp("I won't have named entities")
disabled.restore()
```
-If you want to disable all pipes except for one or a few, you can use the `enable`
-keyword. Just like the `disable` keyword, it takes a list of pipe names, or a string
-defining just one pipe.
+If you want to disable all pipes except for one or a few, you can use the
+`enable` keyword. Just like the `disable` keyword, it takes a list of pipe
+names, or a string defining just one pipe.
+
```python
# Enable only the parser
with nlp.select_pipes(enable="parser"):
doc = nlp("I will only be parsed")
```
-
Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method
to remove pipeline components from an existing pipeline, the
[`rename_pipe`](/api/language#rename_pipe) method to rename them, or the
@@ -349,12 +349,12 @@ last** in the pipeline, or define a **custom name**. If no name is set and no
> nlp.add_pipe(my_component, before="parser")
> ```
-| Argument | Type | Description |
-| -------- | ------- | ------------------------------------------------------------------------ |
-| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). |
-| `first` | bool | If set to `True`, component is added **first** in the pipeline. |
-| `before` | unicode | String name of component to add the new component **before**. |
-| `after` | unicode | String name of component to add the new component **after**. |
+| Argument | Type | Description |
+| -------- | ---- | ------------------------------------------------------------------------ |
+| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). |
+| `first` | bool | If set to `True`, component is added **first** in the pipeline. |
+| `before` | str | String name of component to add the new component **before**. |
+| `after` | str | String name of component to add the new component **after**. |
### Example: A simple pipeline component {#custom-components-simple}
diff --git a/website/docs/usage/rule-based-matching.md b/website/docs/usage/rule-based-matching.md
index 5f47bd2e3..a84399312 100644
--- a/website/docs/usage/rule-based-matching.md
+++ b/website/docs/usage/rule-based-matching.md
@@ -157,19 +157,19 @@ The available token pattern keys correspond to a number of
[`Token` attributes](/api/token#attributes). The supported attributes for
rule-based matching are:
-| Attribute | Type | Description |
-| -------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------ |
-| `ORTH` | unicode | The exact verbatim text of a token. |
-| `TEXT` 2.1 | unicode | The exact verbatim text of a token. |
-| `LOWER` | unicode | The lowercase form of the token text. |
-| `LENGTH` | int | The length of the token text. |
-| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
-| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
-| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
-| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
-| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | unicode | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
-| `ENT_TYPE` | unicode | The token's entity label. |
-| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
+| Attribute | Type | Description |
+| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
+| `ORTH` | str | The exact verbatim text of a token. |
+| `TEXT` 2.1 | str | The exact verbatim text of a token. |
+| `LOWER` | str | The lowercase form of the token text. |
+| `LENGTH` | int | The length of the token text. |
+| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
+| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
+| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
+| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
+| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
+| `ENT_TYPE` | str | The token's entity label. |
+| `_` 2.1 | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
@@ -1101,21 +1101,28 @@ powerful model packages with binary weights _and_ rules included!
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
-When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**,
-the EntityRuler calls the nlp object to construct a doc object. This happens in case you try
-to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to
-extract matches based on the pattern's POS signature.
+When using a large amount of **phrase patterns** (roughly > 10000) it's useful
+to understand how the `add_patterns` function of the EntityRuler works. For each
+**phrase pattern**, the EntityRuler calls the nlp object to construct a doc
+object. This happens in case you try to add the EntityRuler at the end of an
+existing pipeline with, for example, a POS tagger and want to extract matches
+based on the pattern's POS signature.
-In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler.
+In this case you would pass a config value of `phrase_matcher_attr="POS"` for
+the EntityRuler.
-Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns.
+Running the full language pipeline across every pattern in a large list scales
+linearly and can therefore take a long time on large amounts of phrase patterns.
-As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively.
+As of spaCy 2.2.4 the `add_patterns` function has been refactored to use
+nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with
+5,000-100,000 phrase patterns respectively.
-Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time.
+Even with this speedup (but especially if you're using an older version) the
+`add_patterns` function can still take a long time.
-An easy workaround to make this function run faster is disabling the other language pipes
-while adding the phrase patterns.
+An easy workaround to make this function run faster is disabling the other
+language pipes while adding the phrase patterns.
```python
entityruler = EntityRuler(nlp)
diff --git a/website/docs/usage/saving-loading.md b/website/docs/usage/saving-loading.md
index c94c79360..c0dbfc732 100644
--- a/website/docs/usage/saving-loading.md
+++ b/website/docs/usage/saving-loading.md
@@ -94,8 +94,8 @@ docs = list(doc_bin.get_docs(nlp.vocab))
If `store_user_data` is set to `True`, the `Doc.user_data` will be serialized as
well, which includes the values of
-[extension attributes](/usage/processing-pipelines#custom-components-attributes) (if
-they're serializable with msgpack).
+[extension attributes](/usage/processing-pipelines#custom-components-attributes)
+(if they're serializable with msgpack).
@@ -666,10 +666,10 @@ and lets you customize how the model should be initialized and loaded. You can
define the language data to be loaded and the
[processing pipeline](/usage/processing-pipelines) to execute.
-| Setting | Type | Description |
-| ---------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `lang` | unicode | ID of the language class to initialize. |
-| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. |
+| Setting | Type | Description |
+| ---------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `lang` | str | ID of the language class to initialize. |
+| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. |
The `load()` method that comes with our model package templates will take care
of putting all this together and returning a `Language` object with the loaded
diff --git a/website/docs/usage/visualizers.md b/website/docs/usage/visualizers.md
index dd0b0eb50..9733e09c2 100644
--- a/website/docs/usage/visualizers.md
+++ b/website/docs/usage/visualizers.md
@@ -67,12 +67,12 @@ arcs.
-| Argument | Type | Description | Default |
-| --------- | ------- | ----------------------------------------------------------- | ----------- |
-| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
-| `color` | unicode | Text color (HEX, RGB or color names). | `"#000000"` |
-| `bg` | unicode | Background color (HEX, RGB or color names). | `"#ffffff"` |
-| `font` | unicode | Font name or font family for all text. | `"Arial"` |
+| Argument | Type | Description | Default |
+| --------- | ---- | ----------------------------------------------------------- | ----------- |
+| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
+| `color` | str | Text color (HEX, RGB or color names). | `"#000000"` |
+| `bg` | str | Background color (HEX, RGB or color names). | `"#ffffff"` |
+| `font` | str | Font name or font family for all text. | `"Arial"` |
For a list of all available options, see the
[`displacy` API documentation](/api/top-level#displacy_options).