Merge branch 'develop' into master-tmp

This commit is contained in:
Ines Montani 2020-06-03 14:36:59 +02:00
commit 810fce3bb1
77 changed files with 1211 additions and 808 deletions

View File

@ -5,7 +5,7 @@ VENV := ./env$(PYVER)
version := $(shell "bin/get-version.sh") version := $(shell "bin/get-version.sh")
dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data $(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy_lookups_data
chmod a+rx $@ chmod a+rx $@
dist/pytest.pex : wheelhouse/pytest-*.whl dist/pytest.pex : wheelhouse/pytest-*.whl
@ -14,7 +14,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl
wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py* wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
$(VENV)/bin/pip wheel . -w ./wheelhouse $(VENV)/bin/pip wheel . -w ./wheelhouse
$(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse $(VENV)/bin/pip wheel spacy_lookups_data -w ./wheelhouse
touch $@ touch $@
wheelhouse/pytest-%.whl : $(VENV)/bin/pex wheelhouse/pytest-%.whl : $(VENV)/bin/pex

View File

@ -0,0 +1,115 @@
# Training hyper-parameters and additional features.
[training]
# Whether to train on sequences with 'gold standard' sentence boundaries
# and tokens. If you set this to true, take care to ensure your run-time
# data is passed in sentence-by-sentence via some prior preprocessing.
gold_preproc = false
# Limitations on training document length or number of examples.
max_length = 0
limit = 0
# Data augmentation
orth_variant_level = 0.0
dropout = 0.1
# Controls early-stopping. 0 or -1 mean unlimited.
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 400
# Other settings
seed = 0
accumulate_gradient = 1
use_pytorch_for_gpu_memory = false
# Control how scores are printed and checkpoints are evaluated.
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
# These settings are invalid for the transformer models.
init_tok2vec = null
vectors = null
[training.batch_size]
@schedules = "compounding.v1"
start = 1000
stop = 1000
compound = 1.001
[optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 1e-8
learn_rate = 0.001
#[optimizer.learn_rate]
#@schedules = "warmup_linear.v1"
#warmup_steps = 250
#total_steps = 20000
#initial_rate = 0.001
[nlp]
lang = "en"
vectors = ${training:vectors}
[nlp.pipeline.tok2vec]
factory = "tok2vec"
[nlp.pipeline.senter]
factory = "senter"
[nlp.pipeline.ner]
factory = "ner"
[nlp.pipeline.tagger]
factory = "tagger"
[nlp.pipeline.parser]
factory = "parser"
[nlp.pipeline.senter.model]
@architectures = "spacy.Tagger.v1"
[nlp.pipeline.senter.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.tagger.model]
@architectures = "spacy.Tagger.v1"
[nlp.pipeline.tagger.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.parser.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 8
hidden_width = 128
maxout_pieces = 3
use_upper = false
[nlp.pipeline.parser.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.ner.model]
@architectures = "spacy.TransitionBasedParser.v1"
nr_feature_tokens = 3
hidden_width = 128
maxout_pieces = 3
use_upper = false
[nlp.pipeline.ner.model.tok2vec]
@architectures = "spacy.Tok2VecTensors.v1"
width = ${nlp.pipeline.tok2vec.model:width}
[nlp.pipeline.tok2vec.model]
@architectures = "spacy.HashEmbedCNN.v1"
pretrained_vectors = ${nlp:vectors}
width = 256
depth = 6
window_size = 1
embed_size = 10000
maxout_pieces = 3
subword_features = true

View File

@ -13,9 +13,11 @@ numpy>=1.15.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
plac>=0.9.6,<1.2.0 plac>=0.9.6,<1.2.0
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
# Optional dependencies
jsonschema>=2.6.0,<3.1.0
pydantic>=1.3.0,<2.0.0 pydantic>=1.3.0,<2.0.0
# Official Python utilities
setuptools
packaging
importlib_metadata>=0.20; python_version < "3.8"
# Development dependencies # Development dependencies
cython>=0.25 cython>=0.25
pytest>=4.6.5 pytest>=4.6.5

View File

@ -47,15 +47,17 @@ install_requires =
wasabi>=0.4.0,<1.1.0 wasabi>=0.4.0,<1.1.0
srsly>=2.0.0,<3.0.0 srsly>=2.0.0,<3.0.0
catalogue>=0.0.7,<1.1.0 catalogue>=0.0.7,<1.1.0
ml_datasets ml_datasets>=0.1.1
# Third-party dependencies # Third-party dependencies
tqdm>=4.38.0,<5.0.0 tqdm>=4.38.0,<5.0.0
setuptools
numpy>=1.15.0 numpy>=1.15.0
plac>=0.9.6,<1.2.0 plac>=0.9.6,<1.2.0
requests>=2.13.0,<3.0.0 requests>=2.13.0,<3.0.0
pydantic>=1.3.0,<2.0.0 pydantic>=1.3.0,<2.0.0
tqdm>=4.38.0,<5.0.0 # Official Python utilities
setuptools
packaging
importlib_metadata>=0.20; python_version < "3.8"
[options.extras_require] [options.extras_require]
lookups = lookups =

View File

@ -1,6 +1,6 @@
# fmt: off # fmt: off
__title__ = "spacy" __title__ = "spacy"
__version__ = "3.0.0.dev8" __version__ = "3.0.0.dev9"
__release__ = True __release__ = True
__download_url__ = "https://github.com/explosion/spacy-models/releases/download" __download_url__ = "https://github.com/explosion/spacy-models/releases/download"
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json" __compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"

View File

@ -111,8 +111,8 @@ def get_entities(lines, tag_pattern, ner_map=None):
final entity type with `ner_map` if mapping present. Entity tag is 'O' if final entity type with `ner_map` if mapping present. Entity tag is 'O' if
the pattern is not matched. the pattern is not matched.
lines (unicode): CONLL-U lines for one sentences lines (str): CONLL-U lines for one sentences
tag_pattern (unicode): Regex pattern for entity tag tag_pattern (str): Regex pattern for entity tag
ner_map (dict): Map old NER tag names to new ones, '' maps to O. ner_map (dict): Map old NER tag names to new ones, '' maps to O.
RETURNS (list): List of BILUO entity tags RETURNS (list): List of BILUO entity tags
""" """
@ -187,8 +187,8 @@ def example_from_conllu_sentence(
"""Create an Example from the lines for one CoNLL-U sentence, merging """Create an Example from the lines for one CoNLL-U sentence, merging
subtokens and appending morphology to tags if required. subtokens and appending morphology to tags if required.
lines (unicode): The non-comment lines for a CoNLL-U sentence lines (str): The non-comment lines for a CoNLL-U sentence
ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col ner_tag_pattern (str): The regex pattern for matching NER in MISC col
RETURNS (Example): An example containing the annotation RETURNS (Example): An example containing the annotation
""" """
# create a Doc with each subtoken as its own token # create a Doc with each subtoken as its own token

View File

@ -5,6 +5,7 @@ import sys
from wasabi import msg from wasabi import msg
from .. import about from .. import about
from ..util import is_package, get_base_version
def download( def download(
@ -17,7 +18,7 @@ def download(
flag is set, the command expects the full model name with version. flag is set, the command expects the full model name with version.
For direct downloads, the compatibility check will be skipped. For direct downloads, the compatibility check will be skipped.
""" """
if not require_package("spacy") and "--no-deps" not in pip_args: if not is_package("spacy") and "--no-deps" not in pip_args:
msg.warn( msg.warn(
"Skipping model package dependencies and setting `--no-deps`. " "Skipping model package dependencies and setting `--no-deps`. "
"You don't seem to have the spaCy package itself installed " "You don't seem to have the spaCy package itself installed "
@ -45,21 +46,6 @@ def download(
"Download and installation successful", "Download and installation successful",
f"You can now load the model via spacy.load('{model_name}')", f"You can now load the model via spacy.load('{model_name}')",
) )
# If a model is downloaded and then loaded within the same process, our
# is_package check currently fails, because pkg_resources.working_set
# is not refreshed automatically (see #3923). We're trying to work
# around this here be requiring the package explicitly.
require_package(model_name)
def require_package(name):
try:
import pkg_resources
pkg_resources.working_set.require(name)
return True
except: # noqa: E722
return False
def get_json(url, desc): def get_json(url, desc):
@ -77,8 +63,7 @@ def get_json(url, desc):
def get_compatibility(): def get_compatibility():
version = about.__version__ version = get_base_version(about.__version__)
version = version.rsplit(".dev", 1)[0]
comp_table = get_json(about.__compatibility__, "compatibility table") comp_table = get_json(about.__compatibility__, "compatibility table")
comp = comp_table["spacy"] comp = comp_table["spacy"]
if version not in comp: if version not in comp:
@ -87,7 +72,7 @@ def get_compatibility():
def get_version(model, comp): def get_version(model, comp):
model = model.rsplit(".dev", 1)[0] model = get_base_version(model)
if model not in comp: if model not in comp:
msg.fail( msg.fail(
f"No compatible model found for '{model}' (spaCy v{about.__version__})", f"No compatible model found for '{model}' (spaCy v{about.__version__})",

View File

@ -48,7 +48,9 @@ def info(
"Location": str(Path(__file__).parent.parent), "Location": str(Path(__file__).parent.parent),
"Platform": platform.platform(), "Platform": platform.platform(),
"Python version": platform.python_version(), "Python version": platform.python_version(),
"Models": ", ".join(model["name"] for model in all_models.values()), "Models": ", ".join(
f"{m['name']} ({m['version']})" for m in all_models.values()
),
} }
if not silent: if not silent:
title = "Info about spaCy" title = "Info about spaCy"
@ -63,7 +65,7 @@ def print_markdown(data, title=None):
"""Print data in GitHub-flavoured Markdown format for issues etc. """Print data in GitHub-flavoured Markdown format for issues etc.
data (dict or list of tuples): Label/value pairs. data (dict or list of tuples): Label/value pairs.
title (unicode or None): Title, will be rendered as headline 2. title (str / None): Title, will be rendered as headline 2.
""" """
markdown = [] markdown = []
for key, value in data.items(): for key, value in data.items():

View File

@ -83,14 +83,14 @@ def generate_meta(model_path, existing_meta, msg):
("lang", "Model language", meta.get("lang", "en")), ("lang", "Model language", meta.get("lang", "en")),
("name", "Model name", meta.get("name", "model")), ("name", "Model name", meta.get("name", "model")),
("version", "Model version", meta.get("version", "0.0.0")), ("version", "Model version", meta.get("version", "0.0.0")),
("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"),
("description", "Model description", meta.get("description", False)), ("description", "Model description", meta.get("description", False)),
("author", "Author", meta.get("author", False)), ("author", "Author", meta.get("author", False)),
("email", "Author email", meta.get("email", False)), ("email", "Author email", meta.get("email", False)),
("url", "Author website", meta.get("url", False)), ("url", "Author website", meta.get("url", False)),
("license", "License", meta.get("license", "CC BY-SA 3.0")), ("license", "License", meta.get("license", "MIT")),
] ]
nlp = util.load_model_from_path(Path(model_path)) nlp = util.load_model_from_path(Path(model_path))
meta["spacy_version"] = util.get_model_version_range(about.__version__)
meta["pipeline"] = nlp.pipe_names meta["pipeline"] = nlp.pipe_names
meta["vectors"] = { meta["vectors"] = {
"width": nlp.vocab.vectors_length, "width": nlp.vocab.vectors_length,
@ -168,6 +168,7 @@ def setup_package():
package_data={model_name: list_files(model_dir)}, package_data={model_name: list_files(model_dir)},
install_requires=list_requirements(meta), install_requires=list_requirements(meta),
zip_safe=False, zip_safe=False,
entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]}
) )

View File

@ -483,7 +483,6 @@ def train(
# Update model meta.json # Update model meta.json
meta["lang"] = nlp.lang meta["lang"] = nlp.lang
meta["pipeline"] = nlp.pipe_names meta["pipeline"] = nlp.pipe_names
meta["spacy_version"] = f">={about.__version__}"
if beam_width == 1: if beam_width == 1:
meta["speed"] = { meta["speed"] = {
"nwords": nwords, "nwords": nwords,

View File

@ -7,7 +7,7 @@ from pathlib import Path
from wasabi import msg from wasabi import msg
import thinc import thinc
import thinc.schedules import thinc.schedules
from thinc.api import Model from thinc.api import Model, use_pytorch_for_gpu_memory
import random import random
from ..gold import GoldCorpus from ..gold import GoldCorpus
@ -171,6 +171,8 @@ def train_from_config(
msg.info(f"Loading config from: {config_path}") msg.info(f"Loading config from: {config_path}")
config = util.load_config(config_path, create_objects=False) config = util.load_config(config_path, create_objects=False)
util.fix_random_seed(config["training"]["seed"]) util.fix_random_seed(config["training"]["seed"])
if config["training"]["use_pytorch_for_gpu_memory"]:
use_pytorch_for_gpu_memory()
nlp_config = config["nlp"] nlp_config = config["nlp"]
config = util.load_config(config_path, create_objects=True) config = util.load_config(config_path, create_objects=True)
msg.info("Creating nlp from config") msg.info("Creating nlp from config")
@ -213,6 +215,12 @@ def train_from_config(
if is_best_checkpoint and output_path is not None: if is_best_checkpoint and output_path is not None:
nlp.to_disk(output_path) nlp.to_disk(output_path)
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False) progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
# Clean up the objects to faciliate garbage collection.
for eg in batch:
eg.doc = None
eg.goldparse = None
eg.doc_annotation = None
eg.token_annotation = None
finally: finally:
if output_path is not None: if output_path is not None:
final_model_path = output_path / "model-final" final_model_path = output_path / "model-final"

View File

@ -4,6 +4,8 @@ import requests
from wasabi import msg from wasabi import msg
from .. import about from .. import about
from ..util import get_package_version, get_installed_models, get_base_version
from ..util import get_package_path, get_model_meta, is_compatible_version
def validate(): def validate():
@ -12,7 +14,7 @@ def validate():
with the installed models. Should be run after `pip install -U spacy`. with the installed models. Should be run after `pip install -U spacy`.
""" """
model_pkgs, compat = get_model_pkgs() model_pkgs, compat = get_model_pkgs()
spacy_version = about.__version__.rsplit(".dev", 1)[0] spacy_version = get_base_version(about.__version__)
current_compat = compat.get(spacy_version, {}) current_compat = compat.get(spacy_version, {})
if not current_compat: if not current_compat:
msg.warn(f"No compatible models found for v{spacy_version} of spaCy") msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
@ -25,7 +27,7 @@ def validate():
msg.info(f"spaCy installation: {spacy_dir}") msg.info(f"spaCy installation: {spacy_dir}")
if model_pkgs: if model_pkgs:
header = ("NAME", "VERSION", "") header = ("NAME", "SPACY", "VERSION", "")
rows = [] rows = []
for name, data in model_pkgs.items(): for name, data in model_pkgs.items():
if data["compat"]: if data["compat"]:
@ -34,7 +36,7 @@ def validate():
else: else:
version = msg.text(data["version"], color="red", no_print=True) version = msg.text(data["version"], color="red", no_print=True)
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}" comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
rows.append((data["name"], version, comp)) rows.append((data["name"], data["spacy"], version, comp))
msg.table(rows, header=header) msg.table(rows, header=header)
else: else:
msg.text("No models found in your current environment.", exits=0) msg.text("No models found in your current environment.", exits=0)
@ -44,8 +46,9 @@ def validate():
cmd = "python -m spacy download {}" cmd = "python -m spacy download {}"
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n") print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
if na_models: if na_models:
msg.warn( msg.info(
f"The following models are not available for spaCy v{about.__version__}:", f"The following models are custom spaCy models or not "
f"available for spaCy v{about.__version__}:",
", ".join(na_models), ", ".join(na_models),
) )
if incompat_models: if incompat_models:
@ -53,8 +56,6 @@ def validate():
def get_model_pkgs(): def get_model_pkgs():
import pkg_resources
with msg.loading("Loading compatibility table..."): with msg.loading("Loading compatibility table..."):
r = requests.get(about.__compatibility__) r = requests.get(about.__compatibility__)
if r.status_code != 200: if r.status_code != 200:
@ -66,19 +67,28 @@ def get_model_pkgs():
msg.good("Loaded compatibility table") msg.good("Loaded compatibility table")
compat = r.json()["spacy"] compat = r.json()["spacy"]
all_models = set() all_models = set()
installed_models = get_installed_models()
for spacy_v, models in dict(compat).items(): for spacy_v, models in dict(compat).items():
all_models.update(models.keys()) all_models.update(models.keys())
for model, model_vs in models.items(): for model, model_vs in models.items():
compat[spacy_v][model] = [reformat_version(v) for v in model_vs] compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
pkgs = {} pkgs = {}
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items(): for pkg_name in installed_models:
package = pkg_name.replace("-", "_") package = pkg_name.replace("-", "_")
if package in all_models: version = get_package_version(pkg_name)
version = pkg_data.version if package in compat:
is_compat = version in compat[package]
spacy_version = about.__version__
else:
model_path = get_package_path(package)
model_meta = get_model_meta(model_path)
spacy_version = model_meta.get("spacy_version", "n/a")
is_compat = is_compatible_version(about.__version__, spacy_version)
pkgs[pkg_name] = { pkgs[pkg_name] = {
"name": package, "name": package,
"version": version, "version": version,
"compat": package in compat and version in compat[package], "spacy": spacy_version,
"compat": is_compat,
} }
return pkgs, compat return pkgs, compat

View File

@ -22,13 +22,13 @@ def render(
"""Render displaCy visualisation. """Render displaCy visualisation.
docs (list or Doc): Document(s) to visualise. docs (list or Doc): Document(s) to visualise.
style (unicode): Visualisation style, 'dep' or 'ent'. style (str): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page. page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup. minify (bool): Minify HTML markup.
jupyter (bool): Override Jupyter auto-detection. jupyter (bool): Override Jupyter auto-detection.
options (dict): Visualiser-specific options, e.g. colors. options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
RETURNS (unicode): Rendered HTML markup. RETURNS (str): Rendered HTML markup.
DOCS: https://spacy.io/api/top-level#displacy.render DOCS: https://spacy.io/api/top-level#displacy.render
USAGE: https://spacy.io/usage/visualizers USAGE: https://spacy.io/usage/visualizers
@ -73,13 +73,13 @@ def serve(
"""Serve displaCy visualisation. """Serve displaCy visualisation.
docs (list or Doc): Document(s) to visualise. docs (list or Doc): Document(s) to visualise.
style (unicode): Visualisation style, 'dep' or 'ent'. style (str): Visualisation style, 'dep' or 'ent'.
page (bool): Render markup as full HTML page. page (bool): Render markup as full HTML page.
minify (bool): Minify HTML markup. minify (bool): Minify HTML markup.
options (dict): Visualiser-specific options, e.g. colors. options (dict): Visualiser-specific options, e.g. colors.
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts. manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
port (int): Port to serve visualisation. port (int): Port to serve visualisation.
host (unicode): Host to serve visualisation. host (str): Host to serve visualisation.
DOCS: https://spacy.io/api/top-level#displacy.serve DOCS: https://spacy.io/api/top-level#displacy.serve
USAGE: https://spacy.io/usage/visualizers USAGE: https://spacy.io/usage/visualizers

View File

@ -47,7 +47,7 @@ class DependencyRenderer(object):
parsed (list): Dependency parses to render. parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page. page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup. minify (bool): Minify HTML markup.
RETURNS (unicode): Rendered SVG or HTML markup. RETURNS (str): Rendered SVG or HTML markup.
""" """
# Create a random ID prefix to make sure parses don't receive the # Create a random ID prefix to make sure parses don't receive the
# same ID, even if they're identical # same ID, even if they're identical
@ -78,7 +78,7 @@ class DependencyRenderer(object):
render_id (int): Unique ID, typically index of document. render_id (int): Unique ID, typically index of document.
words (list): Individual words and their tags. words (list): Individual words and their tags.
arcs (list): Individual arcs and their start, end, direction and label. arcs (list): Individual arcs and their start, end, direction and label.
RETURNS (unicode): Rendered SVG markup. RETURNS (str): Rendered SVG markup.
""" """
self.levels = self.get_levels(arcs) self.levels = self.get_levels(arcs)
self.highest_level = len(self.levels) self.highest_level = len(self.levels)
@ -112,10 +112,10 @@ class DependencyRenderer(object):
): ):
"""Render individual word. """Render individual word.
text (unicode): Word text. text (str): Word text.
tag (unicode): Part-of-speech tag. tag (str): Part-of-speech tag.
i (int): Unique ID, typically word index. i (int): Unique ID, typically word index.
RETURNS (unicode): Rendered SVG markup. RETURNS (str): Rendered SVG markup.
""" """
y = self.offset_y + self.word_spacing y = self.offset_y + self.word_spacing
x = self.offset_x + i * self.distance x = self.offset_x + i * self.distance
@ -131,12 +131,12 @@ class DependencyRenderer(object):
def render_arrow(self, label, start, end, direction, i): def render_arrow(self, label, start, end, direction, i):
"""Render individual arrow. """Render individual arrow.
label (unicode): Dependency label. label (str): Dependency label.
start (int): Index of start word. start (int): Index of start word.
end (int): Index of end word. end (int): Index of end word.
direction (unicode): Arrow direction, 'left' or 'right'. direction (str): Arrow direction, 'left' or 'right'.
i (int): Unique ID, typically arrow index. i (int): Unique ID, typically arrow index.
RETURNS (unicode): Rendered SVG markup. RETURNS (str): Rendered SVG markup.
""" """
if start < 0 or end < 0: if start < 0 or end < 0:
error_args = dict(start=start, end=end, label=label, dir=direction) error_args = dict(start=start, end=end, label=label, dir=direction)
@ -179,7 +179,7 @@ class DependencyRenderer(object):
y (int): Y-coordinate of arrow start and end point. y (int): Y-coordinate of arrow start and end point.
y_curve (int): Y-corrdinate of Cubic Bézier y_curve point. y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
x_end (int): X-coordinate of arrow end point. x_end (int): X-coordinate of arrow end point.
RETURNS (unicode): Definition of the arc path ('d' attribute). RETURNS (str): Definition of the arc path ('d' attribute).
""" """
template = "M{x},{y} C{x},{c} {e},{c} {e},{y}" template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
if self.compact: if self.compact:
@ -189,11 +189,11 @@ class DependencyRenderer(object):
def get_arrowhead(self, direction, x, y, end): def get_arrowhead(self, direction, x, y, end):
"""Render individual arrow head. """Render individual arrow head.
direction (unicode): Arrow direction, 'left' or 'right'. direction (str): Arrow direction, 'left' or 'right'.
x (int): X-coordinate of arrow start point. x (int): X-coordinate of arrow start point.
y (int): Y-coordinate of arrow start and end point. y (int): Y-coordinate of arrow start and end point.
end (int): X-coordinate of arrow end point. end (int): X-coordinate of arrow end point.
RETURNS (unicode): Definition of the arrow head path ('d' attribute). RETURNS (str): Definition of the arrow head path ('d' attribute).
""" """
if direction == "left": if direction == "left":
pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2) pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
@ -279,7 +279,7 @@ class EntityRenderer(object):
parsed (list): Dependency parses to render. parsed (list): Dependency parses to render.
page (bool): Render parses wrapped as full HTML page. page (bool): Render parses wrapped as full HTML page.
minify (bool): Minify HTML markup. minify (bool): Minify HTML markup.
RETURNS (unicode): Rendered HTML markup. RETURNS (str): Rendered HTML markup.
""" """
rendered = [] rendered = []
for i, p in enumerate(parsed): for i, p in enumerate(parsed):
@ -300,9 +300,9 @@ class EntityRenderer(object):
def render_ents(self, text, spans, title): def render_ents(self, text, spans, title):
"""Render entities in text. """Render entities in text.
text (unicode): Original text. text (str): Original text.
spans (list): Individual entity spans and their start, end and label. spans (list): Individual entity spans and their start, end and label.
title (unicode or None): Document title set in Doc.user_data['title']. title (str / None): Document title set in Doc.user_data['title'].
""" """
markup = "" markup = ""
offset = 0 offset = 0

View File

@ -113,9 +113,12 @@ class Warnings(object):
"ignored during training.") "ignored during training.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
W095 = ("Skipping unsupported morphological feature(s): {feature}. " W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is "
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or " "incompatible with the current version ({current}). This may lead "
"string \"Field1=Value1,Value2|Field2=Value3\".") "to unexpected results or runtime errors. To resolve this, "
"download a newer compatible model or retrain your custom model "
"with the current spaCy version. For more details and available "
"updates, run: python -m spacy validate")
W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' " W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' "
"instead.") "instead.")
W097 = ("No Model config was provided to create the '{name}' component, " W097 = ("No Model config was provided to create the '{name}' component, "
@ -124,6 +127,9 @@ class Warnings(object):
"so a default configuration was used.") "so a default configuration was used.")
W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', " W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
"but got '{type}' instead, so ignoring it.") "but got '{type}' instead, so ignoring it.")
W100 = ("Skipping unsupported morphological feature(s): {feature}. "
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
"string \"Field1=Value1,Value2|Field2=Value3\".")
@add_codes @add_codes
@ -621,7 +627,7 @@ class MatchPatternError(ValueError):
def __init__(self, key, errors): def __init__(self, key, errors):
"""Custom error for validating match patterns. """Custom error for validating match patterns.
key (unicode): The name of the matcher rule. key (str): The name of the matcher rule.
errors (dict): Validation errors (sequence of strings) mapped to pattern errors (dict): Validation errors (sequence of strings) mapped to pattern
ID, i.e. the index of the added pattern. ID, i.e. the index of the added pattern.
""" """

View File

@ -1,8 +1,8 @@
def explain(term): def explain(term):
"""Get a description for a given POS tag, dependency label or entity type. """Get a description for a given POS tag, dependency label or entity type.
term (unicode): The term to explain. term (str): The term to explain.
RETURNS (unicode): The explanation, or `None` if not found in the glossary. RETURNS (str): The explanation, or `None` if not found in the glossary.
EXAMPLE: EXAMPLE:
>>> spacy.explain(u'NORP') >>> spacy.explain(u'NORP')

View File

@ -154,8 +154,8 @@ class GoldCorpus(object):
def __init__(self, train, dev, gold_preproc=False, limit=None): def __init__(self, train, dev, gold_preproc=False, limit=None):
"""Create a GoldCorpus. """Create a GoldCorpus.
train (unicode or Path): File or directory of training data. train (str / Path): File or directory of training data.
dev (unicode or Path): File or directory of development data. dev (str / Path): File or directory of development data.
RETURNS (GoldCorpus): The newly created object. RETURNS (GoldCorpus): The newly created object.
""" """
self.limit = limit self.limit = limit

View File

@ -38,7 +38,7 @@ cdef class Candidate:
@property @property
def entity_(self): def entity_(self):
"""RETURNS (unicode): ID/name of this entity in the KB""" """RETURNS (str): ID/name of this entity in the KB"""
return self.kb.vocab.strings[self.entity_hash] return self.kb.vocab.strings[self.entity_hash]
@property @property
@ -48,7 +48,7 @@ cdef class Candidate:
@property @property
def alias_(self): def alias_(self):
"""RETURNS (unicode): ID of the original alias""" """RETURNS (str): ID of the original alias"""
return self.kb.vocab.strings[self.alias_hash] return self.kb.vocab.strings[self.alias_hash]
@property @property

View File

@ -17,7 +17,8 @@ from .tokens.underscore import Underscore
from .vocab import Vocab from .vocab import Vocab
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .lookups import Lookups from .lookups import Lookups
from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
from .pipe_analysis import count_pipeline_interdependencies
from .gold import Example from .gold import Example
from .scorer import Scorer from .scorer import Scorer
from .util import link_vectors_to_models, create_default_optimizer, registry from .util import link_vectors_to_models, create_default_optimizer, registry
@ -127,7 +128,7 @@ class Language(object):
Defaults (class): Settings, data and factory methods for creating the `nlp` Defaults (class): Settings, data and factory methods for creating the `nlp`
object and processing pipeline. object and processing pipeline.
lang (unicode): Two-letter language ID, i.e. ISO code. lang (str): Two-letter language ID, i.e. ISO code.
DOCS: https://spacy.io/api/language DOCS: https://spacy.io/api/language
""" """
@ -196,13 +197,14 @@ class Language(object):
@property @property
def meta(self): def meta(self):
spacy_version = util.get_model_version_range(about.__version__)
if self.vocab.lang: if self.vocab.lang:
self._meta.setdefault("lang", self.vocab.lang) self._meta.setdefault("lang", self.vocab.lang)
else: else:
self._meta.setdefault("lang", self.lang) self._meta.setdefault("lang", self.lang)
self._meta.setdefault("name", "model") self._meta.setdefault("name", "model")
self._meta.setdefault("version", "0.0.0") self._meta.setdefault("version", "0.0.0")
self._meta.setdefault("spacy_version", f">={about.__version__}") self._meta.setdefault("spacy_version", spacy_version)
self._meta.setdefault("description", "") self._meta.setdefault("description", "")
self._meta.setdefault("author", "") self._meta.setdefault("author", "")
self._meta.setdefault("email", "") self._meta.setdefault("email", "")
@ -292,7 +294,7 @@ class Language(object):
def get_pipe(self, name): def get_pipe(self, name):
"""Get a pipeline component for a given component name. """Get a pipeline component for a given component name.
name (unicode): Name of pipeline component to get. name (str): Name of pipeline component to get.
RETURNS (callable): The pipeline component. RETURNS (callable): The pipeline component.
DOCS: https://spacy.io/api/language#get_pipe DOCS: https://spacy.io/api/language#get_pipe
@ -305,7 +307,7 @@ class Language(object):
def create_pipe(self, name, config=dict()): def create_pipe(self, name, config=dict()):
"""Create a pipeline component from a factory. """Create a pipeline component from a factory.
name (unicode): Factory name to look up in `Language.factories`. name (str): Factory name to look up in `Language.factories`.
config (dict): Configuration parameters to initialise component. config (dict): Configuration parameters to initialise component.
RETURNS (callable): Pipeline component. RETURNS (callable): Pipeline component.
@ -348,12 +350,12 @@ class Language(object):
of before/after/first/last can be set. Default behaviour is "last". of before/after/first/last can be set. Default behaviour is "last".
component (callable): The pipeline component. component (callable): The pipeline component.
name (unicode): Name of pipeline component. Overwrites existing name (str): Name of pipeline component. Overwrites existing
component.name attribute if available. If no name is set and component.name attribute if available. If no name is set and
the component exposes no name attribute, component.__name__ is the component exposes no name attribute, component.__name__ is
used. An error is raised if a name already exists in the pipeline. used. An error is raised if a name already exists in the pipeline.
before (unicode): Component name to insert component directly before. before (str): Component name to insert component directly before.
after (unicode): Component name to insert component directly after. after (str): Component name to insert component directly after.
first (bool): Insert component first / not first in the pipeline. first (bool): Insert component first / not first in the pipeline.
last (bool): Insert component last / not last in the pipeline. last (bool): Insert component last / not last in the pipeline.
@ -394,7 +396,7 @@ class Language(object):
"""Check if a component name is present in the pipeline. Equivalent to """Check if a component name is present in the pipeline. Equivalent to
`name in nlp.pipe_names`. `name in nlp.pipe_names`.
name (unicode): Name of the component. name (str): Name of the component.
RETURNS (bool): Whether a component of the name exists in the pipeline. RETURNS (bool): Whether a component of the name exists in the pipeline.
DOCS: https://spacy.io/api/language#has_pipe DOCS: https://spacy.io/api/language#has_pipe
@ -404,7 +406,7 @@ class Language(object):
def replace_pipe(self, name, component): def replace_pipe(self, name, component):
"""Replace a component in the pipeline. """Replace a component in the pipeline.
name (unicode): Name of the component to replace. name (str): Name of the component to replace.
component (callable): Pipeline component. component (callable): Pipeline component.
DOCS: https://spacy.io/api/language#replace_pipe DOCS: https://spacy.io/api/language#replace_pipe
@ -423,8 +425,8 @@ class Language(object):
def rename_pipe(self, old_name, new_name): def rename_pipe(self, old_name, new_name):
"""Rename a pipeline component. """Rename a pipeline component.
old_name (unicode): Name of the component to rename. old_name (str): Name of the component to rename.
new_name (unicode): New name of the component. new_name (str): New name of the component.
DOCS: https://spacy.io/api/language#rename_pipe DOCS: https://spacy.io/api/language#rename_pipe
""" """
@ -438,7 +440,7 @@ class Language(object):
def remove_pipe(self, name): def remove_pipe(self, name):
"""Remove a component from the pipeline. """Remove a component from the pipeline.
name (unicode): Name of the component to remove. name (str): Name of the component to remove.
RETURNS (tuple): A `(name, component)` tuple of the removed component. RETURNS (tuple): A `(name, component)` tuple of the removed component.
DOCS: https://spacy.io/api/language#remove_pipe DOCS: https://spacy.io/api/language#remove_pipe
@ -455,7 +457,7 @@ class Language(object):
and can contain arbitrary whitespace. Alignment into the original string and can contain arbitrary whitespace. Alignment into the original string
is preserved. is preserved.
text (unicode): The text to be processed. text (str): The text to be processed.
disable (list): Names of the pipeline components to disable. disable (list): Names of the pipeline components to disable.
component_cfg (dict): An optional dictionary with extra keyword arguments component_cfg (dict): An optional dictionary with extra keyword arguments
for specific components. for specific components.
@ -564,13 +566,14 @@ class Language(object):
if component_cfg is None: if component_cfg is None:
component_cfg = {} component_cfg = {}
component_deps = count_pipeline_interdependencies(self.pipeline)
# Determine whether component should set annotations. In theory I guess # Determine whether component should set annotations. In theory I guess
# we should do this by inspecting the meta? Or we could just always # we should do this by inspecting the meta? Or we could just always
# say "yes" # say "yes"
for name, proc in self.pipeline: for i, (name, proc) in enumerate(self.pipeline):
component_cfg.setdefault(name, {}) component_cfg.setdefault(name, {})
component_cfg[name].setdefault("drop", drop) component_cfg[name].setdefault("drop", drop)
component_cfg[name].setdefault("set_annotations", False) component_cfg[name]["set_annotations"] = bool(component_deps[i])
for name, proc in self.pipeline: for name, proc in self.pipeline:
if not hasattr(proc, "update"): if not hasattr(proc, "update"):
continue continue
@ -938,7 +941,7 @@ class Language(object):
"""Save the current state to a directory. If a model is loaded, this """Save the current state to a directory. If a model is loaded, this
will include the model. will include the model.
path (unicode or Path): Path to a directory, which will be created if path (str / Path): Path to a directory, which will be created if
it doesn't exist. it doesn't exist.
exclude (list): Names of components or serialization fields to exclude. exclude (list): Names of components or serialization fields to exclude.
@ -972,7 +975,7 @@ class Language(object):
returns it. If the saved `Language` object contains a model, the returns it. If the saved `Language` object contains a model, the
model will be loaded. model will be loaded.
path (unicode or Path): A path to a directory. path (str / Path): A path to a directory.
exclude (list): Names of components or serialization fields to exclude. exclude (list): Names of components or serialization fields to exclude.
RETURNS (Language): The modified `Language` object. RETURNS (Language): The modified `Language` object.
@ -1090,7 +1093,7 @@ class component(object):
): ):
"""Decorate a pipeline component. """Decorate a pipeline component.
name (unicode): Default component and factory name. name (str): Default component and factory name.
assigns (list): Attributes assigned by component, e.g. `["token.pos"]`. assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
requires (list): Attributes required by component, e.g. `["token.dep"]`. requires (list): Attributes required by component, e.g. `["token.dep"]`.
retokenizes (bool): Whether the component changes the tokenization. retokenizes (bool): Whether the component changes the tokenization.

View File

@ -30,8 +30,8 @@ class Lemmatizer(object):
def __call__(self, string, univ_pos, morphology=None): def __call__(self, string, univ_pos, morphology=None):
"""Lemmatize a string. """Lemmatize a string.
string (unicode): The string to lemmatize, e.g. the token text. string (str): The string to lemmatize, e.g. the token text.
univ_pos (unicode / int): The token's universal part-of-speech tag. univ_pos (str / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the morphology (dict): The token's morphological features following the
Universal Dependencies scheme. Universal Dependencies scheme.
RETURNS (list): The available lemmas for the string. RETURNS (list): The available lemmas for the string.
@ -69,7 +69,7 @@ class Lemmatizer(object):
Check whether we're dealing with an uninflected paradigm, so we can Check whether we're dealing with an uninflected paradigm, so we can
avoid lemmatization entirely. avoid lemmatization entirely.
univ_pos (unicode / int): The token's universal part-of-speech tag. univ_pos (str / int): The token's universal part-of-speech tag.
morphology (dict): The token's morphological features following the morphology (dict): The token's morphological features following the
Universal Dependencies scheme. Universal Dependencies scheme.
""" """
@ -126,10 +126,10 @@ class Lemmatizer(object):
"""Look up a lemma in the table, if available. If no lemma is found, """Look up a lemma in the table, if available. If no lemma is found,
the original string is returned. the original string is returned.
string (unicode): The original string. string (str): The original string.
orth (int): Optional hash of the string to look up. If not set, the orth (int): Optional hash of the string to look up. If not set, the
string will be used and hashed. string will be used and hashed.
RETURNS (unicode): The lemma if the string was found, otherwise the RETURNS (str): The lemma if the string was found, otherwise the
original string. original string.
""" """
lookup_table = self.lookups.get_table("lemma_lookup", {}) lookup_table = self.lookups.get_table("lemma_lookup", {})

View File

@ -164,7 +164,7 @@ cdef class Lexeme:
self.vocab.set_vector(self.c.orth, vector) self.vocab.set_vector(self.c.orth, vector)
property rank: property rank:
"""RETURNS (unicode): Sequential ID of the lexemes's lexical type, used """RETURNS (str): Sequential ID of the lexemes's lexical type, used
to index into tables, e.g. for word vectors.""" to index into tables, e.g. for word vectors."""
def __get__(self): def __get__(self):
return self.c.id return self.c.id
@ -187,18 +187,18 @@ cdef class Lexeme:
@property @property
def orth_(self): def orth_(self):
"""RETURNS (unicode): The original verbatim text of the lexeme """RETURNS (str): The original verbatim text of the lexeme
(identical to `Lexeme.text`). Exists mostly for consistency with (identical to `Lexeme.text`). Exists mostly for consistency with
the other attributes.""" the other attributes."""
return self.vocab.strings[self.c.orth] return self.vocab.strings[self.c.orth]
@property @property
def text(self): def text(self):
"""RETURNS (unicode): The original verbatim text of the lexeme.""" """RETURNS (str): The original verbatim text of the lexeme."""
return self.orth_ return self.orth_
property lower: property lower:
"""RETURNS (unicode): Lowercase form of the lexeme.""" """RETURNS (str): Lowercase form of the lexeme."""
def __get__(self): def __get__(self):
return self.c.lower return self.c.lower
@ -281,7 +281,7 @@ cdef class Lexeme:
prob_table[self.c.orth] = x prob_table[self.c.orth] = x
property lower_: property lower_:
"""RETURNS (unicode): Lowercase form of the word.""" """RETURNS (str): Lowercase form of the word."""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lower] return self.vocab.strings[self.c.lower]
@ -289,7 +289,7 @@ cdef class Lexeme:
self.c.lower = self.vocab.strings.add(x) self.c.lower = self.vocab.strings.add(x)
property norm_: property norm_:
"""RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the """RETURNS (str): The lexemes's norm, i.e. a normalised form of the
lexeme text. lexeme text.
""" """
def __get__(self): def __get__(self):
@ -299,7 +299,7 @@ cdef class Lexeme:
self.norm = self.vocab.strings.add(x) self.norm = self.vocab.strings.add(x)
property shape_: property shape_:
"""RETURNS (unicode): Transform of the word's string, to show """RETURNS (str): Transform of the word's string, to show
orthographic features. orthographic features.
""" """
def __get__(self): def __get__(self):
@ -309,7 +309,7 @@ cdef class Lexeme:
self.c.shape = self.vocab.strings.add(x) self.c.shape = self.vocab.strings.add(x)
property prefix_: property prefix_:
"""RETURNS (unicode): Length-N substring from the start of the word. """RETURNS (str): Length-N substring from the start of the word.
Defaults to `N=1`. Defaults to `N=1`.
""" """
def __get__(self): def __get__(self):
@ -319,7 +319,7 @@ cdef class Lexeme:
self.c.prefix = self.vocab.strings.add(x) self.c.prefix = self.vocab.strings.add(x)
property suffix_: property suffix_:
"""RETURNS (unicode): Length-N substring from the end of the word. """RETURNS (str): Length-N substring from the end of the word.
Defaults to `N=3`. Defaults to `N=3`.
""" """
def __get__(self): def __get__(self):
@ -329,7 +329,7 @@ cdef class Lexeme:
self.c.suffix = self.vocab.strings.add(x) self.c.suffix = self.vocab.strings.add(x)
property lang_: property lang_:
"""RETURNS (unicode): Language of the parent vocabulary.""" """RETURNS (str): Language of the parent vocabulary."""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.lang] return self.vocab.strings[self.c.lang]

View File

@ -31,7 +31,7 @@ class Lookups(object):
"""Check if the lookups contain a table of a given name. Delegates to """Check if the lookups contain a table of a given name. Delegates to
Lookups.has_table. Lookups.has_table.
name (unicode): Name of the table. name (str): Name of the table.
RETURNS (bool): Whether a table of that name is in the lookups. RETURNS (bool): Whether a table of that name is in the lookups.
""" """
return self.has_table(name) return self.has_table(name)
@ -48,7 +48,7 @@ class Lookups(object):
def add_table(self, name, data=SimpleFrozenDict()): def add_table(self, name, data=SimpleFrozenDict()):
"""Add a new table to the lookups. Raises an error if the table exists. """Add a new table to the lookups. Raises an error if the table exists.
name (unicode): Unique name of table. name (str): Unique name of table.
data (dict): Optional data to add to the table. data (dict): Optional data to add to the table.
RETURNS (Table): The newly added table. RETURNS (Table): The newly added table.
@ -64,7 +64,7 @@ class Lookups(object):
"""Get a table. Raises an error if the table doesn't exist and no """Get a table. Raises an error if the table doesn't exist and no
default value is provided. default value is provided.
name (unicode): Name of the table. name (str): Name of the table.
default: Optional default value to return if table doesn't exist. default: Optional default value to return if table doesn't exist.
RETURNS (Table): The table. RETURNS (Table): The table.
@ -79,7 +79,7 @@ class Lookups(object):
def remove_table(self, name): def remove_table(self, name):
"""Remove a table. Raises an error if the table doesn't exist. """Remove a table. Raises an error if the table doesn't exist.
name (unicode): Name of the table to remove. name (str): Name of the table to remove.
RETURNS (Table): The removed table. RETURNS (Table): The removed table.
DOCS: https://spacy.io/api/lookups#remove_table DOCS: https://spacy.io/api/lookups#remove_table
@ -91,7 +91,7 @@ class Lookups(object):
def has_table(self, name): def has_table(self, name):
"""Check if the lookups contain a table of a given name. """Check if the lookups contain a table of a given name.
name (unicode): Name of the table. name (str): Name of the table.
RETURNS (bool): Whether a table of that name exists. RETURNS (bool): Whether a table of that name exists.
DOCS: https://spacy.io/api/lookups#has_table DOCS: https://spacy.io/api/lookups#has_table
@ -125,7 +125,7 @@ class Lookups(object):
"""Save the lookups to a directory as lookups.bin. Expects a path to a """Save the lookups to a directory as lookups.bin. Expects a path to a
directory, which will be created if it doesn't exist. directory, which will be created if it doesn't exist.
path (unicode / Path): The file path. path (str / Path): The file path.
DOCS: https://spacy.io/api/lookups#to_disk DOCS: https://spacy.io/api/lookups#to_disk
""" """
@ -141,7 +141,7 @@ class Lookups(object):
"""Load lookups from a directory containing a lookups.bin. Will skip """Load lookups from a directory containing a lookups.bin. Will skip
loading if the file doesn't exist. loading if the file doesn't exist.
path (unicode / Path): The directory path. path (str / Path): The directory path.
RETURNS (Lookups): The loaded lookups. RETURNS (Lookups): The loaded lookups.
DOCS: https://spacy.io/api/lookups#from_disk DOCS: https://spacy.io/api/lookups#from_disk
@ -167,7 +167,7 @@ class Table(OrderedDict):
"""Initialize a new table from a dict. """Initialize a new table from a dict.
data (dict): The dictionary. data (dict): The dictionary.
name (unicode): Optional table name for reference. name (str): Optional table name for reference.
RETURNS (Table): The newly created object. RETURNS (Table): The newly created object.
DOCS: https://spacy.io/api/lookups#table.from_dict DOCS: https://spacy.io/api/lookups#table.from_dict
@ -179,7 +179,7 @@ class Table(OrderedDict):
def __init__(self, name=None, data=None): def __init__(self, name=None, data=None):
"""Initialize a new table. """Initialize a new table.
name (unicode): Optional table name for reference. name (str): Optional table name for reference.
data (dict): Initial data, used to hint Bloom Filter. data (dict): Initial data, used to hint Bloom Filter.
RETURNS (Table): The newly created object. RETURNS (Table): The newly created object.
@ -197,7 +197,7 @@ class Table(OrderedDict):
def __setitem__(self, key, value): def __setitem__(self, key, value):
"""Set new key/value pair. String keys will be hashed. """Set new key/value pair. String keys will be hashed.
key (unicode / int): The key to set. key (str / int): The key to set.
value: The value to set. value: The value to set.
""" """
key = get_string_id(key) key = get_string_id(key)
@ -208,7 +208,7 @@ class Table(OrderedDict):
"""Set new key/value pair. String keys will be hashed. """Set new key/value pair. String keys will be hashed.
Same as table[key] = value. Same as table[key] = value.
key (unicode / int): The key to set. key (str / int): The key to set.
value: The value to set. value: The value to set.
""" """
self[key] = value self[key] = value
@ -216,7 +216,7 @@ class Table(OrderedDict):
def __getitem__(self, key): def __getitem__(self, key):
"""Get the value for a given key. String keys will be hashed. """Get the value for a given key. String keys will be hashed.
key (unicode / int): The key to get. key (str / int): The key to get.
RETURNS: The value. RETURNS: The value.
""" """
key = get_string_id(key) key = get_string_id(key)
@ -225,7 +225,7 @@ class Table(OrderedDict):
def get(self, key, default=None): def get(self, key, default=None):
"""Get the value for a given key. String keys will be hashed. """Get the value for a given key. String keys will be hashed.
key (unicode / int): The key to get. key (str / int): The key to get.
default: The default value to return. default: The default value to return.
RETURNS: The value. RETURNS: The value.
""" """
@ -235,7 +235,7 @@ class Table(OrderedDict):
def __contains__(self, key): def __contains__(self, key):
"""Check whether a key is in the table. String keys will be hashed. """Check whether a key is in the table. String keys will be hashed.
key (unicode / int): The key to check. key (str / int): The key to check.
RETURNS (bool): Whether the key is in the table. RETURNS (bool): Whether the key is in the table.
""" """
key = get_string_id(key) key = get_string_id(key)

View File

@ -66,7 +66,7 @@ cdef class DependencyMatcher:
def __contains__(self, key): def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID. """Check whether the matcher contains rules for a match ID.
key (unicode): The match ID. key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID. RETURNS (bool): Whether the matcher contains rules for this match ID.
""" """
return self._normalize_key(key) in self._patterns return self._normalize_key(key) in self._patterns
@ -194,7 +194,7 @@ cdef class DependencyMatcher:
def get(self, key, default=None): def get(self, key, default=None):
"""Retrieve the pattern stored for a key. """Retrieve the pattern stored for a key.
key (unicode or int): The key to retrieve. key (str / int): The key to retrieve.
RETURNS (tuple): The rule, as an (on_match, patterns) tuple. RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
""" """
key = self._normalize_key(key) key = self._normalize_key(key)

View File

@ -64,7 +64,7 @@ cdef class Matcher:
def __contains__(self, key): def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID. """Check whether the matcher contains rules for a match ID.
key (unicode): The match ID. key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID. RETURNS (bool): Whether the matcher contains rules for this match ID.
""" """
return self._normalize_key(key) in self._patterns return self._normalize_key(key) in self._patterns
@ -98,7 +98,7 @@ cdef class Matcher:
number of arguments). The on_match callback becomes an optional keyword number of arguments). The on_match callback becomes an optional keyword
argument. argument.
key (unicode): The match ID. key (str): The match ID.
patterns (list): The patterns to add for the given key. patterns (list): The patterns to add for the given key.
on_match (callable): Optional callback executed on match. on_match (callable): Optional callback executed on match.
*_patterns (list): For backwards compatibility: list of patterns to add *_patterns (list): For backwards compatibility: list of patterns to add
@ -139,7 +139,7 @@ cdef class Matcher:
"""Remove a rule from the matcher. A KeyError is raised if the key does """Remove a rule from the matcher. A KeyError is raised if the key does
not exist. not exist.
key (unicode): The ID of the match rule. key (str): The ID of the match rule.
""" """
norm_key = self._normalize_key(key) norm_key = self._normalize_key(key)
if not norm_key in self._patterns: if not norm_key in self._patterns:
@ -166,7 +166,7 @@ cdef class Matcher:
def get(self, key, default=None): def get(self, key, default=None):
"""Retrieve the pattern stored for a key. """Retrieve the pattern stored for a key.
key (unicode or int): The key to retrieve. key (str / int): The key to retrieve.
RETURNS (tuple): The rule, as an (on_match, patterns) tuple. RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
""" """
key = self._normalize_key(key) key = self._normalize_key(key)

View File

@ -30,7 +30,7 @@ cdef class PhraseMatcher:
"""Initialize the PhraseMatcher. """Initialize the PhraseMatcher.
vocab (Vocab): The shared vocabulary. vocab (Vocab): The shared vocabulary.
attr (int / unicode): Token attribute to match on. attr (int / str): Token attribute to match on.
validate (bool): Perform additional validation when patterns are added. validate (bool): Perform additional validation when patterns are added.
RETURNS (PhraseMatcher): The newly constructed object. RETURNS (PhraseMatcher): The newly constructed object.
@ -70,7 +70,7 @@ cdef class PhraseMatcher:
def __contains__(self, key): def __contains__(self, key):
"""Check whether the matcher contains rules for a match ID. """Check whether the matcher contains rules for a match ID.
key (unicode): The match ID. key (str): The match ID.
RETURNS (bool): Whether the matcher contains rules for this match ID. RETURNS (bool): Whether the matcher contains rules for this match ID.
DOCS: https://spacy.io/api/phrasematcher#contains DOCS: https://spacy.io/api/phrasematcher#contains
@ -85,7 +85,7 @@ cdef class PhraseMatcher:
"""Remove a rule from the matcher by match ID. A KeyError is raised if """Remove a rule from the matcher by match ID. A KeyError is raised if
the key does not exist. the key does not exist.
key (unicode): The match ID. key (str): The match ID.
DOCS: https://spacy.io/api/phrasematcher#remove DOCS: https://spacy.io/api/phrasematcher#remove
""" """
@ -159,7 +159,7 @@ cdef class PhraseMatcher:
number of arguments). The on_match callback becomes an optional keyword number of arguments). The on_match callback becomes an optional keyword
argument. argument.
key (unicode): The match ID. key (str): The match ID.
docs (list): List of `Doc` objects representing match patterns. docs (list): List of `Doc` objects representing match patterns.
on_match (callable): Callback executed on match. on_match (callable): Callback executed on match.
*_docs (Doc): For backwards compatibility: list of patterns to add *_docs (Doc): For backwards compatibility: list of patterns to add

View File

@ -15,10 +15,10 @@ def build_tb_parser_model(
use_upper=True, use_upper=True,
nO=None, nO=None,
): ):
token_vector_width = tok2vec.get_dim("nO") t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
tok2vec = chain( tok2vec = chain(
tok2vec, tok2vec,
with_array(Linear(hidden_width, token_vector_width)), with_array(Linear(hidden_width, t2v_width)),
list2array(), list2array(),
) )
tok2vec.set_dim("nO", hidden_width) tok2vec.set_dim("nO", hidden_width)

View File

@ -6,9 +6,9 @@ from ...util import registry
@registry.architectures.register("spacy.Tagger.v1") @registry.architectures.register("spacy.Tagger.v1")
def build_tagger_model(tok2vec, nO=None) -> Model: def build_tagger_model(tok2vec, nO=None) -> Model:
token_vector_width = tok2vec.get_dim("nO")
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?! # TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
output_layer = Softmax(nO, nI=token_vector_width, init_W=zero_init) t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
output_layer = Softmax(nO, t2v_width, init_W=zero_init)
softmax = with_array(output_layer) softmax = with_array(output_layer)
model = chain(tok2vec, softmax) model = chain(tok2vec, softmax)
model.set_ref("tok2vec", tok2vec) model.set_ref("tok2vec", tok2vec)

View File

@ -38,8 +38,8 @@ def forward(model, X, is_train):
def init(model, X=None, Y=None): def init(model, X=None, Y=None):
tok2vec = model.get_ref("tok2vec").initialize() tok2vec = model.get_ref("tok2vec").initialize(X=X)
lower = model.get_ref("lower").initialize(X=X) lower = model.get_ref("lower").initialize()
if model.attrs["has_upper"]: if model.attrs["has_upper"]:
statevecs = model.ops.alloc2f(2, lower.get_dim("nO")) statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
model.get_ref("upper").initialize(X=statevecs) model.get_ref("upper").initialize(X=statevecs)

View File

@ -198,8 +198,8 @@ cdef class Morphology:
"""Add a special-case rule to the morphological analyser. Tokens whose """Add a special-case rule to the morphological analyser. Tokens whose
tag and orth match the rule will receive the specified properties. tag and orth match the rule will receive the specified properties.
tag (unicode): The part-of-speech tag to key the exception. tag (str): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception. orth (str): The word-form to key the exception.
""" """
attrs = dict(attrs) attrs = dict(attrs)
attrs = _normalize_props(attrs) attrs = _normalize_props(attrs)

View File

@ -11,7 +11,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
fulfilled (e.g. if previous components assign the attributes). fulfilled (e.g. if previous components assign the attributes).
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
name (unicode): The name of the pipeline component to analyze. name (str): The name of the pipeline component to analyze.
pipe (callable): The pipeline component function to analyze. pipe (callable): The pipeline component function to analyze.
index (int): The index of the component in the pipeline. index (int): The index of the component in the pipeline.
warn (bool): Show user warning if problem is found. warn (bool): Show user warning if problem is found.
@ -125,7 +125,7 @@ def get_assigns_for_attr(pipeline, attr):
"""Get all pipeline components that assign an attr, e.g. "doc.tensor". """Get all pipeline components that assign an attr, e.g. "doc.tensor".
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
attr (unicode): The attribute to check. attr (str): The attribute to check.
RETURNS (list): (name, pipeline) tuples of components that assign the attr. RETURNS (list): (name, pipeline) tuples of components that assign the attr.
""" """
return _get_feature_for_attr(pipeline, attr, "assigns") return _get_feature_for_attr(pipeline, attr, "assigns")
@ -135,7 +135,7 @@ def get_requires_for_attr(pipeline, attr):
"""Get all pipeline components that require an attr, e.g. "doc.tensor". """Get all pipeline components that require an attr, e.g. "doc.tensor".
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline. pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
attr (unicode): The attribute to check. attr (str): The attribute to check.
RETURNS (list): (name, pipeline) tuples of components that require the attr. RETURNS (list): (name, pipeline) tuples of components that require the attr.
""" """
return _get_feature_for_attr(pipeline, attr, "requires") return _get_feature_for_attr(pipeline, attr, "requires")
@ -173,3 +173,22 @@ def print_summary(nlp, pretty=True, no_print=False):
msg.good("No problems found.") msg.good("No problems found.")
if no_print: if no_print:
return {"overview": overview, "problems": problems} return {"overview": overview, "problems": problems}
def count_pipeline_interdependencies(pipeline):
"""Count how many subsequent components require an annotation set by each
component in the pipeline.
"""
pipe_assigns = []
pipe_requires = []
for name, pipe in pipeline:
pipe_assigns.append(set(getattr(pipe, "assigns", [])))
pipe_requires.append(set(getattr(pipe, "requires", [])))
counts = []
for i, assigns in enumerate(pipe_assigns):
count = 0
for requires in pipe_requires[i + 1 :]:
if assigns.intersection(requires):
count += 1
counts.append(count)
return counts

View File

@ -30,7 +30,7 @@ class EntityRuler(object):
nlp (Language): The shared nlp object to pass the vocab to the matchers nlp (Language): The shared nlp object to pass the vocab to the matchers
and process phrase patterns. and process phrase patterns.
phrase_matcher_attr (int / unicode): Token attribute to match on, passed phrase_matcher_attr (int / str): Token attribute to match on, passed
to the internal PhraseMatcher as `attr` to the internal PhraseMatcher as `attr`
validate (bool): Whether patterns should be validated, passed to validate (bool): Whether patterns should be validated, passed to
Matcher and PhraseMatcher as `validate` Matcher and PhraseMatcher as `validate`
@ -315,7 +315,7 @@ class EntityRuler(object):
"""Load the entity ruler from a file. Expects a file containing """Load the entity ruler from a file. Expects a file containing
newline-delimited JSON (JSONL) with one entry per line. newline-delimited JSON (JSONL) with one entry per line.
path (unicode / Path): The JSONL file to load. path (str / Path): The JSONL file to load.
**kwargs: Other config paramters, mostly for consistency. **kwargs: Other config paramters, mostly for consistency.
RETURNS (EntityRuler): The loaded entity ruler. RETURNS (EntityRuler): The loaded entity ruler.
@ -351,7 +351,7 @@ class EntityRuler(object):
"""Save the entity ruler patterns to a directory. The patterns will be """Save the entity ruler patterns to a directory. The patterns will be
saved as newline-delimited JSON (JSONL). saved as newline-delimited JSON (JSONL).
path (unicode / Path): The JSONL file to save. path (str / Path): The JSONL file to save.
**kwargs: Other config paramters, mostly for consistency. **kwargs: Other config paramters, mostly for consistency.
DOCS: https://spacy.io/api/entityruler#to_disk DOCS: https://spacy.io/api/entityruler#to_disk

View File

@ -50,7 +50,7 @@ def merge_subtokens(doc, label="subtok"):
"""Merge subtokens into a single token. """Merge subtokens into a single token.
doc (Doc): The Doc object. doc (Doc): The Doc object.
label (unicode): The subtoken dependency label. label (str): The subtoken dependency label.
RETURNS (Doc): The Doc object with merged subtokens. RETURNS (Doc): The Doc object with merged subtokens.
DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens

View File

@ -531,7 +531,16 @@ class Tagger(Pipe):
vocab.morphology.lemmatizer, vocab.morphology.lemmatizer,
exc=vocab.morphology.exc) exc=vocab.morphology.exc)
self.set_output(len(self.labels)) self.set_output(len(self.labels))
self.model.initialize() doc_sample = [Doc(self.vocab, words=["hello", "world"])]
if pipeline is not None:
for name, component in pipeline:
if component is self:
break
if hasattr(component, "pipe"):
doc_sample = list(component.pipe(doc_sample))
else:
doc_sample = [component(doc) for doc in doc_sample]
self.model.initialize(X=doc_sample)
# Get batch of example docs, example outputs to call begin_training(). # Get batch of example docs, example outputs to call begin_training().
# This lets the model infer shapes. # This lets the model infer shapes.
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)

View File

@ -109,7 +109,7 @@ cdef class StringStore:
"""Retrieve a string from a given hash, or vice versa. """Retrieve a string from a given hash, or vice versa.
string_or_id (bytes, unicode or uint64): The value to encode. string_or_id (bytes, unicode or uint64): The value to encode.
Returns (unicode or uint64): The value to be retrieved. Returns (str / uint64): The value to be retrieved.
""" """
if isinstance(string_or_id, basestring) and len(string_or_id) == 0: if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
return 0 return 0
@ -152,7 +152,7 @@ cdef class StringStore:
def add(self, string): def add(self, string):
"""Add a string to the StringStore. """Add a string to the StringStore.
string (unicode): The string to add. string (str): The string to add.
RETURNS (uint64): The string's hash value. RETURNS (uint64): The string's hash value.
""" """
if isinstance(string, unicode): if isinstance(string, unicode):
@ -179,7 +179,7 @@ cdef class StringStore:
def __contains__(self, string not None): def __contains__(self, string not None):
"""Check whether a string is in the store. """Check whether a string is in the store.
string (unicode): The string to check. string (str): The string to check.
RETURNS (bool): Whether the store contains the string. RETURNS (bool): Whether the store contains the string.
""" """
cdef hash_t key cdef hash_t key
@ -205,7 +205,7 @@ cdef class StringStore:
def __iter__(self): def __iter__(self):
"""Iterate over the strings in the store, in order. """Iterate over the strings in the store, in order.
YIELDS (unicode): A string in the store. YIELDS (str): A string in the store.
""" """
cdef int i cdef int i
cdef hash_t key cdef hash_t key
@ -223,7 +223,7 @@ cdef class StringStore:
def to_disk(self, path): def to_disk(self, path):
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (str / Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or Path-like objects. it doesn't exist. Paths may be either strings or Path-like objects.
""" """
path = util.ensure_path(path) path = util.ensure_path(path)
@ -234,7 +234,7 @@ cdef class StringStore:
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. returns it.
path (unicode or Path): A path to a directory. Paths may be either path (str / Path): A path to a directory. Paths may be either
strings or `Path`-like objects. strings or `Path`-like objects.
RETURNS (StringStore): The modified `StringStore` object. RETURNS (StringStore): The modified `StringStore` object.
""" """

View File

@ -624,12 +624,25 @@ cdef class Parser:
sgd = self.create_optimizer() sgd = self.create_optimizer()
doc_sample = [] doc_sample = []
gold_sample = [] gold_sample = []
for example in islice(get_examples(), 1000): for example in islice(get_examples(), 10):
parses = example.get_gold_parses(merge=False, vocab=self.vocab) parses = example.get_gold_parses(merge=False, vocab=self.vocab)
for doc, gold in parses: for doc, gold in parses:
if len(doc):
doc_sample.append(doc) doc_sample.append(doc)
gold_sample.append(gold) gold_sample.append(gold)
self.model.initialize(doc_sample, gold_sample)
if pipeline is not None:
for name, component in pipeline:
if component is self:
break
if hasattr(component, "pipe"):
doc_sample = list(component.pipe(doc_sample))
else:
doc_sample = [component(doc) for doc in doc_sample]
if doc_sample:
self.model.initialize(doc_sample)
else:
self.model.initialize()
if pipeline is not None: if pipeline is not None:
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg) self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
link_vectors_to_models(self.vocab) link_vectors_to_models(self.vocab)

View File

@ -9,7 +9,6 @@ def test_build_dependencies():
"pytest-timeout", "pytest-timeout",
"mock", "mock",
"flake8", "flake8",
"jsonschema",
] ]
libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"] libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"]

View File

@ -1,7 +1,8 @@
import spacy.language import spacy.language
from spacy.language import Language, component from spacy.language import Language, component
from spacy.analysis import print_summary, validate_attrs from spacy.pipe_analysis import print_summary, validate_attrs
from spacy.analysis import get_assigns_for_attr, get_requires_for_attr from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr
from spacy.pipe_analysis import count_pipeline_interdependencies
from mock import Mock, ANY from mock import Mock, ANY
import pytest import pytest
@ -161,3 +162,19 @@ def test_analysis_validate_attrs_remove_pipe():
with pytest.warns(None) as record: with pytest.warns(None) as record:
nlp.remove_pipe("c2") nlp.remove_pipe("c2")
assert not record.list assert not record.list
def test_pipe_interdependencies():
class Fancifier:
name = "fancifier"
assigns = ("doc._.fancy",)
requires = tuple()
class FancyNeeder:
name = "needer"
assigns = tuple()
requires = ("doc._.fancy",)
pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())]
counts = count_pipeline_interdependencies(pipeline)
assert counts == [1, 0]

View File

@ -2,9 +2,11 @@ import pytest
import os import os
import ctypes import ctypes
from pathlib import Path from pathlib import Path
from spacy.about import __version__ as spacy_version
from spacy import util from spacy import util
from spacy import prefer_gpu, require_gpu from spacy import prefer_gpu, require_gpu
from spacy.ml._precomputable_affine import PrecomputableAffine, _backprop_precomputable_affine_padding from spacy.ml._precomputable_affine import PrecomputableAffine
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
@pytest.fixture @pytest.fixture
@ -24,10 +26,12 @@ def test_util_ensure_path_succeeds(text):
assert isinstance(path, Path) assert isinstance(path, Path)
@pytest.mark.parametrize("package", ["numpy"]) @pytest.mark.parametrize(
def test_util_is_package(package): "package,result", [("numpy", True), ("sfkodskfosdkfpsdpofkspdof", False)]
)
def test_util_is_package(package, result):
"""Test that an installed package via pip is recognised by util.is_package.""" """Test that an installed package via pip is recognised by util.is_package."""
assert util.is_package(package) assert util.is_package(package) is result
@pytest.mark.parametrize("package", ["thinc"]) @pytest.mark.parametrize("package", ["thinc"])
@ -87,3 +91,21 @@ def test_ascii_filenames():
root = Path(__file__).parent.parent root = Path(__file__).parent.parent
for path in root.glob("**/*"): for path in root.glob("**/*"):
assert all(ord(c) < 128 for c in path.name), path.name assert all(ord(c) < 128 for c in path.name), path.name
@pytest.mark.parametrize(
"version,constraint,compatible",
[
(spacy_version, spacy_version, True),
(spacy_version, f">={spacy_version}", True),
("3.0.0", "2.0.0", False),
("3.2.1", ">=2.0.0", True),
("2.2.10a1", ">=1.0.0,<2.1.1", False),
("3.0.0.dev3", ">=1.2.3,<4.5.6", True),
("n/a", ">=1.2.3,<4.5.6", None),
("1.2.3", "n/a", None),
("n/a", "n/a", None),
],
)
def test_is_compatible_version(version, constraint, compatible):
assert util.is_compatible_version(version, constraint) is compatible

59
spacy/tests/test_util.py Normal file
View File

@ -0,0 +1,59 @@
import pytest
from spacy.gold import Example
from .util import get_random_doc
from spacy.util import minibatch_by_words
@pytest.mark.parametrize(
"doc_sizes, expected_batches",
[
([400, 400, 199], [3]),
([400, 400, 199, 3], [4]),
([400, 400, 199, 3, 200], [3, 2]),
([400, 400, 199, 3, 1], [5]),
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
([400, 400, 199, 3, 1, 200], [3, 3]),
([400, 400, 199, 3, 1, 999], [3, 3]),
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
([1, 2, 999], [3]),
([1, 2, 999, 1], [4]),
([1, 200, 999, 1], [2, 2]),
([1, 999, 200, 1], [2, 2]),
],
)
def test_util_minibatch(doc_sizes, expected_batches):
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
examples = [Example(doc=doc) for doc in docs]
tol = 0.2
batch_size = 1000
batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True))
assert [len(batch) for batch in batches] == expected_batches
max_size = batch_size + batch_size * tol
for batch in batches:
assert sum([len(example.doc) for example in batch]) < max_size
@pytest.mark.parametrize(
"doc_sizes, expected_batches",
[
([400, 4000, 199], [1, 2]),
([400, 400, 199, 3000, 200], [1, 4]),
([400, 400, 199, 3, 1, 1500], [1, 5]),
([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
([1, 2, 9999], [1, 2]),
([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
],
)
def test_util_minibatch_oversize(doc_sizes, expected_batches):
""" Test that oversized documents are returned in their own batch"""
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
examples = [Example(doc=doc) for doc in docs]
tol = 0.2
batch_size = 1000
batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False))
assert [len(batch) for batch in batches] == expected_batches

View File

@ -92,6 +92,13 @@ def get_batch(batch_size):
return docs return docs
def get_random_doc(n_words):
vocab = Vocab()
# Make the words numbers, so that they're easy to track.
numbers = [str(i) for i in range(0, n_words)]
return Doc(vocab, words=numbers)
def apply_transition_sequence(parser, doc, sequence): def apply_transition_sequence(parser, doc, sequence):
"""Perform a series of pre-specified transitions, to put the parser in a """Perform a series of pre-specified transitions, to put the parser in a
desired state.""" desired state."""

View File

@ -134,7 +134,7 @@ cdef class Tokenizer:
def __call__(self, unicode string): def __call__(self, unicode string):
"""Tokenize a string. """Tokenize a string.
string (unicode): The string to tokenize. string (str): The string to tokenize.
RETURNS (Doc): A container for linguistic annotations. RETURNS (Doc): A container for linguistic annotations.
DOCS: https://spacy.io/api/tokenizer#call DOCS: https://spacy.io/api/tokenizer#call
@ -147,7 +147,7 @@ cdef class Tokenizer:
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases): cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
"""Tokenize according to affix and token_match settings. """Tokenize according to affix and token_match settings.
string (unicode): The string to tokenize. string (str): The string to tokenize.
RETURNS (Doc): A container for linguistic annotations. RETURNS (Doc): A container for linguistic annotations.
""" """
if len(string) >= (2 ** 30): if len(string) >= (2 ** 30):
@ -527,7 +527,7 @@ cdef class Tokenizer:
def find_infix(self, unicode string): def find_infix(self, unicode string):
"""Find internal split points of the string, such as hyphens. """Find internal split points of the string, such as hyphens.
string (unicode): The string to segment. string (str): The string to segment.
RETURNS (list): A list of `re.MatchObject` objects that have `.start()` RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
and `.end()` methods, denoting the placement of internal segment and `.end()` methods, denoting the placement of internal segment
separators, e.g. hyphens. separators, e.g. hyphens.
@ -542,7 +542,7 @@ cdef class Tokenizer:
"""Find the length of a prefix that should be segmented from the """Find the length of a prefix that should be segmented from the
string, or None if no prefix rules match. string, or None if no prefix rules match.
string (unicode): The string to segment. string (str): The string to segment.
RETURNS (int): The length of the prefix if present, otherwise `None`. RETURNS (int): The length of the prefix if present, otherwise `None`.
DOCS: https://spacy.io/api/tokenizer#find_prefix DOCS: https://spacy.io/api/tokenizer#find_prefix
@ -556,7 +556,7 @@ cdef class Tokenizer:
"""Find the length of a suffix that should be segmented from the """Find the length of a suffix that should be segmented from the
string, or None if no suffix rules match. string, or None if no suffix rules match.
string (unicode): The string to segment. string (str): The string to segment.
Returns (int): The length of the suffix if present, otherwise `None`. Returns (int): The length of the suffix if present, otherwise `None`.
DOCS: https://spacy.io/api/tokenizer#find_suffix DOCS: https://spacy.io/api/tokenizer#find_suffix
@ -576,7 +576,7 @@ cdef class Tokenizer:
def _validate_special_case(self, chunk, substrings): def _validate_special_case(self, chunk, substrings):
"""Check whether the `ORTH` fields match the string. """Check whether the `ORTH` fields match the string.
string (unicode): The string to specially tokenize. string (str): The string to specially tokenize.
substrings (iterable): A sequence of dicts, where each dict describes substrings (iterable): A sequence of dicts, where each dict describes
a token and its attributes. a token and its attributes.
""" """
@ -588,7 +588,7 @@ cdef class Tokenizer:
def add_special_case(self, unicode string, substrings): def add_special_case(self, unicode string, substrings):
"""Add a special-case tokenization rule. """Add a special-case tokenization rule.
string (unicode): The string to specially tokenize. string (str): The string to specially tokenize.
substrings (iterable): A sequence of dicts, where each dict describes substrings (iterable): A sequence of dicts, where each dict describes
a token and its attributes. The `ORTH` fields of the attributes a token and its attributes. The `ORTH` fields of the attributes
must exactly match the string when they are concatenated. must exactly match the string when they are concatenated.
@ -629,7 +629,7 @@ cdef class Tokenizer:
produced are identical to `nlp.tokenizer()` except for whitespace produced are identical to `nlp.tokenizer()` except for whitespace
tokens. tokens.
string (unicode): The string to tokenize. string (str): The string to tokenize.
RETURNS (list): A list of (pattern_string, token_string) tuples RETURNS (list): A list of (pattern_string, token_string) tuples
DOCS: https://spacy.io/api/tokenizer#explain DOCS: https://spacy.io/api/tokenizer#explain
@ -693,7 +693,7 @@ cdef class Tokenizer:
def to_disk(self, path, **kwargs): def to_disk(self, path, **kwargs):
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (str / Path): A path to a directory, which will be created if
it doesn't exist. it doesn't exist.
exclude (list): String names of serialization fields to exclude. exclude (list): String names of serialization fields to exclude.
@ -707,7 +707,7 @@ cdef class Tokenizer:
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. returns it.
path (unicode or Path): A path to a directory. path (str / Path): A path to a directory.
exclude (list): String names of serialization fields to exclude. exclude (list): String names of serialization fields to exclude.
RETURNS (Tokenizer): The modified `Tokenizer` object. RETURNS (Tokenizer): The modified `Tokenizer` object.

View File

@ -117,7 +117,7 @@ cdef class Doc:
def set_extension(cls, name, **kwargs): def set_extension(cls, name, **kwargs):
"""Define a custom attribute which becomes available as `Doc._`. """Define a custom attribute which becomes available as `Doc._`.
name (unicode): Name of the attribute to set. name (str): Name of the attribute to set.
default: Optional default value of the attribute. default: Optional default value of the attribute.
getter (callable): Optional getter function. getter (callable): Optional getter function.
setter (callable): Optional setter function. setter (callable): Optional setter function.
@ -135,7 +135,7 @@ cdef class Doc:
def get_extension(cls, name): def get_extension(cls, name):
"""Look up a previously registered extension by name. """Look up a previously registered extension by name.
name (unicode): Name of the extension. name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple. RETURNS (tuple): A `(default, method, getter, setter)` tuple.
DOCS: https://spacy.io/api/doc#get_extension DOCS: https://spacy.io/api/doc#get_extension
@ -146,7 +146,7 @@ cdef class Doc:
def has_extension(cls, name): def has_extension(cls, name):
"""Check whether an extension has been registered. """Check whether an extension has been registered.
name (unicode): Name of the extension. name (str): Name of the extension.
RETURNS (bool): Whether the extension has been registered. RETURNS (bool): Whether the extension has been registered.
DOCS: https://spacy.io/api/doc#has_extension DOCS: https://spacy.io/api/doc#has_extension
@ -157,7 +157,7 @@ cdef class Doc:
def remove_extension(cls, name): def remove_extension(cls, name):
"""Remove a previously registered extension. """Remove a previously registered extension.
name (unicode): Name of the extension. name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
removed extension. removed extension.
@ -483,7 +483,7 @@ cdef class Doc:
def text(self): def text(self):
"""A unicode representation of the document text. """A unicode representation of the document text.
RETURNS (unicode): The original verbatim text of the document. RETURNS (str): The original verbatim text of the document.
""" """
return "".join(t.text_with_ws for t in self) return "".join(t.text_with_ws for t in self)
@ -492,7 +492,7 @@ cdef class Doc:
"""An alias of `Doc.text`, provided for duck-type compatibility with """An alias of `Doc.text`, provided for duck-type compatibility with
`Span` and `Token`. `Span` and `Token`.
RETURNS (unicode): The original verbatim text of the document. RETURNS (str): The original verbatim text of the document.
""" """
return self.text return self.text
@ -637,7 +637,7 @@ cdef class Doc:
@property @property
def lang_(self): def lang_(self):
"""RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'.""" """RETURNS (str): Language of the doc's vocabulary, e.g. 'en'."""
return self.vocab.lang return self.vocab.lang
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1: cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
@ -852,7 +852,7 @@ cdef class Doc:
def to_disk(self, path, **kwargs): def to_disk(self, path, **kwargs):
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode or Path): A path to a directory, which will be created if path (str / Path): A path to a directory, which will be created if
it doesn't exist. Paths may be either strings or Path-like objects. it doesn't exist. Paths may be either strings or Path-like objects.
exclude (list): String names of serialization fields to exclude. exclude (list): String names of serialization fields to exclude.
@ -866,7 +866,7 @@ cdef class Doc:
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. returns it.
path (unicode or Path): A path to a directory. Paths may be either path (str / Path): A path to a directory. Paths may be either
strings or `Path`-like objects. strings or `Path`-like objects.
exclude (list): String names of serialization fields to exclude. exclude (list): String names of serialization fields to exclude.
RETURNS (Doc): The modified `Doc` object. RETURNS (Doc): The modified `Doc` object.

View File

@ -33,7 +33,7 @@ cdef class Span:
def set_extension(cls, name, **kwargs): def set_extension(cls, name, **kwargs):
"""Define a custom attribute which becomes available as `Span._`. """Define a custom attribute which becomes available as `Span._`.
name (unicode): Name of the attribute to set. name (str): Name of the attribute to set.
default: Optional default value of the attribute. default: Optional default value of the attribute.
getter (callable): Optional getter function. getter (callable): Optional getter function.
setter (callable): Optional setter function. setter (callable): Optional setter function.
@ -51,7 +51,7 @@ cdef class Span:
def get_extension(cls, name): def get_extension(cls, name):
"""Look up a previously registered extension by name. """Look up a previously registered extension by name.
name (unicode): Name of the extension. name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple. RETURNS (tuple): A `(default, method, getter, setter)` tuple.
DOCS: https://spacy.io/api/span#get_extension DOCS: https://spacy.io/api/span#get_extension
@ -62,7 +62,7 @@ cdef class Span:
def has_extension(cls, name): def has_extension(cls, name):
"""Check whether an extension has been registered. """Check whether an extension has been registered.
name (unicode): Name of the extension. name (str): Name of the extension.
RETURNS (bool): Whether the extension has been registered. RETURNS (bool): Whether the extension has been registered.
DOCS: https://spacy.io/api/span#has_extension DOCS: https://spacy.io/api/span#has_extension
@ -73,7 +73,7 @@ cdef class Span:
def remove_extension(cls, name): def remove_extension(cls, name):
"""Remove a previously registered extension. """Remove a previously registered extension.
name (unicode): Name of the extension. name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
removed extension. removed extension.
@ -491,7 +491,7 @@ cdef class Span:
@property @property
def text(self): def text(self):
"""RETURNS (unicode): The original verbatim text of the span.""" """RETURNS (str): The original verbatim text of the span."""
text = self.text_with_ws text = self.text_with_ws
if self[-1].whitespace_: if self[-1].whitespace_:
text = text[:-1] text = text[:-1]
@ -502,7 +502,7 @@ cdef class Span:
"""The text content of the span with a trailing whitespace character if """The text content of the span with a trailing whitespace character if
the last token has one. the last token has one.
RETURNS (unicode): The text content of the span (with trailing RETURNS (str): The text content of the span (with trailing
whitespace). whitespace).
""" """
return "".join([t.text_with_ws for t in self]) return "".join([t.text_with_ws for t in self])
@ -678,7 +678,7 @@ cdef class Span:
raise NotImplementedError(TempErrors.T007.format(attr="ent_id")) raise NotImplementedError(TempErrors.T007.format(attr="ent_id"))
property ent_id_: property ent_id_:
"""RETURNS (unicode): The (string) entity ID.""" """RETURNS (str): The (string) entity ID."""
def __get__(self): def __get__(self):
return self.root.ent_id_ return self.root.ent_id_
@ -690,12 +690,12 @@ cdef class Span:
"""Verbatim text content (identical to `Span.text`). Exists mostly for """Verbatim text content (identical to `Span.text`). Exists mostly for
consistency with other attributes. consistency with other attributes.
RETURNS (unicode): The span's text.""" RETURNS (str): The span's text."""
return self.text return self.text
@property @property
def lemma_(self): def lemma_(self):
"""RETURNS (unicode): The span's lemma.""" """RETURNS (str): The span's lemma."""
return " ".join([t.lemma_ for t in self]).strip() return " ".join([t.lemma_ for t in self]).strip()
@property @property
@ -714,7 +714,7 @@ cdef class Span:
return "".join([t.text_with_ws for t in self]) return "".join([t.text_with_ws for t in self])
property label_: property label_:
"""RETURNS (unicode): The span's label.""" """RETURNS (str): The span's label."""
def __get__(self): def __get__(self):
return self.doc.vocab.strings[self.label] return self.doc.vocab.strings[self.label]
@ -724,7 +724,7 @@ cdef class Span:
raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_)) raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_))
property kb_id_: property kb_id_:
"""RETURNS (unicode): The named entity's KB ID.""" """RETURNS (str): The named entity's KB ID."""
def __get__(self): def __get__(self):
return self.doc.vocab.strings[self.kb_id] return self.doc.vocab.strings[self.kb_id]

View File

@ -36,7 +36,7 @@ cdef class Token:
def set_extension(cls, name, **kwargs): def set_extension(cls, name, **kwargs):
"""Define a custom attribute which becomes available as `Token._`. """Define a custom attribute which becomes available as `Token._`.
name (unicode): Name of the attribute to set. name (str): Name of the attribute to set.
default: Optional default value of the attribute. default: Optional default value of the attribute.
getter (callable): Optional getter function. getter (callable): Optional getter function.
setter (callable): Optional setter function. setter (callable): Optional setter function.
@ -54,7 +54,7 @@ cdef class Token:
def get_extension(cls, name): def get_extension(cls, name):
"""Look up a previously registered extension by name. """Look up a previously registered extension by name.
name (unicode): Name of the extension. name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple. RETURNS (tuple): A `(default, method, getter, setter)` tuple.
DOCS: https://spacy.io/api/token#get_extension DOCS: https://spacy.io/api/token#get_extension
@ -65,7 +65,7 @@ cdef class Token:
def has_extension(cls, name): def has_extension(cls, name):
"""Check whether an extension has been registered. """Check whether an extension has been registered.
name (unicode): Name of the extension. name (str): Name of the extension.
RETURNS (bool): Whether the extension has been registered. RETURNS (bool): Whether the extension has been registered.
DOCS: https://spacy.io/api/token#has_extension DOCS: https://spacy.io/api/token#has_extension
@ -76,7 +76,7 @@ cdef class Token:
def remove_extension(cls, name): def remove_extension(cls, name):
"""Remove a previously registered extension. """Remove a previously registered extension.
name (unicode): Name of the extension. name (str): Name of the extension.
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
removed extension. removed extension.
@ -244,12 +244,12 @@ cdef class Token:
@property @property
def text(self): def text(self):
"""RETURNS (unicode): The original verbatim text of the token.""" """RETURNS (str): The original verbatim text of the token."""
return self.orth_ return self.orth_
@property @property
def text_with_ws(self): def text_with_ws(self):
"""RETURNS (unicode): The text content of the span (with trailing """RETURNS (str): The text content of the span (with trailing
whitespace). whitespace).
""" """
cdef unicode orth = self.vocab.strings[self.c.lex.orth] cdef unicode orth = self.vocab.strings[self.c.lex.orth]
@ -762,7 +762,7 @@ cdef class Token:
self.c.ent_type = ent_type self.c.ent_type = ent_type
property ent_type_: property ent_type_:
"""RETURNS (unicode): Named entity type.""" """RETURNS (str): Named entity type."""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.ent_type] return self.vocab.strings[self.c.ent_type]
@ -785,7 +785,7 @@ cdef class Token:
and "" means no entity tag is set. "B" with an empty ent_type and "" means no entity tag is set. "B" with an empty ent_type
means that the token is blocked from further processing by NER. means that the token is blocked from further processing by NER.
RETURNS (unicode): IOB code of named entity tag. RETURNS (str): IOB code of named entity tag.
""" """
iob_strings = ("", "I", "O", "B") iob_strings = ("", "I", "O", "B")
return iob_strings[self.c.ent_iob] return iob_strings[self.c.ent_iob]
@ -801,7 +801,7 @@ cdef class Token:
self.c.ent_id = key self.c.ent_id = key
property ent_id_: property ent_id_:
"""RETURNS (unicode): ID of the entity the token is an instance of, """RETURNS (str): ID of the entity the token is an instance of,
if any. if any.
""" """
def __get__(self): def __get__(self):
@ -819,7 +819,7 @@ cdef class Token:
self.c.ent_kb_id = ent_kb_id self.c.ent_kb_id = ent_kb_id
property ent_kb_id_: property ent_kb_id_:
"""RETURNS (unicode): Named entity KB ID.""" """RETURNS (str): Named entity KB ID."""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.ent_kb_id] return self.vocab.strings[self.c.ent_kb_id]
@ -828,12 +828,12 @@ cdef class Token:
@property @property
def whitespace_(self): def whitespace_(self):
"""RETURNS (unicode): The trailing whitespace character, if present.""" """RETURNS (str): The trailing whitespace character, if present."""
return " " if self.c.spacy else "" return " " if self.c.spacy else ""
@property @property
def orth_(self): def orth_(self):
"""RETURNS (unicode): Verbatim text content (identical to """RETURNS (str): Verbatim text content (identical to
`Token.text`). Exists mostly for consistency with the other `Token.text`). Exists mostly for consistency with the other
attributes. attributes.
""" """
@ -841,13 +841,13 @@ cdef class Token:
@property @property
def lower_(self): def lower_(self):
"""RETURNS (unicode): The lowercase token text. Equivalent to """RETURNS (str): The lowercase token text. Equivalent to
`Token.text.lower()`. `Token.text.lower()`.
""" """
return self.vocab.strings[self.c.lex.lower] return self.vocab.strings[self.c.lex.lower]
property norm_: property norm_:
"""RETURNS (unicode): The token's norm, i.e. a normalised form of the """RETURNS (str): The token's norm, i.e. a normalised form of the
token text. Usually set in the language's tokenizer exceptions or token text. Usually set in the language's tokenizer exceptions or
norm exceptions. norm exceptions.
""" """
@ -859,34 +859,34 @@ cdef class Token:
@property @property
def shape_(self): def shape_(self):
"""RETURNS (unicode): Transform of the tokens's string, to show """RETURNS (str): Transform of the tokens's string, to show
orthographic features. For example, "Xxxx" or "dd". orthographic features. For example, "Xxxx" or "dd".
""" """
return self.vocab.strings[self.c.lex.shape] return self.vocab.strings[self.c.lex.shape]
@property @property
def prefix_(self): def prefix_(self):
"""RETURNS (unicode): A length-N substring from the start of the token. """RETURNS (str): A length-N substring from the start of the token.
Defaults to `N=1`. Defaults to `N=1`.
""" """
return self.vocab.strings[self.c.lex.prefix] return self.vocab.strings[self.c.lex.prefix]
@property @property
def suffix_(self): def suffix_(self):
"""RETURNS (unicode): A length-N substring from the end of the token. """RETURNS (str): A length-N substring from the end of the token.
Defaults to `N=3`. Defaults to `N=3`.
""" """
return self.vocab.strings[self.c.lex.suffix] return self.vocab.strings[self.c.lex.suffix]
@property @property
def lang_(self): def lang_(self):
"""RETURNS (unicode): Language of the parent document's vocabulary, """RETURNS (str): Language of the parent document's vocabulary,
e.g. 'en'. e.g. 'en'.
""" """
return self.vocab.strings[self.c.lex.lang] return self.vocab.strings[self.c.lex.lang]
property lemma_: property lemma_:
"""RETURNS (unicode): The token lemma, i.e. the base form of the word, """RETURNS (str): The token lemma, i.e. the base form of the word,
with no inflectional suffixes. with no inflectional suffixes.
""" """
def __get__(self): def __get__(self):
@ -899,7 +899,7 @@ cdef class Token:
self.c.lemma = self.vocab.strings.add(lemma_) self.c.lemma = self.vocab.strings.add(lemma_)
property pos_: property pos_:
"""RETURNS (unicode): Coarse-grained part-of-speech tag.""" """RETURNS (str): Coarse-grained part-of-speech tag."""
def __get__(self): def __get__(self):
return parts_of_speech.NAMES[self.c.pos] return parts_of_speech.NAMES[self.c.pos]
@ -907,7 +907,7 @@ cdef class Token:
self.c.pos = parts_of_speech.IDS[pos_name] self.c.pos = parts_of_speech.IDS[pos_name]
property tag_: property tag_:
"""RETURNS (unicode): Fine-grained part-of-speech tag.""" """RETURNS (str): Fine-grained part-of-speech tag."""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.tag] return self.vocab.strings[self.c.tag]
@ -915,7 +915,7 @@ cdef class Token:
self.tag = self.vocab.strings.add(tag) self.tag = self.vocab.strings.add(tag)
property dep_: property dep_:
"""RETURNS (unicode): The syntactic dependency label.""" """RETURNS (str): The syntactic dependency label."""
def __get__(self): def __get__(self):
return self.vocab.strings[self.c.dep] return self.vocab.strings[self.c.dep]

View File

@ -15,6 +15,8 @@ import srsly
import catalogue import catalogue
import sys import sys
import warnings import warnings
from packaging.specifiers import SpecifierSet, InvalidSpecifier
from packaging.version import Version, InvalidVersion
try: try:
@ -22,9 +24,16 @@ try:
except ImportError: except ImportError:
cupy = None cupy = None
try: # Python 3.8
import importlib.metadata as importlib_metadata
except ImportError:
import importlib_metadata
from .symbols import ORTH from .symbols import ORTH
from .compat import cupy, CudaStream from .compat import cupy, CudaStream
from .errors import Errors, Warnings from .errors import Errors, Warnings
from . import about
_PRINT_ENV = False _PRINT_ENV = False
OOV_RANK = numpy.iinfo(numpy.uint64).max OOV_RANK = numpy.iinfo(numpy.uint64).max
@ -37,6 +46,10 @@ class registry(thinc.registry):
factories = catalogue.create("spacy", "factories", entry_points=True) factories = catalogue.create("spacy", "factories", entry_points=True)
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True) displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
assets = catalogue.create("spacy", "assets", entry_points=True) assets = catalogue.create("spacy", "assets", entry_points=True)
# This is mostly used to get a list of all installed models in the current
# environment. spaCy models packaged with `spacy package` will "advertise"
# themselves via entry points.
models = catalogue.create("spacy", "models", entry_points=True)
def set_env_log(value): def set_env_log(value):
@ -49,7 +62,7 @@ def lang_class_is_loaded(lang):
loaded lazily, to avoid expensive setup code associated with the language loaded lazily, to avoid expensive setup code associated with the language
data. data.
lang (unicode): Two-letter language code, e.g. 'en'. lang (str): Two-letter language code, e.g. 'en'.
RETURNS (bool): Whether a Language class has been loaded. RETURNS (bool): Whether a Language class has been loaded.
""" """
return lang in registry.languages return lang in registry.languages
@ -58,7 +71,7 @@ def lang_class_is_loaded(lang):
def get_lang_class(lang): def get_lang_class(lang):
"""Import and load a Language class. """Import and load a Language class.
lang (unicode): Two-letter language code, e.g. 'en'. lang (str): Two-letter language code, e.g. 'en'.
RETURNS (Language): Language class. RETURNS (Language): Language class.
""" """
# Check if language is registered / entry point is available # Check if language is registered / entry point is available
@ -76,7 +89,7 @@ def get_lang_class(lang):
def set_lang_class(name, cls): def set_lang_class(name, cls):
"""Set a custom Language class name that can be loaded via get_lang_class. """Set a custom Language class name that can be loaded via get_lang_class.
name (unicode): Name of Language class. name (str): Name of Language class.
cls (Language): Language class. cls (Language): Language class.
""" """
registry.languages.register(name, func=cls) registry.languages.register(name, func=cls)
@ -98,7 +111,7 @@ def load_language_data(path):
"""Load JSON language data using the given path as a base. If the provided """Load JSON language data using the given path as a base. If the provided
path isn't present, will attempt to load a gzipped version before giving up. path isn't present, will attempt to load a gzipped version before giving up.
path (unicode / Path): The data to load. path (str / Path): The data to load.
RETURNS: The loaded data. RETURNS: The loaded data.
""" """
path = ensure_path(path) path = ensure_path(path)
@ -119,7 +132,7 @@ def get_module_path(module):
def load_model(name, **overrides): def load_model(name, **overrides):
"""Load a model from a package or data path. """Load a model from a package or data path.
name (unicode): Package name or model path. name (str): Package name or model path.
**overrides: Specific overrides, like pipeline components to disable. **overrides: Specific overrides, like pipeline components to disable.
RETURNS (Language): `Language` class with the loaded model. RETURNS (Language): `Language` class with the loaded model.
""" """
@ -193,7 +206,7 @@ def load_model_from_init_py(init_file, **overrides):
"""Helper function to use in the `load()` method of a model package's """Helper function to use in the `load()` method of a model package's
__init__.py. __init__.py.
init_file (unicode): Path to model's __init__.py, i.e. `__file__`. init_file (str): Path to model's __init__.py, i.e. `__file__`.
**overrides: Specific overrides, like pipeline components to disable. **overrides: Specific overrides, like pipeline components to disable.
RETURNS (Language): `Language` class with loaded model. RETURNS (Language): `Language` class with loaded model.
""" """
@ -206,11 +219,74 @@ def load_model_from_init_py(init_file, **overrides):
return load_model_from_path(data_path, meta, **overrides) return load_model_from_path(data_path, meta, **overrides)
def get_installed_models():
"""List all model packages currently installed in the environment.
RETURNS (list): The string names of the models.
"""
return list(registry.models.get_all().keys())
def get_package_version(name):
"""Get the version of an installed package. Typically used to get model
package versions.
name (str): The name of the installed Python package.
RETURNS (str / None): The version or None if package not installed.
"""
try:
return importlib_metadata.version(name)
except importlib_metadata.PackageNotFoundError:
return None
def is_compatible_version(version, constraint, prereleases=True):
"""Check if a version (e.g. "2.0.0") is compatible given a version
constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version,
it's interpreted as =={version}.
version (str): The version to check.
constraint (str): The constraint string.
prereleases (bool): Whether to allow prereleases. If set to False,
prerelease versions will be considered incompatible.
RETURNS (bool / None): Whether the version is compatible, or None if the
version or constraint are invalid.
"""
# Handle cases where exact version is provided as constraint
if constraint[0].isdigit():
constraint = f"=={constraint}"
try:
spec = SpecifierSet(constraint)
version = Version(version)
except (InvalidSpecifier, InvalidVersion):
return None
spec.prereleases = prereleases
return version in spec
def get_model_version_range(spacy_version):
"""Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy
version. Models are always compatible across patch versions but not
across minor or major versions.
"""
release = Version(spacy_version).release
return f">={spacy_version},<{release[0]}.{release[1] + 1}.0"
def get_base_version(version):
"""Generate the base version without any prerelease identifiers.
version (str): The version, e.g. "3.0.0.dev1".
RETURNS (str): The base version, e.g. "3.0.0".
"""
return Version(version).base_version
def load_config(path, create_objects=False): def load_config(path, create_objects=False):
"""Load a Thinc-formatted config file, optionally filling in objects where """Load a Thinc-formatted config file, optionally filling in objects where
the config references registry entries. See "Thinc config files" for details. the config references registry entries. See "Thinc config files" for details.
path (unicode or Path): Path to the config file path (str / Path): Path to the config file
create_objects (bool): Whether to automatically create objects when the config create_objects (bool): Whether to automatically create objects when the config
references registry entries. Defaults to False. references registry entries. Defaults to False.
@ -227,7 +303,7 @@ def load_config_from_str(string, create_objects=False):
"""Load a Thinc-formatted config, optionally filling in objects where """Load a Thinc-formatted config, optionally filling in objects where
the config references registry entries. See "Thinc config files" for details. the config references registry entries. See "Thinc config files" for details.
string (unicode or Path): Text contents of the config file. string (str / Path): Text contents of the config file.
create_objects (bool): Whether to automatically create objects when the config create_objects (bool): Whether to automatically create objects when the config
references registry entries. Defaults to False. references registry entries. Defaults to False.
@ -243,7 +319,7 @@ def load_config_from_str(string, create_objects=False):
def get_model_meta(path): def get_model_meta(path):
"""Get model meta.json from a directory path and validate its contents. """Get model meta.json from a directory path and validate its contents.
path (unicode or Path): Path to model directory. path (str / Path): Path to model directory.
RETURNS (dict): The model's meta data. RETURNS (dict): The model's meta data.
""" """
model_path = ensure_path(path) model_path = ensure_path(path)
@ -256,13 +332,23 @@ def get_model_meta(path):
for setting in ["lang", "name", "version"]: for setting in ["lang", "name", "version"]:
if setting not in meta or not meta[setting]: if setting not in meta or not meta[setting]:
raise ValueError(Errors.E054.format(setting=setting)) raise ValueError(Errors.E054.format(setting=setting))
if "spacy_version" in meta:
if not is_compatible_version(about.__version__, meta["spacy_version"]):
warnings.warn(
Warnings.W095.format(
model=f"{meta['lang']}_{meta['name']}",
model_version=meta["version"],
version=meta["spacy_version"],
current=about.__version__,
)
)
return meta return meta
def get_model_config(path): def get_model_config(path):
"""Get the model's config from a directory path. """Get the model's config from a directory path.
path (unicode or Path): Path to model directory. path (str / Path): Path to model directory.
RETURNS (Config): The model's config data. RETURNS (Config): The model's config data.
""" """
model_path = ensure_path(path) model_path = ensure_path(path)
@ -279,23 +365,20 @@ def get_model_config(path):
def is_package(name): def is_package(name):
"""Check if string maps to a package installed via pip. """Check if string maps to a package installed via pip.
name (unicode): Name of package. name (str): Name of package.
RETURNS (bool): True if installed package, False if not. RETURNS (bool): True if installed package, False if not.
""" """
import pkg_resources try:
importlib_metadata.distribution(name)
name = name.lower() # compare package name against lowercase name
packages = pkg_resources.working_set.by_key.keys()
for package in packages:
if package.lower().replace("-", "_") == name:
return True return True
except: # noqa: E722
return False return False
def get_package_path(name): def get_package_path(name):
"""Get the path to an installed package. """Get the path to an installed package.
name (unicode): Package name. name (str): Package name.
RETURNS (Path): Path to installed package. RETURNS (Path): Path to installed package.
""" """
name = name.lower() # use lowercase version to be safe name = name.lower() # use lowercase version to be safe
@ -470,8 +553,8 @@ def expand_exc(excs, search, replace):
For example, to add additional versions with typographic apostrophes. For example, to add additional versions with typographic apostrophes.
excs (dict): Tokenizer exceptions. excs (dict): Tokenizer exceptions.
search (unicode): String to find and replace. search (str): String to find and replace.
replace (unicode): Replacement. replace (str): Replacement.
RETURNS (dict): Combined tokenizer exceptions. RETURNS (dict): Combined tokenizer exceptions.
""" """
@ -575,41 +658,73 @@ def decaying(start, stop, decay):
curr -= decay curr -= decay
def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0.2): def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False):
"""Create minibatches of roughly a given number of words. If any examples """Create minibatches of roughly a given number of words. If any examples
are longer than the specified batch length, they will appear in a batch by are longer than the specified batch length, they will appear in a batch by
themselves.""" themselves, or be discarded if discard_oversize=True."""
if isinstance(size, int): if isinstance(size, int):
size_ = itertools.repeat(size) size_ = itertools.repeat(size)
elif isinstance(size, List): elif isinstance(size, List):
size_ = iter(size) size_ = iter(size)
else: else:
size_ = size size_ = size
examples = iter(examples)
oversize = [] target_size = next(size_)
while True: tol_size = target_size * tolerance
batch_size = next(size_)
tol_size = batch_size * 0.2
batch = [] batch = []
if oversize: overflow = []
example = oversize.pop(0) batch_size = 0
overflow_size = 0
for example in examples:
n_words = count_words(example.doc) n_words = count_words(example.doc)
# if the current example exceeds the maximum batch size, it is returned separately
# but only if discard_oversize=False.
if n_words > target_size + tol_size:
if not discard_oversize:
yield [example]
# add the example to the current batch if there's no overflow yet and it still fits
elif overflow_size == 0 and (batch_size + n_words) <= target_size:
batch.append(example) batch.append(example)
batch_size -= n_words batch_size += n_words
while batch_size >= 1:
try: # add the example to the overflow buffer if it fits in the tolerance margin
example = next(examples) elif (batch_size + overflow_size + n_words) <= (target_size + tol_size):
except StopIteration: overflow.append(example)
if batch: overflow_size += n_words
yield batch
return # yield the previous batch and start a new one. The new one gets the overflow examples.
n_words = count_words(example.doc)
if n_words < (batch_size + tol_size):
batch_size -= n_words
batch.append(example)
else: else:
oversize.append(example) yield batch
target_size = next(size_)
tol_size = target_size * tolerance
batch = overflow
batch_size = overflow_size
overflow = []
overflow_size = 0
# this example still fits
if (batch_size + n_words) <= target_size:
batch.append(example)
batch_size += n_words
# this example fits in overflow
elif (batch_size + n_words) <= (target_size + tol_size):
overflow.append(example)
overflow_size += n_words
# this example does not fit with the previous overflow: start another new batch
else:
yield batch
target_size = next(size_)
tol_size = target_size * tolerance
batch = [example]
batch_size = n_words
# yield the final batch
if batch: if batch:
batch.extend(overflow)
yield batch yield batch
@ -705,8 +820,8 @@ def from_disk(path, readers, exclude):
def import_file(name, loc): def import_file(name, loc):
"""Import module from a file. Used to load models from a directory. """Import module from a file. Used to load models from a directory.
name (unicode): Name of module to load. name (str): Name of module to load.
loc (unicode / Path): Path to the file. loc (str / Path): Path to the file.
RETURNS: The loaded module. RETURNS: The loaded module.
""" """
loc = str(loc) loc = str(loc)
@ -721,8 +836,8 @@ def minify_html(html):
Disclaimer: NOT a general-purpose solution, only removes indentation and Disclaimer: NOT a general-purpose solution, only removes indentation and
newlines. newlines.
html (unicode): Markup to minify. html (str): Markup to minify.
RETURNS (unicode): "Minified" HTML. RETURNS (str): "Minified" HTML.
""" """
return html.strip().replace(" ", "").replace("\n", "") return html.strip().replace(" ", "").replace("\n", "")
@ -731,8 +846,8 @@ def escape_html(text):
"""Replace <, >, &, " with their HTML encoded representation. Intended to """Replace <, >, &, " with their HTML encoded representation. Intended to
prevent HTML errors in rendered displaCy markup. prevent HTML errors in rendered displaCy markup.
text (unicode): The original text. text (str): The original text.
RETURNS (unicode): Equivalent text to be safely used within HTML. RETURNS (str): Equivalent text to be safely used within HTML.
""" """
text = text.replace("&", "&amp;") text = text.replace("&", "&amp;")
text = text.replace("<", "&lt;") text = text.replace("<", "&lt;")

View File

@ -57,7 +57,7 @@ cdef class Vectors:
shape (tuple): Size of the table, as (# entries, # columns) shape (tuple): Size of the table, as (# entries, # columns)
data (numpy.ndarray): The vector data. data (numpy.ndarray): The vector data.
keys (iterable): A sequence of keys, aligned with the data. keys (iterable): A sequence of keys, aligned with the data.
name (unicode): A name to identify the vectors table. name (str): A name to identify the vectors table.
RETURNS (Vectors): The newly created object. RETURNS (Vectors): The newly created object.
DOCS: https://spacy.io/api/vectors#init DOCS: https://spacy.io/api/vectors#init
@ -244,7 +244,7 @@ cdef class Vectors:
def find(self, *, key=None, keys=None, row=None, rows=None): def find(self, *, key=None, keys=None, row=None, rows=None):
"""Look up one or more keys by row, or vice versa. """Look up one or more keys by row, or vice versa.
key (unicode / int): Find the row that the given key points to. key (str / int): Find the row that the given key points to.
Returns int, -1 if missing. Returns int, -1 if missing.
keys (iterable): Find rows that the keys point to. keys (iterable): Find rows that the keys point to.
Returns ndarray. Returns ndarray.
@ -366,7 +366,7 @@ cdef class Vectors:
def to_disk(self, path, **kwargs): def to_disk(self, path, **kwargs):
"""Save the current state to a directory. """Save the current state to a directory.
path (unicode / Path): A path to a directory, which will be created if path (str / Path): A path to a directory, which will be created if
it doesn't exists. it doesn't exists.
DOCS: https://spacy.io/api/vectors#to_disk DOCS: https://spacy.io/api/vectors#to_disk
@ -386,7 +386,7 @@ cdef class Vectors:
"""Loads state from a directory. Modifies the object in place and """Loads state from a directory. Modifies the object in place and
returns it. returns it.
path (unicode / Path): Directory path, string or Path-like object. path (str / Path): Directory path, string or Path-like object.
RETURNS (Vectors): The modified object. RETURNS (Vectors): The modified object.
DOCS: https://spacy.io/api/vectors#from_disk DOCS: https://spacy.io/api/vectors#from_disk

View File

@ -505,8 +505,8 @@ tokenization can be provided.
> ``` > ```
| Key | Type | Description | | Key | Type | Description |
| -------- | ------- | ---------------------------------------------------------- | | -------- | ---- | ---------------------------------------------------------- |
| `text` | unicode | The raw input text. Is not required if `tokens` available. | | `text` | str | The raw input text. Is not required if `tokens` available. |
| `tokens` | list | Optional tokenization, one string per token. | | `tokens` | list | Optional tokenization, one string per token. |
```json ```json

View File

@ -170,7 +170,7 @@ vocabulary.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | ------------------------------------------------------------------------------------------- | | ----------- | ---------------- | ------------------------------------------------------------------------------------------- |
| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. | | `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. |
| `string` | unicode | The string of the word to look up. | | `string` | str | The string of the word to look up. |
| **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary. | | **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary. |
### Vocab.get_by_orth {#vocab_get_by_orth tag="method"} ### Vocab.get_by_orth {#vocab_get_by_orth tag="method"}

View File

@ -230,8 +230,8 @@ Add a new label to the pipe.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------- | ------- | ----------------- | | ------- | ---- | ----------------- |
| `label` | unicode | The label to add. | | `label` | str | The label to add. |
## DependencyParser.to_disk {#to_disk tag="method"} ## DependencyParser.to_disk {#to_disk tag="method"}
@ -245,8 +245,8 @@ Serialize the pipe to disk.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## DependencyParser.from_disk {#from_disk tag="method"} ## DependencyParser.from_disk {#from_disk tag="method"}
@ -262,7 +262,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------ | -------------------------------------------------------------------------- | | ----------- | ------------------ | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. | | **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. |

View File

@ -123,7 +123,7 @@ details, see the documentation on
| Name | Type | Description | | Name | Type | Description |
| --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- | | --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- |
| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. | | `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. |
| `default` | - | Optional default value of the attribute if no getter or method is defined. | | `default` | - | Optional default value of the attribute if no getter or method is defined. |
| `method` | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`. | | `method` | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`. |
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. |
@ -146,8 +146,8 @@ Look up a previously registered extension by name. Returns a 4-tuple
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ------------------------------------------------------------- | | ----------- | ----- | ------------------------------------------------------------- |
| `name` | unicode | Name of the extension. | | `name` | str | Name of the extension. |
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | | **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
## Doc.has_extension {#has_extension tag="classmethod" new="2"} ## Doc.has_extension {#has_extension tag="classmethod" new="2"}
@ -163,8 +163,8 @@ Check whether an extension has been registered on the `Doc` class.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ------------------------------------------ | | ----------- | ---- | ------------------------------------------ |
| `name` | unicode | Name of the extension to check. | | `name` | str | Name of the extension to check. |
| **RETURNS** | bool | Whether the extension has been registered. | | **RETURNS** | bool | Whether the extension has been registered. |
## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"} ## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
@ -181,8 +181,8 @@ Remove a previously registered extension.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | --------------------------------------------------------------------- | | ----------- | ----- | --------------------------------------------------------------------- |
| `name` | unicode | Name of the extension. | | `name` | str | Name of the extension. |
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | | **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
## Doc.char_span {#char_span tag="method" new="2"} ## Doc.char_span {#char_span tag="method" new="2"}
@ -369,8 +369,8 @@ Save the current state to a directory.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Doc.from_disk {#from_disk tag="method" new="2"} ## Doc.from_disk {#from_disk tag="method" new="2"}
@ -386,8 +386,8 @@ Loads state from a directory. Modifies the object in place and returns it.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- | | ----------- | ------------ | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Doc` | The modified `Doc` object. | | **RETURNS** | `Doc` | The modified `Doc` object. |
@ -648,15 +648,15 @@ The L2 norm of the document's vector representation.
| Name | Type | Description | | Name | Type | Description |
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `text` | unicode | A unicode representation of the document text. | | `text` | str | A unicode representation of the document text. |
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. | | `text_with_ws` | str | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. | | `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
| `vocab` | `Vocab` | The store of lexical types. | | `vocab` | `Vocab` | The store of lexical types. |
| `tensor` <Tag variant="new">2</Tag> | `ndarray` | Container for dense vector representations. | | `tensor` <Tag variant="new">2</Tag> | `ndarray` | Container for dense vector representations. |
| `cats` <Tag variant="new">2</Tag> | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. | | `cats` <Tag variant="new">2</Tag> | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. |
| `user_data` | - | A generic storage area, for user custom data. | | `user_data` | - | A generic storage area, for user custom data. |
| `lang` <Tag variant="new">2.1</Tag> | int | Language of the document's vocabulary. | | `lang` <Tag variant="new">2.1</Tag> | int | Language of the document's vocabulary. |
| `lang_` <Tag variant="new">2.1</Tag> | unicode | Language of the document's vocabulary. | | `lang_` <Tag variant="new">2.1</Tag> | str | Language of the document's vocabulary. |
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. | | `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. |
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. | | `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. |
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. | | `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. |

View File

@ -259,8 +259,8 @@ Serialize the pipe to disk.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## EntityLinker.from_disk {#from_disk tag="method"} ## EntityLinker.from_disk {#from_disk tag="method"}
@ -275,8 +275,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- | | ----------- | -------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. | | **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. |

View File

@ -231,8 +231,8 @@ Add a new label to the pipe.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------- | ------- | ----------------- | | ------- | ---- | ----------------- |
| `label` | unicode | The label to add. | | `label` | str | The label to add. |
## EntityRecognizer.to_disk {#to_disk tag="method"} ## EntityRecognizer.to_disk {#to_disk tag="method"}
@ -246,8 +246,8 @@ Serialize the pipe to disk.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## EntityRecognizer.from_disk {#from_disk tag="method"} ## EntityRecognizer.from_disk {#from_disk tag="method"}
@ -263,7 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------ | -------------------------------------------------------------------------- | | ----------- | ------------------ | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. | | **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. |

View File

@ -73,8 +73,8 @@ Whether a label is present in the patterns.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | -------------------------------------------- | | ----------- | ---- | -------------------------------------------- |
| `label` | unicode | The label to check. | | `label` | str | The label to check. |
| **RETURNS** | bool | Whether the entity ruler contains the label. | | **RETURNS** | bool | Whether the entity ruler contains the label. |
## EntityRuler.\_\_call\_\_ {#call tag="method"} ## EntityRuler.\_\_call\_\_ {#call tag="method"}
@ -83,8 +83,9 @@ Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
happens automatically after the component has been added to the pipeline using happens automatically after the component has been added to the pipeline using
[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized [`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
with `overwrite_ents=True`, existing entities will be replaced if they overlap with `overwrite_ents=True`, existing entities will be replaced if they overlap
with the matches. When matches overlap in a Doc, the entity ruler prioritizes longer with the matches. When matches overlap in a Doc, the entity ruler prioritizes
patterns over shorter, and if equal the match occuring first in the Doc is chosen. longer patterns over shorter, and if equal the match occuring first in the Doc
is chosen.
> #### Example > #### Example
> >
@ -140,8 +141,8 @@ only the patterns are saved as JSONL. If a directory name is provided, a
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------ | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------- | | ------ | ------------ | ----------------------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## EntityRuler.from_disk {#from_disk tag="method"} ## EntityRuler.from_disk {#from_disk tag="method"}
@ -159,8 +160,8 @@ configuration.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | ---------------------------------------------------------------------------------------- | | ----------- | ------------- | ---------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. |
| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. | | **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. |
## EntityRuler.to_bytes {#to_bytes tag="method"} ## EntityRuler.to_bytes {#to_bytes tag="method"}

View File

@ -18,7 +18,7 @@ Create a `GoldCorpus`. IF the input data is an iterable, each item should be a
for further details. for further details.
| Name | Type | Description | | Name | Type | Description |
| ----------- | --------------------------- | ------------------------------------------------------------ | | ----------- | ----------------------- | ------------------------------------------------------------ |
| `train` | unicode / `Path` / iterable | Training data, as a path (file or directory) or iterable. | | `train` | str / `Path` / iterable | Training data, as a path (file or directory) or iterable. |
| `dev` | unicode / `Path` / iterable | Development data, as a path (file or directory) or iterable. | | `dev` | str / `Path` / iterable | Development data, as a path (file or directory) or iterable. |
| **RETURNS** | `GoldCorpus` | The newly constructed object. | | **RETURNS** | `GoldCorpus` | The newly constructed object. |

View File

@ -60,7 +60,8 @@ Whether the provided syntactic annotations form a projective dependency tree.
Convert a list of Doc objects into the Convert a list of Doc objects into the
[JSON-serializable format](/api/annotation#json-input) used by the [JSON-serializable format](/api/annotation#json-input) used by the
[`spacy train`](/api/cli#train) command. Each input doc will be treated as a 'paragraph' in the output doc. [`spacy train`](/api/cli#train) command. Each input doc will be treated as a
'paragraph' in the output doc.
> #### Example > #### Example
> >
@ -158,7 +159,7 @@ single-token entity.
| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
| `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. | | `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. |
| `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. | | `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. |
| **RETURNS** | list | Unicode strings, describing the [BILUO](/api/annotation#biluo) tags. | | **RETURNS** | list | str strings, describing the [BILUO](/api/annotation#biluo) tags. |
### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"} ### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}

View File

@ -1,16 +1,19 @@
--- ---
title: KnowledgeBase title: KnowledgeBase
teaser: A storage class for entities and aliases of a specific knowledge base (ontology) teaser:
A storage class for entities and aliases of a specific knowledge base
(ontology)
tag: class tag: class
source: spacy/kb.pyx source: spacy/kb.pyx
new: 2.2 new: 2.2
--- ---
The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init) The `KnowledgeBase` object provides a method to generate
objects, which are plausible external identifiers given a certain textual mention. [`Candidate`](/api/kb/#candidate_init) objects, which are plausible external
Each such `Candidate` holds information from the relevant KB entities, identifiers given a certain textual mention. Each such `Candidate` holds
such as its frequency in text and possible aliases. information from the relevant KB entities, such as its frequency in text and
Each entity in the knowledge base also has a pretrained entity vector of a fixed size. possible aliases. Each entity in the knowledge base also has a pretrained entity
vector of a fixed size.
## KnowledgeBase.\_\_init\_\_ {#init tag="method"} ## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
@ -25,24 +28,24 @@ Create the knowledge base.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------------------- | ---------------- | ----------------------------------------- | | ---------------------- | --------------- | ---------------------------------------- |
| `vocab` | `Vocab` | A `Vocab` object. | | `vocab` | `Vocab` | A `Vocab` object. |
| `entity_vector_length` | int | Length of the fixed-size entity vectors. | | `entity_vector_length` | int | Length of the fixed-size entity vectors. |
| **RETURNS** | `KnowledgeBase` | The newly constructed object. | | **RETURNS** | `KnowledgeBase` | The newly constructed object. |
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"} ## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
The length of the fixed-size entity vectors in the knowledge base. The length of the fixed-size entity vectors in the knowledge base.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---- | ----------------------------------------- | | ----------- | ---- | ---------------------------------------- |
| **RETURNS** | int | Length of the fixed-size entity vectors. | | **RETURNS** | int | Length of the fixed-size entity vectors. |
## KnowledgeBase.add_entity {#add_entity tag="method"} ## KnowledgeBase.add_entity {#add_entity tag="method"}
Add an entity to the knowledge base, specifying its corpus frequency Add an entity to the knowledge base, specifying its corpus frequency and entity
and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length). vector, which should be of length
[`entity_vector_length`](/api/kb#entity_vector_length).
> #### Example > #### Example
> >
@ -52,15 +55,15 @@ and entity vector, which should be of length [`entity_vector_length`](/api/kb#en
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------------- | ------------- | ------------------------------------------------- | | --------------- | ------ | ----------------------------------------------- |
| `entity` | unicode | The unique entity identifier | | `entity` | str | The unique entity identifier |
| `freq` | float | The frequency of the entity in a typical corpus | | `freq` | float | The frequency of the entity in a typical corpus |
| `entity_vector` | vector | The pretrained vector of the entity | | `entity_vector` | vector | The pretrained vector of the entity |
## KnowledgeBase.set_entities {#set_entities tag="method"} ## KnowledgeBase.set_entities {#set_entities tag="method"}
Define the full list of entities in the knowledge base, specifying the corpus frequency Define the full list of entities in the knowledge base, specifying the corpus
and entity vector for each entity. frequency and entity vector for each entity.
> #### Example > #### Example
> >
@ -69,17 +72,18 @@ and entity vector for each entity.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------- | ------------- | ------------------------------------------------- | | ------------- | -------- | --------------------------------- |
| `entity_list` | iterable | List of unique entity identifiers | | `entity_list` | iterable | List of unique entity identifiers |
| `freq_list` | iterable | List of entity frequencies | | `freq_list` | iterable | List of entity frequencies |
| `vector_list` | iterable | List of entity vectors | | `vector_list` | iterable | List of entity vectors |
## KnowledgeBase.add_alias {#add_alias tag="method"} ## KnowledgeBase.add_alias {#add_alias tag="method"}
Add an alias or mention to the knowledge base, specifying its potential KB identifiers Add an alias or mention to the knowledge base, specifying its potential KB
and their prior probabilities. The entity identifiers should refer to entities previously identifiers and their prior probabilities. The entity identifiers should refer
added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities). to entities previously added with [`add_entity`](/api/kb#add_entity) or
The sum of the prior probabilities should not exceed 1. [`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
should not exceed 1.
> #### Example > #### Example
> >
@ -88,10 +92,10 @@ The sum of the prior probabilities should not exceed 1.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------------- | ------------- | -------------------------------------------------- | | --------------- | -------- | -------------------------------------------------- |
| `alias` | unicode | The textual mention or alias | | `alias` | str | The textual mention or alias |
| `entities` | iterable | The potential entities that the alias may refer to | | `entities` | iterable | The potential entities that the alias may refer to |
| `probabilities`| iterable | The prior probabilities of each entity | | `probabilities` | iterable | The prior probabilities of each entity |
## KnowledgeBase.\_\_len\_\_ {#len tag="method"} ## KnowledgeBase.\_\_len\_\_ {#len tag="method"}
@ -118,7 +122,7 @@ Get a list of all entity IDs in the knowledge base.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---- | --------------------------------------------- | | ----------- | ---- | ------------------------------------------- |
| **RETURNS** | list | The list of entities in the knowledge base. | | **RETURNS** | list | The list of entities in the knowledge base. |
## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"} ## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
@ -132,7 +136,7 @@ Get the total number of aliases in the knowledge base.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---- | --------------------------------------------- | | ----------- | ---- | -------------------------------------------- |
| **RETURNS** | int | The number of aliases in the knowledge base. | | **RETURNS** | int | The number of aliases in the knowledge base. |
## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"} ## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
@ -146,7 +150,7 @@ Get a list of all aliases in the knowledge base.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---- | --------------------------------------------- | | ----------- | ---- | ------------------------------------------ |
| **RETURNS** | list | The list of aliases in the knowledge base. | | **RETURNS** | list | The list of aliases in the knowledge base. |
## KnowledgeBase.get_candidates {#get_candidates tag="method"} ## KnowledgeBase.get_candidates {#get_candidates tag="method"}
@ -161,8 +165,8 @@ of type [`Candidate`](/api/kb/#candidate_init).
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------- | ------------- | -------------------------------------------------- | | ----------- | -------- | ---------------------------------------- |
| `alias` | unicode | The textual mention or alias | | `alias` | str | The textual mention or alias |
| **RETURNS** | iterable | The list of relevant `Candidate` objects | | **RETURNS** | iterable | The list of relevant `Candidate` objects |
## KnowledgeBase.get_vector {#get_vector tag="method"} ## KnowledgeBase.get_vector {#get_vector tag="method"}
@ -176,14 +180,14 @@ Given a certain entity ID, retrieve its pretrained entity vector.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------- | ------------- | -------------------------------------------------- | | ----------- | ------ | ----------------- |
| `entity` | unicode | The entity ID | | `entity` | str | The entity ID |
| **RETURNS** | vector | The entity vector | | **RETURNS** | vector | The entity vector |
## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"} ## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
Given a certain entity ID and a certain textual mention, retrieve Given a certain entity ID and a certain textual mention, retrieve the prior
the prior probability of the fact that the mention links to the entity ID. probability of the fact that the mention links to the entity ID.
> #### Example > #### Example
> >
@ -192,9 +196,9 @@ the prior probability of the fact that the mention links to the entity ID.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------- | ------------- | --------------------------------------------------------------- | | ----------- | ----- | -------------------------------------------------------------- |
| `entity` | unicode | The entity ID | | `entity` | str | The entity ID |
| `alias` | unicode | The textual mention or alias | | `alias` | str | The textual mention or alias |
| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` | | **RETURNS** | float | The prior probability of the `alias` referring to the `entity` |
## KnowledgeBase.dump {#dump tag="method"} ## KnowledgeBase.dump {#dump tag="method"}
@ -208,13 +212,13 @@ Save the current state of the knowledge base to a directory.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ | | ----- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| `loc` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `loc` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## KnowledgeBase.load_bulk {#load_bulk tag="method"} ## KnowledgeBase.load_bulk {#load_bulk tag="method"}
Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab) Restore the state of the knowledge base from a given directory. Note that the
should also be the same as the one used to create the KB. [`Vocab`](/api/vocab) should also be the same as the one used to create the KB.
> #### Example > #### Example
> >
@ -226,18 +230,16 @@ should also be the same as the one used to create the KB.
> kb.load_bulk("/path/to/kb") > kb.load_bulk("/path/to/kb")
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | ----------------------------------------------------------------------------------------- | | ----------- | --------------- | -------------------------------------------------------------------------- |
| `loc` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `loc` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. | | **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. |
## Candidate.\_\_init\_\_ {#candidate_init tag="method"} ## Candidate.\_\_init\_\_ {#candidate_init tag="method"}
Construct a `Candidate` object. Usually this constructor is not called directly, Construct a `Candidate` object. Usually this constructor is not called directly,
but instead these objects are returned by the [`get_candidates`](/api/kb#get_candidates) method but instead these objects are returned by the
of a `KnowledgeBase`. [`get_candidates`](/api/kb#get_candidates) method of a `KnowledgeBase`.
> #### Example > #### Example
> >
@ -258,11 +260,11 @@ of a `KnowledgeBase`.
## Candidate attributes {#candidate_attributes} ## Candidate attributes {#candidate_attributes}
| Name | Type | Description | | Name | Type | Description |
| ---------------------- | ------------ | ------------------------------------------------------------------ | | --------------- | ------ | -------------------------------------------------------------- |
| `entity` | int | The entity's unique KB identifier | | `entity` | int | The entity's unique KB identifier |
| `entity_` | unicode | The entity's unique KB identifier | | `entity_` | str | The entity's unique KB identifier |
| `alias` | int | The alias or textual mention | | `alias` | int | The alias or textual mention |
| `alias_` | unicode | The alias or textual mention | | `alias_` | str | The alias or textual mention |
| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` | | `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
| `entity_freq` | long | The frequency of the entity in a typical corpus | | `entity_freq` | long | The frequency of the entity in a typical corpus |
| `entity_vector` | vector | The pretrained vector of the entity | | `entity_vector` | vector | The pretrained vector of the entity |

View File

@ -50,8 +50,8 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | --------------------------------------------------------------------------------- | | ----------- | ----- | --------------------------------------------------------------------------------- |
| `text` | unicode | The text to be processed. | | `text` | str | The text to be processed. |
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
| **RETURNS** | `Doc` | A container for accessing the annotations. | | **RETURNS** | `Doc` | A container for accessing the annotations. |
@ -201,7 +201,7 @@ Create a pipeline component from a factory.
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | ---------------------------------------------------------------------------------- | | ----------- | -------- | ---------------------------------------------------------------------------------- |
| `name` | unicode | Factory name to look up in [`Language.factories`](/api/language#class-attributes). | | `name` | str | Factory name to look up in [`Language.factories`](/api/language#class-attributes). |
| `config` | dict | Configuration parameters to initialize component. | | `config` | dict | Configuration parameters to initialize component. |
| **RETURNS** | callable | The pipeline component. | | **RETURNS** | callable | The pipeline component. |
@ -224,9 +224,9 @@ take a `Doc` object, modify it and return it. Only one of `before`, `after`,
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `component` | callable | The pipeline component. | | `component` | callable | The pipeline component. |
| `name` | unicode | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. | | `name` | str | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. |
| `before` | unicode | Component name to insert component directly before. | | `before` | str | Component name to insert component directly before. |
| `after` | unicode | Component name to insert component directly after: | | `after` | str | Component name to insert component directly after: |
| `first` | bool | Insert component first / not first in the pipeline. | | `first` | bool | Insert component first / not first in the pipeline. |
| `last` | bool | Insert component last / not last in the pipeline. | | `last` | bool | Insert component last / not last in the pipeline. |
@ -244,8 +244,8 @@ Check whether a component is present in the pipeline. Equivalent to
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | -------------------------------------------------------- | | ----------- | ---- | -------------------------------------------------------- |
| `name` | unicode | Name of the pipeline component to check. | | `name` | str | Name of the pipeline component to check. |
| **RETURNS** | bool | Whether a component of that name exists in the pipeline. | | **RETURNS** | bool | Whether a component of that name exists in the pipeline. |
## Language.get_pipe {#get_pipe tag="method" new="2"} ## Language.get_pipe {#get_pipe tag="method" new="2"}
@ -261,7 +261,7 @@ Get a pipeline component for a given component name.
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | -------------------------------------- | | ----------- | -------- | -------------------------------------- |
| `name` | unicode | Name of the pipeline component to get. | | `name` | str | Name of the pipeline component to get. |
| **RETURNS** | callable | The pipeline component. | | **RETURNS** | callable | The pipeline component. |
## Language.replace_pipe {#replace_pipe tag="method" new="2"} ## Language.replace_pipe {#replace_pipe tag="method" new="2"}
@ -276,7 +276,7 @@ Replace a component in the pipeline.
| Name | Type | Description | | Name | Type | Description |
| ----------- | -------- | --------------------------------- | | ----------- | -------- | --------------------------------- |
| `name` | unicode | Name of the component to replace. | | `name` | str | Name of the component to replace. |
| `component` | callable | The pipeline component to insert. | | `component` | callable | The pipeline component to insert. |
## Language.rename_pipe {#rename_pipe tag="method" new="2"} ## Language.rename_pipe {#rename_pipe tag="method" new="2"}
@ -293,9 +293,9 @@ added to the pipeline, you can also use the `name` argument on
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ---------- | ------- | -------------------------------- | | ---------- | ---- | -------------------------------- |
| `old_name` | unicode | Name of the component to rename. | | `old_name` | str | Name of the component to rename. |
| `new_name` | unicode | New name of the component. | | `new_name` | str | New name of the component. |
## Language.remove_pipe {#remove_pipe tag="method" new="2"} ## Language.remove_pipe {#remove_pipe tag="method" new="2"}
@ -310,8 +310,8 @@ component function.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ----------------------------------------------------- | | ----------- | ----- | ----------------------------------------------------- |
| `name` | unicode | Name of the component to remove. | | `name` | str | Name of the component to remove. |
| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. | | **RETURNS** | tuple | A `(name, component)` tuple of the removed component. |
## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"} ## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"}
@ -342,12 +342,11 @@ latter case, all components not in the `enable` list, will be disabled.
| Name | Type | Description | | Name | Type | Description |
| ----------- | --------------- | ------------------------------------------------------------------------------------ | | ----------- | --------------- | ------------------------------------------------------------------------------------ |
| `disable` | list | Names of pipeline components to disable. | | `disable` | list | Names of pipeline components to disable. |
| `disable` | unicode | Name of pipeline component to disable. | | `disable` | str | Name of pipeline component to disable. |
| `enable` | list | Names of pipeline components that will not be disabled. | | `enable` | list | Names of pipeline components that will not be disabled. |
| `enable` | unicode | Name of pipeline component that will not be disabled. | | `enable` | str | Name of pipeline component that will not be disabled. |
| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. | | **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
<Infobox title="Changed in v3.0" variant="warning"> <Infobox title="Changed in v3.0" variant="warning">
As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`: As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
@ -371,8 +370,8 @@ the model**.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
## Language.from_disk {#from_disk tag="method" new="2"} ## Language.from_disk {#from_disk tag="method" new="2"}
@ -396,8 +395,8 @@ loaded object.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | ----------------------------------------------------------------------------------------- | | ----------- | ------------ | ----------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Language` | The modified `Language` object. | | **RETURNS** | `Language` | The modified `Language` object. |
@ -481,9 +480,9 @@ per component.
## Class attributes {#class-attributes} ## Class attributes {#class-attributes}
| Name | Type | Description | | Name | Type | Description |
| -------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------------- | | -------------------------------------- | ----- | ----------------------------------------------------------------------------------------------------------------------------------- |
| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. | | `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
| `lang` | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). | | `lang` | str | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
| `factories` <Tag variant="new">2</Tag> | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. | | `factories` <Tag variant="new">2</Tag> | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -63,8 +63,8 @@ Lemmatize a string.
| Name | Type | Description | | Name | Type | Description |
| ------------ | ------------- | -------------------------------------------------------------------------------------------------------- | | ------------ | ------------- | -------------------------------------------------------------------------------------------------------- |
| `string` | unicode | The string to lemmatize, e.g. the token text. | | `string` | str | The string to lemmatize, e.g. the token text. |
| `univ_pos` | unicode / int | The token's universal part-of-speech tag. | | `univ_pos` | str / int | The token's universal part-of-speech tag. |
| `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. | | `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. |
| **RETURNS** | list | The available lemmas for the string. | | **RETURNS** | list | The available lemmas for the string. |
@ -83,10 +83,10 @@ original string is returned. Languages can provide a
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ----------------------------------------------------------------------------------------------------------- | | ----------- | ---- | ----------------------------------------------------------------------------------------------------------- |
| `string` | unicode | The string to look up. | | `string` | str | The string to look up. |
| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. | | `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. | | **RETURNS** | str | The lemma if the string was found, otherwise the original string. |
## Lemmatizer.is_base_form {#is_base_form tag="method"} ## Lemmatizer.is_base_form {#is_base_form tag="method"}
@ -103,8 +103,8 @@ lemmatization entirely.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------ | ------------- | --------------------------------------------------------------------------------------- | | ------------ | --------- | --------------------------------------------------------------------------------------- |
| `univ_pos` | unicode / int | The token's universal part-of-speech tag. | | `univ_pos` | str / int | The token's universal part-of-speech tag. |
| `morphology` | dict | The token's morphological features. | | `morphology` | dict | The token's morphological features. |
| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. | | **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. |

View File

@ -125,21 +125,21 @@ The L2 norm of the lexeme's vector representation.
| Name | Type | Description | | Name | Type | Description |
| -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `vocab` | `Vocab` | The lexeme's vocabulary. | | `vocab` | `Vocab` | The lexeme's vocabulary. |
| `text` | unicode | Verbatim text content. | | `text` | str | Verbatim text content. |
| `orth` | int | ID of the verbatim text content. | | `orth` | int | ID of the verbatim text content. |
| `orth_` | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. | | `orth_` | str | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. |
| `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. | | `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. |
| `flags` | int | Container of the lexeme's binary flags. | | `flags` | int | Container of the lexeme's binary flags. |
| `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. | | `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. |
| `norm_` | unicode | The lexemes's norm, i.e. a normalized form of the lexeme text. | | `norm_` | str | The lexemes's norm, i.e. a normalized form of the lexeme text. |
| `lower` | int | Lowercase form of the word. | | `lower` | int | Lowercase form of the word. |
| `lower_` | unicode | Lowercase form of the word. | | `lower_` | str | Lowercase form of the word. |
| `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
| `shape_` | unicode | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. | | `shape_` | str | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
| `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. | | `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. |
| `prefix_` | unicode | Length-N substring from the start of the word. Defaults to `N=1`. | | `prefix_` | str | Length-N substring from the start of the word. Defaults to `N=1`. |
| `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. | | `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. |
| `suffix_` | unicode | Length-N substring from the start of the word. Defaults to `N=3`. | | `suffix_` | str | Length-N substring from the start of the word. Defaults to `N=3`. |
| `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. | | `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. |
| `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. | | `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. |
| `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. | | `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. |
@ -159,7 +159,7 @@ The L2 norm of the lexeme's vector representation.
| `is_oov` | bool | Is the lexeme out-of-vocabulary? | | `is_oov` | bool | Is the lexeme out-of-vocabulary? |
| `is_stop` | bool | Is the lexeme part of a "stop list"? | | `is_stop` | bool | Is the lexeme part of a "stop list"? |
| `lang` | int | Language of the parent vocabulary. | | `lang` | int | Language of the parent vocabulary. |
| `lang_` | unicode | Language of the parent vocabulary. | | `lang_` | str | Language of the parent vocabulary. |
| `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). | | `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). |
| `cluster` | int | Brown cluster ID. | | `cluster` | int | Brown cluster ID. |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. | | `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. |

View File

@ -57,8 +57,8 @@ Check if the lookups contain a table of a given name. Delegates to
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ----------------------------------------------- | | ----------- | ---- | ----------------------------------------------- |
| `name` | unicode | Name of the table. | | `name` | str | Name of the table. |
| **RETURNS** | bool | Whether a table of that name is in the lookups. | | **RETURNS** | bool | Whether a table of that name is in the lookups. |
## Lookups.tables {#tables tag="property"} ## Lookups.tables {#tables tag="property"}
@ -91,7 +91,7 @@ exists.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----------------------------- | ---------------------------------- | | ----------- | ----------------------------- | ---------------------------------- |
| `name` | unicode | Unique name of the table. | | `name` | str | Unique name of the table. |
| `data` | dict | Optional data to add to the table. | | `data` | dict | Optional data to add to the table. |
| **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. | | **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. |
@ -110,7 +110,7 @@ Get a table from the lookups. Raises an error if the table doesn't exist.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----------------------------- | ------------------ | | ----------- | ----------------------------- | ------------------ |
| `name` | unicode | Name of the table. | | `name` | str | Name of the table. |
| **RETURNS** | [`Table`](/api/lookups#table) | The table. | | **RETURNS** | [`Table`](/api/lookups#table) | The table. |
## Lookups.remove_table {#remove_table tag="method"} ## Lookups.remove_table {#remove_table tag="method"}
@ -128,7 +128,7 @@ Remove a table from the lookups. Raises an error if the table doesn't exist.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----------------------------- | ---------------------------- | | ----------- | ----------------------------- | ---------------------------- |
| `name` | unicode | Name of the table to remove. | | `name` | str | Name of the table to remove. |
| **RETURNS** | [`Table`](/api/lookups#table) | The removed table. | | **RETURNS** | [`Table`](/api/lookups#table) | The removed table. |
## Lookups.has_table {#has_table tag="method"} ## Lookups.has_table {#has_table tag="method"}
@ -145,8 +145,8 @@ Check if the lookups contain a table of a given name. Equivalent to
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ----------------------------------------------- | | ----------- | ---- | ----------------------------------------------- |
| `name` | unicode | Name of the table. | | `name` | str | Name of the table. |
| **RETURNS** | bool | Whether a table of that name is in the lookups. | | **RETURNS** | bool | Whether a table of that name is in the lookups. |
## Lookups.to_bytes {#to_bytes tag="method"} ## Lookups.to_bytes {#to_bytes tag="method"}
@ -192,8 +192,8 @@ which will be created if it doesn't exist.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## Lookups.from_disk {#from_disk tag="method"} ## Lookups.from_disk {#from_disk tag="method"}
@ -209,8 +209,8 @@ the file doesn't exist.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- | | ----------- | ------------ | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| **RETURNS** | `Lookups` | The loaded lookups. | | **RETURNS** | `Lookups` | The loaded lookups. |
## Table {#table tag="class, ordererddict"} ## Table {#table tag="class, ordererddict"}
@ -238,7 +238,7 @@ Initialize a new table.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ---------------------------------- | | ----------- | ------- | ---------------------------------- |
| `name` | unicode | Optional table name for reference. | | `name` | str | Optional table name for reference. |
| **RETURNS** | `Table` | The newly constructed object. | | **RETURNS** | `Table` | The newly constructed object. |
### Table.from_dict {#table.from_dict tag="classmethod"} ### Table.from_dict {#table.from_dict tag="classmethod"}
@ -256,7 +256,7 @@ Initialize a new table from a dict.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ---------------------------------- | | ----------- | ------- | ---------------------------------- |
| `data` | dict | The dictionary. | | `data` | dict | The dictionary. |
| `name` | unicode | Optional table name for reference. | | `name` | str | Optional table name for reference. |
| **RETURNS** | `Table` | The newly constructed object. | | **RETURNS** | `Table` | The newly constructed object. |
### Table.set {#table.set tag="method"} ### Table.set {#table.set tag="method"}
@ -274,8 +274,8 @@ Set a new key / value pair. String keys will be hashed. Same as
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------- | ------------- | ----------- | | ------- | --------- | ----------- |
| `key` | unicode / int | The key. | | `key` | str / int | The key. |
| `value` | - | The value. | | `value` | - | The value. |
### Table.to_bytes {#table.to_bytes tag="method"} ### Table.to_bytes {#table.to_bytes tag="method"}
@ -313,6 +313,6 @@ Load a table from a bytestring.
| Name | Type | Description | | Name | Type | Description |
| -------------- | --------------------------- | ----------------------------------------------------- | | -------------- | --------------------------- | ----------------------------------------------------- |
| `name` | unicode | Table name. | | `name` | str | Table name. |
| `default_size` | int | Default size of bloom filters if no data is provided. | | `default_size` | int | Default size of bloom filters if no data is provided. |
| `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. | | `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. |

View File

@ -126,8 +126,8 @@ Check whether the matcher contains rules for a match ID.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ----------------------------------------------------- | | ----------- | ---- | ----------------------------------------------------- |
| `key` | unicode | The match ID. | | `key` | str | The match ID. |
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | | **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
## Matcher.add {#add tag="method" new="2"} ## Matcher.add {#add tag="method" new="2"}
@ -153,7 +153,7 @@ overwritten.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------ | --------------------------------------------------------------------------------------------- | | ----------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | unicode | An ID for the thing you're matching. | | `match_id` | str | An ID for the thing you're matching. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. | | `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
@ -189,8 +189,8 @@ exist.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----- | ------- | ------------------------- | | ----- | ---- | ------------------------- |
| `key` | unicode | The ID of the match rule. | | `key` | str | The ID of the match rule. |
## Matcher.get {#get tag="method" new="2"} ## Matcher.get {#get tag="method" new="2"}
@ -205,6 +205,6 @@ Retrieve the pattern stored for a key. Returns the rule as an
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | --------------------------------------------- | | ----------- | ----- | --------------------------------------------- |
| `key` | unicode | The ID of the match rule. | | `key` | str | The ID of the match rule. |
| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. | | **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |

View File

@ -134,8 +134,8 @@ Check whether the matcher contains rules for a match ID.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ----------------------------------------------------- | | ----------- | ---- | ----------------------------------------------------- |
| `key` | unicode | The match ID. | | `key` | str | The match ID. |
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. | | **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
## PhraseMatcher.add {#add tag="method"} ## PhraseMatcher.add {#add tag="method"}
@ -162,7 +162,7 @@ overwritten.
| Name | Type | Description | | Name | Type | Description |
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- | | ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
| `match_id` | unicode | An ID for the thing you're matching. | | `match_id` | str | An ID for the thing you're matching. |
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. | | `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
| `*docs` | `Doc` | `Doc` objects of the phrases to match. | | `*docs` | `Doc` | `Doc` objects of the phrases to match. |
@ -199,5 +199,5 @@ does not exist.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----- | ------- | ------------------------- | | ----- | ---- | ------------------------- |
| `key` | unicode | The ID of the match rule. | | `key` | str | The ID of the match rule. |

View File

@ -113,7 +113,7 @@ end of the pipeline and after all other components.
</Infobox> </Infobox>
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ------------------------------------------------------------ | | ----------- | ----- | ------------------------------------------------------------ |
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. | | `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
| `label` | unicode | The subtoken dependency label. Defaults to `"subtok"`. | | `label` | str | The subtoken dependency label. Defaults to `"subtok"`. |
| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. | | **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. |

View File

@ -82,8 +82,8 @@ a file `sentencizer.json`. This also happens automatically when you save an
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- | | ------ | ------------ | ---------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## Sentencizer.from_disk {#from_disk tag="method"} ## Sentencizer.from_disk {#from_disk tag="method"}
@ -99,8 +99,8 @@ added to its pipeline.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- | | ----------- | ------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. |
| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. | | **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. |
## Sentencizer.to_bytes {#to_bytes tag="method"} ## Sentencizer.to_bytes {#to_bytes tag="method"}

View File

@ -110,7 +110,7 @@ For details, see the documentation on
| Name | Type | Description | | Name | Type | Description |
| --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- | | --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- |
| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. | | `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. |
| `default` | - | Optional default value of the attribute if no getter or method is defined. | | `default` | - | Optional default value of the attribute if no getter or method is defined. |
| `method` | callable | Set a custom method on the object, for example `span._.compare(other_span)`. | | `method` | callable | Set a custom method on the object, for example `span._.compare(other_span)`. |
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. | | `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. |
@ -133,8 +133,8 @@ Look up a previously registered extension by name. Returns a 4-tuple
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ------------------------------------------------------------- | | ----------- | ----- | ------------------------------------------------------------- |
| `name` | unicode | Name of the extension. | | `name` | str | Name of the extension. |
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. | | **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
## Span.has_extension {#has_extension tag="classmethod" new="2"} ## Span.has_extension {#has_extension tag="classmethod" new="2"}
@ -150,8 +150,8 @@ Check whether an extension has been registered on the `Span` class.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ------------------------------------------ | | ----------- | ---- | ------------------------------------------ |
| `name` | unicode | Name of the extension to check. | | `name` | str | Name of the extension to check. |
| **RETURNS** | bool | Whether the extension has been registered. | | **RETURNS** | bool | Whether the extension has been registered. |
## Span.remove_extension {#remove_extension tag="classmethod" new="2.0.12"} ## Span.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
@ -168,8 +168,8 @@ Remove a previously registered extension.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | --------------------------------------------------------------------- | | ----------- | ----- | --------------------------------------------------------------------- |
| `name` | unicode | Name of the extension. | | `name` | str | Name of the extension. |
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. | | **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
## Span.char_span {#char_span tag="method" new="2.2.4"} ## Span.char_span {#char_span tag="method" new="2.2.4"}
@ -497,16 +497,16 @@ The L2 norm of the span's vector representation.
| `end` | int | The token offset for the end of the span. | | `end` | int | The token offset for the end of the span. |
| `start_char` | int | The character offset for the start of the span. | | `start_char` | int | The character offset for the start of the span. |
| `end_char` | int | The character offset for the end of the span. | | `end_char` | int | The character offset for the end of the span. |
| `text` | unicode | A unicode representation of the span text. | | `text` | str | A unicode representation of the span text. |
| `text_with_ws` | unicode | The text content of the span with a trailing whitespace character if the last token has one. | | `text_with_ws` | str | The text content of the span with a trailing whitespace character if the last token has one. |
| `orth` | int | ID of the verbatim text content. | | `orth` | int | ID of the verbatim text content. |
| `orth_` | unicode | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. | | `orth_` | str | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. |
| `label` | int | The hash value of the span's label. | | `label` | int | The hash value of the span's label. |
| `label_` | unicode | The span's label. | | `label_` | str | The span's label. |
| `lemma_` | unicode | The span's lemma. | | `lemma_` | str | The span's lemma. |
| `kb_id` | int | The hash value of the knowledge base ID referred to by the span. | | `kb_id` | int | The hash value of the knowledge base ID referred to by the span. |
| `kb_id_` | unicode | The knowledge base ID referred to by the span. | | `kb_id_` | str | The knowledge base ID referred to by the span. |
| `ent_id` | int | The hash value of the named entity the token is an instance of. | | `ent_id` | int | The hash value of the named entity the token is an instance of. |
| `ent_id_` | unicode | The string ID of the named entity the token is an instance of. | | `ent_id_` | str | The string ID of the named entity the token is an instance of. |
| `sentiment` | float | A scalar value indicating the positivity or negativity of the span. | | `sentiment` | float | A scalar value indicating the positivity or negativity of the span. |
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). | | `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |

View File

@ -55,7 +55,7 @@ Retrieve a string from a given hash, or vice versa.
| Name | Type | Description | | Name | Type | Description |
| -------------- | ------------------------ | -------------------------- | | -------------- | ------------------------ | -------------------------- |
| `string_or_id` | bytes, unicode or uint64 | The value to encode. | | `string_or_id` | bytes, unicode or uint64 | The value to encode. |
| **RETURNS** | unicode or int | The value to be retrieved. | | **RETURNS** | str or int | The value to be retrieved. |
## StringStore.\_\_contains\_\_ {#contains tag="method"} ## StringStore.\_\_contains\_\_ {#contains tag="method"}
@ -70,8 +70,8 @@ Check whether a string is in the store.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | -------------------------------------- | | ----------- | ---- | -------------------------------------- |
| `string` | unicode | The string to check. | | `string` | str | The string to check. |
| **RETURNS** | bool | Whether the store contains the string. | | **RETURNS** | bool | Whether the store contains the string. |
## StringStore.\_\_iter\_\_ {#iter tag="method"} ## StringStore.\_\_iter\_\_ {#iter tag="method"}
@ -88,8 +88,8 @@ store will always include an empty string `''` at position `0`.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ---------- | ------- | ---------------------- | | ---------- | ---- | ---------------------- |
| **YIELDS** | unicode | A string in the store. | | **YIELDS** | str | A string in the store. |
## StringStore.add {#add tag="method" new="2"} ## StringStore.add {#add tag="method" new="2"}
@ -107,8 +107,8 @@ Add a string to the `StringStore`.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ------------------------ | | ----------- | ------ | ------------------------ |
| `string` | unicode | The string to add. | | `string` | str | The string to add. |
| **RETURNS** | uint64 | The string's hash value. | | **RETURNS** | uint64 | The string's hash value. |
## StringStore.to_disk {#to_disk tag="method" new="2"} ## StringStore.to_disk {#to_disk tag="method" new="2"}
@ -122,8 +122,8 @@ Save the current state to a directory.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## StringStore.from_disk {#from_disk tag="method" new="2"} ## StringStore.from_disk {#from_disk tag="method" new="2"}
@ -137,8 +137,8 @@ Loads state from a directory. Modifies the object in place and returns it.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- | | ----------- | ------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| **RETURNS** | `StringStore` | The modified `StringStore` object. | | **RETURNS** | `StringStore` | The modified `StringStore` object. |
## StringStore.to_bytes {#to_bytes tag="method"} ## StringStore.to_bytes {#to_bytes tag="method"}
@ -186,6 +186,6 @@ Get a 64-bit hash for a given string.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ------------------- | | ----------- | ------ | ------------------- |
| `string` | unicode | The string to hash. | | `string` | str | The string to hash. |
| **RETURNS** | uint64 | The hash. | | **RETURNS** | uint64 | The hash. |

View File

@ -230,8 +230,8 @@ Add a new label to the pipe.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------- | ------- | --------------------------------------------------------------- | | -------- | ---- | --------------------------------------------------------------- |
| `label` | unicode | The label to add. | | `label` | str | The label to add. |
| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. | | `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. |
## Tagger.to_disk {#to_disk tag="method"} ## Tagger.to_disk {#to_disk tag="method"}
@ -246,8 +246,8 @@ Serialize the pipe to disk.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Tagger.from_disk {#from_disk tag="method"} ## Tagger.from_disk {#from_disk tag="method"}
@ -262,8 +262,8 @@ Load the pipe from disk. Modifies the object in place and returns it.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- | | ----------- | ------------ | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Tagger` | The modified `Tagger` object. | | **RETURNS** | `Tagger` | The modified `Tagger` object. |

View File

@ -44,7 +44,7 @@ shortcut for this and instantiate the component using its string name and
| `vocab` | `Vocab` | The shared vocabulary. | | `vocab` | `Vocab` | The shared vocabulary. |
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. | | `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
| `exclusive_classes` | bool | Make categories mutually exclusive. Defaults to `False`. | | `exclusive_classes` | bool | Make categories mutually exclusive. Defaults to `False`. |
| `architecture` | unicode | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. | | `architecture` | str | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. |
| **RETURNS** | `TextCategorizer` | The newly constructed object. | | **RETURNS** | `TextCategorizer` | The newly constructed object. |
### Architectures {#architectures new="2.1"} ### Architectures {#architectures new="2.1"}
@ -248,8 +248,8 @@ Add a new label to the pipe.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------- | ------- | ----------------- | | ------- | ---- | ----------------- |
| `label` | unicode | The label to add. | | `label` | str | The label to add. |
## TextCategorizer.to_disk {#to_disk tag="method"} ## TextCategorizer.to_disk {#to_disk tag="method"}
@ -263,8 +263,8 @@ Serialize the pipe to disk.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## TextCategorizer.from_disk {#from_disk tag="method"} ## TextCategorizer.from_disk {#from_disk tag="method"}
@ -280,7 +280,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ----------------- | -------------------------------------------------------------------------- | | ----------- | ----------------- | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. | | **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. |

View File

@ -35,7 +35,7 @@ the
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ----------- | ------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | A storage container for lexical types. | | `vocab` | `Vocab` | A storage container for lexical types. |
| `rules` | dict | Exceptions and special-cases for the tokenizer. | | `rules` | dict | Exceptions and special-cases for the tokenizer. |
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. | | `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
@ -56,8 +56,8 @@ Tokenize a string.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | --------------------------------------- | | ----------- | ----- | --------------------------------------- |
| `string` | unicode | The string to tokenize. | | `string` | str | The string to tokenize. |
| **RETURNS** | `Doc` | A container for linguistic annotations. | | **RETURNS** | `Doc` | A container for linguistic annotations. |
## Tokenizer.pipe {#pipe tag="method"} ## Tokenizer.pipe {#pipe tag="method"}
@ -83,8 +83,8 @@ Tokenize a stream of texts.
Find internal split points of the string. Find internal split points of the string.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | | ----------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
| `string` | unicode | The string to split. | | `string` | str | The string to split. |
| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. | | **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. |
## Tokenizer.find_prefix {#find_prefix tag="method"} ## Tokenizer.find_prefix {#find_prefix tag="method"}
@ -93,8 +93,8 @@ Find the length of a prefix that should be segmented from the string, or `None`
if no prefix rules match. if no prefix rules match.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | ------------------------------------------------------ | | ----------- | ---- | ------------------------------------------------------ |
| `string` | unicode | The string to segment. | | `string` | str | The string to segment. |
| **RETURNS** | int | The length of the prefix if present, otherwise `None`. | | **RETURNS** | int | The length of the prefix if present, otherwise `None`. |
## Tokenizer.find_suffix {#find_suffix tag="method"} ## Tokenizer.find_suffix {#find_suffix tag="method"}
@ -104,7 +104,7 @@ if no suffix rules match.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------ | ------------------------------------------------------ | | ----------- | ------------ | ------------------------------------------------------ |
| `string` | unicode | The string to segment. | | `string` | str | The string to segment. |
| **RETURNS** | int / `None` | The length of the suffix if present, otherwise `None`. | | **RETURNS** | int / `None` | The length of the suffix if present, otherwise `None`. |
## Tokenizer.add_special_case {#add_special_case tag="method"} ## Tokenizer.add_special_case {#add_special_case tag="method"}
@ -125,7 +125,7 @@ and examples.
| Name | Type | Description | | Name | Type | Description |
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | | ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| `string` | unicode | The string to specially tokenize. | | `string` | str | The string to specially tokenize. |
| `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. | | `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. |
## Tokenizer.explain {#explain tag="method"} ## Tokenizer.explain {#explain tag="method"}
@ -143,8 +143,8 @@ produced are identical to `Tokenizer.__call__` except for whitespace tokens.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------------| -------- | --------------------------------------------------- | | ----------- | ---- | --------------------------------------------------- |
| `string` | unicode | The string to tokenize with the debugging tokenizer | | `string` | str | The string to tokenize with the debugging tokenizer |
| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples | | **RETURNS** | list | A list of `(pattern_string, token_string)` tuples |
## Tokenizer.to_disk {#to_disk tag="method"} ## Tokenizer.to_disk {#to_disk tag="method"}
@ -159,8 +159,8 @@ Serialize the tokenizer to disk.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Tokenizer.from_disk {#from_disk tag="method"} ## Tokenizer.from_disk {#from_disk tag="method"}
@ -175,8 +175,8 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- | | ----------- | ------------ | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. | | **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. |
@ -218,12 +218,12 @@ it.
## Attributes {#attributes} ## Attributes {#attributes}
| Name | Type | Description | | Name | Type | Description |
| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- | | ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. | | `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. | | `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. | | `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. | | `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. | | `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an`re.MatchObject`or`None. |
| `rules` | dict | A dictionary of tokenizer exceptions and special cases. | | `rules` | dict | A dictionary of tokenizer exceptions and special cases. |
## Serialization fields {#serialization-fields} ## Serialization fields {#serialization-fields}

View File

@ -33,8 +33,8 @@ class. The data will be loaded in via
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | --------------------------------------------------------------------------------- | | ----------- | ------------ | --------------------------------------------------------------------------------- |
| `name` | unicode / `Path` | Model to load, i.e. shortcut link, package name or path. | | `name` | str / `Path` | Model to load, i.e. shortcut link, package name or path. |
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
| **RETURNS** | `Language` | A `Language` object with the loaded model. | | **RETURNS** | `Language` | A `Language` object with the loaded model. |
@ -79,7 +79,7 @@ Create a blank model of a given language class. This function is the twin of
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------- | ------------------------------------------------------------------------------------------------ | | ----------- | ---------- | ------------------------------------------------------------------------------------------------ |
| `name` | unicode | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. | | `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). | | `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
| **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. | | **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. |
@ -99,8 +99,8 @@ meta data as a dictionary instead, you can use the `meta` attribute on your
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ---------- | ------- | ------------------------------------------------------------- | | ---------- | ---- | ------------------------------------------------------------- |
| `model` | unicode | A model, i.e. shortcut link, package name or path (optional). | | `model` | str | A model, i.e. shortcut link, package name or path (optional). |
| `markdown` | bool | Print information as Markdown. | | `markdown` | bool | Print information as Markdown. |
### spacy.explain {#spacy.explain tag="function"} ### spacy.explain {#spacy.explain tag="function"}
@ -123,9 +123,9 @@ list of available terms, see
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | -------------------------------------------------------- | | ----------- | ---- | -------------------------------------------------------- |
| `term` | unicode | Term to explain. | | `term` | str | Term to explain. |
| **RETURNS** | unicode | The explanation, or `None` if not found in the glossary. | | **RETURNS** | str | The explanation, or `None` if not found in the glossary. |
### spacy.prefer_gpu {#spacy.prefer_gpu tag="function" new="2.0.14"} ### spacy.prefer_gpu {#spacy.prefer_gpu tag="function" new="2.0.14"}
@ -189,13 +189,13 @@ browser. Will run a simple web server.
| Name | Type | Description | Default | | Name | Type | Description | Default |
| --------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ----------- | | --------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ----------- |
| `docs` | list, `Doc`, `Span` | Document(s) to visualize. | | `docs` | list, `Doc`, `Span` | Document(s) to visualize. |
| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` | | `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
| `page` | bool | Render markup as full HTML page. | `True` | | `page` | bool | Render markup as full HTML page. | `True` |
| `minify` | bool | Minify HTML markup. | `False` | | `minify` | bool | Minify HTML markup. | `False` |
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
| `port` | int | Port to serve visualization. | `5000` | | `port` | int | Port to serve visualization. | `5000` |
| `host` | unicode | Host to serve visualization. | `'0.0.0.0'` | | `host` | str | Host to serve visualization. | `'0.0.0.0'` |
### displacy.render {#displacy.render tag="method" new="2"} ### displacy.render {#displacy.render tag="method" new="2"}
@ -214,13 +214,13 @@ Render a dependency parse tree or named entity visualization.
| Name | Type | Description | Default | | Name | Type | Description | Default |
| ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- | | ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
| `docs` | list, `Doc`, `Span` | Document(s) to visualize. | | `docs` | list, `Doc`, `Span` | Document(s) to visualize. |
| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` | | `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
| `page` | bool | Render markup as full HTML page. | `False` | | `page` | bool | Render markup as full HTML page. | `False` |
| `minify` | bool | Minify HTML markup. | `False` | | `minify` | bool | Minify HTML markup. | `False` |
| `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` | | `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` |
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` | | `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` | | `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
| **RETURNS** | unicode | Rendered HTML markup. | | **RETURNS** | str | Rendered HTML markup. |
### Visualizer options {#displacy_options} ### Visualizer options {#displacy_options}
@ -237,15 +237,15 @@ If a setting is not present in the options, the default value will be used.
> ``` > ```
| Name | Type | Description | Default | | Name | Type | Description | Default |
| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- | | ------------------------------------------ | ---- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` | | `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
| `add_lemma` <Tag variant="new">2.2.4</Tag> | bool | Print the lemma's in a separate row below the token texts. | `False` | | `add_lemma` <Tag variant="new">2.2.4</Tag> | bool | Print the lemma's in a separate row below the token texts. | `False` |
| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` | | `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` | | `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | | `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` | | `color` | str | Text color (HEX, RGB or color names). | `'#000000'` |
| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` | | `bg` | str | Background color (HEX, RGB or color names). | `'#ffffff'` |
| `font` | unicode | Font name or font family for all text. | `'Arial'` | | `font` | str | Font name or font family for all text. | `'Arial'` |
| `offset_x` | int | Spacing on left side of the SVG in px. | `50` | | `offset_x` | int | Spacing on left side of the SVG in px. | `50` |
| `arrow_stroke` | int | Width of arrow path in px. | `2` | | `arrow_stroke` | int | Width of arrow path in px. | `2` |
| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) | | `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) |
@ -264,10 +264,10 @@ If a setting is not present in the options, the default value will be used.
> ``` > ```
| Name | Type | Description | Default | | Name | Type | Description | Default |
| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ | | --------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
| `ents` | list | Entity types to highlight (`None` for all types). | `None` | | `ents` | list | Entity types to highlight (`None` for all types). | `None` |
| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` | | `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` |
| `template` <Tag variant="new">2.2</Tag> | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) | | `template` <Tag variant="new">2.2</Tag> | str | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |
By default, displaCy comes with colors for all By default, displaCy comes with colors for all
[entity types supported by spaCy](/api/annotation#named-entities). If you're [entity types supported by spaCy](/api/annotation#named-entities). If you're
@ -309,8 +309,8 @@ Set custom path to the data directory where spaCy looks for models.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------ | ---------------- | --------------------------- | | ------ | ------------ | --------------------------- |
| `path` | unicode / `Path` | Path to new data directory. | | `path` | str / `Path` | Path to new data directory. |
### util.get_lang_class {#util.get_lang_class tag="function"} ### util.get_lang_class {#util.get_lang_class tag="function"}
@ -330,7 +330,7 @@ you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------- | -------------------------------------- | | ----------- | ---------- | -------------------------------------- |
| `lang` | unicode | Two-letter language code, e.g. `'en'`. | | `lang` | str | Two-letter language code, e.g. `'en'`. |
| **RETURNS** | `Language` | Language class. | | **RETURNS** | `Language` | Language class. |
### util.set_lang_class {#util.set_lang_class tag="function"} ### util.set_lang_class {#util.set_lang_class tag="function"}
@ -352,7 +352,7 @@ the two-letter language code.
| Name | Type | Description | | Name | Type | Description |
| ------ | ---------- | -------------------------------------- | | ------ | ---------- | -------------------------------------- |
| `name` | unicode | Two-letter language code, e.g. `'en'`. | | `name` | str | Two-letter language code, e.g. `'en'`. |
| `cls` | `Language` | The language class, e.g. `English`. | | `cls` | `Language` | The language class, e.g. `English`. |
### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"} ### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"}
@ -369,8 +369,8 @@ loaded lazily, to avoid expensive setup code associated with the language data.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | -------------------------------------- | | ----------- | ---- | -------------------------------------- |
| `name` | unicode | Two-letter language code, e.g. `'en'`. | | `name` | str | Two-letter language code, e.g. `'en'`. |
| **RETURNS** | bool | Whether the class has been loaded. | | **RETURNS** | bool | Whether the class has been loaded. |
### util.load_model {#util.load_model tag="function" new="2"} ### util.load_model {#util.load_model tag="function" new="2"}
@ -392,7 +392,7 @@ in via [`Language.from_disk()`](/api/language#from_disk).
| Name | Type | Description | | Name | Type | Description |
| ------------- | ---------- | -------------------------------------------------------- | | ------------- | ---------- | -------------------------------------------------------- |
| `name` | unicode | Package name, shortcut link or model path. | | `name` | str | Package name, shortcut link or model path. |
| `**overrides` | - | Specific overrides, like pipeline components to disable. | | `**overrides` | - | Specific overrides, like pipeline components to disable. |
| **RETURNS** | `Language` | `Language` class with the loaded model. | | **RETURNS** | `Language` | `Language` class with the loaded model. |
@ -411,7 +411,7 @@ it easy to test a new model that you haven't packaged yet.
| Name | Type | Description | | Name | Type | Description |
| ------------- | ---------- | ---------------------------------------------------------------------------------------------------- | | ------------- | ---------- | ---------------------------------------------------------------------------------------------------- |
| `model_path` | unicode | Path to model data directory. | | `model_path` | str | Path to model data directory. |
| `meta` | dict | Model meta data. If `False`, spaCy will try to load the meta from a meta.json in the same directory. | | `meta` | dict | Model meta data. If `False`, spaCy will try to load the meta from a meta.json in the same directory. |
| `**overrides` | - | Specific overrides, like pipeline components to disable. | | `**overrides` | - | Specific overrides, like pipeline components to disable. |
| **RETURNS** | `Language` | `Language` class with the loaded model. | | **RETURNS** | `Language` | `Language` class with the loaded model. |
@ -432,7 +432,7 @@ A helper function to use in the `load()` method of a model package's
| Name | Type | Description | | Name | Type | Description |
| ------------- | ---------- | -------------------------------------------------------- | | ------------- | ---------- | -------------------------------------------------------- |
| `init_file` | unicode | Path to model's `__init__.py`, i.e. `__file__`. | | `init_file` | str | Path to model's `__init__.py`, i.e. `__file__`. |
| `**overrides` | - | Specific overrides, like pipeline components to disable. | | `**overrides` | - | Specific overrides, like pipeline components to disable. |
| **RETURNS** | `Language` | `Language` class with the loaded model. | | **RETURNS** | `Language` | `Language` class with the loaded model. |
@ -447,8 +447,8 @@ Get a model's meta.json from a directory path and validate its contents.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | ------------------------ | | ----------- | ------------ | ------------------------ |
| `path` | unicode / `Path` | Path to model directory. | | `path` | str / `Path` | Path to model directory. |
| **RETURNS** | dict | The model's meta data. | | **RETURNS** | dict | The model's meta data. |
### util.is_package {#util.is_package tag="function"} ### util.is_package {#util.is_package tag="function"}
@ -464,8 +464,8 @@ Check if string maps to a package installed via pip. Mainly used to validate
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | -------------------------------------------- | | ----------- | ------ | -------------------------------------------- |
| `name` | unicode | Name of package. | | `name` | str | Name of package. |
| **RETURNS** | `bool` | `True` if installed package, `False` if not. | | **RETURNS** | `bool` | `True` if installed package, `False` if not. |
### util.get_package_path {#util.get_package_path tag="function" new="2"} ### util.get_package_path {#util.get_package_path tag="function" new="2"}
@ -481,8 +481,8 @@ Get path to an installed package. Mainly used to resolve the location of
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| -------------- | ------- | -------------------------------- | | -------------- | ------ | -------------------------------- |
| `package_name` | unicode | Name of installed package. | | `package_name` | str | Name of installed package. |
| **RETURNS** | `Path` | Path to model package directory. | | **RETURNS** | `Path` | Path to model package directory. |
### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"} ### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"}

View File

@ -35,7 +35,7 @@ you can add vectors to later.
| `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. | | `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. |
| `keys` | iterable | A sequence of keys aligned with the data. | | `keys` | iterable | A sequence of keys aligned with the data. |
| `shape` | tuple | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. | | `shape` | tuple | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. |
| `name` | unicode | A name to identify the vectors table. | | `name` | str | A name to identify the vectors table. |
| **RETURNS** | `Vectors` | The newly created object. | | **RETURNS** | `Vectors` | The newly created object. |
## Vectors.\_\_getitem\_\_ {#getitem tag="method"} ## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
@ -140,7 +140,7 @@ mapping separately. If you need to manage the strings, you should use the
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------------------------- | ----------------------------------------------------- | | ----------- | ---------------------------------- | ----------------------------------------------------- |
| `key` | unicode / int | The key to add. | | `key` | str / int | The key to add. |
| `vector` | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key. | | `vector` | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key. |
| `row` | int | An optional row number of a vector to map the key to. | | `row` | int | An optional row number of a vector to map the key to. |
| **RETURNS** | int | The row the vector was added to. | | **RETURNS** | int | The row the vector was added to. |
@ -227,7 +227,7 @@ Look up one or more keys by row, or vice versa.
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------------------------------------- | ------------------------------------------------------------------------ | | ----------- | ------------------------------------- | ------------------------------------------------------------------------ |
| `key` | unicode / int | Find the row that the given key points to. Returns int, `-1` if missing. | | `key` | str / int | Find the row that the given key points to. Returns int, `-1` if missing. |
| `keys` | iterable | Find rows that the keys point to. Returns `ndarray`. | | `keys` | iterable | Find rows that the keys point to. Returns `ndarray`. |
| `row` | int | Find the first key that points to the row. Returns int. | | `row` | int | Find the first key that points to the row. Returns int. |
| `rows` | iterable | Find the keys that point to the rows. Returns ndarray. | | `rows` | iterable | Find the keys that point to the rows. Returns ndarray. |
@ -338,8 +338,8 @@ Save the current state to a directory.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
## Vectors.from_disk {#from_disk tag="method"} ## Vectors.from_disk {#from_disk tag="method"}
@ -353,8 +353,8 @@ Loads state from a directory. Modifies the object in place and returns it.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- | | ----------- | ------------ | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| **RETURNS** | `Vectors` | The modified `Vectors` object. | | **RETURNS** | `Vectors` | The modified `Vectors` object. |
## Vectors.to_bytes {#to_bytes tag="method"} ## Vectors.to_bytes {#to_bytes tag="method"}

View File

@ -27,7 +27,7 @@ Create the vocabulary.
| `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. | | `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. |
| `lemmatizer` | object | A lemmatizer. Defaults to `None`. | | `lemmatizer` | object | A lemmatizer. Defaults to `None`. |
| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. | | `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. |
| `vectors_name` <Tag variant="new">2.2</Tag> | unicode | A name to identify the vectors table. | | `vectors_name` <Tag variant="new">2.2</Tag> | str | A name to identify the vectors table. |
| **RETURNS** | `Vocab` | The newly constructed object. | | **RETURNS** | `Vocab` | The newly constructed object. |
## Vocab.\_\_len\_\_ {#len tag="method"} ## Vocab.\_\_len\_\_ {#len tag="method"}
@ -92,8 +92,8 @@ given string, you need to look it up in
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ------- | -------------------------------------------------- | | ----------- | ---- | -------------------------------------------------- |
| `string` | unicode | The ID string. | | `string` | str | The ID string. |
| **RETURNS** | bool | Whether the string has an entry in the vocabulary. | | **RETURNS** | bool | Whether the string has an entry in the vocabulary. |
## Vocab.add_flag {#add_flag tag="method"} ## Vocab.add_flag {#add_flag tag="method"}
@ -117,7 +117,7 @@ using `token.check_flag(flag_id)`.
| Name | Type | Description | | Name | Type | Description |
| ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- | | ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
| `flag_getter` | dict | A function `f(unicode) -> bool`, to get the flag value. | | `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value. |
| `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. | | `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. |
| **RETURNS** | int | The integer ID by which the flag value can be checked. | | **RETURNS** | int | The integer ID by which the flag value can be checked. |
@ -228,8 +228,8 @@ Save the current state to a directory.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- | | --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
## Vocab.from_disk {#from_disk tag="method" new="2"} ## Vocab.from_disk {#from_disk tag="method" new="2"}
@ -244,8 +244,8 @@ Loads state from a directory. Modifies the object in place and returns it.
> ``` > ```
| Name | Type | Description | | Name | Type | Description |
| ----------- | ---------------- | -------------------------------------------------------------------------- | | ----------- | ------------ | -------------------------------------------------------------------------- |
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. | | `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. | | `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
| **RETURNS** | `Vocab` | The modified `Vocab` object. | | **RETURNS** | `Vocab` | The modified `Vocab` object. |

View File

@ -327,11 +327,11 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy)..
### Disabling the parser {#disabling} ### Disabling the parser {#disabling}
In the [default models](/models), the parser is loaded and enabled as part of In the [default models](/models), the parser is loaded and enabled as part of
the [standard processing pipeline](/usage/processing-pipelines). If you don't need the [standard processing pipeline](/usage/processing-pipelines). If you don't
any of the syntactic information, you should disable the parser. Disabling the need any of the syntactic information, you should disable the parser. Disabling
parser will make spaCy load and run much faster. If you want to load the parser, the parser will make spaCy load and run much faster. If you want to load the
but need to disable it for specific documents, you can also control its use on parser, but need to disable it for specific documents, you can also control its
the `nlp` object. use on the `nlp` object.
```python ```python
nlp = spacy.load("en_core_web_sm", disable=["parser"]) nlp = spacy.load("en_core_web_sm", disable=["parser"])
@ -989,8 +989,8 @@ nlp.tokenizer = my_tokenizer
``` ```
| Argument | Type | Description | | Argument | Type | Description |
| ----------- | ------- | ------------------------- | | ----------- | ----- | ------------------------- |
| `text` | unicode | The raw text to tokenize. | | `text` | str | The raw text to tokenize. |
| **RETURNS** | `Doc` | The tokenized document. | | **RETURNS** | `Doc` | The tokenized document. |
<Infobox title="Important note: using a custom tokenizer" variant="warning"> <Infobox title="Important note: using a custom tokenizer" variant="warning">

View File

@ -272,16 +272,16 @@ doc = nlp("I won't have named entities")
disabled.restore() disabled.restore()
``` ```
If you want to disable all pipes except for one or a few, you can use the `enable` If you want to disable all pipes except for one or a few, you can use the
keyword. Just like the `disable` keyword, it takes a list of pipe names, or a string `enable` keyword. Just like the `disable` keyword, it takes a list of pipe
defining just one pipe. names, or a string defining just one pipe.
```python ```python
# Enable only the parser # Enable only the parser
with nlp.select_pipes(enable="parser"): with nlp.select_pipes(enable="parser"):
doc = nlp("I will only be parsed") doc = nlp("I will only be parsed")
``` ```
Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method
to remove pipeline components from an existing pipeline, the to remove pipeline components from an existing pipeline, the
[`rename_pipe`](/api/language#rename_pipe) method to rename them, or the [`rename_pipe`](/api/language#rename_pipe) method to rename them, or the
@ -350,11 +350,11 @@ last** in the pipeline, or define a **custom name**. If no name is set and no
> ``` > ```
| Argument | Type | Description | | Argument | Type | Description |
| -------- | ------- | ------------------------------------------------------------------------ | | -------- | ---- | ------------------------------------------------------------------------ |
| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). | | `last` | bool | If set to `True`, component is added **last** in the pipeline (default). |
| `first` | bool | If set to `True`, component is added **first** in the pipeline. | | `first` | bool | If set to `True`, component is added **first** in the pipeline. |
| `before` | unicode | String name of component to add the new component **before**. | | `before` | str | String name of component to add the new component **before**. |
| `after` | unicode | String name of component to add the new component **after**. | | `after` | str | String name of component to add the new component **after**. |
### Example: A simple pipeline component {#custom-components-simple} ### Example: A simple pipeline component {#custom-components-simple}

View File

@ -158,17 +158,17 @@ The available token pattern keys correspond to a number of
rule-based matching are: rule-based matching are:
| Attribute | Type |  Description | | Attribute | Type |  Description |
| -------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------ | | -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
| `ORTH` | unicode | The exact verbatim text of a token. | | `ORTH` | str | The exact verbatim text of a token. |
| `TEXT` <Tag variant="new">2.1</Tag> | unicode | The exact verbatim text of a token. | | `TEXT` <Tag variant="new">2.1</Tag> | str | The exact verbatim text of a token. |
| `LOWER` | unicode | The lowercase form of the token text. | | `LOWER` | str | The lowercase form of the token text. |
|  `LENGTH` | int | The length of the token text. | |  `LENGTH` | int | The length of the token text. |
|  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. | |  `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
|  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. | |  `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. | |  `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. | |  `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | unicode | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. | |  `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
| `ENT_TYPE` | unicode | The token's entity label. | | `ENT_TYPE` | str | The token's entity label. |
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). | | `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
<Accordion title="Does it matter if the attribute names are uppercase or lowercase?"> <Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
@ -1101,21 +1101,28 @@ powerful model packages with binary weights _and_ rules included!
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"} ### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**, When using a large amount of **phrase patterns** (roughly > 10000) it's useful
the EntityRuler calls the nlp object to construct a doc object. This happens in case you try to understand how the `add_patterns` function of the EntityRuler works. For each
to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to **phrase pattern**, the EntityRuler calls the nlp object to construct a doc
extract matches based on the pattern's POS signature. object. This happens in case you try to add the EntityRuler at the end of an
existing pipeline with, for example, a POS tagger and want to extract matches
based on the pattern's POS signature.
In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler. In this case you would pass a config value of `phrase_matcher_attr="POS"` for
the EntityRuler.
Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns. Running the full language pipeline across every pattern in a large list scales
linearly and can therefore take a long time on large amounts of phrase patterns.
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively. As of spaCy 2.2.4 the `add_patterns` function has been refactored to use
nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with
5,000-100,000 phrase patterns respectively.
Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time. Even with this speedup (but especially if you're using an older version) the
`add_patterns` function can still take a long time.
An easy workaround to make this function run faster is disabling the other language pipes An easy workaround to make this function run faster is disabling the other
while adding the phrase patterns. language pipes while adding the phrase patterns.
```python ```python
entityruler = EntityRuler(nlp) entityruler = EntityRuler(nlp)

View File

@ -94,8 +94,8 @@ docs = list(doc_bin.get_docs(nlp.vocab))
If `store_user_data` is set to `True`, the `Doc.user_data` will be serialized as If `store_user_data` is set to `True`, the `Doc.user_data` will be serialized as
well, which includes the values of well, which includes the values of
[extension attributes](/usage/processing-pipelines#custom-components-attributes) (if [extension attributes](/usage/processing-pipelines#custom-components-attributes)
they're serializable with msgpack). (if they're serializable with msgpack).
<Infobox title="Important note on serializing extension attributes" variant="warning"> <Infobox title="Important note on serializing extension attributes" variant="warning">
@ -667,8 +667,8 @@ define the language data to be loaded and the
[processing pipeline](/usage/processing-pipelines) to execute. [processing pipeline](/usage/processing-pipelines) to execute.
| Setting | Type | Description | | Setting | Type | Description |
| ---------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- | | ---------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `lang` | unicode | ID of the language class to initialize. | | `lang` | str | ID of the language class to initialize. |
| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. | | `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. |
The `load()` method that comes with our model package templates will take care The `load()` method that comes with our model package templates will take care

View File

@ -68,11 +68,11 @@ arcs.
</Infobox> </Infobox>
| Argument | Type | Description | Default | | Argument | Type | Description | Default |
| --------- | ------- | ----------------------------------------------------------- | ----------- | | --------- | ---- | ----------------------------------------------------------- | ----------- |
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` | | `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
| `color` | unicode | Text color (HEX, RGB or color names). | `"#000000"` | | `color` | str | Text color (HEX, RGB or color names). | `"#000000"` |
| `bg` | unicode | Background color (HEX, RGB or color names). | `"#ffffff"` | | `bg` | str | Background color (HEX, RGB or color names). | `"#ffffff"` |
| `font` | unicode | Font name or font family for all text. | `"Arial"` | | `font` | str | Font name or font family for all text. | `"Arial"` |
For a list of all available options, see the For a list of all available options, see the
[`displacy` API documentation](/api/top-level#displacy_options). [`displacy` API documentation](/api/top-level#displacy_options).