mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 02:06:31 +03:00
Merge branch 'develop' into master-tmp
This commit is contained in:
commit
810fce3bb1
4
Makefile
4
Makefile
|
@ -5,7 +5,7 @@ VENV := ./env$(PYVER)
|
|||
version := $(shell "bin/get-version.sh")
|
||||
|
||||
dist/spacy-$(version).pex : wheelhouse/spacy-$(version).stamp
|
||||
$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) jsonschema spacy_lookups_data
|
||||
$(VENV)/bin/pex -f ./wheelhouse --no-index --disable-cache -m spacy -o $@ spacy==$(version) spacy_lookups_data
|
||||
chmod a+rx $@
|
||||
|
||||
dist/pytest.pex : wheelhouse/pytest-*.whl
|
||||
|
@ -14,7 +14,7 @@ dist/pytest.pex : wheelhouse/pytest-*.whl
|
|||
|
||||
wheelhouse/spacy-$(version).stamp : $(VENV)/bin/pex setup.py spacy/*.py* spacy/*/*.py*
|
||||
$(VENV)/bin/pip wheel . -w ./wheelhouse
|
||||
$(VENV)/bin/pip wheel jsonschema spacy_lookups_data -w ./wheelhouse
|
||||
$(VENV)/bin/pip wheel spacy_lookups_data -w ./wheelhouse
|
||||
touch $@
|
||||
|
||||
wheelhouse/pytest-%.whl : $(VENV)/bin/pex
|
||||
|
|
115
examples/experiments/onto-joint/defaults.cfg
Normal file
115
examples/experiments/onto-joint/defaults.cfg
Normal file
|
@ -0,0 +1,115 @@
|
|||
# Training hyper-parameters and additional features.
|
||||
[training]
|
||||
# Whether to train on sequences with 'gold standard' sentence boundaries
|
||||
# and tokens. If you set this to true, take care to ensure your run-time
|
||||
# data is passed in sentence-by-sentence via some prior preprocessing.
|
||||
gold_preproc = false
|
||||
# Limitations on training document length or number of examples.
|
||||
max_length = 0
|
||||
limit = 0
|
||||
# Data augmentation
|
||||
orth_variant_level = 0.0
|
||||
dropout = 0.1
|
||||
# Controls early-stopping. 0 or -1 mean unlimited.
|
||||
patience = 1600
|
||||
max_epochs = 0
|
||||
max_steps = 20000
|
||||
eval_frequency = 400
|
||||
# Other settings
|
||||
seed = 0
|
||||
accumulate_gradient = 1
|
||||
use_pytorch_for_gpu_memory = false
|
||||
# Control how scores are printed and checkpoints are evaluated.
|
||||
scores = ["speed", "tags_acc", "uas", "las", "ents_f"]
|
||||
score_weights = {"las": 0.4, "ents_f": 0.4, "tags_acc": 0.2}
|
||||
# These settings are invalid for the transformer models.
|
||||
init_tok2vec = null
|
||||
vectors = null
|
||||
|
||||
[training.batch_size]
|
||||
@schedules = "compounding.v1"
|
||||
start = 1000
|
||||
stop = 1000
|
||||
compound = 1.001
|
||||
|
||||
[optimizer]
|
||||
@optimizers = "Adam.v1"
|
||||
beta1 = 0.9
|
||||
beta2 = 0.999
|
||||
L2_is_weight_decay = true
|
||||
L2 = 0.01
|
||||
grad_clip = 1.0
|
||||
use_averages = true
|
||||
eps = 1e-8
|
||||
learn_rate = 0.001
|
||||
|
||||
#[optimizer.learn_rate]
|
||||
#@schedules = "warmup_linear.v1"
|
||||
#warmup_steps = 250
|
||||
#total_steps = 20000
|
||||
#initial_rate = 0.001
|
||||
|
||||
[nlp]
|
||||
lang = "en"
|
||||
vectors = ${training:vectors}
|
||||
|
||||
[nlp.pipeline.tok2vec]
|
||||
factory = "tok2vec"
|
||||
|
||||
[nlp.pipeline.senter]
|
||||
factory = "senter"
|
||||
|
||||
[nlp.pipeline.ner]
|
||||
factory = "ner"
|
||||
|
||||
[nlp.pipeline.tagger]
|
||||
factory = "tagger"
|
||||
|
||||
[nlp.pipeline.parser]
|
||||
factory = "parser"
|
||||
|
||||
[nlp.pipeline.senter.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.senter.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tagger.model]
|
||||
@architectures = "spacy.Tagger.v1"
|
||||
|
||||
[nlp.pipeline.tagger.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.parser.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 8
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
||||
[nlp.pipeline.parser.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.ner.model]
|
||||
@architectures = "spacy.TransitionBasedParser.v1"
|
||||
nr_feature_tokens = 3
|
||||
hidden_width = 128
|
||||
maxout_pieces = 3
|
||||
use_upper = false
|
||||
|
||||
[nlp.pipeline.ner.model.tok2vec]
|
||||
@architectures = "spacy.Tok2VecTensors.v1"
|
||||
width = ${nlp.pipeline.tok2vec.model:width}
|
||||
|
||||
[nlp.pipeline.tok2vec.model]
|
||||
@architectures = "spacy.HashEmbedCNN.v1"
|
||||
pretrained_vectors = ${nlp:vectors}
|
||||
width = 256
|
||||
depth = 6
|
||||
window_size = 1
|
||||
embed_size = 10000
|
||||
maxout_pieces = 3
|
||||
subword_features = true
|
|
@ -13,9 +13,11 @@ numpy>=1.15.0
|
|||
requests>=2.13.0,<3.0.0
|
||||
plac>=0.9.6,<1.2.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
# Optional dependencies
|
||||
jsonschema>=2.6.0,<3.1.0
|
||||
pydantic>=1.3.0,<2.0.0
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
# Development dependencies
|
||||
cython>=0.25
|
||||
pytest>=4.6.5
|
||||
|
|
|
@ -47,15 +47,17 @@ install_requires =
|
|||
wasabi>=0.4.0,<1.1.0
|
||||
srsly>=2.0.0,<3.0.0
|
||||
catalogue>=0.0.7,<1.1.0
|
||||
ml_datasets
|
||||
ml_datasets>=0.1.1
|
||||
# Third-party dependencies
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
setuptools
|
||||
numpy>=1.15.0
|
||||
plac>=0.9.6,<1.2.0
|
||||
requests>=2.13.0,<3.0.0
|
||||
pydantic>=1.3.0,<2.0.0
|
||||
tqdm>=4.38.0,<5.0.0
|
||||
# Official Python utilities
|
||||
setuptools
|
||||
packaging
|
||||
importlib_metadata>=0.20; python_version < "3.8"
|
||||
|
||||
[options.extras_require]
|
||||
lookups =
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# fmt: off
|
||||
__title__ = "spacy"
|
||||
__version__ = "3.0.0.dev8"
|
||||
__version__ = "3.0.0.dev9"
|
||||
__release__ = True
|
||||
__download_url__ = "https://github.com/explosion/spacy-models/releases/download"
|
||||
__compatibility__ = "https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json"
|
||||
|
|
|
@ -111,8 +111,8 @@ def get_entities(lines, tag_pattern, ner_map=None):
|
|||
final entity type with `ner_map` if mapping present. Entity tag is 'O' if
|
||||
the pattern is not matched.
|
||||
|
||||
lines (unicode): CONLL-U lines for one sentences
|
||||
tag_pattern (unicode): Regex pattern for entity tag
|
||||
lines (str): CONLL-U lines for one sentences
|
||||
tag_pattern (str): Regex pattern for entity tag
|
||||
ner_map (dict): Map old NER tag names to new ones, '' maps to O.
|
||||
RETURNS (list): List of BILUO entity tags
|
||||
"""
|
||||
|
@ -187,8 +187,8 @@ def example_from_conllu_sentence(
|
|||
"""Create an Example from the lines for one CoNLL-U sentence, merging
|
||||
subtokens and appending morphology to tags if required.
|
||||
|
||||
lines (unicode): The non-comment lines for a CoNLL-U sentence
|
||||
ner_tag_pattern (unicode): The regex pattern for matching NER in MISC col
|
||||
lines (str): The non-comment lines for a CoNLL-U sentence
|
||||
ner_tag_pattern (str): The regex pattern for matching NER in MISC col
|
||||
RETURNS (Example): An example containing the annotation
|
||||
"""
|
||||
# create a Doc with each subtoken as its own token
|
||||
|
|
|
@ -5,6 +5,7 @@ import sys
|
|||
from wasabi import msg
|
||||
|
||||
from .. import about
|
||||
from ..util import is_package, get_base_version
|
||||
|
||||
|
||||
def download(
|
||||
|
@ -17,7 +18,7 @@ def download(
|
|||
flag is set, the command expects the full model name with version.
|
||||
For direct downloads, the compatibility check will be skipped.
|
||||
"""
|
||||
if not require_package("spacy") and "--no-deps" not in pip_args:
|
||||
if not is_package("spacy") and "--no-deps" not in pip_args:
|
||||
msg.warn(
|
||||
"Skipping model package dependencies and setting `--no-deps`. "
|
||||
"You don't seem to have the spaCy package itself installed "
|
||||
|
@ -45,21 +46,6 @@ def download(
|
|||
"Download and installation successful",
|
||||
f"You can now load the model via spacy.load('{model_name}')",
|
||||
)
|
||||
# If a model is downloaded and then loaded within the same process, our
|
||||
# is_package check currently fails, because pkg_resources.working_set
|
||||
# is not refreshed automatically (see #3923). We're trying to work
|
||||
# around this here be requiring the package explicitly.
|
||||
require_package(model_name)
|
||||
|
||||
|
||||
def require_package(name):
|
||||
try:
|
||||
import pkg_resources
|
||||
|
||||
pkg_resources.working_set.require(name)
|
||||
return True
|
||||
except: # noqa: E722
|
||||
return False
|
||||
|
||||
|
||||
def get_json(url, desc):
|
||||
|
@ -77,8 +63,7 @@ def get_json(url, desc):
|
|||
|
||||
|
||||
def get_compatibility():
|
||||
version = about.__version__
|
||||
version = version.rsplit(".dev", 1)[0]
|
||||
version = get_base_version(about.__version__)
|
||||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
||||
comp = comp_table["spacy"]
|
||||
if version not in comp:
|
||||
|
@ -87,7 +72,7 @@ def get_compatibility():
|
|||
|
||||
|
||||
def get_version(model, comp):
|
||||
model = model.rsplit(".dev", 1)[0]
|
||||
model = get_base_version(model)
|
||||
if model not in comp:
|
||||
msg.fail(
|
||||
f"No compatible model found for '{model}' (spaCy v{about.__version__})",
|
||||
|
|
|
@ -48,7 +48,9 @@ def info(
|
|||
"Location": str(Path(__file__).parent.parent),
|
||||
"Platform": platform.platform(),
|
||||
"Python version": platform.python_version(),
|
||||
"Models": ", ".join(model["name"] for model in all_models.values()),
|
||||
"Models": ", ".join(
|
||||
f"{m['name']} ({m['version']})" for m in all_models.values()
|
||||
),
|
||||
}
|
||||
if not silent:
|
||||
title = "Info about spaCy"
|
||||
|
@ -63,7 +65,7 @@ def print_markdown(data, title=None):
|
|||
"""Print data in GitHub-flavoured Markdown format for issues etc.
|
||||
|
||||
data (dict or list of tuples): Label/value pairs.
|
||||
title (unicode or None): Title, will be rendered as headline 2.
|
||||
title (str / None): Title, will be rendered as headline 2.
|
||||
"""
|
||||
markdown = []
|
||||
for key, value in data.items():
|
||||
|
|
|
@ -83,14 +83,14 @@ def generate_meta(model_path, existing_meta, msg):
|
|||
("lang", "Model language", meta.get("lang", "en")),
|
||||
("name", "Model name", meta.get("name", "model")),
|
||||
("version", "Model version", meta.get("version", "0.0.0")),
|
||||
("spacy_version", "Required spaCy version", f">={about.__version__},<3.0.0"),
|
||||
("description", "Model description", meta.get("description", False)),
|
||||
("author", "Author", meta.get("author", False)),
|
||||
("email", "Author email", meta.get("email", False)),
|
||||
("url", "Author website", meta.get("url", False)),
|
||||
("license", "License", meta.get("license", "CC BY-SA 3.0")),
|
||||
("license", "License", meta.get("license", "MIT")),
|
||||
]
|
||||
nlp = util.load_model_from_path(Path(model_path))
|
||||
meta["spacy_version"] = util.get_model_version_range(about.__version__)
|
||||
meta["pipeline"] = nlp.pipe_names
|
||||
meta["vectors"] = {
|
||||
"width": nlp.vocab.vectors_length,
|
||||
|
@ -168,6 +168,7 @@ def setup_package():
|
|||
package_data={model_name: list_files(model_dir)},
|
||||
install_requires=list_requirements(meta),
|
||||
zip_safe=False,
|
||||
entry_points={'spacy_models': ['{m} = {m}'.format(m=model_name)]}
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -483,7 +483,6 @@ def train(
|
|||
# Update model meta.json
|
||||
meta["lang"] = nlp.lang
|
||||
meta["pipeline"] = nlp.pipe_names
|
||||
meta["spacy_version"] = f">={about.__version__}"
|
||||
if beam_width == 1:
|
||||
meta["speed"] = {
|
||||
"nwords": nwords,
|
||||
|
|
|
@ -7,7 +7,7 @@ from pathlib import Path
|
|||
from wasabi import msg
|
||||
import thinc
|
||||
import thinc.schedules
|
||||
from thinc.api import Model
|
||||
from thinc.api import Model, use_pytorch_for_gpu_memory
|
||||
import random
|
||||
|
||||
from ..gold import GoldCorpus
|
||||
|
@ -171,6 +171,8 @@ def train_from_config(
|
|||
msg.info(f"Loading config from: {config_path}")
|
||||
config = util.load_config(config_path, create_objects=False)
|
||||
util.fix_random_seed(config["training"]["seed"])
|
||||
if config["training"]["use_pytorch_for_gpu_memory"]:
|
||||
use_pytorch_for_gpu_memory()
|
||||
nlp_config = config["nlp"]
|
||||
config = util.load_config(config_path, create_objects=True)
|
||||
msg.info("Creating nlp from config")
|
||||
|
@ -213,6 +215,12 @@ def train_from_config(
|
|||
if is_best_checkpoint and output_path is not None:
|
||||
nlp.to_disk(output_path)
|
||||
progress = tqdm.tqdm(total=training["eval_frequency"], leave=False)
|
||||
# Clean up the objects to faciliate garbage collection.
|
||||
for eg in batch:
|
||||
eg.doc = None
|
||||
eg.goldparse = None
|
||||
eg.doc_annotation = None
|
||||
eg.token_annotation = None
|
||||
finally:
|
||||
if output_path is not None:
|
||||
final_model_path = output_path / "model-final"
|
||||
|
|
|
@ -4,6 +4,8 @@ import requests
|
|||
from wasabi import msg
|
||||
|
||||
from .. import about
|
||||
from ..util import get_package_version, get_installed_models, get_base_version
|
||||
from ..util import get_package_path, get_model_meta, is_compatible_version
|
||||
|
||||
|
||||
def validate():
|
||||
|
@ -12,7 +14,7 @@ def validate():
|
|||
with the installed models. Should be run after `pip install -U spacy`.
|
||||
"""
|
||||
model_pkgs, compat = get_model_pkgs()
|
||||
spacy_version = about.__version__.rsplit(".dev", 1)[0]
|
||||
spacy_version = get_base_version(about.__version__)
|
||||
current_compat = compat.get(spacy_version, {})
|
||||
if not current_compat:
|
||||
msg.warn(f"No compatible models found for v{spacy_version} of spaCy")
|
||||
|
@ -25,7 +27,7 @@ def validate():
|
|||
msg.info(f"spaCy installation: {spacy_dir}")
|
||||
|
||||
if model_pkgs:
|
||||
header = ("NAME", "VERSION", "")
|
||||
header = ("NAME", "SPACY", "VERSION", "")
|
||||
rows = []
|
||||
for name, data in model_pkgs.items():
|
||||
if data["compat"]:
|
||||
|
@ -34,7 +36,7 @@ def validate():
|
|||
else:
|
||||
version = msg.text(data["version"], color="red", no_print=True)
|
||||
comp = f"--> {compat.get(data['name'], ['n/a'])[0]}"
|
||||
rows.append((data["name"], version, comp))
|
||||
rows.append((data["name"], data["spacy"], version, comp))
|
||||
msg.table(rows, header=header)
|
||||
else:
|
||||
msg.text("No models found in your current environment.", exits=0)
|
||||
|
@ -44,8 +46,9 @@ def validate():
|
|||
cmd = "python -m spacy download {}"
|
||||
print("\n".join([cmd.format(pkg) for pkg in update_models]) + "\n")
|
||||
if na_models:
|
||||
msg.warn(
|
||||
f"The following models are not available for spaCy v{about.__version__}:",
|
||||
msg.info(
|
||||
f"The following models are custom spaCy models or not "
|
||||
f"available for spaCy v{about.__version__}:",
|
||||
", ".join(na_models),
|
||||
)
|
||||
if incompat_models:
|
||||
|
@ -53,8 +56,6 @@ def validate():
|
|||
|
||||
|
||||
def get_model_pkgs():
|
||||
import pkg_resources
|
||||
|
||||
with msg.loading("Loading compatibility table..."):
|
||||
r = requests.get(about.__compatibility__)
|
||||
if r.status_code != 200:
|
||||
|
@ -66,20 +67,29 @@ def get_model_pkgs():
|
|||
msg.good("Loaded compatibility table")
|
||||
compat = r.json()["spacy"]
|
||||
all_models = set()
|
||||
installed_models = get_installed_models()
|
||||
for spacy_v, models in dict(compat).items():
|
||||
all_models.update(models.keys())
|
||||
for model, model_vs in models.items():
|
||||
compat[spacy_v][model] = [reformat_version(v) for v in model_vs]
|
||||
pkgs = {}
|
||||
for pkg_name, pkg_data in pkg_resources.working_set.by_key.items():
|
||||
for pkg_name in installed_models:
|
||||
package = pkg_name.replace("-", "_")
|
||||
if package in all_models:
|
||||
version = pkg_data.version
|
||||
pkgs[pkg_name] = {
|
||||
"name": package,
|
||||
"version": version,
|
||||
"compat": package in compat and version in compat[package],
|
||||
}
|
||||
version = get_package_version(pkg_name)
|
||||
if package in compat:
|
||||
is_compat = version in compat[package]
|
||||
spacy_version = about.__version__
|
||||
else:
|
||||
model_path = get_package_path(package)
|
||||
model_meta = get_model_meta(model_path)
|
||||
spacy_version = model_meta.get("spacy_version", "n/a")
|
||||
is_compat = is_compatible_version(about.__version__, spacy_version)
|
||||
pkgs[pkg_name] = {
|
||||
"name": package,
|
||||
"version": version,
|
||||
"spacy": spacy_version,
|
||||
"compat": is_compat,
|
||||
}
|
||||
return pkgs, compat
|
||||
|
||||
|
||||
|
|
|
@ -22,13 +22,13 @@ def render(
|
|||
"""Render displaCy visualisation.
|
||||
|
||||
docs (list or Doc): Document(s) to visualise.
|
||||
style (unicode): Visualisation style, 'dep' or 'ent'.
|
||||
style (str): Visualisation style, 'dep' or 'ent'.
|
||||
page (bool): Render markup as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
jupyter (bool): Override Jupyter auto-detection.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
RETURNS (unicode): Rendered HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.render
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
|
@ -73,13 +73,13 @@ def serve(
|
|||
"""Serve displaCy visualisation.
|
||||
|
||||
docs (list or Doc): Document(s) to visualise.
|
||||
style (unicode): Visualisation style, 'dep' or 'ent'.
|
||||
style (str): Visualisation style, 'dep' or 'ent'.
|
||||
page (bool): Render markup as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
options (dict): Visualiser-specific options, e.g. colors.
|
||||
manual (bool): Don't parse `Doc` and instead expect a dict/list of dicts.
|
||||
port (int): Port to serve visualisation.
|
||||
host (unicode): Host to serve visualisation.
|
||||
host (str): Host to serve visualisation.
|
||||
|
||||
DOCS: https://spacy.io/api/top-level#displacy.serve
|
||||
USAGE: https://spacy.io/usage/visualizers
|
||||
|
|
|
@ -47,7 +47,7 @@ class DependencyRenderer(object):
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (unicode): Rendered SVG or HTML markup.
|
||||
RETURNS (str): Rendered SVG or HTML markup.
|
||||
"""
|
||||
# Create a random ID prefix to make sure parses don't receive the
|
||||
# same ID, even if they're identical
|
||||
|
@ -78,7 +78,7 @@ class DependencyRenderer(object):
|
|||
render_id (int): Unique ID, typically index of document.
|
||||
words (list): Individual words and their tags.
|
||||
arcs (list): Individual arcs and their start, end, direction and label.
|
||||
RETURNS (unicode): Rendered SVG markup.
|
||||
RETURNS (str): Rendered SVG markup.
|
||||
"""
|
||||
self.levels = self.get_levels(arcs)
|
||||
self.highest_level = len(self.levels)
|
||||
|
@ -112,10 +112,10 @@ class DependencyRenderer(object):
|
|||
):
|
||||
"""Render individual word.
|
||||
|
||||
text (unicode): Word text.
|
||||
tag (unicode): Part-of-speech tag.
|
||||
text (str): Word text.
|
||||
tag (str): Part-of-speech tag.
|
||||
i (int): Unique ID, typically word index.
|
||||
RETURNS (unicode): Rendered SVG markup.
|
||||
RETURNS (str): Rendered SVG markup.
|
||||
"""
|
||||
y = self.offset_y + self.word_spacing
|
||||
x = self.offset_x + i * self.distance
|
||||
|
@ -131,12 +131,12 @@ class DependencyRenderer(object):
|
|||
def render_arrow(self, label, start, end, direction, i):
|
||||
"""Render individual arrow.
|
||||
|
||||
label (unicode): Dependency label.
|
||||
label (str): Dependency label.
|
||||
start (int): Index of start word.
|
||||
end (int): Index of end word.
|
||||
direction (unicode): Arrow direction, 'left' or 'right'.
|
||||
direction (str): Arrow direction, 'left' or 'right'.
|
||||
i (int): Unique ID, typically arrow index.
|
||||
RETURNS (unicode): Rendered SVG markup.
|
||||
RETURNS (str): Rendered SVG markup.
|
||||
"""
|
||||
if start < 0 or end < 0:
|
||||
error_args = dict(start=start, end=end, label=label, dir=direction)
|
||||
|
@ -179,7 +179,7 @@ class DependencyRenderer(object):
|
|||
y (int): Y-coordinate of arrow start and end point.
|
||||
y_curve (int): Y-corrdinate of Cubic Bézier y_curve point.
|
||||
x_end (int): X-coordinate of arrow end point.
|
||||
RETURNS (unicode): Definition of the arc path ('d' attribute).
|
||||
RETURNS (str): Definition of the arc path ('d' attribute).
|
||||
"""
|
||||
template = "M{x},{y} C{x},{c} {e},{c} {e},{y}"
|
||||
if self.compact:
|
||||
|
@ -189,11 +189,11 @@ class DependencyRenderer(object):
|
|||
def get_arrowhead(self, direction, x, y, end):
|
||||
"""Render individual arrow head.
|
||||
|
||||
direction (unicode): Arrow direction, 'left' or 'right'.
|
||||
direction (str): Arrow direction, 'left' or 'right'.
|
||||
x (int): X-coordinate of arrow start point.
|
||||
y (int): Y-coordinate of arrow start and end point.
|
||||
end (int): X-coordinate of arrow end point.
|
||||
RETURNS (unicode): Definition of the arrow head path ('d' attribute).
|
||||
RETURNS (str): Definition of the arrow head path ('d' attribute).
|
||||
"""
|
||||
if direction == "left":
|
||||
pos1, pos2, pos3 = (x, x - self.arrow_width + 2, x + self.arrow_width - 2)
|
||||
|
@ -279,7 +279,7 @@ class EntityRenderer(object):
|
|||
parsed (list): Dependency parses to render.
|
||||
page (bool): Render parses wrapped as full HTML page.
|
||||
minify (bool): Minify HTML markup.
|
||||
RETURNS (unicode): Rendered HTML markup.
|
||||
RETURNS (str): Rendered HTML markup.
|
||||
"""
|
||||
rendered = []
|
||||
for i, p in enumerate(parsed):
|
||||
|
@ -300,9 +300,9 @@ class EntityRenderer(object):
|
|||
def render_ents(self, text, spans, title):
|
||||
"""Render entities in text.
|
||||
|
||||
text (unicode): Original text.
|
||||
text (str): Original text.
|
||||
spans (list): Individual entity spans and their start, end and label.
|
||||
title (unicode or None): Document title set in Doc.user_data['title'].
|
||||
title (str / None): Document title set in Doc.user_data['title'].
|
||||
"""
|
||||
markup = ""
|
||||
offset = 0
|
||||
|
|
|
@ -113,9 +113,12 @@ class Warnings(object):
|
|||
"ignored during training.")
|
||||
|
||||
# TODO: fix numbering after merging develop into master
|
||||
W095 = ("Skipping unsupported morphological feature(s): {feature}. "
|
||||
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
|
||||
"string \"Field1=Value1,Value2|Field2=Value3\".")
|
||||
W095 = ("Model '{model}' ({model_version}) requires spaCy {version} and is "
|
||||
"incompatible with the current version ({current}). This may lead "
|
||||
"to unexpected results or runtime errors. To resolve this, "
|
||||
"download a newer compatible model or retrain your custom model "
|
||||
"with the current spaCy version. For more details and available "
|
||||
"updates, run: python -m spacy validate")
|
||||
W096 = ("The method 'disable_pipes' has become deprecated - use 'select_pipes' "
|
||||
"instead.")
|
||||
W097 = ("No Model config was provided to create the '{name}' component, "
|
||||
|
@ -124,6 +127,9 @@ class Warnings(object):
|
|||
"so a default configuration was used.")
|
||||
W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
|
||||
"but got '{type}' instead, so ignoring it.")
|
||||
W100 = ("Skipping unsupported morphological feature(s): {feature}. "
|
||||
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
|
||||
"string \"Field1=Value1,Value2|Field2=Value3\".")
|
||||
|
||||
|
||||
@add_codes
|
||||
|
@ -621,7 +627,7 @@ class MatchPatternError(ValueError):
|
|||
def __init__(self, key, errors):
|
||||
"""Custom error for validating match patterns.
|
||||
|
||||
key (unicode): The name of the matcher rule.
|
||||
key (str): The name of the matcher rule.
|
||||
errors (dict): Validation errors (sequence of strings) mapped to pattern
|
||||
ID, i.e. the index of the added pattern.
|
||||
"""
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
def explain(term):
|
||||
"""Get a description for a given POS tag, dependency label or entity type.
|
||||
|
||||
term (unicode): The term to explain.
|
||||
RETURNS (unicode): The explanation, or `None` if not found in the glossary.
|
||||
term (str): The term to explain.
|
||||
RETURNS (str): The explanation, or `None` if not found in the glossary.
|
||||
|
||||
EXAMPLE:
|
||||
>>> spacy.explain(u'NORP')
|
||||
|
|
|
@ -154,8 +154,8 @@ class GoldCorpus(object):
|
|||
def __init__(self, train, dev, gold_preproc=False, limit=None):
|
||||
"""Create a GoldCorpus.
|
||||
|
||||
train (unicode or Path): File or directory of training data.
|
||||
dev (unicode or Path): File or directory of development data.
|
||||
train (str / Path): File or directory of training data.
|
||||
dev (str / Path): File or directory of development data.
|
||||
RETURNS (GoldCorpus): The newly created object.
|
||||
"""
|
||||
self.limit = limit
|
||||
|
|
|
@ -38,7 +38,7 @@ cdef class Candidate:
|
|||
|
||||
@property
|
||||
def entity_(self):
|
||||
"""RETURNS (unicode): ID/name of this entity in the KB"""
|
||||
"""RETURNS (str): ID/name of this entity in the KB"""
|
||||
return self.kb.vocab.strings[self.entity_hash]
|
||||
|
||||
@property
|
||||
|
@ -48,7 +48,7 @@ cdef class Candidate:
|
|||
|
||||
@property
|
||||
def alias_(self):
|
||||
"""RETURNS (unicode): ID of the original alias"""
|
||||
"""RETURNS (str): ID of the original alias"""
|
||||
return self.kb.vocab.strings[self.alias_hash]
|
||||
|
||||
@property
|
||||
|
|
|
@ -17,7 +17,8 @@ from .tokens.underscore import Underscore
|
|||
from .vocab import Vocab
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .lookups import Lookups
|
||||
from .analysis import analyze_pipes, analyze_all_pipes, validate_attrs
|
||||
from .pipe_analysis import analyze_pipes, analyze_all_pipes, validate_attrs
|
||||
from .pipe_analysis import count_pipeline_interdependencies
|
||||
from .gold import Example
|
||||
from .scorer import Scorer
|
||||
from .util import link_vectors_to_models, create_default_optimizer, registry
|
||||
|
@ -127,7 +128,7 @@ class Language(object):
|
|||
|
||||
Defaults (class): Settings, data and factory methods for creating the `nlp`
|
||||
object and processing pipeline.
|
||||
lang (unicode): Two-letter language ID, i.e. ISO code.
|
||||
lang (str): Two-letter language ID, i.e. ISO code.
|
||||
|
||||
DOCS: https://spacy.io/api/language
|
||||
"""
|
||||
|
@ -196,13 +197,14 @@ class Language(object):
|
|||
|
||||
@property
|
||||
def meta(self):
|
||||
spacy_version = util.get_model_version_range(about.__version__)
|
||||
if self.vocab.lang:
|
||||
self._meta.setdefault("lang", self.vocab.lang)
|
||||
else:
|
||||
self._meta.setdefault("lang", self.lang)
|
||||
self._meta.setdefault("name", "model")
|
||||
self._meta.setdefault("version", "0.0.0")
|
||||
self._meta.setdefault("spacy_version", f">={about.__version__}")
|
||||
self._meta.setdefault("spacy_version", spacy_version)
|
||||
self._meta.setdefault("description", "")
|
||||
self._meta.setdefault("author", "")
|
||||
self._meta.setdefault("email", "")
|
||||
|
@ -292,7 +294,7 @@ class Language(object):
|
|||
def get_pipe(self, name):
|
||||
"""Get a pipeline component for a given component name.
|
||||
|
||||
name (unicode): Name of pipeline component to get.
|
||||
name (str): Name of pipeline component to get.
|
||||
RETURNS (callable): The pipeline component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#get_pipe
|
||||
|
@ -305,7 +307,7 @@ class Language(object):
|
|||
def create_pipe(self, name, config=dict()):
|
||||
"""Create a pipeline component from a factory.
|
||||
|
||||
name (unicode): Factory name to look up in `Language.factories`.
|
||||
name (str): Factory name to look up in `Language.factories`.
|
||||
config (dict): Configuration parameters to initialise component.
|
||||
RETURNS (callable): Pipeline component.
|
||||
|
||||
|
@ -348,12 +350,12 @@ class Language(object):
|
|||
of before/after/first/last can be set. Default behaviour is "last".
|
||||
|
||||
component (callable): The pipeline component.
|
||||
name (unicode): Name of pipeline component. Overwrites existing
|
||||
name (str): Name of pipeline component. Overwrites existing
|
||||
component.name attribute if available. If no name is set and
|
||||
the component exposes no name attribute, component.__name__ is
|
||||
used. An error is raised if a name already exists in the pipeline.
|
||||
before (unicode): Component name to insert component directly before.
|
||||
after (unicode): Component name to insert component directly after.
|
||||
before (str): Component name to insert component directly before.
|
||||
after (str): Component name to insert component directly after.
|
||||
first (bool): Insert component first / not first in the pipeline.
|
||||
last (bool): Insert component last / not last in the pipeline.
|
||||
|
||||
|
@ -394,7 +396,7 @@ class Language(object):
|
|||
"""Check if a component name is present in the pipeline. Equivalent to
|
||||
`name in nlp.pipe_names`.
|
||||
|
||||
name (unicode): Name of the component.
|
||||
name (str): Name of the component.
|
||||
RETURNS (bool): Whether a component of the name exists in the pipeline.
|
||||
|
||||
DOCS: https://spacy.io/api/language#has_pipe
|
||||
|
@ -404,7 +406,7 @@ class Language(object):
|
|||
def replace_pipe(self, name, component):
|
||||
"""Replace a component in the pipeline.
|
||||
|
||||
name (unicode): Name of the component to replace.
|
||||
name (str): Name of the component to replace.
|
||||
component (callable): Pipeline component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#replace_pipe
|
||||
|
@ -423,8 +425,8 @@ class Language(object):
|
|||
def rename_pipe(self, old_name, new_name):
|
||||
"""Rename a pipeline component.
|
||||
|
||||
old_name (unicode): Name of the component to rename.
|
||||
new_name (unicode): New name of the component.
|
||||
old_name (str): Name of the component to rename.
|
||||
new_name (str): New name of the component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#rename_pipe
|
||||
"""
|
||||
|
@ -438,7 +440,7 @@ class Language(object):
|
|||
def remove_pipe(self, name):
|
||||
"""Remove a component from the pipeline.
|
||||
|
||||
name (unicode): Name of the component to remove.
|
||||
name (str): Name of the component to remove.
|
||||
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
||||
|
||||
DOCS: https://spacy.io/api/language#remove_pipe
|
||||
|
@ -455,7 +457,7 @@ class Language(object):
|
|||
and can contain arbitrary whitespace. Alignment into the original string
|
||||
is preserved.
|
||||
|
||||
text (unicode): The text to be processed.
|
||||
text (str): The text to be processed.
|
||||
disable (list): Names of the pipeline components to disable.
|
||||
component_cfg (dict): An optional dictionary with extra keyword arguments
|
||||
for specific components.
|
||||
|
@ -564,13 +566,14 @@ class Language(object):
|
|||
|
||||
if component_cfg is None:
|
||||
component_cfg = {}
|
||||
component_deps = count_pipeline_interdependencies(self.pipeline)
|
||||
# Determine whether component should set annotations. In theory I guess
|
||||
# we should do this by inspecting the meta? Or we could just always
|
||||
# say "yes"
|
||||
for name, proc in self.pipeline:
|
||||
for i, (name, proc) in enumerate(self.pipeline):
|
||||
component_cfg.setdefault(name, {})
|
||||
component_cfg[name].setdefault("drop", drop)
|
||||
component_cfg[name].setdefault("set_annotations", False)
|
||||
component_cfg[name]["set_annotations"] = bool(component_deps[i])
|
||||
for name, proc in self.pipeline:
|
||||
if not hasattr(proc, "update"):
|
||||
continue
|
||||
|
@ -938,7 +941,7 @@ class Language(object):
|
|||
"""Save the current state to a directory. If a model is loaded, this
|
||||
will include the model.
|
||||
|
||||
path (unicode or Path): Path to a directory, which will be created if
|
||||
path (str / Path): Path to a directory, which will be created if
|
||||
it doesn't exist.
|
||||
exclude (list): Names of components or serialization fields to exclude.
|
||||
|
||||
|
@ -972,7 +975,7 @@ class Language(object):
|
|||
returns it. If the saved `Language` object contains a model, the
|
||||
model will be loaded.
|
||||
|
||||
path (unicode or Path): A path to a directory.
|
||||
path (str / Path): A path to a directory.
|
||||
exclude (list): Names of components or serialization fields to exclude.
|
||||
RETURNS (Language): The modified `Language` object.
|
||||
|
||||
|
@ -1090,7 +1093,7 @@ class component(object):
|
|||
):
|
||||
"""Decorate a pipeline component.
|
||||
|
||||
name (unicode): Default component and factory name.
|
||||
name (str): Default component and factory name.
|
||||
assigns (list): Attributes assigned by component, e.g. `["token.pos"]`.
|
||||
requires (list): Attributes required by component, e.g. `["token.dep"]`.
|
||||
retokenizes (bool): Whether the component changes the tokenization.
|
||||
|
|
|
@ -30,8 +30,8 @@ class Lemmatizer(object):
|
|||
def __call__(self, string, univ_pos, morphology=None):
|
||||
"""Lemmatize a string.
|
||||
|
||||
string (unicode): The string to lemmatize, e.g. the token text.
|
||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
||||
string (str): The string to lemmatize, e.g. the token text.
|
||||
univ_pos (str / int): The token's universal part-of-speech tag.
|
||||
morphology (dict): The token's morphological features following the
|
||||
Universal Dependencies scheme.
|
||||
RETURNS (list): The available lemmas for the string.
|
||||
|
@ -69,7 +69,7 @@ class Lemmatizer(object):
|
|||
Check whether we're dealing with an uninflected paradigm, so we can
|
||||
avoid lemmatization entirely.
|
||||
|
||||
univ_pos (unicode / int): The token's universal part-of-speech tag.
|
||||
univ_pos (str / int): The token's universal part-of-speech tag.
|
||||
morphology (dict): The token's morphological features following the
|
||||
Universal Dependencies scheme.
|
||||
"""
|
||||
|
@ -126,10 +126,10 @@ class Lemmatizer(object):
|
|||
"""Look up a lemma in the table, if available. If no lemma is found,
|
||||
the original string is returned.
|
||||
|
||||
string (unicode): The original string.
|
||||
string (str): The original string.
|
||||
orth (int): Optional hash of the string to look up. If not set, the
|
||||
string will be used and hashed.
|
||||
RETURNS (unicode): The lemma if the string was found, otherwise the
|
||||
RETURNS (str): The lemma if the string was found, otherwise the
|
||||
original string.
|
||||
"""
|
||||
lookup_table = self.lookups.get_table("lemma_lookup", {})
|
||||
|
|
|
@ -164,7 +164,7 @@ cdef class Lexeme:
|
|||
self.vocab.set_vector(self.c.orth, vector)
|
||||
|
||||
property rank:
|
||||
"""RETURNS (unicode): Sequential ID of the lexemes's lexical type, used
|
||||
"""RETURNS (str): Sequential ID of the lexemes's lexical type, used
|
||||
to index into tables, e.g. for word vectors."""
|
||||
def __get__(self):
|
||||
return self.c.id
|
||||
|
@ -187,18 +187,18 @@ cdef class Lexeme:
|
|||
|
||||
@property
|
||||
def orth_(self):
|
||||
"""RETURNS (unicode): The original verbatim text of the lexeme
|
||||
"""RETURNS (str): The original verbatim text of the lexeme
|
||||
(identical to `Lexeme.text`). Exists mostly for consistency with
|
||||
the other attributes."""
|
||||
return self.vocab.strings[self.c.orth]
|
||||
|
||||
@property
|
||||
def text(self):
|
||||
"""RETURNS (unicode): The original verbatim text of the lexeme."""
|
||||
"""RETURNS (str): The original verbatim text of the lexeme."""
|
||||
return self.orth_
|
||||
|
||||
property lower:
|
||||
"""RETURNS (unicode): Lowercase form of the lexeme."""
|
||||
"""RETURNS (str): Lowercase form of the lexeme."""
|
||||
def __get__(self):
|
||||
return self.c.lower
|
||||
|
||||
|
@ -281,7 +281,7 @@ cdef class Lexeme:
|
|||
prob_table[self.c.orth] = x
|
||||
|
||||
property lower_:
|
||||
"""RETURNS (unicode): Lowercase form of the word."""
|
||||
"""RETURNS (str): Lowercase form of the word."""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lower]
|
||||
|
||||
|
@ -289,7 +289,7 @@ cdef class Lexeme:
|
|||
self.c.lower = self.vocab.strings.add(x)
|
||||
|
||||
property norm_:
|
||||
"""RETURNS (unicode): The lexemes's norm, i.e. a normalised form of the
|
||||
"""RETURNS (str): The lexemes's norm, i.e. a normalised form of the
|
||||
lexeme text.
|
||||
"""
|
||||
def __get__(self):
|
||||
|
@ -299,7 +299,7 @@ cdef class Lexeme:
|
|||
self.norm = self.vocab.strings.add(x)
|
||||
|
||||
property shape_:
|
||||
"""RETURNS (unicode): Transform of the word's string, to show
|
||||
"""RETURNS (str): Transform of the word's string, to show
|
||||
orthographic features.
|
||||
"""
|
||||
def __get__(self):
|
||||
|
@ -309,7 +309,7 @@ cdef class Lexeme:
|
|||
self.c.shape = self.vocab.strings.add(x)
|
||||
|
||||
property prefix_:
|
||||
"""RETURNS (unicode): Length-N substring from the start of the word.
|
||||
"""RETURNS (str): Length-N substring from the start of the word.
|
||||
Defaults to `N=1`.
|
||||
"""
|
||||
def __get__(self):
|
||||
|
@ -319,7 +319,7 @@ cdef class Lexeme:
|
|||
self.c.prefix = self.vocab.strings.add(x)
|
||||
|
||||
property suffix_:
|
||||
"""RETURNS (unicode): Length-N substring from the end of the word.
|
||||
"""RETURNS (str): Length-N substring from the end of the word.
|
||||
Defaults to `N=3`.
|
||||
"""
|
||||
def __get__(self):
|
||||
|
@ -329,7 +329,7 @@ cdef class Lexeme:
|
|||
self.c.suffix = self.vocab.strings.add(x)
|
||||
|
||||
property lang_:
|
||||
"""RETURNS (unicode): Language of the parent vocabulary."""
|
||||
"""RETURNS (str): Language of the parent vocabulary."""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.lang]
|
||||
|
||||
|
|
|
@ -31,7 +31,7 @@ class Lookups(object):
|
|||
"""Check if the lookups contain a table of a given name. Delegates to
|
||||
Lookups.has_table.
|
||||
|
||||
name (unicode): Name of the table.
|
||||
name (str): Name of the table.
|
||||
RETURNS (bool): Whether a table of that name is in the lookups.
|
||||
"""
|
||||
return self.has_table(name)
|
||||
|
@ -48,7 +48,7 @@ class Lookups(object):
|
|||
def add_table(self, name, data=SimpleFrozenDict()):
|
||||
"""Add a new table to the lookups. Raises an error if the table exists.
|
||||
|
||||
name (unicode): Unique name of table.
|
||||
name (str): Unique name of table.
|
||||
data (dict): Optional data to add to the table.
|
||||
RETURNS (Table): The newly added table.
|
||||
|
||||
|
@ -64,7 +64,7 @@ class Lookups(object):
|
|||
"""Get a table. Raises an error if the table doesn't exist and no
|
||||
default value is provided.
|
||||
|
||||
name (unicode): Name of the table.
|
||||
name (str): Name of the table.
|
||||
default: Optional default value to return if table doesn't exist.
|
||||
RETURNS (Table): The table.
|
||||
|
||||
|
@ -79,7 +79,7 @@ class Lookups(object):
|
|||
def remove_table(self, name):
|
||||
"""Remove a table. Raises an error if the table doesn't exist.
|
||||
|
||||
name (unicode): Name of the table to remove.
|
||||
name (str): Name of the table to remove.
|
||||
RETURNS (Table): The removed table.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#remove_table
|
||||
|
@ -91,7 +91,7 @@ class Lookups(object):
|
|||
def has_table(self, name):
|
||||
"""Check if the lookups contain a table of a given name.
|
||||
|
||||
name (unicode): Name of the table.
|
||||
name (str): Name of the table.
|
||||
RETURNS (bool): Whether a table of that name exists.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#has_table
|
||||
|
@ -125,7 +125,7 @@ class Lookups(object):
|
|||
"""Save the lookups to a directory as lookups.bin. Expects a path to a
|
||||
directory, which will be created if it doesn't exist.
|
||||
|
||||
path (unicode / Path): The file path.
|
||||
path (str / Path): The file path.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#to_disk
|
||||
"""
|
||||
|
@ -141,7 +141,7 @@ class Lookups(object):
|
|||
"""Load lookups from a directory containing a lookups.bin. Will skip
|
||||
loading if the file doesn't exist.
|
||||
|
||||
path (unicode / Path): The directory path.
|
||||
path (str / Path): The directory path.
|
||||
RETURNS (Lookups): The loaded lookups.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#from_disk
|
||||
|
@ -167,7 +167,7 @@ class Table(OrderedDict):
|
|||
"""Initialize a new table from a dict.
|
||||
|
||||
data (dict): The dictionary.
|
||||
name (unicode): Optional table name for reference.
|
||||
name (str): Optional table name for reference.
|
||||
RETURNS (Table): The newly created object.
|
||||
|
||||
DOCS: https://spacy.io/api/lookups#table.from_dict
|
||||
|
@ -179,7 +179,7 @@ class Table(OrderedDict):
|
|||
def __init__(self, name=None, data=None):
|
||||
"""Initialize a new table.
|
||||
|
||||
name (unicode): Optional table name for reference.
|
||||
name (str): Optional table name for reference.
|
||||
data (dict): Initial data, used to hint Bloom Filter.
|
||||
RETURNS (Table): The newly created object.
|
||||
|
||||
|
@ -197,7 +197,7 @@ class Table(OrderedDict):
|
|||
def __setitem__(self, key, value):
|
||||
"""Set new key/value pair. String keys will be hashed.
|
||||
|
||||
key (unicode / int): The key to set.
|
||||
key (str / int): The key to set.
|
||||
value: The value to set.
|
||||
"""
|
||||
key = get_string_id(key)
|
||||
|
@ -208,7 +208,7 @@ class Table(OrderedDict):
|
|||
"""Set new key/value pair. String keys will be hashed.
|
||||
Same as table[key] = value.
|
||||
|
||||
key (unicode / int): The key to set.
|
||||
key (str / int): The key to set.
|
||||
value: The value to set.
|
||||
"""
|
||||
self[key] = value
|
||||
|
@ -216,7 +216,7 @@ class Table(OrderedDict):
|
|||
def __getitem__(self, key):
|
||||
"""Get the value for a given key. String keys will be hashed.
|
||||
|
||||
key (unicode / int): The key to get.
|
||||
key (str / int): The key to get.
|
||||
RETURNS: The value.
|
||||
"""
|
||||
key = get_string_id(key)
|
||||
|
@ -225,7 +225,7 @@ class Table(OrderedDict):
|
|||
def get(self, key, default=None):
|
||||
"""Get the value for a given key. String keys will be hashed.
|
||||
|
||||
key (unicode / int): The key to get.
|
||||
key (str / int): The key to get.
|
||||
default: The default value to return.
|
||||
RETURNS: The value.
|
||||
"""
|
||||
|
@ -235,7 +235,7 @@ class Table(OrderedDict):
|
|||
def __contains__(self, key):
|
||||
"""Check whether a key is in the table. String keys will be hashed.
|
||||
|
||||
key (unicode / int): The key to check.
|
||||
key (str / int): The key to check.
|
||||
RETURNS (bool): Whether the key is in the table.
|
||||
"""
|
||||
key = get_string_id(key)
|
||||
|
|
|
@ -66,7 +66,7 @@ cdef class DependencyMatcher:
|
|||
def __contains__(self, key):
|
||||
"""Check whether the matcher contains rules for a match ID.
|
||||
|
||||
key (unicode): The match ID.
|
||||
key (str): The match ID.
|
||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||
"""
|
||||
return self._normalize_key(key) in self._patterns
|
||||
|
@ -194,7 +194,7 @@ cdef class DependencyMatcher:
|
|||
def get(self, key, default=None):
|
||||
"""Retrieve the pattern stored for a key.
|
||||
|
||||
key (unicode or int): The key to retrieve.
|
||||
key (str / int): The key to retrieve.
|
||||
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
|
||||
"""
|
||||
key = self._normalize_key(key)
|
||||
|
|
|
@ -64,7 +64,7 @@ cdef class Matcher:
|
|||
def __contains__(self, key):
|
||||
"""Check whether the matcher contains rules for a match ID.
|
||||
|
||||
key (unicode): The match ID.
|
||||
key (str): The match ID.
|
||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||
"""
|
||||
return self._normalize_key(key) in self._patterns
|
||||
|
@ -98,7 +98,7 @@ cdef class Matcher:
|
|||
number of arguments). The on_match callback becomes an optional keyword
|
||||
argument.
|
||||
|
||||
key (unicode): The match ID.
|
||||
key (str): The match ID.
|
||||
patterns (list): The patterns to add for the given key.
|
||||
on_match (callable): Optional callback executed on match.
|
||||
*_patterns (list): For backwards compatibility: list of patterns to add
|
||||
|
@ -139,7 +139,7 @@ cdef class Matcher:
|
|||
"""Remove a rule from the matcher. A KeyError is raised if the key does
|
||||
not exist.
|
||||
|
||||
key (unicode): The ID of the match rule.
|
||||
key (str): The ID of the match rule.
|
||||
"""
|
||||
norm_key = self._normalize_key(key)
|
||||
if not norm_key in self._patterns:
|
||||
|
@ -166,7 +166,7 @@ cdef class Matcher:
|
|||
def get(self, key, default=None):
|
||||
"""Retrieve the pattern stored for a key.
|
||||
|
||||
key (unicode or int): The key to retrieve.
|
||||
key (str / int): The key to retrieve.
|
||||
RETURNS (tuple): The rule, as an (on_match, patterns) tuple.
|
||||
"""
|
||||
key = self._normalize_key(key)
|
||||
|
|
|
@ -30,7 +30,7 @@ cdef class PhraseMatcher:
|
|||
"""Initialize the PhraseMatcher.
|
||||
|
||||
vocab (Vocab): The shared vocabulary.
|
||||
attr (int / unicode): Token attribute to match on.
|
||||
attr (int / str): Token attribute to match on.
|
||||
validate (bool): Perform additional validation when patterns are added.
|
||||
RETURNS (PhraseMatcher): The newly constructed object.
|
||||
|
||||
|
@ -70,7 +70,7 @@ cdef class PhraseMatcher:
|
|||
def __contains__(self, key):
|
||||
"""Check whether the matcher contains rules for a match ID.
|
||||
|
||||
key (unicode): The match ID.
|
||||
key (str): The match ID.
|
||||
RETURNS (bool): Whether the matcher contains rules for this match ID.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#contains
|
||||
|
@ -85,7 +85,7 @@ cdef class PhraseMatcher:
|
|||
"""Remove a rule from the matcher by match ID. A KeyError is raised if
|
||||
the key does not exist.
|
||||
|
||||
key (unicode): The match ID.
|
||||
key (str): The match ID.
|
||||
|
||||
DOCS: https://spacy.io/api/phrasematcher#remove
|
||||
"""
|
||||
|
@ -159,7 +159,7 @@ cdef class PhraseMatcher:
|
|||
number of arguments). The on_match callback becomes an optional keyword
|
||||
argument.
|
||||
|
||||
key (unicode): The match ID.
|
||||
key (str): The match ID.
|
||||
docs (list): List of `Doc` objects representing match patterns.
|
||||
on_match (callable): Callback executed on match.
|
||||
*_docs (Doc): For backwards compatibility: list of patterns to add
|
||||
|
|
|
@ -15,10 +15,10 @@ def build_tb_parser_model(
|
|||
use_upper=True,
|
||||
nO=None,
|
||||
):
|
||||
token_vector_width = tok2vec.get_dim("nO")
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
tok2vec = chain(
|
||||
tok2vec,
|
||||
with_array(Linear(hidden_width, token_vector_width)),
|
||||
with_array(Linear(hidden_width, t2v_width)),
|
||||
list2array(),
|
||||
)
|
||||
tok2vec.set_dim("nO", hidden_width)
|
||||
|
|
|
@ -6,9 +6,9 @@ from ...util import registry
|
|||
|
||||
@registry.architectures.register("spacy.Tagger.v1")
|
||||
def build_tagger_model(tok2vec, nO=None) -> Model:
|
||||
token_vector_width = tok2vec.get_dim("nO")
|
||||
# TODO: glorot_uniform_init seems to work a bit better than zero_init here?!
|
||||
output_layer = Softmax(nO, nI=token_vector_width, init_W=zero_init)
|
||||
t2v_width = tok2vec.get_dim("nO") if tok2vec.has_dim("nO") else None
|
||||
output_layer = Softmax(nO, t2v_width, init_W=zero_init)
|
||||
softmax = with_array(output_layer)
|
||||
model = chain(tok2vec, softmax)
|
||||
model.set_ref("tok2vec", tok2vec)
|
||||
|
|
|
@ -38,8 +38,8 @@ def forward(model, X, is_train):
|
|||
|
||||
|
||||
def init(model, X=None, Y=None):
|
||||
tok2vec = model.get_ref("tok2vec").initialize()
|
||||
lower = model.get_ref("lower").initialize(X=X)
|
||||
tok2vec = model.get_ref("tok2vec").initialize(X=X)
|
||||
lower = model.get_ref("lower").initialize()
|
||||
if model.attrs["has_upper"]:
|
||||
statevecs = model.ops.alloc2f(2, lower.get_dim("nO"))
|
||||
model.get_ref("upper").initialize(X=statevecs)
|
||||
|
|
|
@ -198,8 +198,8 @@ cdef class Morphology:
|
|||
"""Add a special-case rule to the morphological analyser. Tokens whose
|
||||
tag and orth match the rule will receive the specified properties.
|
||||
|
||||
tag (unicode): The part-of-speech tag to key the exception.
|
||||
orth (unicode): The word-form to key the exception.
|
||||
tag (str): The part-of-speech tag to key the exception.
|
||||
orth (str): The word-form to key the exception.
|
||||
"""
|
||||
attrs = dict(attrs)
|
||||
attrs = _normalize_props(attrs)
|
||||
|
|
|
@ -11,7 +11,7 @@ def analyze_pipes(pipeline, name, pipe, index, warn=True):
|
|||
fulfilled (e.g. if previous components assign the attributes).
|
||||
|
||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||
name (unicode): The name of the pipeline component to analyze.
|
||||
name (str): The name of the pipeline component to analyze.
|
||||
pipe (callable): The pipeline component function to analyze.
|
||||
index (int): The index of the component in the pipeline.
|
||||
warn (bool): Show user warning if problem is found.
|
||||
|
@ -125,7 +125,7 @@ def get_assigns_for_attr(pipeline, attr):
|
|||
"""Get all pipeline components that assign an attr, e.g. "doc.tensor".
|
||||
|
||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||
attr (unicode): The attribute to check.
|
||||
attr (str): The attribute to check.
|
||||
RETURNS (list): (name, pipeline) tuples of components that assign the attr.
|
||||
"""
|
||||
return _get_feature_for_attr(pipeline, attr, "assigns")
|
||||
|
@ -135,7 +135,7 @@ def get_requires_for_attr(pipeline, attr):
|
|||
"""Get all pipeline components that require an attr, e.g. "doc.tensor".
|
||||
|
||||
pipeline (list): A list of (name, pipe) tuples e.g. nlp.pipeline.
|
||||
attr (unicode): The attribute to check.
|
||||
attr (str): The attribute to check.
|
||||
RETURNS (list): (name, pipeline) tuples of components that require the attr.
|
||||
"""
|
||||
return _get_feature_for_attr(pipeline, attr, "requires")
|
||||
|
@ -173,3 +173,22 @@ def print_summary(nlp, pretty=True, no_print=False):
|
|||
msg.good("No problems found.")
|
||||
if no_print:
|
||||
return {"overview": overview, "problems": problems}
|
||||
|
||||
|
||||
def count_pipeline_interdependencies(pipeline):
|
||||
"""Count how many subsequent components require an annotation set by each
|
||||
component in the pipeline.
|
||||
"""
|
||||
pipe_assigns = []
|
||||
pipe_requires = []
|
||||
for name, pipe in pipeline:
|
||||
pipe_assigns.append(set(getattr(pipe, "assigns", [])))
|
||||
pipe_requires.append(set(getattr(pipe, "requires", [])))
|
||||
counts = []
|
||||
for i, assigns in enumerate(pipe_assigns):
|
||||
count = 0
|
||||
for requires in pipe_requires[i + 1 :]:
|
||||
if assigns.intersection(requires):
|
||||
count += 1
|
||||
counts.append(count)
|
||||
return counts
|
|
@ -30,7 +30,7 @@ class EntityRuler(object):
|
|||
|
||||
nlp (Language): The shared nlp object to pass the vocab to the matchers
|
||||
and process phrase patterns.
|
||||
phrase_matcher_attr (int / unicode): Token attribute to match on, passed
|
||||
phrase_matcher_attr (int / str): Token attribute to match on, passed
|
||||
to the internal PhraseMatcher as `attr`
|
||||
validate (bool): Whether patterns should be validated, passed to
|
||||
Matcher and PhraseMatcher as `validate`
|
||||
|
@ -315,7 +315,7 @@ class EntityRuler(object):
|
|||
"""Load the entity ruler from a file. Expects a file containing
|
||||
newline-delimited JSON (JSONL) with one entry per line.
|
||||
|
||||
path (unicode / Path): The JSONL file to load.
|
||||
path (str / Path): The JSONL file to load.
|
||||
**kwargs: Other config paramters, mostly for consistency.
|
||||
|
||||
RETURNS (EntityRuler): The loaded entity ruler.
|
||||
|
@ -351,7 +351,7 @@ class EntityRuler(object):
|
|||
"""Save the entity ruler patterns to a directory. The patterns will be
|
||||
saved as newline-delimited JSON (JSONL).
|
||||
|
||||
path (unicode / Path): The JSONL file to save.
|
||||
path (str / Path): The JSONL file to save.
|
||||
**kwargs: Other config paramters, mostly for consistency.
|
||||
|
||||
DOCS: https://spacy.io/api/entityruler#to_disk
|
||||
|
|
|
@ -50,7 +50,7 @@ def merge_subtokens(doc, label="subtok"):
|
|||
"""Merge subtokens into a single token.
|
||||
|
||||
doc (Doc): The Doc object.
|
||||
label (unicode): The subtoken dependency label.
|
||||
label (str): The subtoken dependency label.
|
||||
RETURNS (Doc): The Doc object with merged subtokens.
|
||||
|
||||
DOCS: https://spacy.io/api/pipeline-functions#merge_subtokens
|
||||
|
|
|
@ -531,7 +531,16 @@ class Tagger(Pipe):
|
|||
vocab.morphology.lemmatizer,
|
||||
exc=vocab.morphology.exc)
|
||||
self.set_output(len(self.labels))
|
||||
self.model.initialize()
|
||||
doc_sample = [Doc(self.vocab, words=["hello", "world"])]
|
||||
if pipeline is not None:
|
||||
for name, component in pipeline:
|
||||
if component is self:
|
||||
break
|
||||
if hasattr(component, "pipe"):
|
||||
doc_sample = list(component.pipe(doc_sample))
|
||||
else:
|
||||
doc_sample = [component(doc) for doc in doc_sample]
|
||||
self.model.initialize(X=doc_sample)
|
||||
# Get batch of example docs, example outputs to call begin_training().
|
||||
# This lets the model infer shapes.
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
|
|
@ -109,7 +109,7 @@ cdef class StringStore:
|
|||
"""Retrieve a string from a given hash, or vice versa.
|
||||
|
||||
string_or_id (bytes, unicode or uint64): The value to encode.
|
||||
Returns (unicode or uint64): The value to be retrieved.
|
||||
Returns (str / uint64): The value to be retrieved.
|
||||
"""
|
||||
if isinstance(string_or_id, basestring) and len(string_or_id) == 0:
|
||||
return 0
|
||||
|
@ -152,7 +152,7 @@ cdef class StringStore:
|
|||
def add(self, string):
|
||||
"""Add a string to the StringStore.
|
||||
|
||||
string (unicode): The string to add.
|
||||
string (str): The string to add.
|
||||
RETURNS (uint64): The string's hash value.
|
||||
"""
|
||||
if isinstance(string, unicode):
|
||||
|
@ -179,7 +179,7 @@ cdef class StringStore:
|
|||
def __contains__(self, string not None):
|
||||
"""Check whether a string is in the store.
|
||||
|
||||
string (unicode): The string to check.
|
||||
string (str): The string to check.
|
||||
RETURNS (bool): Whether the store contains the string.
|
||||
"""
|
||||
cdef hash_t key
|
||||
|
@ -205,7 +205,7 @@ cdef class StringStore:
|
|||
def __iter__(self):
|
||||
"""Iterate over the strings in the store, in order.
|
||||
|
||||
YIELDS (unicode): A string in the store.
|
||||
YIELDS (str): A string in the store.
|
||||
"""
|
||||
cdef int i
|
||||
cdef hash_t key
|
||||
|
@ -223,7 +223,7 @@ cdef class StringStore:
|
|||
def to_disk(self, path):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
path (str / Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||
"""
|
||||
path = util.ensure_path(path)
|
||||
|
@ -234,7 +234,7 @@ cdef class StringStore:
|
|||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
path (unicode or Path): A path to a directory. Paths may be either
|
||||
path (str / Path): A path to a directory. Paths may be either
|
||||
strings or `Path`-like objects.
|
||||
RETURNS (StringStore): The modified `StringStore` object.
|
||||
"""
|
||||
|
|
|
@ -624,12 +624,25 @@ cdef class Parser:
|
|||
sgd = self.create_optimizer()
|
||||
doc_sample = []
|
||||
gold_sample = []
|
||||
for example in islice(get_examples(), 1000):
|
||||
for example in islice(get_examples(), 10):
|
||||
parses = example.get_gold_parses(merge=False, vocab=self.vocab)
|
||||
for doc, gold in parses:
|
||||
doc_sample.append(doc)
|
||||
gold_sample.append(gold)
|
||||
self.model.initialize(doc_sample, gold_sample)
|
||||
if len(doc):
|
||||
doc_sample.append(doc)
|
||||
gold_sample.append(gold)
|
||||
|
||||
if pipeline is not None:
|
||||
for name, component in pipeline:
|
||||
if component is self:
|
||||
break
|
||||
if hasattr(component, "pipe"):
|
||||
doc_sample = list(component.pipe(doc_sample))
|
||||
else:
|
||||
doc_sample = [component(doc) for doc in doc_sample]
|
||||
if doc_sample:
|
||||
self.model.initialize(doc_sample)
|
||||
else:
|
||||
self.model.initialize()
|
||||
if pipeline is not None:
|
||||
self.init_multitask_objectives(get_examples, pipeline, sgd=sgd, **self.cfg)
|
||||
link_vectors_to_models(self.vocab)
|
||||
|
|
|
@ -9,7 +9,6 @@ def test_build_dependencies():
|
|||
"pytest-timeout",
|
||||
"mock",
|
||||
"flake8",
|
||||
"jsonschema",
|
||||
]
|
||||
libs_ignore_setup = ["fugashi", "natto-py", "pythainlp"]
|
||||
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
import spacy.language
|
||||
from spacy.language import Language, component
|
||||
from spacy.analysis import print_summary, validate_attrs
|
||||
from spacy.analysis import get_assigns_for_attr, get_requires_for_attr
|
||||
from spacy.pipe_analysis import print_summary, validate_attrs
|
||||
from spacy.pipe_analysis import get_assigns_for_attr, get_requires_for_attr
|
||||
from spacy.pipe_analysis import count_pipeline_interdependencies
|
||||
from mock import Mock, ANY
|
||||
import pytest
|
||||
|
||||
|
@ -161,3 +162,19 @@ def test_analysis_validate_attrs_remove_pipe():
|
|||
with pytest.warns(None) as record:
|
||||
nlp.remove_pipe("c2")
|
||||
assert not record.list
|
||||
|
||||
|
||||
def test_pipe_interdependencies():
|
||||
class Fancifier:
|
||||
name = "fancifier"
|
||||
assigns = ("doc._.fancy",)
|
||||
requires = tuple()
|
||||
|
||||
class FancyNeeder:
|
||||
name = "needer"
|
||||
assigns = tuple()
|
||||
requires = ("doc._.fancy",)
|
||||
|
||||
pipeline = [("fancifier", Fancifier()), ("needer", FancyNeeder())]
|
||||
counts = count_pipeline_interdependencies(pipeline)
|
||||
assert counts == [1, 0]
|
||||
|
|
|
@ -2,9 +2,11 @@ import pytest
|
|||
import os
|
||||
import ctypes
|
||||
from pathlib import Path
|
||||
from spacy.about import __version__ as spacy_version
|
||||
from spacy import util
|
||||
from spacy import prefer_gpu, require_gpu
|
||||
from spacy.ml._precomputable_affine import PrecomputableAffine, _backprop_precomputable_affine_padding
|
||||
from spacy.ml._precomputable_affine import PrecomputableAffine
|
||||
from spacy.ml._precomputable_affine import _backprop_precomputable_affine_padding
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
|
@ -24,10 +26,12 @@ def test_util_ensure_path_succeeds(text):
|
|||
assert isinstance(path, Path)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("package", ["numpy"])
|
||||
def test_util_is_package(package):
|
||||
@pytest.mark.parametrize(
|
||||
"package,result", [("numpy", True), ("sfkodskfosdkfpsdpofkspdof", False)]
|
||||
)
|
||||
def test_util_is_package(package, result):
|
||||
"""Test that an installed package via pip is recognised by util.is_package."""
|
||||
assert util.is_package(package)
|
||||
assert util.is_package(package) is result
|
||||
|
||||
|
||||
@pytest.mark.parametrize("package", ["thinc"])
|
||||
|
@ -87,3 +91,21 @@ def test_ascii_filenames():
|
|||
root = Path(__file__).parent.parent
|
||||
for path in root.glob("**/*"):
|
||||
assert all(ord(c) < 128 for c in path.name), path.name
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"version,constraint,compatible",
|
||||
[
|
||||
(spacy_version, spacy_version, True),
|
||||
(spacy_version, f">={spacy_version}", True),
|
||||
("3.0.0", "2.0.0", False),
|
||||
("3.2.1", ">=2.0.0", True),
|
||||
("2.2.10a1", ">=1.0.0,<2.1.1", False),
|
||||
("3.0.0.dev3", ">=1.2.3,<4.5.6", True),
|
||||
("n/a", ">=1.2.3,<4.5.6", None),
|
||||
("1.2.3", "n/a", None),
|
||||
("n/a", "n/a", None),
|
||||
],
|
||||
)
|
||||
def test_is_compatible_version(version, constraint, compatible):
|
||||
assert util.is_compatible_version(version, constraint) is compatible
|
||||
|
|
59
spacy/tests/test_util.py
Normal file
59
spacy/tests/test_util.py
Normal file
|
@ -0,0 +1,59 @@
|
|||
import pytest
|
||||
from spacy.gold import Example
|
||||
|
||||
from .util import get_random_doc
|
||||
|
||||
from spacy.util import minibatch_by_words
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"doc_sizes, expected_batches",
|
||||
[
|
||||
([400, 400, 199], [3]),
|
||||
([400, 400, 199, 3], [4]),
|
||||
([400, 400, 199, 3, 200], [3, 2]),
|
||||
([400, 400, 199, 3, 1], [5]),
|
||||
([400, 400, 199, 3, 1, 1500], [5]), # 1500 will be discarded
|
||||
([400, 400, 199, 3, 1, 200], [3, 3]),
|
||||
([400, 400, 199, 3, 1, 999], [3, 3]),
|
||||
([400, 400, 199, 3, 1, 999, 999], [3, 2, 1, 1]),
|
||||
([1, 2, 999], [3]),
|
||||
([1, 2, 999, 1], [4]),
|
||||
([1, 200, 999, 1], [2, 2]),
|
||||
([1, 999, 200, 1], [2, 2]),
|
||||
],
|
||||
)
|
||||
def test_util_minibatch(doc_sizes, expected_batches):
|
||||
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
|
||||
examples = [Example(doc=doc) for doc in docs]
|
||||
tol = 0.2
|
||||
batch_size = 1000
|
||||
batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=True))
|
||||
assert [len(batch) for batch in batches] == expected_batches
|
||||
|
||||
max_size = batch_size + batch_size * tol
|
||||
for batch in batches:
|
||||
assert sum([len(example.doc) for example in batch]) < max_size
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"doc_sizes, expected_batches",
|
||||
[
|
||||
([400, 4000, 199], [1, 2]),
|
||||
([400, 400, 199, 3000, 200], [1, 4]),
|
||||
([400, 400, 199, 3, 1, 1500], [1, 5]),
|
||||
([400, 400, 199, 3000, 2000, 200, 200], [1, 1, 3, 2]),
|
||||
([1, 2, 9999], [1, 2]),
|
||||
([2000, 1, 2000, 1, 1, 1, 2000], [1, 1, 1, 4]),
|
||||
],
|
||||
)
|
||||
def test_util_minibatch_oversize(doc_sizes, expected_batches):
|
||||
""" Test that oversized documents are returned in their own batch"""
|
||||
docs = [get_random_doc(doc_size) for doc_size in doc_sizes]
|
||||
examples = [Example(doc=doc) for doc in docs]
|
||||
tol = 0.2
|
||||
batch_size = 1000
|
||||
batches = list(minibatch_by_words(examples=examples, size=batch_size, tolerance=tol, discard_oversize=False))
|
||||
assert [len(batch) for batch in batches] == expected_batches
|
||||
|
||||
|
|
@ -92,6 +92,13 @@ def get_batch(batch_size):
|
|||
return docs
|
||||
|
||||
|
||||
def get_random_doc(n_words):
|
||||
vocab = Vocab()
|
||||
# Make the words numbers, so that they're easy to track.
|
||||
numbers = [str(i) for i in range(0, n_words)]
|
||||
return Doc(vocab, words=numbers)
|
||||
|
||||
|
||||
def apply_transition_sequence(parser, doc, sequence):
|
||||
"""Perform a series of pre-specified transitions, to put the parser in a
|
||||
desired state."""
|
||||
|
|
|
@ -134,7 +134,7 @@ cdef class Tokenizer:
|
|||
def __call__(self, unicode string):
|
||||
"""Tokenize a string.
|
||||
|
||||
string (unicode): The string to tokenize.
|
||||
string (str): The string to tokenize.
|
||||
RETURNS (Doc): A container for linguistic annotations.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#call
|
||||
|
@ -147,7 +147,7 @@ cdef class Tokenizer:
|
|||
cdef Doc _tokenize_affixes(self, unicode string, bint with_special_cases):
|
||||
"""Tokenize according to affix and token_match settings.
|
||||
|
||||
string (unicode): The string to tokenize.
|
||||
string (str): The string to tokenize.
|
||||
RETURNS (Doc): A container for linguistic annotations.
|
||||
"""
|
||||
if len(string) >= (2 ** 30):
|
||||
|
@ -527,7 +527,7 @@ cdef class Tokenizer:
|
|||
def find_infix(self, unicode string):
|
||||
"""Find internal split points of the string, such as hyphens.
|
||||
|
||||
string (unicode): The string to segment.
|
||||
string (str): The string to segment.
|
||||
RETURNS (list): A list of `re.MatchObject` objects that have `.start()`
|
||||
and `.end()` methods, denoting the placement of internal segment
|
||||
separators, e.g. hyphens.
|
||||
|
@ -542,7 +542,7 @@ cdef class Tokenizer:
|
|||
"""Find the length of a prefix that should be segmented from the
|
||||
string, or None if no prefix rules match.
|
||||
|
||||
string (unicode): The string to segment.
|
||||
string (str): The string to segment.
|
||||
RETURNS (int): The length of the prefix if present, otherwise `None`.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#find_prefix
|
||||
|
@ -556,7 +556,7 @@ cdef class Tokenizer:
|
|||
"""Find the length of a suffix that should be segmented from the
|
||||
string, or None if no suffix rules match.
|
||||
|
||||
string (unicode): The string to segment.
|
||||
string (str): The string to segment.
|
||||
Returns (int): The length of the suffix if present, otherwise `None`.
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#find_suffix
|
||||
|
@ -576,7 +576,7 @@ cdef class Tokenizer:
|
|||
def _validate_special_case(self, chunk, substrings):
|
||||
"""Check whether the `ORTH` fields match the string.
|
||||
|
||||
string (unicode): The string to specially tokenize.
|
||||
string (str): The string to specially tokenize.
|
||||
substrings (iterable): A sequence of dicts, where each dict describes
|
||||
a token and its attributes.
|
||||
"""
|
||||
|
@ -588,7 +588,7 @@ cdef class Tokenizer:
|
|||
def add_special_case(self, unicode string, substrings):
|
||||
"""Add a special-case tokenization rule.
|
||||
|
||||
string (unicode): The string to specially tokenize.
|
||||
string (str): The string to specially tokenize.
|
||||
substrings (iterable): A sequence of dicts, where each dict describes
|
||||
a token and its attributes. The `ORTH` fields of the attributes
|
||||
must exactly match the string when they are concatenated.
|
||||
|
@ -629,7 +629,7 @@ cdef class Tokenizer:
|
|||
produced are identical to `nlp.tokenizer()` except for whitespace
|
||||
tokens.
|
||||
|
||||
string (unicode): The string to tokenize.
|
||||
string (str): The string to tokenize.
|
||||
RETURNS (list): A list of (pattern_string, token_string) tuples
|
||||
|
||||
DOCS: https://spacy.io/api/tokenizer#explain
|
||||
|
@ -693,7 +693,7 @@ cdef class Tokenizer:
|
|||
def to_disk(self, path, **kwargs):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
path (str / Path): A path to a directory, which will be created if
|
||||
it doesn't exist.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
|
||||
|
@ -707,7 +707,7 @@ cdef class Tokenizer:
|
|||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
path (unicode or Path): A path to a directory.
|
||||
path (str / Path): A path to a directory.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Tokenizer): The modified `Tokenizer` object.
|
||||
|
||||
|
|
|
@ -117,7 +117,7 @@ cdef class Doc:
|
|||
def set_extension(cls, name, **kwargs):
|
||||
"""Define a custom attribute which becomes available as `Doc._`.
|
||||
|
||||
name (unicode): Name of the attribute to set.
|
||||
name (str): Name of the attribute to set.
|
||||
default: Optional default value of the attribute.
|
||||
getter (callable): Optional getter function.
|
||||
setter (callable): Optional setter function.
|
||||
|
@ -135,7 +135,7 @@ cdef class Doc:
|
|||
def get_extension(cls, name):
|
||||
"""Look up a previously registered extension by name.
|
||||
|
||||
name (unicode): Name of the extension.
|
||||
name (str): Name of the extension.
|
||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#get_extension
|
||||
|
@ -146,7 +146,7 @@ cdef class Doc:
|
|||
def has_extension(cls, name):
|
||||
"""Check whether an extension has been registered.
|
||||
|
||||
name (unicode): Name of the extension.
|
||||
name (str): Name of the extension.
|
||||
RETURNS (bool): Whether the extension has been registered.
|
||||
|
||||
DOCS: https://spacy.io/api/doc#has_extension
|
||||
|
@ -157,7 +157,7 @@ cdef class Doc:
|
|||
def remove_extension(cls, name):
|
||||
"""Remove a previously registered extension.
|
||||
|
||||
name (unicode): Name of the extension.
|
||||
name (str): Name of the extension.
|
||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
||||
removed extension.
|
||||
|
||||
|
@ -483,7 +483,7 @@ cdef class Doc:
|
|||
def text(self):
|
||||
"""A unicode representation of the document text.
|
||||
|
||||
RETURNS (unicode): The original verbatim text of the document.
|
||||
RETURNS (str): The original verbatim text of the document.
|
||||
"""
|
||||
return "".join(t.text_with_ws for t in self)
|
||||
|
||||
|
@ -492,7 +492,7 @@ cdef class Doc:
|
|||
"""An alias of `Doc.text`, provided for duck-type compatibility with
|
||||
`Span` and `Token`.
|
||||
|
||||
RETURNS (unicode): The original verbatim text of the document.
|
||||
RETURNS (str): The original verbatim text of the document.
|
||||
"""
|
||||
return self.text
|
||||
|
||||
|
@ -637,7 +637,7 @@ cdef class Doc:
|
|||
|
||||
@property
|
||||
def lang_(self):
|
||||
"""RETURNS (unicode): Language of the doc's vocabulary, e.g. 'en'."""
|
||||
"""RETURNS (str): Language of the doc's vocabulary, e.g. 'en'."""
|
||||
return self.vocab.lang
|
||||
|
||||
cdef int push_back(self, LexemeOrToken lex_or_tok, bint has_space) except -1:
|
||||
|
@ -852,7 +852,7 @@ cdef class Doc:
|
|||
def to_disk(self, path, **kwargs):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode or Path): A path to a directory, which will be created if
|
||||
path (str / Path): A path to a directory, which will be created if
|
||||
it doesn't exist. Paths may be either strings or Path-like objects.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
|
||||
|
@ -866,7 +866,7 @@ cdef class Doc:
|
|||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
path (unicode or Path): A path to a directory. Paths may be either
|
||||
path (str / Path): A path to a directory. Paths may be either
|
||||
strings or `Path`-like objects.
|
||||
exclude (list): String names of serialization fields to exclude.
|
||||
RETURNS (Doc): The modified `Doc` object.
|
||||
|
|
|
@ -33,7 +33,7 @@ cdef class Span:
|
|||
def set_extension(cls, name, **kwargs):
|
||||
"""Define a custom attribute which becomes available as `Span._`.
|
||||
|
||||
name (unicode): Name of the attribute to set.
|
||||
name (str): Name of the attribute to set.
|
||||
default: Optional default value of the attribute.
|
||||
getter (callable): Optional getter function.
|
||||
setter (callable): Optional setter function.
|
||||
|
@ -51,7 +51,7 @@ cdef class Span:
|
|||
def get_extension(cls, name):
|
||||
"""Look up a previously registered extension by name.
|
||||
|
||||
name (unicode): Name of the extension.
|
||||
name (str): Name of the extension.
|
||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
||||
|
||||
DOCS: https://spacy.io/api/span#get_extension
|
||||
|
@ -62,7 +62,7 @@ cdef class Span:
|
|||
def has_extension(cls, name):
|
||||
"""Check whether an extension has been registered.
|
||||
|
||||
name (unicode): Name of the extension.
|
||||
name (str): Name of the extension.
|
||||
RETURNS (bool): Whether the extension has been registered.
|
||||
|
||||
DOCS: https://spacy.io/api/span#has_extension
|
||||
|
@ -73,7 +73,7 @@ cdef class Span:
|
|||
def remove_extension(cls, name):
|
||||
"""Remove a previously registered extension.
|
||||
|
||||
name (unicode): Name of the extension.
|
||||
name (str): Name of the extension.
|
||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
||||
removed extension.
|
||||
|
||||
|
@ -491,7 +491,7 @@ cdef class Span:
|
|||
|
||||
@property
|
||||
def text(self):
|
||||
"""RETURNS (unicode): The original verbatim text of the span."""
|
||||
"""RETURNS (str): The original verbatim text of the span."""
|
||||
text = self.text_with_ws
|
||||
if self[-1].whitespace_:
|
||||
text = text[:-1]
|
||||
|
@ -502,7 +502,7 @@ cdef class Span:
|
|||
"""The text content of the span with a trailing whitespace character if
|
||||
the last token has one.
|
||||
|
||||
RETURNS (unicode): The text content of the span (with trailing
|
||||
RETURNS (str): The text content of the span (with trailing
|
||||
whitespace).
|
||||
"""
|
||||
return "".join([t.text_with_ws for t in self])
|
||||
|
@ -678,7 +678,7 @@ cdef class Span:
|
|||
raise NotImplementedError(TempErrors.T007.format(attr="ent_id"))
|
||||
|
||||
property ent_id_:
|
||||
"""RETURNS (unicode): The (string) entity ID."""
|
||||
"""RETURNS (str): The (string) entity ID."""
|
||||
def __get__(self):
|
||||
return self.root.ent_id_
|
||||
|
||||
|
@ -690,12 +690,12 @@ cdef class Span:
|
|||
"""Verbatim text content (identical to `Span.text`). Exists mostly for
|
||||
consistency with other attributes.
|
||||
|
||||
RETURNS (unicode): The span's text."""
|
||||
RETURNS (str): The span's text."""
|
||||
return self.text
|
||||
|
||||
@property
|
||||
def lemma_(self):
|
||||
"""RETURNS (unicode): The span's lemma."""
|
||||
"""RETURNS (str): The span's lemma."""
|
||||
return " ".join([t.lemma_ for t in self]).strip()
|
||||
|
||||
@property
|
||||
|
@ -714,7 +714,7 @@ cdef class Span:
|
|||
return "".join([t.text_with_ws for t in self])
|
||||
|
||||
property label_:
|
||||
"""RETURNS (unicode): The span's label."""
|
||||
"""RETURNS (str): The span's label."""
|
||||
def __get__(self):
|
||||
return self.doc.vocab.strings[self.label]
|
||||
|
||||
|
@ -724,7 +724,7 @@ cdef class Span:
|
|||
raise NotImplementedError(Errors.E129.format(start=self.start, end=self.end, label=label_))
|
||||
|
||||
property kb_id_:
|
||||
"""RETURNS (unicode): The named entity's KB ID."""
|
||||
"""RETURNS (str): The named entity's KB ID."""
|
||||
def __get__(self):
|
||||
return self.doc.vocab.strings[self.kb_id]
|
||||
|
||||
|
|
|
@ -36,7 +36,7 @@ cdef class Token:
|
|||
def set_extension(cls, name, **kwargs):
|
||||
"""Define a custom attribute which becomes available as `Token._`.
|
||||
|
||||
name (unicode): Name of the attribute to set.
|
||||
name (str): Name of the attribute to set.
|
||||
default: Optional default value of the attribute.
|
||||
getter (callable): Optional getter function.
|
||||
setter (callable): Optional setter function.
|
||||
|
@ -54,7 +54,7 @@ cdef class Token:
|
|||
def get_extension(cls, name):
|
||||
"""Look up a previously registered extension by name.
|
||||
|
||||
name (unicode): Name of the extension.
|
||||
name (str): Name of the extension.
|
||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple.
|
||||
|
||||
DOCS: https://spacy.io/api/token#get_extension
|
||||
|
@ -65,7 +65,7 @@ cdef class Token:
|
|||
def has_extension(cls, name):
|
||||
"""Check whether an extension has been registered.
|
||||
|
||||
name (unicode): Name of the extension.
|
||||
name (str): Name of the extension.
|
||||
RETURNS (bool): Whether the extension has been registered.
|
||||
|
||||
DOCS: https://spacy.io/api/token#has_extension
|
||||
|
@ -76,7 +76,7 @@ cdef class Token:
|
|||
def remove_extension(cls, name):
|
||||
"""Remove a previously registered extension.
|
||||
|
||||
name (unicode): Name of the extension.
|
||||
name (str): Name of the extension.
|
||||
RETURNS (tuple): A `(default, method, getter, setter)` tuple of the
|
||||
removed extension.
|
||||
|
||||
|
@ -244,12 +244,12 @@ cdef class Token:
|
|||
|
||||
@property
|
||||
def text(self):
|
||||
"""RETURNS (unicode): The original verbatim text of the token."""
|
||||
"""RETURNS (str): The original verbatim text of the token."""
|
||||
return self.orth_
|
||||
|
||||
@property
|
||||
def text_with_ws(self):
|
||||
"""RETURNS (unicode): The text content of the span (with trailing
|
||||
"""RETURNS (str): The text content of the span (with trailing
|
||||
whitespace).
|
||||
"""
|
||||
cdef unicode orth = self.vocab.strings[self.c.lex.orth]
|
||||
|
@ -762,7 +762,7 @@ cdef class Token:
|
|||
self.c.ent_type = ent_type
|
||||
|
||||
property ent_type_:
|
||||
"""RETURNS (unicode): Named entity type."""
|
||||
"""RETURNS (str): Named entity type."""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.ent_type]
|
||||
|
||||
|
@ -785,7 +785,7 @@ cdef class Token:
|
|||
and "" means no entity tag is set. "B" with an empty ent_type
|
||||
means that the token is blocked from further processing by NER.
|
||||
|
||||
RETURNS (unicode): IOB code of named entity tag.
|
||||
RETURNS (str): IOB code of named entity tag.
|
||||
"""
|
||||
iob_strings = ("", "I", "O", "B")
|
||||
return iob_strings[self.c.ent_iob]
|
||||
|
@ -801,7 +801,7 @@ cdef class Token:
|
|||
self.c.ent_id = key
|
||||
|
||||
property ent_id_:
|
||||
"""RETURNS (unicode): ID of the entity the token is an instance of,
|
||||
"""RETURNS (str): ID of the entity the token is an instance of,
|
||||
if any.
|
||||
"""
|
||||
def __get__(self):
|
||||
|
@ -819,7 +819,7 @@ cdef class Token:
|
|||
self.c.ent_kb_id = ent_kb_id
|
||||
|
||||
property ent_kb_id_:
|
||||
"""RETURNS (unicode): Named entity KB ID."""
|
||||
"""RETURNS (str): Named entity KB ID."""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.ent_kb_id]
|
||||
|
||||
|
@ -828,12 +828,12 @@ cdef class Token:
|
|||
|
||||
@property
|
||||
def whitespace_(self):
|
||||
"""RETURNS (unicode): The trailing whitespace character, if present."""
|
||||
"""RETURNS (str): The trailing whitespace character, if present."""
|
||||
return " " if self.c.spacy else ""
|
||||
|
||||
@property
|
||||
def orth_(self):
|
||||
"""RETURNS (unicode): Verbatim text content (identical to
|
||||
"""RETURNS (str): Verbatim text content (identical to
|
||||
`Token.text`). Exists mostly for consistency with the other
|
||||
attributes.
|
||||
"""
|
||||
|
@ -841,13 +841,13 @@ cdef class Token:
|
|||
|
||||
@property
|
||||
def lower_(self):
|
||||
"""RETURNS (unicode): The lowercase token text. Equivalent to
|
||||
"""RETURNS (str): The lowercase token text. Equivalent to
|
||||
`Token.text.lower()`.
|
||||
"""
|
||||
return self.vocab.strings[self.c.lex.lower]
|
||||
|
||||
property norm_:
|
||||
"""RETURNS (unicode): The token's norm, i.e. a normalised form of the
|
||||
"""RETURNS (str): The token's norm, i.e. a normalised form of the
|
||||
token text. Usually set in the language's tokenizer exceptions or
|
||||
norm exceptions.
|
||||
"""
|
||||
|
@ -859,34 +859,34 @@ cdef class Token:
|
|||
|
||||
@property
|
||||
def shape_(self):
|
||||
"""RETURNS (unicode): Transform of the tokens's string, to show
|
||||
"""RETURNS (str): Transform of the tokens's string, to show
|
||||
orthographic features. For example, "Xxxx" or "dd".
|
||||
"""
|
||||
return self.vocab.strings[self.c.lex.shape]
|
||||
|
||||
@property
|
||||
def prefix_(self):
|
||||
"""RETURNS (unicode): A length-N substring from the start of the token.
|
||||
"""RETURNS (str): A length-N substring from the start of the token.
|
||||
Defaults to `N=1`.
|
||||
"""
|
||||
return self.vocab.strings[self.c.lex.prefix]
|
||||
|
||||
@property
|
||||
def suffix_(self):
|
||||
"""RETURNS (unicode): A length-N substring from the end of the token.
|
||||
"""RETURNS (str): A length-N substring from the end of the token.
|
||||
Defaults to `N=3`.
|
||||
"""
|
||||
return self.vocab.strings[self.c.lex.suffix]
|
||||
|
||||
@property
|
||||
def lang_(self):
|
||||
"""RETURNS (unicode): Language of the parent document's vocabulary,
|
||||
"""RETURNS (str): Language of the parent document's vocabulary,
|
||||
e.g. 'en'.
|
||||
"""
|
||||
return self.vocab.strings[self.c.lex.lang]
|
||||
|
||||
property lemma_:
|
||||
"""RETURNS (unicode): The token lemma, i.e. the base form of the word,
|
||||
"""RETURNS (str): The token lemma, i.e. the base form of the word,
|
||||
with no inflectional suffixes.
|
||||
"""
|
||||
def __get__(self):
|
||||
|
@ -899,7 +899,7 @@ cdef class Token:
|
|||
self.c.lemma = self.vocab.strings.add(lemma_)
|
||||
|
||||
property pos_:
|
||||
"""RETURNS (unicode): Coarse-grained part-of-speech tag."""
|
||||
"""RETURNS (str): Coarse-grained part-of-speech tag."""
|
||||
def __get__(self):
|
||||
return parts_of_speech.NAMES[self.c.pos]
|
||||
|
||||
|
@ -907,7 +907,7 @@ cdef class Token:
|
|||
self.c.pos = parts_of_speech.IDS[pos_name]
|
||||
|
||||
property tag_:
|
||||
"""RETURNS (unicode): Fine-grained part-of-speech tag."""
|
||||
"""RETURNS (str): Fine-grained part-of-speech tag."""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.tag]
|
||||
|
||||
|
@ -915,7 +915,7 @@ cdef class Token:
|
|||
self.tag = self.vocab.strings.add(tag)
|
||||
|
||||
property dep_:
|
||||
"""RETURNS (unicode): The syntactic dependency label."""
|
||||
"""RETURNS (str): The syntactic dependency label."""
|
||||
def __get__(self):
|
||||
return self.vocab.strings[self.c.dep]
|
||||
|
||||
|
|
223
spacy/util.py
223
spacy/util.py
|
@ -15,6 +15,8 @@ import srsly
|
|||
import catalogue
|
||||
import sys
|
||||
import warnings
|
||||
from packaging.specifiers import SpecifierSet, InvalidSpecifier
|
||||
from packaging.version import Version, InvalidVersion
|
||||
|
||||
|
||||
try:
|
||||
|
@ -22,9 +24,16 @@ try:
|
|||
except ImportError:
|
||||
cupy = None
|
||||
|
||||
try: # Python 3.8
|
||||
import importlib.metadata as importlib_metadata
|
||||
except ImportError:
|
||||
import importlib_metadata
|
||||
|
||||
from .symbols import ORTH
|
||||
from .compat import cupy, CudaStream
|
||||
from .errors import Errors, Warnings
|
||||
from . import about
|
||||
|
||||
|
||||
_PRINT_ENV = False
|
||||
OOV_RANK = numpy.iinfo(numpy.uint64).max
|
||||
|
@ -37,6 +46,10 @@ class registry(thinc.registry):
|
|||
factories = catalogue.create("spacy", "factories", entry_points=True)
|
||||
displacy_colors = catalogue.create("spacy", "displacy_colors", entry_points=True)
|
||||
assets = catalogue.create("spacy", "assets", entry_points=True)
|
||||
# This is mostly used to get a list of all installed models in the current
|
||||
# environment. spaCy models packaged with `spacy package` will "advertise"
|
||||
# themselves via entry points.
|
||||
models = catalogue.create("spacy", "models", entry_points=True)
|
||||
|
||||
|
||||
def set_env_log(value):
|
||||
|
@ -49,7 +62,7 @@ def lang_class_is_loaded(lang):
|
|||
loaded lazily, to avoid expensive setup code associated with the language
|
||||
data.
|
||||
|
||||
lang (unicode): Two-letter language code, e.g. 'en'.
|
||||
lang (str): Two-letter language code, e.g. 'en'.
|
||||
RETURNS (bool): Whether a Language class has been loaded.
|
||||
"""
|
||||
return lang in registry.languages
|
||||
|
@ -58,7 +71,7 @@ def lang_class_is_loaded(lang):
|
|||
def get_lang_class(lang):
|
||||
"""Import and load a Language class.
|
||||
|
||||
lang (unicode): Two-letter language code, e.g. 'en'.
|
||||
lang (str): Two-letter language code, e.g. 'en'.
|
||||
RETURNS (Language): Language class.
|
||||
"""
|
||||
# Check if language is registered / entry point is available
|
||||
|
@ -76,7 +89,7 @@ def get_lang_class(lang):
|
|||
def set_lang_class(name, cls):
|
||||
"""Set a custom Language class name that can be loaded via get_lang_class.
|
||||
|
||||
name (unicode): Name of Language class.
|
||||
name (str): Name of Language class.
|
||||
cls (Language): Language class.
|
||||
"""
|
||||
registry.languages.register(name, func=cls)
|
||||
|
@ -98,7 +111,7 @@ def load_language_data(path):
|
|||
"""Load JSON language data using the given path as a base. If the provided
|
||||
path isn't present, will attempt to load a gzipped version before giving up.
|
||||
|
||||
path (unicode / Path): The data to load.
|
||||
path (str / Path): The data to load.
|
||||
RETURNS: The loaded data.
|
||||
"""
|
||||
path = ensure_path(path)
|
||||
|
@ -119,7 +132,7 @@ def get_module_path(module):
|
|||
def load_model(name, **overrides):
|
||||
"""Load a model from a package or data path.
|
||||
|
||||
name (unicode): Package name or model path.
|
||||
name (str): Package name or model path.
|
||||
**overrides: Specific overrides, like pipeline components to disable.
|
||||
RETURNS (Language): `Language` class with the loaded model.
|
||||
"""
|
||||
|
@ -193,7 +206,7 @@ def load_model_from_init_py(init_file, **overrides):
|
|||
"""Helper function to use in the `load()` method of a model package's
|
||||
__init__.py.
|
||||
|
||||
init_file (unicode): Path to model's __init__.py, i.e. `__file__`.
|
||||
init_file (str): Path to model's __init__.py, i.e. `__file__`.
|
||||
**overrides: Specific overrides, like pipeline components to disable.
|
||||
RETURNS (Language): `Language` class with loaded model.
|
||||
"""
|
||||
|
@ -206,11 +219,74 @@ def load_model_from_init_py(init_file, **overrides):
|
|||
return load_model_from_path(data_path, meta, **overrides)
|
||||
|
||||
|
||||
def get_installed_models():
|
||||
"""List all model packages currently installed in the environment.
|
||||
|
||||
RETURNS (list): The string names of the models.
|
||||
"""
|
||||
return list(registry.models.get_all().keys())
|
||||
|
||||
|
||||
def get_package_version(name):
|
||||
"""Get the version of an installed package. Typically used to get model
|
||||
package versions.
|
||||
|
||||
name (str): The name of the installed Python package.
|
||||
RETURNS (str / None): The version or None if package not installed.
|
||||
"""
|
||||
try:
|
||||
return importlib_metadata.version(name)
|
||||
except importlib_metadata.PackageNotFoundError:
|
||||
return None
|
||||
|
||||
|
||||
def is_compatible_version(version, constraint, prereleases=True):
|
||||
"""Check if a version (e.g. "2.0.0") is compatible given a version
|
||||
constraint (e.g. ">=1.9.0,<2.2.1"). If the constraint is a specific version,
|
||||
it's interpreted as =={version}.
|
||||
|
||||
version (str): The version to check.
|
||||
constraint (str): The constraint string.
|
||||
prereleases (bool): Whether to allow prereleases. If set to False,
|
||||
prerelease versions will be considered incompatible.
|
||||
RETURNS (bool / None): Whether the version is compatible, or None if the
|
||||
version or constraint are invalid.
|
||||
"""
|
||||
# Handle cases where exact version is provided as constraint
|
||||
if constraint[0].isdigit():
|
||||
constraint = f"=={constraint}"
|
||||
try:
|
||||
spec = SpecifierSet(constraint)
|
||||
version = Version(version)
|
||||
except (InvalidSpecifier, InvalidVersion):
|
||||
return None
|
||||
spec.prereleases = prereleases
|
||||
return version in spec
|
||||
|
||||
|
||||
def get_model_version_range(spacy_version):
|
||||
"""Generate a version range like >=1.2.3,<1.3.0 based on a given spaCy
|
||||
version. Models are always compatible across patch versions but not
|
||||
across minor or major versions.
|
||||
"""
|
||||
release = Version(spacy_version).release
|
||||
return f">={spacy_version},<{release[0]}.{release[1] + 1}.0"
|
||||
|
||||
|
||||
def get_base_version(version):
|
||||
"""Generate the base version without any prerelease identifiers.
|
||||
|
||||
version (str): The version, e.g. "3.0.0.dev1".
|
||||
RETURNS (str): The base version, e.g. "3.0.0".
|
||||
"""
|
||||
return Version(version).base_version
|
||||
|
||||
|
||||
def load_config(path, create_objects=False):
|
||||
"""Load a Thinc-formatted config file, optionally filling in objects where
|
||||
the config references registry entries. See "Thinc config files" for details.
|
||||
|
||||
path (unicode or Path): Path to the config file
|
||||
path (str / Path): Path to the config file
|
||||
create_objects (bool): Whether to automatically create objects when the config
|
||||
references registry entries. Defaults to False.
|
||||
|
||||
|
@ -227,7 +303,7 @@ def load_config_from_str(string, create_objects=False):
|
|||
"""Load a Thinc-formatted config, optionally filling in objects where
|
||||
the config references registry entries. See "Thinc config files" for details.
|
||||
|
||||
string (unicode or Path): Text contents of the config file.
|
||||
string (str / Path): Text contents of the config file.
|
||||
create_objects (bool): Whether to automatically create objects when the config
|
||||
references registry entries. Defaults to False.
|
||||
|
||||
|
@ -243,7 +319,7 @@ def load_config_from_str(string, create_objects=False):
|
|||
def get_model_meta(path):
|
||||
"""Get model meta.json from a directory path and validate its contents.
|
||||
|
||||
path (unicode or Path): Path to model directory.
|
||||
path (str / Path): Path to model directory.
|
||||
RETURNS (dict): The model's meta data.
|
||||
"""
|
||||
model_path = ensure_path(path)
|
||||
|
@ -256,13 +332,23 @@ def get_model_meta(path):
|
|||
for setting in ["lang", "name", "version"]:
|
||||
if setting not in meta or not meta[setting]:
|
||||
raise ValueError(Errors.E054.format(setting=setting))
|
||||
if "spacy_version" in meta:
|
||||
if not is_compatible_version(about.__version__, meta["spacy_version"]):
|
||||
warnings.warn(
|
||||
Warnings.W095.format(
|
||||
model=f"{meta['lang']}_{meta['name']}",
|
||||
model_version=meta["version"],
|
||||
version=meta["spacy_version"],
|
||||
current=about.__version__,
|
||||
)
|
||||
)
|
||||
return meta
|
||||
|
||||
|
||||
def get_model_config(path):
|
||||
"""Get the model's config from a directory path.
|
||||
|
||||
path (unicode or Path): Path to model directory.
|
||||
path (str / Path): Path to model directory.
|
||||
RETURNS (Config): The model's config data.
|
||||
"""
|
||||
model_path = ensure_path(path)
|
||||
|
@ -279,23 +365,20 @@ def get_model_config(path):
|
|||
def is_package(name):
|
||||
"""Check if string maps to a package installed via pip.
|
||||
|
||||
name (unicode): Name of package.
|
||||
name (str): Name of package.
|
||||
RETURNS (bool): True if installed package, False if not.
|
||||
"""
|
||||
import pkg_resources
|
||||
|
||||
name = name.lower() # compare package name against lowercase name
|
||||
packages = pkg_resources.working_set.by_key.keys()
|
||||
for package in packages:
|
||||
if package.lower().replace("-", "_") == name:
|
||||
return True
|
||||
return False
|
||||
try:
|
||||
importlib_metadata.distribution(name)
|
||||
return True
|
||||
except: # noqa: E722
|
||||
return False
|
||||
|
||||
|
||||
def get_package_path(name):
|
||||
"""Get the path to an installed package.
|
||||
|
||||
name (unicode): Package name.
|
||||
name (str): Package name.
|
||||
RETURNS (Path): Path to installed package.
|
||||
"""
|
||||
name = name.lower() # use lowercase version to be safe
|
||||
|
@ -470,8 +553,8 @@ def expand_exc(excs, search, replace):
|
|||
For example, to add additional versions with typographic apostrophes.
|
||||
|
||||
excs (dict): Tokenizer exceptions.
|
||||
search (unicode): String to find and replace.
|
||||
replace (unicode): Replacement.
|
||||
search (str): String to find and replace.
|
||||
replace (str): Replacement.
|
||||
RETURNS (dict): Combined tokenizer exceptions.
|
||||
"""
|
||||
|
||||
|
@ -575,42 +658,74 @@ def decaying(start, stop, decay):
|
|||
curr -= decay
|
||||
|
||||
|
||||
def minibatch_by_words(examples, size, tuples=True, count_words=len, tolerance=0.2):
|
||||
def minibatch_by_words(examples, size, count_words=len, tolerance=0.2, discard_oversize=False):
|
||||
"""Create minibatches of roughly a given number of words. If any examples
|
||||
are longer than the specified batch length, they will appear in a batch by
|
||||
themselves."""
|
||||
themselves, or be discarded if discard_oversize=True."""
|
||||
if isinstance(size, int):
|
||||
size_ = itertools.repeat(size)
|
||||
elif isinstance(size, List):
|
||||
size_ = iter(size)
|
||||
else:
|
||||
size_ = size
|
||||
examples = iter(examples)
|
||||
oversize = []
|
||||
while True:
|
||||
batch_size = next(size_)
|
||||
tol_size = batch_size * 0.2
|
||||
batch = []
|
||||
if oversize:
|
||||
example = oversize.pop(0)
|
||||
n_words = count_words(example.doc)
|
||||
|
||||
target_size = next(size_)
|
||||
tol_size = target_size * tolerance
|
||||
batch = []
|
||||
overflow = []
|
||||
batch_size = 0
|
||||
overflow_size = 0
|
||||
|
||||
for example in examples:
|
||||
n_words = count_words(example.doc)
|
||||
# if the current example exceeds the maximum batch size, it is returned separately
|
||||
# but only if discard_oversize=False.
|
||||
if n_words > target_size + tol_size:
|
||||
if not discard_oversize:
|
||||
yield [example]
|
||||
|
||||
# add the example to the current batch if there's no overflow yet and it still fits
|
||||
elif overflow_size == 0 and (batch_size + n_words) <= target_size:
|
||||
batch.append(example)
|
||||
batch_size -= n_words
|
||||
while batch_size >= 1:
|
||||
try:
|
||||
example = next(examples)
|
||||
except StopIteration:
|
||||
if batch:
|
||||
yield batch
|
||||
return
|
||||
n_words = count_words(example.doc)
|
||||
if n_words < (batch_size + tol_size):
|
||||
batch_size -= n_words
|
||||
batch.append(example)
|
||||
else:
|
||||
oversize.append(example)
|
||||
if batch:
|
||||
batch_size += n_words
|
||||
|
||||
# add the example to the overflow buffer if it fits in the tolerance margin
|
||||
elif (batch_size + overflow_size + n_words) <= (target_size + tol_size):
|
||||
overflow.append(example)
|
||||
overflow_size += n_words
|
||||
|
||||
# yield the previous batch and start a new one. The new one gets the overflow examples.
|
||||
else:
|
||||
yield batch
|
||||
target_size = next(size_)
|
||||
tol_size = target_size * tolerance
|
||||
batch = overflow
|
||||
batch_size = overflow_size
|
||||
overflow = []
|
||||
overflow_size = 0
|
||||
|
||||
# this example still fits
|
||||
if (batch_size + n_words) <= target_size:
|
||||
batch.append(example)
|
||||
batch_size += n_words
|
||||
|
||||
# this example fits in overflow
|
||||
elif (batch_size + n_words) <= (target_size + tol_size):
|
||||
overflow.append(example)
|
||||
overflow_size += n_words
|
||||
|
||||
# this example does not fit with the previous overflow: start another new batch
|
||||
else:
|
||||
yield batch
|
||||
target_size = next(size_)
|
||||
tol_size = target_size * tolerance
|
||||
batch = [example]
|
||||
batch_size = n_words
|
||||
|
||||
# yield the final batch
|
||||
if batch:
|
||||
batch.extend(overflow)
|
||||
yield batch
|
||||
|
||||
|
||||
def itershuffle(iterable, bufsize=1000):
|
||||
|
@ -705,8 +820,8 @@ def from_disk(path, readers, exclude):
|
|||
def import_file(name, loc):
|
||||
"""Import module from a file. Used to load models from a directory.
|
||||
|
||||
name (unicode): Name of module to load.
|
||||
loc (unicode / Path): Path to the file.
|
||||
name (str): Name of module to load.
|
||||
loc (str / Path): Path to the file.
|
||||
RETURNS: The loaded module.
|
||||
"""
|
||||
loc = str(loc)
|
||||
|
@ -721,8 +836,8 @@ def minify_html(html):
|
|||
Disclaimer: NOT a general-purpose solution, only removes indentation and
|
||||
newlines.
|
||||
|
||||
html (unicode): Markup to minify.
|
||||
RETURNS (unicode): "Minified" HTML.
|
||||
html (str): Markup to minify.
|
||||
RETURNS (str): "Minified" HTML.
|
||||
"""
|
||||
return html.strip().replace(" ", "").replace("\n", "")
|
||||
|
||||
|
@ -731,8 +846,8 @@ def escape_html(text):
|
|||
"""Replace <, >, &, " with their HTML encoded representation. Intended to
|
||||
prevent HTML errors in rendered displaCy markup.
|
||||
|
||||
text (unicode): The original text.
|
||||
RETURNS (unicode): Equivalent text to be safely used within HTML.
|
||||
text (str): The original text.
|
||||
RETURNS (str): Equivalent text to be safely used within HTML.
|
||||
"""
|
||||
text = text.replace("&", "&")
|
||||
text = text.replace("<", "<")
|
||||
|
|
|
@ -57,7 +57,7 @@ cdef class Vectors:
|
|||
shape (tuple): Size of the table, as (# entries, # columns)
|
||||
data (numpy.ndarray): The vector data.
|
||||
keys (iterable): A sequence of keys, aligned with the data.
|
||||
name (unicode): A name to identify the vectors table.
|
||||
name (str): A name to identify the vectors table.
|
||||
RETURNS (Vectors): The newly created object.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#init
|
||||
|
@ -244,7 +244,7 @@ cdef class Vectors:
|
|||
def find(self, *, key=None, keys=None, row=None, rows=None):
|
||||
"""Look up one or more keys by row, or vice versa.
|
||||
|
||||
key (unicode / int): Find the row that the given key points to.
|
||||
key (str / int): Find the row that the given key points to.
|
||||
Returns int, -1 if missing.
|
||||
keys (iterable): Find rows that the keys point to.
|
||||
Returns ndarray.
|
||||
|
@ -366,7 +366,7 @@ cdef class Vectors:
|
|||
def to_disk(self, path, **kwargs):
|
||||
"""Save the current state to a directory.
|
||||
|
||||
path (unicode / Path): A path to a directory, which will be created if
|
||||
path (str / Path): A path to a directory, which will be created if
|
||||
it doesn't exists.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#to_disk
|
||||
|
@ -386,7 +386,7 @@ cdef class Vectors:
|
|||
"""Loads state from a directory. Modifies the object in place and
|
||||
returns it.
|
||||
|
||||
path (unicode / Path): Directory path, string or Path-like object.
|
||||
path (str / Path): Directory path, string or Path-like object.
|
||||
RETURNS (Vectors): The modified object.
|
||||
|
||||
DOCS: https://spacy.io/api/vectors#from_disk
|
||||
|
|
|
@ -504,10 +504,10 @@ tokenization can be provided.
|
|||
> srsly.write_jsonl("/path/to/text.jsonl", data)
|
||||
> ```
|
||||
|
||||
| Key | Type | Description |
|
||||
| -------- | ------- | ---------------------------------------------------------- |
|
||||
| `text` | unicode | The raw input text. Is not required if `tokens` available. |
|
||||
| `tokens` | list | Optional tokenization, one string per token. |
|
||||
| Key | Type | Description |
|
||||
| -------- | ---- | ---------------------------------------------------------- |
|
||||
| `text` | str | The raw input text. Is not required if `tokens` available. |
|
||||
| `tokens` | list | Optional tokenization, one string per token. |
|
||||
|
||||
```json
|
||||
### Example
|
||||
|
|
|
@ -170,7 +170,7 @@ vocabulary.
|
|||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | ------------------------------------------------------------------------------------------- |
|
||||
| `mem` | `cymem.Pool` | A memory pool. Allocated memory will be freed once the `Vocab` object is garbage collected. |
|
||||
| `string` | unicode | The string of the word to look up. |
|
||||
| `string` | str | The string of the word to look up. |
|
||||
| **RETURNS** | `const LexemeC*` | The lexeme in the vocabulary. |
|
||||
|
||||
### Vocab.get_by_orth {#vocab_get_by_orth tag="method"}
|
||||
|
|
|
@ -229,9 +229,9 @@ Add a new label to the pipe.
|
|||
> parser.add_label("MY_LABEL")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------- | ----------------- |
|
||||
| `label` | unicode | The label to add. |
|
||||
| Name | Type | Description |
|
||||
| ------- | ---- | ----------------- |
|
||||
| `label` | str | The label to add. |
|
||||
|
||||
## DependencyParser.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -244,10 +244,10 @@ Serialize the pipe to disk.
|
|||
> parser.to_disk("/path/to/parser")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## DependencyParser.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -262,7 +262,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------ | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `DependencyParser` | The modified `DependencyParser` object. |
|
||||
|
||||
|
|
|
@ -123,7 +123,7 @@ details, see the documentation on
|
|||
|
||||
| Name | Type | Description |
|
||||
| --------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. |
|
||||
| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `doc._.my_attr`. |
|
||||
| `default` | - | Optional default value of the attribute if no getter or method is defined. |
|
||||
| `method` | callable | Set a custom method on the object, for example `doc._.compare(other_doc)`. |
|
||||
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. |
|
||||
|
@ -145,10 +145,10 @@ Look up a previously registered extension by name. Returns a 4-tuple
|
|||
> assert extension == (False, None, None, None)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ------------------------------------------------------------- |
|
||||
| `name` | unicode | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------- |
|
||||
| `name` | str | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
|
||||
|
||||
## Doc.has_extension {#has_extension tag="classmethod" new="2"}
|
||||
|
||||
|
@ -162,10 +162,10 @@ Check whether an extension has been registered on the `Doc` class.
|
|||
> assert Doc.has_extension('has_city')
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ------------------------------------------ |
|
||||
| `name` | unicode | Name of the extension to check. |
|
||||
| **RETURNS** | bool | Whether the extension has been registered. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------ |
|
||||
| `name` | str | Name of the extension to check. |
|
||||
| **RETURNS** | bool | Whether the extension has been registered. |
|
||||
|
||||
## Doc.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
|
||||
|
||||
|
@ -180,10 +180,10 @@ Remove a previously registered extension.
|
|||
> assert not Doc.has_extension('has_city')
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | --------------------------------------------------------------------- |
|
||||
| `name` | unicode | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------------------------------------- |
|
||||
| `name` | str | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
|
||||
|
||||
## Doc.char_span {#char_span tag="method" new="2"}
|
||||
|
||||
|
@ -368,10 +368,10 @@ Save the current state to a directory.
|
|||
> doc.to_disk("/path/to/doc")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## Doc.from_disk {#from_disk tag="method" new="2"}
|
||||
|
||||
|
@ -385,11 +385,11 @@ Loads state from a directory. Modifies the object in place and returns it.
|
|||
> doc = Doc(Vocab()).from_disk("/path/to/doc")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` object. |
|
||||
|
||||
## Doc.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -648,15 +648,15 @@ The L2 norm of the document's vector representation.
|
|||
|
||||
| Name | Type | Description |
|
||||
| --------------------------------------- | ------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `text` | unicode | A unicode representation of the document text. |
|
||||
| `text_with_ws` | unicode | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
|
||||
| `text` | str | A unicode representation of the document text. |
|
||||
| `text_with_ws` | str | An alias of `Doc.text`, provided for duck-type compatibility with `Span` and `Token`. |
|
||||
| `mem` | `Pool` | The document's local memory heap, for all C data it owns. |
|
||||
| `vocab` | `Vocab` | The store of lexical types. |
|
||||
| `tensor` <Tag variant="new">2</Tag> | `ndarray` | Container for dense vector representations. |
|
||||
| `cats` <Tag variant="new">2</Tag> | dict | Maps a label to a score for categories applied to the document. The label is a string and the score should be a float. |
|
||||
| `user_data` | - | A generic storage area, for user custom data. |
|
||||
| `lang` <Tag variant="new">2.1</Tag> | int | Language of the document's vocabulary. |
|
||||
| `lang_` <Tag variant="new">2.1</Tag> | unicode | Language of the document's vocabulary. |
|
||||
| `lang_` <Tag variant="new">2.1</Tag> | str | Language of the document's vocabulary. |
|
||||
| `is_tagged` | bool | A flag indicating that the document has been part-of-speech tagged. Returns `True` if the `Doc` is empty. |
|
||||
| `is_parsed` | bool | A flag indicating that the document has been syntactically parsed. Returns `True` if the `Doc` is empty. |
|
||||
| `is_sentenced` | bool | A flag indicating that sentence boundaries have been applied to the document. Returns `True` if the `Doc` is empty. |
|
||||
|
|
|
@ -258,10 +258,10 @@ Serialize the pipe to disk.
|
|||
> entity_linker.to_disk("/path/to/entity_linker")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## EntityLinker.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -274,11 +274,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> entity_linker.from_disk("/path/to/entity_linker")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `EntityLinker` | The modified `EntityLinker` object. |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -230,9 +230,9 @@ Add a new label to the pipe.
|
|||
> ner.add_label("MY_LABEL")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------- | ----------------- |
|
||||
| `label` | unicode | The label to add. |
|
||||
| Name | Type | Description |
|
||||
| ------- | ---- | ----------------- |
|
||||
| `label` | str | The label to add. |
|
||||
|
||||
## EntityRecognizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -245,10 +245,10 @@ Serialize the pipe to disk.
|
|||
> ner.to_disk("/path/to/ner")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## EntityRecognizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -263,7 +263,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------ | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `EntityRecognizer` | The modified `EntityRecognizer` object. |
|
||||
|
||||
|
|
|
@ -72,10 +72,10 @@ Whether a label is present in the patterns.
|
|||
> assert not "PERSON" in ruler
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | -------------------------------------------- |
|
||||
| `label` | unicode | The label to check. |
|
||||
| **RETURNS** | bool | Whether the entity ruler contains the label. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------- |
|
||||
| `label` | str | The label to check. |
|
||||
| **RETURNS** | bool | Whether the entity ruler contains the label. |
|
||||
|
||||
## EntityRuler.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -83,8 +83,9 @@ Find matches in the `Doc` and add them to the `doc.ents`. Typically, this
|
|||
happens automatically after the component has been added to the pipeline using
|
||||
[`nlp.add_pipe`](/api/language#add_pipe). If the entity ruler was initialized
|
||||
with `overwrite_ents=True`, existing entities will be replaced if they overlap
|
||||
with the matches. When matches overlap in a Doc, the entity ruler prioritizes longer
|
||||
patterns over shorter, and if equal the match occuring first in the Doc is chosen.
|
||||
with the matches. When matches overlap in a Doc, the entity ruler prioritizes
|
||||
longer patterns over shorter, and if equal the match occuring first in the Doc
|
||||
is chosen.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -139,9 +140,9 @@ only the patterns are saved as JSONL. If a directory name is provided, a
|
|||
> ruler.to_disk("/path/to/entity_ruler") # saves patterns and config
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Type | Description |
|
||||
| ------ | ------------ | ----------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a JSONL file or directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
|
||||
## EntityRuler.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -158,10 +159,10 @@ configuration.
|
|||
> ruler.from_disk("/path/to/entity_ruler") # loads patterns and config
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | ---------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------- | ---------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a JSONL file or directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `EntityRuler` | The modified `EntityRuler` object. |
|
||||
|
||||
## EntityRuler.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
|
|
@ -17,8 +17,8 @@ Create a `GoldCorpus`. IF the input data is an iterable, each item should be a
|
|||
[`gold.read_json_file`](https://github.com/explosion/spaCy/tree/master/spacy/gold.pyx)
|
||||
for further details.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------------------- | ------------------------------------------------------------ |
|
||||
| `train` | unicode / `Path` / iterable | Training data, as a path (file or directory) or iterable. |
|
||||
| `dev` | unicode / `Path` / iterable | Development data, as a path (file or directory) or iterable. |
|
||||
| **RETURNS** | `GoldCorpus` | The newly constructed object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------- | ------------------------------------------------------------ |
|
||||
| `train` | str / `Path` / iterable | Training data, as a path (file or directory) or iterable. |
|
||||
| `dev` | str / `Path` / iterable | Development data, as a path (file or directory) or iterable. |
|
||||
| **RETURNS** | `GoldCorpus` | The newly constructed object. |
|
||||
|
|
|
@ -60,7 +60,8 @@ Whether the provided syntactic annotations form a projective dependency tree.
|
|||
|
||||
Convert a list of Doc objects into the
|
||||
[JSON-serializable format](/api/annotation#json-input) used by the
|
||||
[`spacy train`](/api/cli#train) command. Each input doc will be treated as a 'paragraph' in the output doc.
|
||||
[`spacy train`](/api/cli#train) command. Each input doc will be treated as a
|
||||
'paragraph' in the output doc.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -158,7 +159,7 @@ single-token entity.
|
|||
| ----------- | -------- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `doc` | `Doc` | The document that the entity offsets refer to. The output tags will refer to the token boundaries within the document. |
|
||||
| `entities` | iterable | A sequence of `(start, end, label)` triples. `start` and `end` should be character-offset integers denoting the slice into the original string. |
|
||||
| **RETURNS** | list | Unicode strings, describing the [BILUO](/api/annotation#biluo) tags. |
|
||||
| **RETURNS** | list | str strings, describing the [BILUO](/api/annotation#biluo) tags. |
|
||||
|
||||
### gold.offsets_from_biluo_tags {#offsets_from_biluo_tags tag="function"}
|
||||
|
||||
|
|
|
@ -1,16 +1,19 @@
|
|||
---
|
||||
title: KnowledgeBase
|
||||
teaser: A storage class for entities and aliases of a specific knowledge base (ontology)
|
||||
teaser:
|
||||
A storage class for entities and aliases of a specific knowledge base
|
||||
(ontology)
|
||||
tag: class
|
||||
source: spacy/kb.pyx
|
||||
new: 2.2
|
||||
---
|
||||
|
||||
The `KnowledgeBase` object provides a method to generate [`Candidate`](/api/kb/#candidate_init)
|
||||
objects, which are plausible external identifiers given a certain textual mention.
|
||||
Each such `Candidate` holds information from the relevant KB entities,
|
||||
such as its frequency in text and possible aliases.
|
||||
Each entity in the knowledge base also has a pretrained entity vector of a fixed size.
|
||||
The `KnowledgeBase` object provides a method to generate
|
||||
[`Candidate`](/api/kb/#candidate_init) objects, which are plausible external
|
||||
identifiers given a certain textual mention. Each such `Candidate` holds
|
||||
information from the relevant KB entities, such as its frequency in text and
|
||||
possible aliases. Each entity in the knowledge base also has a pretrained entity
|
||||
vector of a fixed size.
|
||||
|
||||
## KnowledgeBase.\_\_init\_\_ {#init tag="method"}
|
||||
|
||||
|
@ -24,25 +27,25 @@ Create the knowledge base.
|
|||
> kb = KnowledgeBase(vocab=vocab, entity_vector_length=64)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------------------- | ---------------- | ----------------------------------------- |
|
||||
| `vocab` | `Vocab` | A `Vocab` object. |
|
||||
| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
|
||||
| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------------- | --------------- | ---------------------------------------- |
|
||||
| `vocab` | `Vocab` | A `Vocab` object. |
|
||||
| `entity_vector_length` | int | Length of the fixed-size entity vectors. |
|
||||
| **RETURNS** | `KnowledgeBase` | The newly constructed object. |
|
||||
|
||||
## KnowledgeBase.entity_vector_length {#entity_vector_length tag="property"}
|
||||
|
||||
The length of the fixed-size entity vectors in the knowledge base.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------------- |
|
||||
| **RETURNS** | int | Length of the fixed-size entity vectors. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ---------------------------------------- |
|
||||
| **RETURNS** | int | Length of the fixed-size entity vectors. |
|
||||
|
||||
## KnowledgeBase.add_entity {#add_entity tag="method"}
|
||||
|
||||
Add an entity to the knowledge base, specifying its corpus frequency
|
||||
and entity vector, which should be of length [`entity_vector_length`](/api/kb#entity_vector_length).
|
||||
Add an entity to the knowledge base, specifying its corpus frequency and entity
|
||||
vector, which should be of length
|
||||
[`entity_vector_length`](/api/kb#entity_vector_length).
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -51,16 +54,16 @@ and entity vector, which should be of length [`entity_vector_length`](/api/kb#en
|
|||
> kb.add_entity(entity="Q463035", freq=111, entity_vector=vector2)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------------- | ------------- | ------------------------------------------------- |
|
||||
| `entity` | unicode | The unique entity identifier |
|
||||
| `freq` | float | The frequency of the entity in a typical corpus |
|
||||
| `entity_vector` | vector | The pretrained vector of the entity |
|
||||
| Name | Type | Description |
|
||||
| --------------- | ------ | ----------------------------------------------- |
|
||||
| `entity` | str | The unique entity identifier |
|
||||
| `freq` | float | The frequency of the entity in a typical corpus |
|
||||
| `entity_vector` | vector | The pretrained vector of the entity |
|
||||
|
||||
## KnowledgeBase.set_entities {#set_entities tag="method"}
|
||||
|
||||
Define the full list of entities in the knowledge base, specifying the corpus frequency
|
||||
and entity vector for each entity.
|
||||
Define the full list of entities in the knowledge base, specifying the corpus
|
||||
frequency and entity vector for each entity.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -68,18 +71,19 @@ and entity vector for each entity.
|
|||
> kb.set_entities(entity_list=["Q42", "Q463035"], freq_list=[32, 111], vector_list=[vector1, vector2])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ------------- | ------------------------------------------------- |
|
||||
| `entity_list` | iterable | List of unique entity identifiers |
|
||||
| `freq_list` | iterable | List of entity frequencies |
|
||||
| `vector_list` | iterable | List of entity vectors |
|
||||
| Name | Type | Description |
|
||||
| ------------- | -------- | --------------------------------- |
|
||||
| `entity_list` | iterable | List of unique entity identifiers |
|
||||
| `freq_list` | iterable | List of entity frequencies |
|
||||
| `vector_list` | iterable | List of entity vectors |
|
||||
|
||||
## KnowledgeBase.add_alias {#add_alias tag="method"}
|
||||
|
||||
Add an alias or mention to the knowledge base, specifying its potential KB identifiers
|
||||
and their prior probabilities. The entity identifiers should refer to entities previously
|
||||
added with [`add_entity`](/api/kb#add_entity) or [`set_entities`](/api/kb#set_entities).
|
||||
The sum of the prior probabilities should not exceed 1.
|
||||
Add an alias or mention to the knowledge base, specifying its potential KB
|
||||
identifiers and their prior probabilities. The entity identifiers should refer
|
||||
to entities previously added with [`add_entity`](/api/kb#add_entity) or
|
||||
[`set_entities`](/api/kb#set_entities). The sum of the prior probabilities
|
||||
should not exceed 1.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -87,11 +91,11 @@ The sum of the prior probabilities should not exceed 1.
|
|||
> kb.add_alias(alias="Douglas", entities=["Q42", "Q463035"], probabilities=[0.6, 0.3])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------------- | -------------------------------------------------- |
|
||||
| `alias` | unicode | The textual mention or alias |
|
||||
| `entities` | iterable | The potential entities that the alias may refer to |
|
||||
| `probabilities`| iterable | The prior probabilities of each entity |
|
||||
| Name | Type | Description |
|
||||
| --------------- | -------- | -------------------------------------------------- |
|
||||
| `alias` | str | The textual mention or alias |
|
||||
| `entities` | iterable | The potential entities that the alias may refer to |
|
||||
| `probabilities` | iterable | The prior probabilities of each entity |
|
||||
|
||||
## KnowledgeBase.\_\_len\_\_ {#len tag="method"}
|
||||
|
||||
|
@ -117,9 +121,9 @@ Get a list of all entity IDs in the knowledge base.
|
|||
> all_entities = kb.get_entity_strings()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------------------- |
|
||||
| **RETURNS** | list | The list of entities in the knowledge base. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------- |
|
||||
| **RETURNS** | list | The list of entities in the knowledge base. |
|
||||
|
||||
## KnowledgeBase.get_size_aliases {#get_size_aliases tag="method"}
|
||||
|
||||
|
@ -131,9 +135,9 @@ Get the total number of aliases in the knowledge base.
|
|||
> total_aliases = kb.get_size_aliases()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------------------- |
|
||||
| **RETURNS** | int | The number of aliases in the knowledge base. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------- |
|
||||
| **RETURNS** | int | The number of aliases in the knowledge base. |
|
||||
|
||||
## KnowledgeBase.get_alias_strings {#get_alias_strings tag="method"}
|
||||
|
||||
|
@ -145,9 +149,9 @@ Get a list of all aliases in the knowledge base.
|
|||
> all_aliases = kb.get_alias_strings()
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------------------- |
|
||||
| **RETURNS** | list | The list of aliases in the knowledge base. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------ |
|
||||
| **RETURNS** | list | The list of aliases in the knowledge base. |
|
||||
|
||||
## KnowledgeBase.get_candidates {#get_candidates tag="method"}
|
||||
|
||||
|
@ -160,10 +164,10 @@ of type [`Candidate`](/api/kb/#candidate_init).
|
|||
> candidates = kb.get_candidates("Douglas")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ------------- | -------------------------------------------------- |
|
||||
| `alias` | unicode | The textual mention or alias |
|
||||
| **RETURNS** | iterable | The list of relevant `Candidate` objects |
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | ---------------------------------------- |
|
||||
| `alias` | str | The textual mention or alias |
|
||||
| **RETURNS** | iterable | The list of relevant `Candidate` objects |
|
||||
|
||||
## KnowledgeBase.get_vector {#get_vector tag="method"}
|
||||
|
||||
|
@ -175,15 +179,15 @@ Given a certain entity ID, retrieve its pretrained entity vector.
|
|||
> vector = kb.get_vector("Q42")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ------------- | -------------------------------------------------- |
|
||||
| `entity` | unicode | The entity ID |
|
||||
| **RETURNS** | vector | The entity vector |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | ----------------- |
|
||||
| `entity` | str | The entity ID |
|
||||
| **RETURNS** | vector | The entity vector |
|
||||
|
||||
## KnowledgeBase.get_prior_prob {#get_prior_prob tag="method"}
|
||||
|
||||
Given a certain entity ID and a certain textual mention, retrieve
|
||||
the prior probability of the fact that the mention links to the entity ID.
|
||||
Given a certain entity ID and a certain textual mention, retrieve the prior
|
||||
probability of the fact that the mention links to the entity ID.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -191,11 +195,11 @@ the prior probability of the fact that the mention links to the entity ID.
|
|||
> probability = kb.get_prior_prob("Q42", "Douglas")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ------------- | --------------------------------------------------------------- |
|
||||
| `entity` | unicode | The entity ID |
|
||||
| `alias` | unicode | The textual mention or alias |
|
||||
| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | -------------------------------------------------------------- |
|
||||
| `entity` | str | The entity ID |
|
||||
| `alias` | str | The textual mention or alias |
|
||||
| **RETURNS** | float | The prior probability of the `alias` referring to the `entity` |
|
||||
|
||||
## KnowledgeBase.dump {#dump tag="method"}
|
||||
|
||||
|
@ -207,14 +211,14 @@ Save the current state of the knowledge base to a directory.
|
|||
> kb.dump(loc)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `loc` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Type | Description |
|
||||
| ----- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `loc` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
|
||||
## KnowledgeBase.load_bulk {#load_bulk tag="method"}
|
||||
|
||||
Restore the state of the knowledge base from a given directory. Note that the [`Vocab`](/api/vocab)
|
||||
should also be the same as the one used to create the KB.
|
||||
Restore the state of the knowledge base from a given directory. Note that the
|
||||
[`Vocab`](/api/vocab) should also be the same as the one used to create the KB.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -226,18 +230,16 @@ should also be the same as the one used to create the KB.
|
|||
> kb.load_bulk("/path/to/kb")
|
||||
> ```
|
||||
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
|
||||
| `loc` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. |
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | --------------- | -------------------------------------------------------------------------- |
|
||||
| `loc` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `KnowledgeBase` | The modified `KnowledgeBase` object. |
|
||||
|
||||
## Candidate.\_\_init\_\_ {#candidate_init tag="method"}
|
||||
|
||||
Construct a `Candidate` object. Usually this constructor is not called directly,
|
||||
but instead these objects are returned by the [`get_candidates`](/api/kb#get_candidates) method
|
||||
of a `KnowledgeBase`.
|
||||
but instead these objects are returned by the
|
||||
[`get_candidates`](/api/kb#get_candidates) method of a `KnowledgeBase`.
|
||||
|
||||
> #### Example
|
||||
>
|
||||
|
@ -257,12 +259,12 @@ of a `KnowledgeBase`.
|
|||
|
||||
## Candidate attributes {#candidate_attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------------- | ------------ | ------------------------------------------------------------------ |
|
||||
| `entity` | int | The entity's unique KB identifier |
|
||||
| `entity_` | unicode | The entity's unique KB identifier |
|
||||
| `alias` | int | The alias or textual mention |
|
||||
| `alias_` | unicode | The alias or textual mention |
|
||||
| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
|
||||
| `entity_freq` | long | The frequency of the entity in a typical corpus |
|
||||
| `entity_vector` | vector | The pretrained vector of the entity |
|
||||
| Name | Type | Description |
|
||||
| --------------- | ------ | -------------------------------------------------------------- |
|
||||
| `entity` | int | The entity's unique KB identifier |
|
||||
| `entity_` | str | The entity's unique KB identifier |
|
||||
| `alias` | int | The alias or textual mention |
|
||||
| `alias_` | str | The alias or textual mention |
|
||||
| `prior_prob` | long | The prior probability of the `alias` referring to the `entity` |
|
||||
| `entity_freq` | long | The frequency of the entity in a typical corpus |
|
||||
| `entity_vector` | vector | The pretrained vector of the entity |
|
||||
|
|
|
@ -49,11 +49,11 @@ contain arbitrary whitespace. Alignment into the original string is preserved.
|
|||
> assert (doc[0].text, doc[0].head.tag_) == ("An", "NN")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | --------------------------------------------------------------------------------- |
|
||||
| `text` | unicode | The text to be processed. |
|
||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||
| **RETURNS** | `Doc` | A container for accessing the annotations. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------------------------------------------------- |
|
||||
| `text` | str | The text to be processed. |
|
||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||
| **RETURNS** | `Doc` | A container for accessing the annotations. |
|
||||
|
||||
<Infobox title="Changed in v2.0" variant="warning">
|
||||
|
||||
|
@ -201,7 +201,7 @@ Create a pipeline component from a factory.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | ---------------------------------------------------------------------------------- |
|
||||
| `name` | unicode | Factory name to look up in [`Language.factories`](/api/language#class-attributes). |
|
||||
| `name` | str | Factory name to look up in [`Language.factories`](/api/language#class-attributes). |
|
||||
| `config` | dict | Configuration parameters to initialize component. |
|
||||
| **RETURNS** | callable | The pipeline component. |
|
||||
|
||||
|
@ -224,9 +224,9 @@ take a `Doc` object, modify it and return it. Only one of `before`, `after`,
|
|||
| Name | Type | Description |
|
||||
| ----------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `component` | callable | The pipeline component. |
|
||||
| `name` | unicode | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. |
|
||||
| `before` | unicode | Component name to insert component directly before. |
|
||||
| `after` | unicode | Component name to insert component directly after: |
|
||||
| `name` | str | Name of pipeline component. Overwrites existing `component.name` attribute if available. If no `name` is set and the component exposes no name attribute, `component.__name__` is used. An error is raised if the name already exists in the pipeline. |
|
||||
| `before` | str | Component name to insert component directly before. |
|
||||
| `after` | str | Component name to insert component directly after: |
|
||||
| `first` | bool | Insert component first / not first in the pipeline. |
|
||||
| `last` | bool | Insert component last / not last in the pipeline. |
|
||||
|
||||
|
@ -243,10 +243,10 @@ Check whether a component is present in the pipeline. Equivalent to
|
|||
> assert nlp.has_pipe("component")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | -------------------------------------------------------- |
|
||||
| `name` | unicode | Name of the pipeline component to check. |
|
||||
| **RETURNS** | bool | Whether a component of that name exists in the pipeline. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------------------- |
|
||||
| `name` | str | Name of the pipeline component to check. |
|
||||
| **RETURNS** | bool | Whether a component of that name exists in the pipeline. |
|
||||
|
||||
## Language.get_pipe {#get_pipe tag="method" new="2"}
|
||||
|
||||
|
@ -261,7 +261,7 @@ Get a pipeline component for a given component name.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | -------------------------------------- |
|
||||
| `name` | unicode | Name of the pipeline component to get. |
|
||||
| `name` | str | Name of the pipeline component to get. |
|
||||
| **RETURNS** | callable | The pipeline component. |
|
||||
|
||||
## Language.replace_pipe {#replace_pipe tag="method" new="2"}
|
||||
|
@ -276,7 +276,7 @@ Replace a component in the pipeline.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | -------- | --------------------------------- |
|
||||
| `name` | unicode | Name of the component to replace. |
|
||||
| `name` | str | Name of the component to replace. |
|
||||
| `component` | callable | The pipeline component to insert. |
|
||||
|
||||
## Language.rename_pipe {#rename_pipe tag="method" new="2"}
|
||||
|
@ -292,10 +292,10 @@ added to the pipeline, you can also use the `name` argument on
|
|||
> nlp.rename_pipe("parser", "spacy_parser")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------- | -------------------------------- |
|
||||
| `old_name` | unicode | Name of the component to rename. |
|
||||
| `new_name` | unicode | New name of the component. |
|
||||
| Name | Type | Description |
|
||||
| ---------- | ---- | -------------------------------- |
|
||||
| `old_name` | str | Name of the component to rename. |
|
||||
| `new_name` | str | New name of the component. |
|
||||
|
||||
## Language.remove_pipe {#remove_pipe tag="method" new="2"}
|
||||
|
||||
|
@ -309,10 +309,10 @@ component function.
|
|||
> assert name == "parser"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ----------------------------------------------------- |
|
||||
| `name` | unicode | Name of the component to remove. |
|
||||
| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ----------------------------------------------------- |
|
||||
| `name` | str | Name of the component to remove. |
|
||||
| **RETURNS** | tuple | A `(name, component)` tuple of the removed component. |
|
||||
|
||||
## Language.select_pipes {#select_pipes tag="contextmanager, method" new="3"}
|
||||
|
||||
|
@ -342,12 +342,11 @@ latter case, all components not in the `enable` list, will be disabled.
|
|||
| Name | Type | Description |
|
||||
| ----------- | --------------- | ------------------------------------------------------------------------------------ |
|
||||
| `disable` | list | Names of pipeline components to disable. |
|
||||
| `disable` | unicode | Name of pipeline component to disable. |
|
||||
| `disable` | str | Name of pipeline component to disable. |
|
||||
| `enable` | list | Names of pipeline components that will not be disabled. |
|
||||
| `enable` | unicode | Name of pipeline component that will not be disabled. |
|
||||
| `enable` | str | Name of pipeline component that will not be disabled. |
|
||||
| **RETURNS** | `DisabledPipes` | The disabled pipes that can be restored by calling the object's `.restore()` method. |
|
||||
|
||||
|
||||
<Infobox title="Changed in v3.0" variant="warning">
|
||||
|
||||
As of spaCy v3.0, the `disable_pipes` method has been renamed to `select_pipes`:
|
||||
|
@ -370,10 +369,10 @@ the model**.
|
|||
> nlp.to_disk("/path/to/models")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## Language.from_disk {#from_disk tag="method" new="2"}
|
||||
|
||||
|
@ -395,11 +394,11 @@ loaded object.
|
|||
> nlp = English().from_disk("/path/to/en_model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | ----------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Language` | The modified `Language` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | ----------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | Names of pipeline components or [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Language` | The modified `Language` object. |
|
||||
|
||||
<Infobox title="Changed in v2.0" variant="warning">
|
||||
|
||||
|
@ -480,11 +479,11 @@ per component.
|
|||
|
||||
## Class attributes {#class-attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------------------------------- | ------- | ----------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
|
||||
| `lang` | unicode | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
|
||||
| `factories` <Tag variant="new">2</Tag> | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. |
|
||||
| Name | Type | Description |
|
||||
| -------------------------------------- | ----- | ----------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `Defaults` | class | Settings, data and factory methods for creating the `nlp` object and processing pipeline. |
|
||||
| `lang` | str | Two-letter language ID, i.e. [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes). |
|
||||
| `factories` <Tag variant="new">2</Tag> | dict | Factories that create pre-defined pipeline components, e.g. the tagger, parser or entity recognizer, keyed by their component name. |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -63,8 +63,8 @@ Lemmatize a string.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------- | -------------------------------------------------------------------------------------------------------- |
|
||||
| `string` | unicode | The string to lemmatize, e.g. the token text. |
|
||||
| `univ_pos` | unicode / int | The token's universal part-of-speech tag. |
|
||||
| `string` | str | The string to lemmatize, e.g. the token text. |
|
||||
| `univ_pos` | str / int | The token's universal part-of-speech tag. |
|
||||
| `morphology` | dict / `None` | Morphological features following the [Universal Dependencies](http://universaldependencies.org/) scheme. |
|
||||
| **RETURNS** | list | The available lemmas for the string. |
|
||||
|
||||
|
@ -82,11 +82,11 @@ original string is returned. Languages can provide a
|
|||
> assert lemmatizer.lookup("going") == "go"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `string` | unicode | The string to look up. |
|
||||
| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
|
||||
| **RETURNS** | unicode | The lemma if the string was found, otherwise the original string. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------------------------------------------------------------------------------- |
|
||||
| `string` | str | The string to look up. |
|
||||
| `orth` | int | Optional hash of the string to look up. If not set, the string will be used and hashed. Defaults to `None`. |
|
||||
| **RETURNS** | str | The lemma if the string was found, otherwise the original string. |
|
||||
|
||||
## Lemmatizer.is_base_form {#is_base_form tag="method"}
|
||||
|
||||
|
@ -102,11 +102,11 @@ lemmatization entirely.
|
|||
> assert is_base_form == True
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------ | ------------- | --------------------------------------------------------------------------------------- |
|
||||
| `univ_pos` | unicode / int | The token's universal part-of-speech tag. |
|
||||
| `morphology` | dict | The token's morphological features. |
|
||||
| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. |
|
||||
| Name | Type | Description |
|
||||
| ------------ | --------- | --------------------------------------------------------------------------------------- |
|
||||
| `univ_pos` | str / int | The token's universal part-of-speech tag. |
|
||||
| `morphology` | dict | The token's morphological features. |
|
||||
| **RETURNS** | bool | Whether the token's part-of-speech tag and morphological features describe a base form. |
|
||||
|
||||
## Attributes {#attributes}
|
||||
|
||||
|
|
|
@ -125,21 +125,21 @@ The L2 norm of the lexeme's vector representation.
|
|||
| Name | Type | Description |
|
||||
| -------------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `vocab` | `Vocab` | The lexeme's vocabulary. |
|
||||
| `text` | unicode | Verbatim text content. |
|
||||
| `text` | str | Verbatim text content. |
|
||||
| `orth` | int | ID of the verbatim text content. |
|
||||
| `orth_` | unicode | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. |
|
||||
| `orth_` | str | Verbatim text content (identical to `Lexeme.text`). Exists mostly for consistency with the other attributes. |
|
||||
| `rank` | int | Sequential ID of the lexemes's lexical type, used to index into tables, e.g. for word vectors. |
|
||||
| `flags` | int | Container of the lexeme's binary flags. |
|
||||
| `norm` | int | The lexemes's norm, i.e. a normalized form of the lexeme text. |
|
||||
| `norm_` | unicode | The lexemes's norm, i.e. a normalized form of the lexeme text. |
|
||||
| `norm_` | str | The lexemes's norm, i.e. a normalized form of the lexeme text. |
|
||||
| `lower` | int | Lowercase form of the word. |
|
||||
| `lower_` | unicode | Lowercase form of the word. |
|
||||
| `lower_` | str | Lowercase form of the word. |
|
||||
| `shape` | int | Transform of the words's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
||||
| `shape_` | unicode | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
||||
| `shape_` | str | Transform of the word's string, to show orthographic features. Alphabetic characters are replaced by `x` or `X`, and numeric characters are replaced by d`, and sequences of the same character are truncated after length 4. For example,`"Xxxx"`or`"dd"`. |
|
||||
| `prefix` | int | Length-N substring from the start of the word. Defaults to `N=1`. |
|
||||
| `prefix_` | unicode | Length-N substring from the start of the word. Defaults to `N=1`. |
|
||||
| `prefix_` | str | Length-N substring from the start of the word. Defaults to `N=1`. |
|
||||
| `suffix` | int | Length-N substring from the end of the word. Defaults to `N=3`. |
|
||||
| `suffix_` | unicode | Length-N substring from the start of the word. Defaults to `N=3`. |
|
||||
| `suffix_` | str | Length-N substring from the start of the word. Defaults to `N=3`. |
|
||||
| `is_alpha` | bool | Does the lexeme consist of alphabetic characters? Equivalent to `lexeme.text.isalpha()`. |
|
||||
| `is_ascii` | bool | Does the lexeme consist of ASCII characters? Equivalent to `[any(ord(c) >= 128 for c in lexeme.text)]`. |
|
||||
| `is_digit` | bool | Does the lexeme consist of digits? Equivalent to `lexeme.text.isdigit()`. |
|
||||
|
@ -159,7 +159,7 @@ The L2 norm of the lexeme's vector representation.
|
|||
| `is_oov` | bool | Is the lexeme out-of-vocabulary? |
|
||||
| `is_stop` | bool | Is the lexeme part of a "stop list"? |
|
||||
| `lang` | int | Language of the parent vocabulary. |
|
||||
| `lang_` | unicode | Language of the parent vocabulary. |
|
||||
| `lang_` | str | Language of the parent vocabulary. |
|
||||
| `prob` | float | Smoothed log probability estimate of the lexeme's word type (context-independent entry in the vocabulary). |
|
||||
| `cluster` | int | Brown cluster ID. |
|
||||
| `sentiment` | float | A scalar value indicating the positivity or negativity of the lexeme. |
|
||||
|
|
|
@ -56,10 +56,10 @@ Check if the lookups contain a table of a given name. Delegates to
|
|||
> assert "some_table" in lookups
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ----------------------------------------------- |
|
||||
| `name` | unicode | Name of the table. |
|
||||
| **RETURNS** | bool | Whether a table of that name is in the lookups. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------------------- |
|
||||
| `name` | str | Name of the table. |
|
||||
| **RETURNS** | bool | Whether a table of that name is in the lookups. |
|
||||
|
||||
## Lookups.tables {#tables tag="property"}
|
||||
|
||||
|
@ -91,7 +91,7 @@ exists.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------------- | ---------------------------------- |
|
||||
| `name` | unicode | Unique name of the table. |
|
||||
| `name` | str | Unique name of the table. |
|
||||
| `data` | dict | Optional data to add to the table. |
|
||||
| **RETURNS** | [`Table`](/api/lookups#table) | The newly added table. |
|
||||
|
||||
|
@ -110,7 +110,7 @@ Get a table from the lookups. Raises an error if the table doesn't exist.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------------- | ------------------ |
|
||||
| `name` | unicode | Name of the table. |
|
||||
| `name` | str | Name of the table. |
|
||||
| **RETURNS** | [`Table`](/api/lookups#table) | The table. |
|
||||
|
||||
## Lookups.remove_table {#remove_table tag="method"}
|
||||
|
@ -128,7 +128,7 @@ Remove a table from the lookups. Raises an error if the table doesn't exist.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------------------- | ---------------------------- |
|
||||
| `name` | unicode | Name of the table to remove. |
|
||||
| `name` | str | Name of the table to remove. |
|
||||
| **RETURNS** | [`Table`](/api/lookups#table) | The removed table. |
|
||||
|
||||
## Lookups.has_table {#has_table tag="method"}
|
||||
|
@ -144,10 +144,10 @@ Check if the lookups contain a table of a given name. Equivalent to
|
|||
> assert lookups.has_table("some_table")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ----------------------------------------------- |
|
||||
| `name` | unicode | Name of the table. |
|
||||
| **RETURNS** | bool | Whether a table of that name is in the lookups. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------------------- |
|
||||
| `name` | str | Name of the table. |
|
||||
| **RETURNS** | bool | Whether a table of that name is in the lookups. |
|
||||
|
||||
## Lookups.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -191,9 +191,9 @@ which will be created if it doesn't exist.
|
|||
> lookups.to_disk("/path/to/lookups")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Type | Description |
|
||||
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
|
||||
## Lookups.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -208,10 +208,10 @@ the file doesn't exist.
|
|||
> lookups.from_disk("/path/to/lookups")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `Lookups` | The loaded lookups. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `Lookups` | The loaded lookups. |
|
||||
|
||||
## Table {#table tag="class, ordererddict"}
|
||||
|
||||
|
@ -238,7 +238,7 @@ Initialize a new table.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ---------------------------------- |
|
||||
| `name` | unicode | Optional table name for reference. |
|
||||
| `name` | str | Optional table name for reference. |
|
||||
| **RETURNS** | `Table` | The newly constructed object. |
|
||||
|
||||
### Table.from_dict {#table.from_dict tag="classmethod"}
|
||||
|
@ -256,7 +256,7 @@ Initialize a new table from a dict.
|
|||
| Name | Type | Description |
|
||||
| ----------- | ------- | ---------------------------------- |
|
||||
| `data` | dict | The dictionary. |
|
||||
| `name` | unicode | Optional table name for reference. |
|
||||
| `name` | str | Optional table name for reference. |
|
||||
| **RETURNS** | `Table` | The newly constructed object. |
|
||||
|
||||
### Table.set {#table.set tag="method"}
|
||||
|
@ -273,10 +273,10 @@ Set a new key / value pair. String keys will be hashed. Same as
|
|||
> assert table["foo"] == "bar"
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------------- | ----------- |
|
||||
| `key` | unicode / int | The key. |
|
||||
| `value` | - | The value. |
|
||||
| Name | Type | Description |
|
||||
| ------- | --------- | ----------- |
|
||||
| `key` | str / int | The key. |
|
||||
| `value` | - | The value. |
|
||||
|
||||
### Table.to_bytes {#table.to_bytes tag="method"}
|
||||
|
||||
|
@ -313,6 +313,6 @@ Load a table from a bytestring.
|
|||
|
||||
| Name | Type | Description |
|
||||
| -------------- | --------------------------- | ----------------------------------------------------- |
|
||||
| `name` | unicode | Table name. |
|
||||
| `name` | str | Table name. |
|
||||
| `default_size` | int | Default size of bloom filters if no data is provided. |
|
||||
| `bloom` | `preshed.bloom.BloomFilter` | The bloom filters. |
|
||||
|
|
|
@ -125,10 +125,10 @@ Check whether the matcher contains rules for a match ID.
|
|||
> assert 'Rule' in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ----------------------------------------------------- |
|
||||
| `key` | unicode | The match ID. |
|
||||
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------------------------- |
|
||||
| `key` | str | The match ID. |
|
||||
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
|
||||
|
||||
## Matcher.add {#add tag="method" new="2"}
|
||||
|
||||
|
@ -153,7 +153,7 @@ overwritten.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | unicode | An ID for the thing you're matching. |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
| `*patterns` | list | Match pattern. A pattern consists of a list of dicts, where each dict describes a token. |
|
||||
|
||||
|
@ -188,9 +188,9 @@ exist.
|
|||
> assert "Rule" not in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----- | ------- | ------------------------- |
|
||||
| `key` | unicode | The ID of the match rule. |
|
||||
| Name | Type | Description |
|
||||
| ----- | ---- | ------------------------- |
|
||||
| `key` | str | The ID of the match rule. |
|
||||
|
||||
## Matcher.get {#get tag="method" new="2"}
|
||||
|
||||
|
@ -204,7 +204,7 @@ Retrieve the pattern stored for a key. Returns the rule as an
|
|||
> on_match, patterns = matcher.get("Rule")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | --------------------------------------------- |
|
||||
| `key` | unicode | The ID of the match rule. |
|
||||
| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------------- |
|
||||
| `key` | str | The ID of the match rule. |
|
||||
| **RETURNS** | tuple | The rule, as an `(on_match, patterns)` tuple. |
|
||||
|
|
|
@ -133,10 +133,10 @@ Check whether the matcher contains rules for a match ID.
|
|||
> assert "OBAMA" in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ----------------------------------------------------- |
|
||||
| `key` | unicode | The match ID. |
|
||||
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ----------------------------------------------------- |
|
||||
| `key` | str | The match ID. |
|
||||
| **RETURNS** | bool | Whether the matcher contains rules for this match ID. |
|
||||
|
||||
## PhraseMatcher.add {#add tag="method"}
|
||||
|
||||
|
@ -162,7 +162,7 @@ overwritten.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------------------ | --------------------------------------------------------------------------------------------- |
|
||||
| `match_id` | unicode | An ID for the thing you're matching. |
|
||||
| `match_id` | str | An ID for the thing you're matching. |
|
||||
| `on_match` | callable or `None` | Callback function to act on matches. Takes the arguments `matcher`, `doc`, `i` and `matches`. |
|
||||
| `*docs` | `Doc` | `Doc` objects of the phrases to match. |
|
||||
|
||||
|
@ -198,6 +198,6 @@ does not exist.
|
|||
> assert "OBAMA" not in matcher
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----- | ------- | ------------------------- |
|
||||
| `key` | unicode | The ID of the match rule. |
|
||||
| Name | Type | Description |
|
||||
| ----- | ---- | ------------------------- |
|
||||
| `key` | str | The ID of the match rule. |
|
||||
|
|
|
@ -112,8 +112,8 @@ end of the pipeline and after all other components.
|
|||
|
||||
</Infobox>
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ------------------------------------------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
|
||||
| `label` | unicode | The subtoken dependency label. Defaults to `"subtok"`. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------ |
|
||||
| `doc` | `Doc` | The `Doc` object to process, e.g. the `Doc` in the pipeline. |
|
||||
| `label` | str | The subtoken dependency label. Defaults to `"subtok"`. |
|
||||
| **RETURNS** | `Doc` | The modified `Doc` with merged subtokens. |
|
||||
|
|
|
@ -81,9 +81,9 @@ a file `sentencizer.json`. This also happens automatically when you save an
|
|||
> sentencizer.to_disk("/path/to/sentencizer.jsonl")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | ---------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Type | Description |
|
||||
| ------ | ------------ | ---------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a file, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
|
||||
## Sentencizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -98,10 +98,10 @@ added to its pipeline.
|
|||
> sentencizer.from_disk("/path/to/sentencizer.json")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a JSON file. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `Sentencizer` | The modified `Sentencizer` object. |
|
||||
|
||||
## Sentencizer.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
|
|
@ -110,7 +110,7 @@ For details, see the documentation on
|
|||
|
||||
| Name | Type | Description |
|
||||
| --------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `name` | unicode | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. |
|
||||
| `name` | str | Name of the attribute to set by the extension. For example, `'my_attr'` will be available as `span._.my_attr`. |
|
||||
| `default` | - | Optional default value of the attribute if no getter or method is defined. |
|
||||
| `method` | callable | Set a custom method on the object, for example `span._.compare(other_span)`. |
|
||||
| `getter` | callable | Getter function that takes the object and returns an attribute value. Is called when the user accesses the `._` attribute. |
|
||||
|
@ -132,10 +132,10 @@ Look up a previously registered extension by name. Returns a 4-tuple
|
|||
> assert extension == (False, None, None, None)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ------------------------------------------------------------- |
|
||||
| `name` | unicode | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | ------------------------------------------------------------- |
|
||||
| `name` | str | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the extension. |
|
||||
|
||||
## Span.has_extension {#has_extension tag="classmethod" new="2"}
|
||||
|
||||
|
@ -149,10 +149,10 @@ Check whether an extension has been registered on the `Span` class.
|
|||
> assert Span.has_extension("is_city")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ------------------------------------------ |
|
||||
| `name` | unicode | Name of the extension to check. |
|
||||
| **RETURNS** | bool | Whether the extension has been registered. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------ |
|
||||
| `name` | str | Name of the extension to check. |
|
||||
| **RETURNS** | bool | Whether the extension has been registered. |
|
||||
|
||||
## Span.remove_extension {#remove_extension tag="classmethod" new="2.0.12"}
|
||||
|
||||
|
@ -167,10 +167,10 @@ Remove a previously registered extension.
|
|||
> assert not Span.has_extension("is_city")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | --------------------------------------------------------------------- |
|
||||
| `name` | unicode | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------------------------------------- |
|
||||
| `name` | str | Name of the extension. |
|
||||
| **RETURNS** | tuple | A `(default, method, getter, setter)` tuple of the removed extension. |
|
||||
|
||||
## Span.char_span {#char_span tag="method" new="2.2.4"}
|
||||
|
||||
|
@ -497,16 +497,16 @@ The L2 norm of the span's vector representation.
|
|||
| `end` | int | The token offset for the end of the span. |
|
||||
| `start_char` | int | The character offset for the start of the span. |
|
||||
| `end_char` | int | The character offset for the end of the span. |
|
||||
| `text` | unicode | A unicode representation of the span text. |
|
||||
| `text_with_ws` | unicode | The text content of the span with a trailing whitespace character if the last token has one. |
|
||||
| `text` | str | A unicode representation of the span text. |
|
||||
| `text_with_ws` | str | The text content of the span with a trailing whitespace character if the last token has one. |
|
||||
| `orth` | int | ID of the verbatim text content. |
|
||||
| `orth_` | unicode | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. |
|
||||
| `orth_` | str | Verbatim text content (identical to `Span.text`). Exists mostly for consistency with the other attributes. |
|
||||
| `label` | int | The hash value of the span's label. |
|
||||
| `label_` | unicode | The span's label. |
|
||||
| `lemma_` | unicode | The span's lemma. |
|
||||
| `label_` | str | The span's label. |
|
||||
| `lemma_` | str | The span's lemma. |
|
||||
| `kb_id` | int | The hash value of the knowledge base ID referred to by the span. |
|
||||
| `kb_id_` | unicode | The knowledge base ID referred to by the span. |
|
||||
| `kb_id_` | str | The knowledge base ID referred to by the span. |
|
||||
| `ent_id` | int | The hash value of the named entity the token is an instance of. |
|
||||
| `ent_id_` | unicode | The string ID of the named entity the token is an instance of. |
|
||||
| `ent_id_` | str | The string ID of the named entity the token is an instance of. |
|
||||
| `sentiment` | float | A scalar value indicating the positivity or negativity of the span. |
|
||||
| `_` | `Underscore` | User space for adding custom [attribute extensions](/usage/processing-pipelines#custom-components-attributes). |
|
||||
|
|
|
@ -55,7 +55,7 @@ Retrieve a string from a given hash, or vice versa.
|
|||
| Name | Type | Description |
|
||||
| -------------- | ------------------------ | -------------------------- |
|
||||
| `string_or_id` | bytes, unicode or uint64 | The value to encode. |
|
||||
| **RETURNS** | unicode or int | The value to be retrieved. |
|
||||
| **RETURNS** | str or int | The value to be retrieved. |
|
||||
|
||||
## StringStore.\_\_contains\_\_ {#contains tag="method"}
|
||||
|
||||
|
@ -69,10 +69,10 @@ Check whether a string is in the store.
|
|||
> assert not "cherry" in stringstore
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | -------------------------------------- |
|
||||
| `string` | unicode | The string to check. |
|
||||
| **RETURNS** | bool | Whether the store contains the string. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------- |
|
||||
| `string` | str | The string to check. |
|
||||
| **RETURNS** | bool | Whether the store contains the string. |
|
||||
|
||||
## StringStore.\_\_iter\_\_ {#iter tag="method"}
|
||||
|
||||
|
@ -87,9 +87,9 @@ store will always include an empty string `''` at position `0`.
|
|||
> assert all_strings == ["apple", "orange"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------- | ---------------------- |
|
||||
| **YIELDS** | unicode | A string in the store. |
|
||||
| Name | Type | Description |
|
||||
| ---------- | ---- | ---------------------- |
|
||||
| **YIELDS** | str | A string in the store. |
|
||||
|
||||
## StringStore.add {#add tag="method" new="2"}
|
||||
|
||||
|
@ -106,10 +106,10 @@ Add a string to the `StringStore`.
|
|||
> assert stringstore["banana"] == banana_hash
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ------------------------ |
|
||||
| `string` | unicode | The string to add. |
|
||||
| **RETURNS** | uint64 | The string's hash value. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | ------------------------ |
|
||||
| `string` | str | The string to add. |
|
||||
| **RETURNS** | uint64 | The string's hash value. |
|
||||
|
||||
## StringStore.to_disk {#to_disk tag="method" new="2"}
|
||||
|
||||
|
@ -121,9 +121,9 @@ Save the current state to a directory.
|
|||
> stringstore.to_disk("/path/to/strings")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Type | Description |
|
||||
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
|
||||
## StringStore.from_disk {#from_disk tag="method" new="2"}
|
||||
|
||||
|
@ -136,10 +136,10 @@ Loads state from a directory. Modifies the object in place and returns it.
|
|||
> stringstore = StringStore().from_disk("/path/to/strings")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `StringStore` | The modified `StringStore` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `StringStore` | The modified `StringStore` object. |
|
||||
|
||||
## StringStore.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -185,7 +185,7 @@ Get a 64-bit hash for a given string.
|
|||
> assert hash_string("apple") == 8566208034543834098
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ------------------- |
|
||||
| `string` | unicode | The string to hash. |
|
||||
| **RETURNS** | uint64 | The hash. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | ------------------- |
|
||||
| `string` | str | The string to hash. |
|
||||
| **RETURNS** | uint64 | The hash. |
|
||||
|
|
|
@ -229,10 +229,10 @@ Add a new label to the pipe.
|
|||
> tagger.add_label("MY_LABEL", {POS: 'NOUN'})
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------- | ------- | --------------------------------------------------------------- |
|
||||
| `label` | unicode | The label to add. |
|
||||
| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. |
|
||||
| Name | Type | Description |
|
||||
| -------- | ---- | --------------------------------------------------------------- |
|
||||
| `label` | str | The label to add. |
|
||||
| `values` | dict | Optional values to map to the label, e.g. a tag map dictionary. |
|
||||
|
||||
## Tagger.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -245,10 +245,10 @@ Serialize the pipe to disk.
|
|||
> tagger.to_disk("/path/to/tagger")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## Tagger.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -261,11 +261,11 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
> tagger.from_disk("/path/to/tagger")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tagger` | The modified `Tagger` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tagger` | The modified `Tagger` object. |
|
||||
|
||||
## Tagger.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
|
|
@ -44,7 +44,7 @@ shortcut for this and instantiate the component using its string name and
|
|||
| `vocab` | `Vocab` | The shared vocabulary. |
|
||||
| `model` | `thinc.neural.Model` / `True` | The model powering the pipeline component. If no model is supplied, the model is created when you call `begin_training`, `from_disk` or `from_bytes`. |
|
||||
| `exclusive_classes` | bool | Make categories mutually exclusive. Defaults to `False`. |
|
||||
| `architecture` | unicode | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. |
|
||||
| `architecture` | str | Model architecture to use, see [architectures](#architectures) for details. Defaults to `"ensemble"`. |
|
||||
| **RETURNS** | `TextCategorizer` | The newly constructed object. |
|
||||
|
||||
### Architectures {#architectures new="2.1"}
|
||||
|
@ -247,9 +247,9 @@ Add a new label to the pipe.
|
|||
> textcat.add_label("MY_LABEL")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------- | ------- | ----------------- |
|
||||
| `label` | unicode | The label to add. |
|
||||
| Name | Type | Description |
|
||||
| ------- | ---- | ----------------- |
|
||||
| `label` | str | The label to add. |
|
||||
|
||||
## TextCategorizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -262,10 +262,10 @@ Serialize the pipe to disk.
|
|||
> textcat.to_disk("/path/to/textcat")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## TextCategorizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -280,7 +280,7 @@ Load the pipe from disk. Modifies the object in place and returns it.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `TextCategorizer` | The modified `TextCategorizer` object. |
|
||||
|
||||
|
|
|
@ -34,15 +34,15 @@ the
|
|||
> tokenizer = nlp.Defaults.create_tokenizer(nlp)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | A storage container for lexical types. |
|
||||
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
|
||||
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
|
||||
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
|
||||
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
|
||||
| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. |
|
||||
| **RETURNS** | `Tokenizer` | The newly constructed object. |
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ----------- | ------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | A storage container for lexical types. |
|
||||
| `rules` | dict | Exceptions and special-cases for the tokenizer. |
|
||||
| `prefix_search` | callable | A function matching the signature of `re.compile(string).search` to match prefixes. |
|
||||
| `suffix_search` | callable | A function matching the signature of `re.compile(string).search` to match suffixes. |
|
||||
| `infix_finditer` | callable | A function matching the signature of `re.compile(string).finditer` to find infixes. |
|
||||
| `token_match` | callable | A function matching the signature of `re.compile(string).match to find token matches. |
|
||||
| **RETURNS** | `Tokenizer` | The newly constructed object. |
|
||||
|
||||
## Tokenizer.\_\_call\_\_ {#call tag="method"}
|
||||
|
||||
|
@ -55,10 +55,10 @@ Tokenize a string.
|
|||
> assert len(tokens) == 4
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | --------------------------------------- |
|
||||
| `string` | unicode | The string to tokenize. |
|
||||
| **RETURNS** | `Doc` | A container for linguistic annotations. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ----- | --------------------------------------- |
|
||||
| `string` | str | The string to tokenize. |
|
||||
| **RETURNS** | `Doc` | A container for linguistic annotations. |
|
||||
|
||||
## Tokenizer.pipe {#pipe tag="method"}
|
||||
|
||||
|
@ -82,20 +82,20 @@ Tokenize a stream of texts.
|
|||
|
||||
Find internal split points of the string.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `string` | unicode | The string to split. |
|
||||
| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `string` | str | The string to split. |
|
||||
| **RETURNS** | list | A list of `re.MatchObject` objects that have `.start()` and `.end()` methods, denoting the placement of internal segment separators, e.g. hyphens. |
|
||||
|
||||
## Tokenizer.find_prefix {#find_prefix tag="method"}
|
||||
|
||||
Find the length of a prefix that should be segmented from the string, or `None`
|
||||
if no prefix rules match.
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | ------------------------------------------------------ |
|
||||
| `string` | unicode | The string to segment. |
|
||||
| **RETURNS** | int | The length of the prefix if present, otherwise `None`. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | ------------------------------------------------------ |
|
||||
| `string` | str | The string to segment. |
|
||||
| **RETURNS** | int | The length of the prefix if present, otherwise `None`. |
|
||||
|
||||
## Tokenizer.find_suffix {#find_suffix tag="method"}
|
||||
|
||||
|
@ -104,7 +104,7 @@ if no suffix rules match.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | ------------------------------------------------------ |
|
||||
| `string` | unicode | The string to segment. |
|
||||
| `string` | str | The string to segment. |
|
||||
| **RETURNS** | int / `None` | The length of the suffix if present, otherwise `None`. |
|
||||
|
||||
## Tokenizer.add_special_case {#add_special_case tag="method"}
|
||||
|
@ -125,7 +125,7 @@ and examples.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ------------- | -------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
|
||||
| `string` | unicode | The string to specially tokenize. |
|
||||
| `string` | str | The string to specially tokenize. |
|
||||
| `token_attrs` | iterable | A sequence of dicts, where each dict describes a token and its attributes. The `ORTH` fields of the attributes must exactly match the string when they are concatenated. |
|
||||
|
||||
## Tokenizer.explain {#explain tag="method"}
|
||||
|
@ -142,10 +142,10 @@ produced are identical to `Tokenizer.__call__` except for whitespace tokens.
|
|||
> assert [t[1] for t in tok_exp] == ["(", "do", "n't", ")"]
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------------| -------- | --------------------------------------------------- |
|
||||
| `string` | unicode | The string to tokenize with the debugging tokenizer |
|
||||
| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | --------------------------------------------------- |
|
||||
| `string` | str | The string to tokenize with the debugging tokenizer |
|
||||
| **RETURNS** | list | A list of `(pattern_string, token_string)` tuples |
|
||||
|
||||
## Tokenizer.to_disk {#to_disk tag="method"}
|
||||
|
||||
|
@ -158,10 +158,10 @@ Serialize the tokenizer to disk.
|
|||
> tokenizer.to_disk("/path/to/tokenizer")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## Tokenizer.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -174,11 +174,11 @@ Load the tokenizer from disk. Modifies the object in place and returns it.
|
|||
> tokenizer.from_disk("/path/to/tokenizer")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Tokenizer` | The modified `Tokenizer` object. |
|
||||
|
||||
## Tokenizer.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
@ -217,14 +217,14 @@ it.
|
|||
|
||||
## Attributes {#attributes}
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ------- | --------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
|
||||
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
|
||||
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
|
||||
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
|
||||
| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an `re.MatchObject` or `None. |
|
||||
| `rules` | dict | A dictionary of tokenizer exceptions and special cases. |
|
||||
| Name | Type | Description |
|
||||
| ---------------- | ------- | -------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `vocab` | `Vocab` | The vocab object of the parent `Doc`. |
|
||||
| `prefix_search` | - | A function to find segment boundaries from the start of a string. Returns the length of the segment, or `None`. |
|
||||
| `suffix_search` | - | A function to find segment boundaries from the end of a string. Returns the length of the segment, or `None`. |
|
||||
| `infix_finditer` | - | A function to find internal segment separators, e.g. hyphens. Returns a (possibly empty) list of `re.MatchObject` objects. |
|
||||
| `token_match` | - | A function matching the signature of `re.compile(string).match to find token matches. Returns an`re.MatchObject`or`None. |
|
||||
| `rules` | dict | A dictionary of tokenizer exceptions and special cases. |
|
||||
|
||||
## Serialization fields {#serialization-fields}
|
||||
|
||||
|
|
|
@ -32,11 +32,11 @@ class. The data will be loaded in via
|
|||
> nlp = spacy.load("en_core_web_sm", disable=["parser", "tagger"])
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | --------------------------------------------------------------------------------- |
|
||||
| `name` | unicode / `Path` | Model to load, i.e. shortcut link, package name or path. |
|
||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||
| **RETURNS** | `Language` | A `Language` object with the loaded model. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | --------------------------------------------------------------------------------- |
|
||||
| `name` | str / `Path` | Model to load, i.e. shortcut link, package name or path. |
|
||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||
| **RETURNS** | `Language` | A `Language` object with the loaded model. |
|
||||
|
||||
Essentially, `spacy.load()` is a convenience wrapper that reads the language ID
|
||||
and pipeline components from a model's `meta.json`, initializes the `Language`
|
||||
|
@ -79,7 +79,7 @@ Create a blank model of a given language class. This function is the twin of
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------- | ------------------------------------------------------------------------------------------------ |
|
||||
| `name` | unicode | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
|
||||
| `name` | str | [ISO code](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) of the language class to load. |
|
||||
| `disable` | list | Names of pipeline components to [disable](/usage/processing-pipelines#disabling). |
|
||||
| **RETURNS** | `Language` | An empty `Language` object of the appropriate subclass. |
|
||||
|
||||
|
@ -98,10 +98,10 @@ meta data as a dictionary instead, you can use the `meta` attribute on your
|
|||
> spacy.info("de", markdown=True)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ---------- | ------- | ------------------------------------------------------------- |
|
||||
| `model` | unicode | A model, i.e. shortcut link, package name or path (optional). |
|
||||
| `markdown` | bool | Print information as Markdown. |
|
||||
| Name | Type | Description |
|
||||
| ---------- | ---- | ------------------------------------------------------------- |
|
||||
| `model` | str | A model, i.e. shortcut link, package name or path (optional). |
|
||||
| `markdown` | bool | Print information as Markdown. |
|
||||
|
||||
### spacy.explain {#spacy.explain tag="function"}
|
||||
|
||||
|
@ -122,10 +122,10 @@ list of available terms, see
|
|||
> # world NN noun, singular or mass
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | -------------------------------------------------------- |
|
||||
| `term` | unicode | Term to explain. |
|
||||
| **RETURNS** | unicode | The explanation, or `None` if not found in the glossary. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------------------- |
|
||||
| `term` | str | Term to explain. |
|
||||
| **RETURNS** | str | The explanation, or `None` if not found in the glossary. |
|
||||
|
||||
### spacy.prefer_gpu {#spacy.prefer_gpu tag="function" new="2.0.14"}
|
||||
|
||||
|
@ -189,13 +189,13 @@ browser. Will run a simple web server.
|
|||
| Name | Type | Description | Default |
|
||||
| --------- | ------------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ----------- |
|
||||
| `docs` | list, `Doc`, `Span` | Document(s) to visualize. |
|
||||
| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
|
||||
| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
|
||||
| `page` | bool | Render markup as full HTML page. | `True` |
|
||||
| `minify` | bool | Minify HTML markup. | `False` |
|
||||
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
|
||||
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
|
||||
| `port` | int | Port to serve visualization. | `5000` |
|
||||
| `host` | unicode | Host to serve visualization. | `'0.0.0.0'` |
|
||||
| `host` | str | Host to serve visualization. | `'0.0.0.0'` |
|
||||
|
||||
### displacy.render {#displacy.render tag="method" new="2"}
|
||||
|
||||
|
@ -214,13 +214,13 @@ Render a dependency parse tree or named entity visualization.
|
|||
| Name | Type | Description | Default |
|
||||
| ----------- | ------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------- |
|
||||
| `docs` | list, `Doc`, `Span` | Document(s) to visualize. |
|
||||
| `style` | unicode | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
|
||||
| `style` | str | Visualization style, `'dep'` or `'ent'`. | `'dep'` |
|
||||
| `page` | bool | Render markup as full HTML page. | `False` |
|
||||
| `minify` | bool | Minify HTML markup. | `False` |
|
||||
| `jupyter` | bool | Explicitly enable or disable "[Jupyter](http://jupyter.org/) mode" to return markup ready to be rendered in a notebook. Detected automatically if `None`. | `None` |
|
||||
| `options` | dict | [Visualizer-specific options](#displacy_options), e.g. colors. | `{}` |
|
||||
| `manual` | bool | Don't parse `Doc` and instead, expect a dict or list of dicts. [See here](/usage/visualizers#manual-usage) for formats and examples. | `False` |
|
||||
| **RETURNS** | unicode | Rendered HTML markup. |
|
||||
| **RETURNS** | str | Rendered HTML markup. |
|
||||
|
||||
### Visualizer options {#displacy_options}
|
||||
|
||||
|
@ -236,22 +236,22 @@ If a setting is not present in the options, the default value will be used.
|
|||
> displacy.serve(doc, style="dep", options=options)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| ------------------------------------------ | ------- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
|
||||
| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
|
||||
| `add_lemma` <Tag variant="new">2.2.4</Tag> | bool | Print the lemma's in a separate row below the token texts. | `False` |
|
||||
| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
|
||||
| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
|
||||
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
|
||||
| `color` | unicode | Text color (HEX, RGB or color names). | `'#000000'` |
|
||||
| `bg` | unicode | Background color (HEX, RGB or color names). | `'#ffffff'` |
|
||||
| `font` | unicode | Font name or font family for all text. | `'Arial'` |
|
||||
| `offset_x` | int | Spacing on left side of the SVG in px. | `50` |
|
||||
| `arrow_stroke` | int | Width of arrow path in px. | `2` |
|
||||
| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) |
|
||||
| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) |
|
||||
| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` |
|
||||
| `distance` | int | Distance between words in px. | `175` / `150` (compact) |
|
||||
| Name | Type | Description | Default |
|
||||
| ------------------------------------------ | ---- | --------------------------------------------------------------------------------------------------------------- | ----------------------- |
|
||||
| `fine_grained` | bool | Use fine-grained part-of-speech tags (`Token.tag_`) instead of coarse-grained tags (`Token.pos_`). | `False` |
|
||||
| `add_lemma` <Tag variant="new">2.2.4</Tag> | bool | Print the lemma's in a separate row below the token texts. | `False` |
|
||||
| `collapse_punct` | bool | Attach punctuation to tokens. Can make the parse more readable, as it prevents long arcs to attach punctuation. | `True` |
|
||||
| `collapse_phrases` | bool | Merge noun phrases into one token. | `False` |
|
||||
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
|
||||
| `color` | str | Text color (HEX, RGB or color names). | `'#000000'` |
|
||||
| `bg` | str | Background color (HEX, RGB or color names). | `'#ffffff'` |
|
||||
| `font` | str | Font name or font family for all text. | `'Arial'` |
|
||||
| `offset_x` | int | Spacing on left side of the SVG in px. | `50` |
|
||||
| `arrow_stroke` | int | Width of arrow path in px. | `2` |
|
||||
| `arrow_width` | int | Width of arrow head in px. | `10` / `8` (compact) |
|
||||
| `arrow_spacing` | int | Spacing between arrows in px to avoid overlaps. | `20` / `12` (compact) |
|
||||
| `word_spacing` | int | Vertical spacing between words and arcs in px. | `45` |
|
||||
| `distance` | int | Distance between words in px. | `175` / `150` (compact) |
|
||||
|
||||
#### Named Entity Visualizer options {#displacy_options-ent}
|
||||
|
||||
|
@ -263,11 +263,11 @@ If a setting is not present in the options, the default value will be used.
|
|||
> displacy.serve(doc, style="ent", options=options)
|
||||
> ```
|
||||
|
||||
| Name | Type | Description | Default |
|
||||
| --------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
|
||||
| `ents` | list | Entity types to highlight (`None` for all types). | `None` |
|
||||
| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` |
|
||||
| `template` <Tag variant="new">2.2</Tag> | unicode | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |
|
||||
| Name | Type | Description | Default |
|
||||
| --------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------ |
|
||||
| `ents` | list | Entity types to highlight (`None` for all types). | `None` |
|
||||
| `colors` | dict | Color overrides. Entity types in uppercase should be mapped to color names or values. | `{}` |
|
||||
| `template` <Tag variant="new">2.2</Tag> | str | Optional template to overwrite the HTML used to render entity spans. Should be a format string and can use `{bg}`, `{text}` and `{label}`. | see [`templates.py`](https://github.com/explosion/spaCy/blob/master/spacy/displacy/templates.py) |
|
||||
|
||||
By default, displaCy comes with colors for all
|
||||
[entity types supported by spaCy](/api/annotation#named-entities). If you're
|
||||
|
@ -308,9 +308,9 @@ Set custom path to the data directory where spaCy looks for models.
|
|||
> # PosixPath('/custom/path')
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | --------------------------- |
|
||||
| `path` | unicode / `Path` | Path to new data directory. |
|
||||
| Name | Type | Description |
|
||||
| ------ | ------------ | --------------------------- |
|
||||
| `path` | str / `Path` | Path to new data directory. |
|
||||
|
||||
### util.get_lang_class {#util.get_lang_class tag="function"}
|
||||
|
||||
|
@ -330,7 +330,7 @@ you can use the [`set_lang_class`](/api/top-level#util.set_lang_class) helper.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------- | -------------------------------------- |
|
||||
| `lang` | unicode | Two-letter language code, e.g. `'en'`. |
|
||||
| `lang` | str | Two-letter language code, e.g. `'en'`. |
|
||||
| **RETURNS** | `Language` | Language class. |
|
||||
|
||||
### util.set_lang_class {#util.set_lang_class tag="function"}
|
||||
|
@ -352,7 +352,7 @@ the two-letter language code.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------- | -------------------------------------- |
|
||||
| `name` | unicode | Two-letter language code, e.g. `'en'`. |
|
||||
| `name` | str | Two-letter language code, e.g. `'en'`. |
|
||||
| `cls` | `Language` | The language class, e.g. `English`. |
|
||||
|
||||
### util.lang_class_is_loaded {#util.lang_class_is_loaded tag="function" new="2.1"}
|
||||
|
@ -368,10 +368,10 @@ loaded lazily, to avoid expensive setup code associated with the language data.
|
|||
> assert util.lang_class_is_loaded("de") is False
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | -------------------------------------- |
|
||||
| `name` | unicode | Two-letter language code, e.g. `'en'`. |
|
||||
| **RETURNS** | bool | Whether the class has been loaded. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------- |
|
||||
| `name` | str | Two-letter language code, e.g. `'en'`. |
|
||||
| **RETURNS** | bool | Whether the class has been loaded. |
|
||||
|
||||
### util.load_model {#util.load_model tag="function" new="2"}
|
||||
|
||||
|
@ -392,7 +392,7 @@ in via [`Language.from_disk()`](/api/language#from_disk).
|
|||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---------- | -------------------------------------------------------- |
|
||||
| `name` | unicode | Package name, shortcut link or model path. |
|
||||
| `name` | str | Package name, shortcut link or model path. |
|
||||
| `**overrides` | - | Specific overrides, like pipeline components to disable. |
|
||||
| **RETURNS** | `Language` | `Language` class with the loaded model. |
|
||||
|
||||
|
@ -411,7 +411,7 @@ it easy to test a new model that you haven't packaged yet.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---------- | ---------------------------------------------------------------------------------------------------- |
|
||||
| `model_path` | unicode | Path to model data directory. |
|
||||
| `model_path` | str | Path to model data directory. |
|
||||
| `meta` | dict | Model meta data. If `False`, spaCy will try to load the meta from a meta.json in the same directory. |
|
||||
| `**overrides` | - | Specific overrides, like pipeline components to disable. |
|
||||
| **RETURNS** | `Language` | `Language` class with the loaded model. |
|
||||
|
@ -432,7 +432,7 @@ A helper function to use in the `load()` method of a model package's
|
|||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---------- | -------------------------------------------------------- |
|
||||
| `init_file` | unicode | Path to model's `__init__.py`, i.e. `__file__`. |
|
||||
| `init_file` | str | Path to model's `__init__.py`, i.e. `__file__`. |
|
||||
| `**overrides` | - | Specific overrides, like pipeline components to disable. |
|
||||
| **RETURNS** | `Language` | `Language` class with the loaded model. |
|
||||
|
||||
|
@ -446,10 +446,10 @@ Get a model's meta.json from a directory path and validate its contents.
|
|||
> meta = util.get_model_meta("/path/to/model")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | ------------------------ |
|
||||
| `path` | unicode / `Path` | Path to model directory. |
|
||||
| **RETURNS** | dict | The model's meta data. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | ------------------------ |
|
||||
| `path` | str / `Path` | Path to model directory. |
|
||||
| **RETURNS** | dict | The model's meta data. |
|
||||
|
||||
### util.is_package {#util.is_package tag="function"}
|
||||
|
||||
|
@ -463,10 +463,10 @@ Check if string maps to a package installed via pip. Mainly used to validate
|
|||
> util.is_package("xyz") # False
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | -------------------------------------------- |
|
||||
| `name` | unicode | Name of package. |
|
||||
| **RETURNS** | `bool` | `True` if installed package, `False` if not. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------ | -------------------------------------------- |
|
||||
| `name` | str | Name of package. |
|
||||
| **RETURNS** | `bool` | `True` if installed package, `False` if not. |
|
||||
|
||||
### util.get_package_path {#util.get_package_path tag="function" new="2"}
|
||||
|
||||
|
@ -480,10 +480,10 @@ Get path to an installed package. Mainly used to resolve the location of
|
|||
> # /usr/lib/python3.6/site-packages/en_core_web_sm
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------- | -------------------------------- |
|
||||
| `package_name` | unicode | Name of installed package. |
|
||||
| **RETURNS** | `Path` | Path to model package directory. |
|
||||
| Name | Type | Description |
|
||||
| -------------- | ------ | -------------------------------- |
|
||||
| `package_name` | str | Name of installed package. |
|
||||
| **RETURNS** | `Path` | Path to model package directory. |
|
||||
|
||||
### util.is_in_jupyter {#util.is_in_jupyter tag="function" new="2"}
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ you can add vectors to later.
|
|||
| `data` | `ndarray[ndim=1, dtype='float32']` | The vector data. |
|
||||
| `keys` | iterable | A sequence of keys aligned with the data. |
|
||||
| `shape` | tuple | Size of the table as `(n_entries, n_columns)`, the number of entries and number of columns. Not required if you're initializing the object with `data` and `keys`. |
|
||||
| `name` | unicode | A name to identify the vectors table. |
|
||||
| `name` | str | A name to identify the vectors table. |
|
||||
| **RETURNS** | `Vectors` | The newly created object. |
|
||||
|
||||
## Vectors.\_\_getitem\_\_ {#getitem tag="method"}
|
||||
|
@ -140,7 +140,7 @@ mapping separately. If you need to manage the strings, you should use the
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------------------------- | ----------------------------------------------------- |
|
||||
| `key` | unicode / int | The key to add. |
|
||||
| `key` | str / int | The key to add. |
|
||||
| `vector` | `ndarray[ndim=1, dtype='float32']` | An optional vector to add for the key. |
|
||||
| `row` | int | An optional row number of a vector to map the key to. |
|
||||
| **RETURNS** | int | The row the vector was added to. |
|
||||
|
@ -227,7 +227,7 @@ Look up one or more keys by row, or vice versa.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------------------------------- | ------------------------------------------------------------------------ |
|
||||
| `key` | unicode / int | Find the row that the given key points to. Returns int, `-1` if missing. |
|
||||
| `key` | str / int | Find the row that the given key points to. Returns int, `-1` if missing. |
|
||||
| `keys` | iterable | Find rows that the keys point to. Returns `ndarray`. |
|
||||
| `row` | int | Find the first key that points to the row. Returns int. |
|
||||
| `rows` | iterable | Find the keys that point to the rows. Returns ndarray. |
|
||||
|
@ -337,9 +337,9 @@ Save the current state to a directory.
|
|||
>
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ------ | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| Name | Type | Description |
|
||||
| ------ | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
|
||||
## Vectors.from_disk {#from_disk tag="method"}
|
||||
|
||||
|
@ -352,10 +352,10 @@ Loads state from a directory. Modifies the object in place and returns it.
|
|||
> vectors.from_disk("/path/to/vectors")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `Vectors` | The modified `Vectors` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| **RETURNS** | `Vectors` | The modified `Vectors` object. |
|
||||
|
||||
## Vectors.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
|
|
@ -27,7 +27,7 @@ Create the vocabulary.
|
|||
| `tag_map` | dict | A dictionary mapping fine-grained tags to coarse-grained parts-of-speech, and optionally morphological attributes. |
|
||||
| `lemmatizer` | object | A lemmatizer. Defaults to `None`. |
|
||||
| `strings` | `StringStore` / list | A [`StringStore`](/api/stringstore) that maps strings to hash values, and vice versa, or a list of strings. |
|
||||
| `vectors_name` <Tag variant="new">2.2</Tag> | unicode | A name to identify the vectors table. |
|
||||
| `vectors_name` <Tag variant="new">2.2</Tag> | str | A name to identify the vectors table. |
|
||||
| **RETURNS** | `Vocab` | The newly constructed object. |
|
||||
|
||||
## Vocab.\_\_len\_\_ {#len tag="method"}
|
||||
|
@ -91,10 +91,10 @@ given string, you need to look it up in
|
|||
> assert oov not in nlp.vocab
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------- | -------------------------------------------------- |
|
||||
| `string` | unicode | The ID string. |
|
||||
| **RETURNS** | bool | Whether the string has an entry in the vocabulary. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---- | -------------------------------------------------- |
|
||||
| `string` | str | The ID string. |
|
||||
| **RETURNS** | bool | Whether the string has an entry in the vocabulary. |
|
||||
|
||||
## Vocab.add_flag {#add_flag tag="method"}
|
||||
|
||||
|
@ -117,7 +117,7 @@ using `token.check_flag(flag_id)`.
|
|||
|
||||
| Name | Type | Description |
|
||||
| ------------- | ---- | ----------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `flag_getter` | dict | A function `f(unicode) -> bool`, to get the flag value. |
|
||||
| `flag_getter` | dict | A function `f(str) -> bool`, to get the flag value. |
|
||||
| `flag_id` | int | An integer between 1 and 63 (inclusive), specifying the bit at which the flag will be stored. If `-1`, the lowest available bit will be chosen. |
|
||||
| **RETURNS** | int | The integer ID by which the flag value can be checked. |
|
||||
|
||||
|
@ -227,10 +227,10 @@ Save the current state to a directory.
|
|||
> nlp.vocab.to_disk("/path/to/vocab")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| --------- | ---------------- | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| Name | Type | Description |
|
||||
| --------- | ------------ | --------------------------------------------------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory, which will be created if it doesn't exist. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
|
||||
## Vocab.from_disk {#from_disk tag="method" new="2"}
|
||||
|
||||
|
@ -243,11 +243,11 @@ Loads state from a directory. Modifies the object in place and returns it.
|
|||
> vocab = Vocab().from_disk("/path/to/vocab")
|
||||
> ```
|
||||
|
||||
| Name | Type | Description |
|
||||
| ----------- | ---------------- | -------------------------------------------------------------------------- |
|
||||
| `path` | unicode / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Vocab` | The modified `Vocab` object. |
|
||||
| Name | Type | Description |
|
||||
| ----------- | ------------ | -------------------------------------------------------------------------- |
|
||||
| `path` | str / `Path` | A path to a directory. Paths may be either strings or `Path`-like objects. |
|
||||
| `exclude` | list | String names of [serialization fields](#serialization-fields) to exclude. |
|
||||
| **RETURNS** | `Vocab` | The modified `Vocab` object. |
|
||||
|
||||
## Vocab.to_bytes {#to_bytes tag="method"}
|
||||
|
||||
|
|
|
@ -327,11 +327,11 @@ displaCy in our [online demo](https://explosion.ai/demos/displacy)..
|
|||
### Disabling the parser {#disabling}
|
||||
|
||||
In the [default models](/models), the parser is loaded and enabled as part of
|
||||
the [standard processing pipeline](/usage/processing-pipelines). If you don't need
|
||||
any of the syntactic information, you should disable the parser. Disabling the
|
||||
parser will make spaCy load and run much faster. If you want to load the parser,
|
||||
but need to disable it for specific documents, you can also control its use on
|
||||
the `nlp` object.
|
||||
the [standard processing pipeline](/usage/processing-pipelines). If you don't
|
||||
need any of the syntactic information, you should disable the parser. Disabling
|
||||
the parser will make spaCy load and run much faster. If you want to load the
|
||||
parser, but need to disable it for specific documents, you can also control its
|
||||
use on the `nlp` object.
|
||||
|
||||
```python
|
||||
nlp = spacy.load("en_core_web_sm", disable=["parser"])
|
||||
|
@ -988,10 +988,10 @@ nlp = spacy.load("en_core_web_sm")
|
|||
nlp.tokenizer = my_tokenizer
|
||||
```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| ----------- | ------- | ------------------------- |
|
||||
| `text` | unicode | The raw text to tokenize. |
|
||||
| **RETURNS** | `Doc` | The tokenized document. |
|
||||
| Argument | Type | Description |
|
||||
| ----------- | ----- | ------------------------- |
|
||||
| `text` | str | The raw text to tokenize. |
|
||||
| **RETURNS** | `Doc` | The tokenized document. |
|
||||
|
||||
<Infobox title="Important note: using a custom tokenizer" variant="warning">
|
||||
|
||||
|
|
|
@ -272,16 +272,16 @@ doc = nlp("I won't have named entities")
|
|||
disabled.restore()
|
||||
```
|
||||
|
||||
If you want to disable all pipes except for one or a few, you can use the `enable`
|
||||
keyword. Just like the `disable` keyword, it takes a list of pipe names, or a string
|
||||
defining just one pipe.
|
||||
If you want to disable all pipes except for one or a few, you can use the
|
||||
`enable` keyword. Just like the `disable` keyword, it takes a list of pipe
|
||||
names, or a string defining just one pipe.
|
||||
|
||||
```python
|
||||
# Enable only the parser
|
||||
with nlp.select_pipes(enable="parser"):
|
||||
doc = nlp("I will only be parsed")
|
||||
```
|
||||
|
||||
|
||||
Finally, you can also use the [`remove_pipe`](/api/language#remove_pipe) method
|
||||
to remove pipeline components from an existing pipeline, the
|
||||
[`rename_pipe`](/api/language#rename_pipe) method to rename them, or the
|
||||
|
@ -349,12 +349,12 @@ last** in the pipeline, or define a **custom name**. If no name is set and no
|
|||
> nlp.add_pipe(my_component, before="parser")
|
||||
> ```
|
||||
|
||||
| Argument | Type | Description |
|
||||
| -------- | ------- | ------------------------------------------------------------------------ |
|
||||
| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). |
|
||||
| `first` | bool | If set to `True`, component is added **first** in the pipeline. |
|
||||
| `before` | unicode | String name of component to add the new component **before**. |
|
||||
| `after` | unicode | String name of component to add the new component **after**. |
|
||||
| Argument | Type | Description |
|
||||
| -------- | ---- | ------------------------------------------------------------------------ |
|
||||
| `last` | bool | If set to `True`, component is added **last** in the pipeline (default). |
|
||||
| `first` | bool | If set to `True`, component is added **first** in the pipeline. |
|
||||
| `before` | str | String name of component to add the new component **before**. |
|
||||
| `after` | str | String name of component to add the new component **after**. |
|
||||
|
||||
### Example: A simple pipeline component {#custom-components-simple}
|
||||
|
||||
|
|
|
@ -157,19 +157,19 @@ The available token pattern keys correspond to a number of
|
|||
[`Token` attributes](/api/token#attributes). The supported attributes for
|
||||
rule-based matching are:
|
||||
|
||||
| Attribute | Type | Description |
|
||||
| -------------------------------------- | ------- | ------------------------------------------------------------------------------------------------------ |
|
||||
| `ORTH` | unicode | The exact verbatim text of a token. |
|
||||
| `TEXT` <Tag variant="new">2.1</Tag> | unicode | The exact verbatim text of a token. |
|
||||
| `LOWER` | unicode | The lowercase form of the token text. |
|
||||
| `LENGTH` | int | The length of the token text. |
|
||||
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
|
||||
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|
||||
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|
||||
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|
||||
| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | unicode | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
|
||||
| `ENT_TYPE` | unicode | The token's entity label. |
|
||||
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
|
||||
| Attribute | Type | Description |
|
||||
| -------------------------------------- | ---- | ------------------------------------------------------------------------------------------------------ |
|
||||
| `ORTH` | str | The exact verbatim text of a token. |
|
||||
| `TEXT` <Tag variant="new">2.1</Tag> | str | The exact verbatim text of a token. |
|
||||
| `LOWER` | str | The lowercase form of the token text. |
|
||||
| `LENGTH` | int | The length of the token text. |
|
||||
| `IS_ALPHA`, `IS_ASCII`, `IS_DIGIT` | bool | Token text consists of alphabetic characters, ASCII characters, digits. |
|
||||
| `IS_LOWER`, `IS_UPPER`, `IS_TITLE` | bool | Token text is in lowercase, uppercase, titlecase. |
|
||||
| `IS_PUNCT`, `IS_SPACE`, `IS_STOP` | bool | Token is punctuation, whitespace, stop word. |
|
||||
| `LIKE_NUM`, `LIKE_URL`, `LIKE_EMAIL` | bool | Token text resembles a number, URL, email. |
|
||||
| `POS`, `TAG`, `DEP`, `LEMMA`, `SHAPE` | str | The token's simple and extended part-of-speech tag, dependency label, lemma, shape. |
|
||||
| `ENT_TYPE` | str | The token's entity label. |
|
||||
| `_` <Tag variant="new">2.1</Tag> | dict | Properties in [custom extension attributes](/usage/processing-pipelines#custom-components-attributes). |
|
||||
|
||||
<Accordion title="Does it matter if the attribute names are uppercase or lowercase?">
|
||||
|
||||
|
@ -1101,21 +1101,28 @@ powerful model packages with binary weights _and_ rules included!
|
|||
|
||||
### Using a large number of phrase patterns {#entityruler-large-phrase-patterns new="2.2.4"}
|
||||
|
||||
When using a large amount of **phrase patterns** (roughly > 10000) it's useful to understand how the `add_patterns` function of the EntityRuler works. For each **phrase pattern**,
|
||||
the EntityRuler calls the nlp object to construct a doc object. This happens in case you try
|
||||
to add the EntityRuler at the end of an existing pipeline with, for example, a POS tagger and want to
|
||||
extract matches based on the pattern's POS signature.
|
||||
When using a large amount of **phrase patterns** (roughly > 10000) it's useful
|
||||
to understand how the `add_patterns` function of the EntityRuler works. For each
|
||||
**phrase pattern**, the EntityRuler calls the nlp object to construct a doc
|
||||
object. This happens in case you try to add the EntityRuler at the end of an
|
||||
existing pipeline with, for example, a POS tagger and want to extract matches
|
||||
based on the pattern's POS signature.
|
||||
|
||||
In this case you would pass a config value of `phrase_matcher_attr="POS"` for the EntityRuler.
|
||||
In this case you would pass a config value of `phrase_matcher_attr="POS"` for
|
||||
the EntityRuler.
|
||||
|
||||
Running the full language pipeline across every pattern in a large list scales linearly and can therefore take a long time on large amounts of phrase patterns.
|
||||
Running the full language pipeline across every pattern in a large list scales
|
||||
linearly and can therefore take a long time on large amounts of phrase patterns.
|
||||
|
||||
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with 5,000-100,000 phrase patterns respectively.
|
||||
As of spaCy 2.2.4 the `add_patterns` function has been refactored to use
|
||||
nlp.pipe on all phrase patterns resulting in about a 10x-20x speed up with
|
||||
5,000-100,000 phrase patterns respectively.
|
||||
|
||||
Even with this speedup (but especially if you're using an older version) the `add_patterns` function can still take a long time.
|
||||
Even with this speedup (but especially if you're using an older version) the
|
||||
`add_patterns` function can still take a long time.
|
||||
|
||||
An easy workaround to make this function run faster is disabling the other language pipes
|
||||
while adding the phrase patterns.
|
||||
An easy workaround to make this function run faster is disabling the other
|
||||
language pipes while adding the phrase patterns.
|
||||
|
||||
```python
|
||||
entityruler = EntityRuler(nlp)
|
||||
|
|
|
@ -94,8 +94,8 @@ docs = list(doc_bin.get_docs(nlp.vocab))
|
|||
|
||||
If `store_user_data` is set to `True`, the `Doc.user_data` will be serialized as
|
||||
well, which includes the values of
|
||||
[extension attributes](/usage/processing-pipelines#custom-components-attributes) (if
|
||||
they're serializable with msgpack).
|
||||
[extension attributes](/usage/processing-pipelines#custom-components-attributes)
|
||||
(if they're serializable with msgpack).
|
||||
|
||||
<Infobox title="Important note on serializing extension attributes" variant="warning">
|
||||
|
||||
|
@ -666,10 +666,10 @@ and lets you customize how the model should be initialized and loaded. You can
|
|||
define the language data to be loaded and the
|
||||
[processing pipeline](/usage/processing-pipelines) to execute.
|
||||
|
||||
| Setting | Type | Description |
|
||||
| ---------- | ------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | unicode | ID of the language class to initialize. |
|
||||
| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. |
|
||||
| Setting | Type | Description |
|
||||
| ---------- | ---- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
|
||||
| `lang` | str | ID of the language class to initialize. |
|
||||
| `pipeline` | list | A list of strings mapping to the IDs of pipeline factories to apply in that order. If not set, spaCy's [default pipeline](/usage/processing-pipelines) will be used. |
|
||||
|
||||
The `load()` method that comes with our model package templates will take care
|
||||
of putting all this together and returning a `Language` object with the loaded
|
||||
|
|
|
@ -67,12 +67,12 @@ arcs.
|
|||
|
||||
</Infobox>
|
||||
|
||||
| Argument | Type | Description | Default |
|
||||
| --------- | ------- | ----------------------------------------------------------- | ----------- |
|
||||
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
|
||||
| `color` | unicode | Text color (HEX, RGB or color names). | `"#000000"` |
|
||||
| `bg` | unicode | Background color (HEX, RGB or color names). | `"#ffffff"` |
|
||||
| `font` | unicode | Font name or font family for all text. | `"Arial"` |
|
||||
| Argument | Type | Description | Default |
|
||||
| --------- | ---- | ----------------------------------------------------------- | ----------- |
|
||||
| `compact` | bool | "Compact mode" with square arrows that takes up less space. | `False` |
|
||||
| `color` | str | Text color (HEX, RGB or color names). | `"#000000"` |
|
||||
| `bg` | str | Background color (HEX, RGB or color names). | `"#ffffff"` |
|
||||
| `font` | str | Font name or font family for all text. | `"Arial"` |
|
||||
|
||||
For a list of all available options, see the
|
||||
[`displacy` API documentation](/api/top-level#displacy_options).
|
||||
|
|
Loading…
Reference in New Issue
Block a user