💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None
This commit is contained in:
Ines Montani 2018-04-03 15:50:31 +02:00 committed by GitHub
parent abf8b16d71
commit 3141e04822
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
41 changed files with 652 additions and 443 deletions

View File

@ -4,18 +4,14 @@ from __future__ import unicode_literals
from .cli.info import info as cli_info
from .glossary import explain
from .about import __version__
from .errors import Warnings, deprecation_warning
from . import util
def load(name, **overrides):
depr_path = overrides.get('path')
if depr_path not in (True, False, None):
util.deprecated(
"As of spaCy v2.0, the keyword argument `path=` is deprecated. "
"You can now call spacy.load with the path as its first argument, "
"and the model's meta.json will be used to determine the language "
"to load. For example:\nnlp = spacy.load('{}')".format(depr_path),
'error')
deprecation_warning(Warnings.W001.format(path=depr_path))
return util.load_model(name, **overrides)

View File

@ -23,6 +23,7 @@ from thinc.neural._classes.affine import _set_dimensions_if_needed
import thinc.extra.load_nlp
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
from .errors import Errors
from . import util
@ -340,10 +341,10 @@ def _divide_array(X, size):
def get_col(idx):
assert idx >= 0, idx
if idx < 0:
raise IndexError(Errors.E066.format(value=idx))
def forward(X, drop=0.):
assert idx >= 0, idx
if isinstance(X, numpy.ndarray):
ops = NumpyOps()
else:
@ -351,7 +352,6 @@ def get_col(idx):
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
def backward(y, sgd=None):
assert idx >= 0, idx
dX = ops.allocate(X.shape)
dX[:, idx] += y
return dX

View File

@ -11,7 +11,6 @@ __email__ = 'contact@explosion.ai'
__license__ = 'MIT'
__release__ = True
__docs_models__ = 'https://spacy.io/usage/models'
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json'

73
spacy/cli/_messages.py Normal file
View File

@ -0,0 +1,73 @@
# coding: utf8
from __future__ import unicode_literals
class Messages(object):
M001 = ("Download successful but linking failed")
M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
"don't have admin permissions?), but you can still load the "
"model via its full package name: nlp = spacy.load('{name}')")
M003 = ("Server error ({code}: {desc})")
M004 = ("Couldn't fetch {desc}. Please find a model for your spaCy "
"installation (v{version}), and download it manually. For more "
"details, see the documentation: https://spacy.io/usage/models")
M005 = ("Compatibility error")
M006 = ("No compatible models found for v{version} of spaCy.")
M007 = ("No compatible model found for '{name}' (spaCy v{version}).")
M008 = ("Can't locate model data")
M009 = ("The data should be located in {path}")
M010 = ("Can't find the spaCy data path to create model symlink")
M011 = ("Make sure a directory `/data` exists within your spaCy "
"installation and try again. The data directory should be "
"located here:")
M012 = ("Link '{name}' already exists")
M013 = ("To overwrite an existing link, use the --force flag.")
M014 = ("Can't overwrite symlink '{name}'")
M015 = ("This can happen if your data directory contains a directory or "
"file of the same name.")
M016 = ("Error: Couldn't link model to '{name}'")
M017 = ("Creating a symlink in spacy/data failed. Make sure you have the "
"required permissions and try re-running the command as admin, or "
"use a virtualenv. You can still import the model as a module and "
"call its load() method, or create the symlink manually.")
M018 = ("Linking successful")
M019 = ("You can now load the model via spacy.load('{name}')")
M020 = ("Can't find model meta.json")
M021 = ("Couldn't fetch compatibility table.")
M022 = ("Can't find spaCy v{version} in compatibility table")
M023 = ("Installed models (spaCy v{version})")
M024 = ("No models found in your current environment.")
M025 = ("Use the following commands to update the model packages:")
M026 = ("The following models are not available for spaCy "
"v{version}: {models}")
M027 = ("You may also want to overwrite the incompatible links using the "
"`python -m spacy link` command with `--force`, or remove them "
"from the data directory. Data path: {path}")
M028 = ("Input file not found")
M029 = ("Output directory not found")
M030 = ("Unknown format")
M031 = ("Can't find converter for {converter}")
M032 = ("Generated output file {name}")
M033 = ("Created {n_docs} documents")
M034 = ("Evaluation data not found")
M035 = ("Visualization output directory not found")
M036 = ("Generated {n} parses as HTML")
M037 = ("Can't find words frequencies file")
M038 = ("Sucessfully compiled vocab")
M039 = ("{entries} entries, {vectors} vectors")
M040 = ("Output directory not found")
M041 = ("Loaded meta.json from file")
M042 = ("Successfully created package '{name}'")
M043 = ("To build the package, run `python setup.py sdist` in this "
"directory.")
M044 = ("Package directory already exists")
M045 = ("Please delete the directory and try again, or use the `--force` "
"flag to overwrite existing directories.")
M046 = ("Generating meta.json")
M047 = ("Enter the package settings for your model. The following "
"information will be read from your model data: pipeline, vectors.")
M048 = ("No '{key}' setting found in meta.json")
M049 = ("This setting is required to build your package.")
M050 = ("Training data not found")
M051 = ("Development data not found")
M052 = ("Not a valid meta.json format")
M053 = ("Expected dict but got: {meta_type}")

View File

@ -5,6 +5,7 @@ import plac
from pathlib import Path
from .converters import conllu2json, iob2json, conll_ner2json
from ._messages import Messages
from ..util import prints
# Converters are matched by file extension. To add a converter, add a new
@ -32,14 +33,14 @@ def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto
input_path = Path(input_file)
output_path = Path(output_dir)
if not input_path.exists():
prints(input_path, title="Input file not found", exits=1)
prints(input_path, title=Messages.M028, exits=1)
if not output_path.exists():
prints(output_path, title="Output directory not found", exits=1)
prints(output_path, title=Messages.M029, exits=1)
if converter == 'auto':
converter = input_path.suffix[1:]
if converter not in CONVERTERS:
prints("Can't find converter for %s" % converter,
title="Unknown format", exits=1)
prints(Messages.M031.format(converter=converter),
title=Messages.M030, exits=1)
func = CONVERTERS[converter]
func(input_path, output_path,
n_sents=n_sents, use_morphology=morphology)

View File

@ -1,6 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from .._messages import Messages
from ...compat import json_dumps, path2str
from ...util import prints
from ...gold import iob_to_biluo
@ -18,8 +19,8 @@ def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f:
f.write(json_dumps(docs))
prints("Created %d documents" % len(docs),
title="Generated output file %s" % path2str(output_file))
prints(Messages.M033.format(n_docs=len(docs)),
title=Messages.M032.format(name=path2str(output_file)))
def read_conll_ner(input_path):

View File

@ -1,6 +1,7 @@
# coding: utf8
from __future__ import unicode_literals
from .._messages import Messages
from ...compat import json_dumps, path2str
from ...util import prints
@ -32,8 +33,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f:
f.write(json_dumps(docs))
prints("Created %d documents" % len(docs),
title="Generated output file %s" % path2str(output_file))
prints(Messages.M033.format(n_docs=len(docs)),
title=Messages.M032.format(name=path2str(output_file)))
def read_conllx(input_path, use_morphology=False, n=0):

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals
from cytoolz import partition_all, concat
from .._messages import Messages
from ...compat import json_dumps, path2str
from ...util import prints
from ...gold import iob_to_biluo
@ -18,8 +19,8 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f:
f.write(json_dumps(docs))
prints("Created %d documents" % len(docs),
title="Generated output file %s" % path2str(output_file))
prints(Messages.M033.format(n_docs=len(docs)),
title=Messages.M032.format(name=path2str(output_file)))
def read_iob(raw_sents):

View File

@ -8,6 +8,7 @@ import sys
import ujson
from .link import link
from ._messages import Messages
from ..util import prints, get_package_path
from ..compat import url_read, HTTPError
from .. import about
@ -32,9 +33,7 @@ def download(model, direct=False):
version = get_version(model_name, compatibility)
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name,
v=version))
if dl != 0:
# if download subprocess doesn't return 0, exit with the respective
# exit code before doing anything else
if dl != 0: # if download subprocess doesn't return 0, exit
sys.exit(dl)
try:
# Get package path here because link uses
@ -48,22 +47,15 @@ def download(model, direct=False):
# Dirty, but since spacy.download and the auto-linking is
# mostly a convenience wrapper, it's best to show a success
# message and loading instructions, even if linking fails.
prints(
"Creating a shortcut link for 'en' didn't work (maybe "
"you don't have admin permissions?), but you can still "
"load the model via its full package name:",
"nlp = spacy.load('%s')" % model_name,
title="Download successful but linking failed")
prints(Messages.M001.format(name=model_name), title=Messages.M002)
def get_json(url, desc):
try:
data = url_read(url)
except HTTPError as e:
msg = ("Couldn't fetch %s. Please find a model for your spaCy "
"installation (v%s), and download it manually.")
prints(msg % (desc, about.__version__), about.__docs_models__,
title="Server error (%d: %s)" % (e.code, e.reason), exits=1)
prints(Messages.M004.format(desc, about.__version__),
title=Messages.M003.format(e.code, e.reason), exits=1)
return ujson.loads(data)
@ -73,17 +65,16 @@ def get_compatibility():
comp_table = get_json(about.__compatibility__, "compatibility table")
comp = comp_table['spacy']
if version not in comp:
prints("No compatible models found for v%s of spaCy." % version,
title="Compatibility error", exits=1)
prints(Messages.M006.format(version=version), title=Messages.M005,
exits=1)
return comp[version]
def get_version(model, comp):
model = model.rsplit('.dev', 1)[0]
if model not in comp:
version = about.__version__
msg = "No compatible model found for '%s' (spaCy v%s)."
prints(msg % (model, version), title="Compatibility error", exits=1)
prints(Messages.M007.format(name=model, version=about.__version__),
title=Messages.M005, exits=1)
return comp[model][0]

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals, division, print_function
import plac
from timeit import default_timer as timer
from ._messages import Messages
from ..gold import GoldCorpus
from ..util import prints
from .. import util
@ -33,10 +34,9 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
data_path = util.ensure_path(data_path)
displacy_path = util.ensure_path(displacy_path)
if not data_path.exists():
prints(data_path, title="Evaluation data not found", exits=1)
prints(data_path, title=Messages.M034, exits=1)
if displacy_path and not displacy_path.exists():
prints(displacy_path, title="Visualization output directory not found",
exits=1)
prints(displacy_path, title=Messages.M035, exits=1)
corpus = GoldCorpus(data_path, data_path)
nlp = util.load_model(model)
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
@ -52,8 +52,7 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
render_ents = 'ner' in nlp.meta.get('pipeline', [])
render_parses(docs, displacy_path, model_name=model,
limit=displacy_limit, deps=render_deps, ents=render_ents)
msg = "Generated %s parses as HTML" % displacy_limit
prints(displacy_path, title=msg)
prints(displacy_path, title=Messages.M036.format(n=displacy_limit))
def render_parses(docs, output_path, model_name='', limit=250, deps=True,

View File

@ -5,9 +5,10 @@ import plac
import platform
from pathlib import Path
from ._messages import Messages
from ..compat import path2str
from .. import about
from .. import util
from .. import about
@plac.annotations(
@ -25,7 +26,7 @@ def info(model=None, markdown=False):
model_path = util.get_data_path() / model
meta_path = model_path / 'meta.json'
if not meta_path.is_file():
util.prints(meta_path, title="Can't find model meta.json", exits=1)
util.prints(meta_path, title=Messages.M020, exits=1)
meta = util.read_json(meta_path)
if model_path.resolve() != model_path:
meta['link'] = path2str(model_path)

View File

@ -11,7 +11,9 @@ from preshed.counter import PreshCounter
import tarfile
import gzip
from ._messages import Messages
from ..vectors import Vectors
from ..errors import Warnings, user_warning
from ..util import prints, ensure_path, get_lang_class
try:
@ -37,16 +39,13 @@ def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, vectors_loc=
and word vectors.
"""
if freqs_loc is not None and not freqs_loc.exists():
prints(freqs_loc, title="Can't find words frequencies file", exits=1)
prints(freqs_loc, title=Messages.M037, exits=1)
clusters_loc = ensure_path(clusters_loc)
vectors_loc = ensure_path(vectors_loc)
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
clusters = read_clusters(clusters_loc) if clusters_loc else {}
nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
@ -69,7 +68,6 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
nlp = lang_class()
for lexeme in nlp.vocab:
lexeme.rank = 0
lex_added = 0
for i, (word, prob) in enumerate(tqdm(sorted(probs.items(), key=lambda item: item[1], reverse=True))):
lexeme = nlp.vocab[word]
@ -89,15 +87,13 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
lexeme = nlp.vocab[word]
lexeme.is_oov = False
lex_added += 1
if len(vectors_data):
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
if prune_vectors >= 1:
nlp.vocab.prune_vectors(prune_vectors)
vec_added = len(nlp.vocab.vectors)
prints("{} entries, {} vectors".format(lex_added, vec_added),
title="Sucessfully compiled vocab")
prints(Messages.M039.format(entries=lex_added, vectors=vec_added),
title=Messages.M038)
return nlp
@ -145,7 +141,7 @@ def read_clusters(clusters_loc):
print("Reading clusters...")
clusters = {}
if ftfy is None:
print("Warning: No text fixing. Run pip install ftfy if necessary")
user_warning(Warnings.W004)
with clusters_loc.open() as f:
for line in tqdm(f):
try:

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import plac
from pathlib import Path
from ._messages import Messages
from ..compat import symlink_to, path2str
from ..util import prints
from .. import util
@ -24,40 +25,29 @@ def link(origin, link_name, force=False, model_path=None):
else:
model_path = Path(origin) if model_path is None else Path(model_path)
if not model_path.exists():
prints("The data should be located in %s" % path2str(model_path),
title="Can't locate model data", exits=1)
prints(Messages.M009.format(path=path2str(model_path)),
title=Messages.M008, exits=1)
data_path = util.get_data_path()
if not data_path or not data_path.exists():
spacy_loc = Path(__file__).parent.parent
prints("Make sure a directory `/data` exists within your spaCy "
"installation and try again. The data directory should be "
"located here:", path2str(spacy_loc), exits=1,
title="Can't find the spaCy data path to create model symlink")
prints(Messages.M011, spacy_loc, title=Messages.M010, exits=1)
link_path = util.get_data_path() / link_name
if link_path.is_symlink() and not force:
prints("To overwrite an existing link, use the --force flag.",
title="Link %s already exists" % link_name, exits=1)
prints(Messages.M013, title=Messages.M012.format(name=link_name),
exits=1)
elif link_path.is_symlink(): # does a symlink exist?
# NB: It's important to check for is_symlink here and not for exists,
# because invalid/outdated symlinks would return False otherwise.
link_path.unlink()
elif link_path.exists(): # does it exist otherwise?
# NB: Check this last because valid symlinks also "exist".
prints("This can happen if your data directory contains a directory "
"or file of the same name.", link_path,
title="Can't overwrite symlink %s" % link_name, exits=1)
prints(Messages.M015, link_path,
title=Messages.M014.format(name=link_name), exits=1)
msg = "%s --> %s" % (path2str(model_path), path2str(link_path))
try:
symlink_to(link_path, model_path)
except:
# This is quite dirty, but just making sure other errors are caught.
prints("Creating a symlink in spacy/data failed. Make sure you have "
"the required permissions and try re-running the command as "
"admin, or use a virtualenv. You can still import the model as "
"a module and call its load() method, or create the symlink "
"manually.",
"%s --> %s" % (path2str(model_path), path2str(link_path)),
title="Error: Couldn't link model to '%s'" % link_name)
prints(Messages.M017, msg, title=Messages.M016.format(name=link_name))
raise
prints("%s --> %s" % (path2str(model_path), path2str(link_path)),
"You can now load the model via spacy.load('%s')" % link_name,
title="Linking successful")
prints(msg, Messages.M019.format(name=link_name), title=Messages.M018)

View File

@ -5,6 +5,7 @@ import plac
import shutil
from pathlib import Path
from ._messages import Messages
from ..compat import path2str, json_dumps
from ..util import prints
from .. import util
@ -31,17 +32,17 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False,
output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path)
if not input_path or not input_path.exists():
prints(input_path, title="Model directory not found", exits=1)
prints(input_path, title=Messages.M008, exits=1)
if not output_path or not output_path.exists():
prints(output_path, title="Output directory not found", exits=1)
prints(output_path, title=Messages.M040, exits=1)
if meta_path and not meta_path.exists():
prints(meta_path, title="meta.json not found", exits=1)
prints(meta_path, title=Messages.M020, exits=1)
meta_path = meta_path or input_path / 'meta.json'
if meta_path.is_file():
meta = util.read_json(meta_path)
if not create_meta: # only print this if user doesn't want to overwrite
prints(meta_path, title="Loaded meta.json from file")
prints(meta_path, title=Messages.M041)
else:
meta = generate_meta(input_dir, meta)
meta = validate_meta(meta, ['lang', 'name', 'version'])
@ -57,9 +58,8 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False,
create_file(main_path / 'setup.py', TEMPLATE_SETUP)
create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST)
create_file(package_path / '__init__.py', TEMPLATE_INIT)
prints(main_path, "To build the package, run `python setup.py sdist` in "
"this directory.",
title="Successfully created package '%s'" % model_name_v)
prints(main_path, Messages.M043,
title=Messages.M042.format(name=model_name_v))
def create_dirs(package_path, force):
@ -67,10 +67,7 @@ def create_dirs(package_path, force):
if force:
shutil.rmtree(path2str(package_path))
else:
prints(package_path, "Please delete the directory and try again, "
"or use the --force flag to overwrite existing "
"directories.", title="Package directory already exists",
exits=1)
prints(package_path, Messages.M045, title=Messages.M044, exits=1)
Path.mkdir(package_path, parents=True)
@ -97,9 +94,7 @@ def generate_meta(model_path, existing_meta):
meta['vectors'] = {'width': nlp.vocab.vectors_length,
'vectors': len(nlp.vocab.vectors),
'keys': nlp.vocab.vectors.n_keys}
prints("Enter the package settings for your model. The following "
"information will be read from your model data: pipeline, vectors.",
title="Generating meta.json")
prints(Messages.M047, title=Messages.Mo46)
for setting, desc, default in settings:
response = util.get_raw_input(desc, default)
meta[setting] = default if response == '' and default else response
@ -111,8 +106,7 @@ def generate_meta(model_path, existing_meta):
def validate_meta(meta, keys):
for key in keys:
if key not in meta or meta[key] == '':
prints("This setting is required to build your package.",
title='No "%s" setting found in meta.json' % key, exits=1)
prints(Messages.M049, title=Messages.M048.format(key=key), exits=1)
return meta

View File

@ -7,6 +7,7 @@ import tqdm
from thinc.neural._classes.model import Model
from timeit import default_timer as timer
from ._messages import Messages
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
from ..gold import GoldCorpus, minibatch
from ..util import prints
@ -54,15 +55,15 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
if not output_path.exists():
output_path.mkdir()
if not train_path.exists():
prints(train_path, title="Training data not found", exits=1)
prints(train_path, title=Messages.M050, exits=1)
if dev_path and not dev_path.exists():
prints(dev_path, title="Development data not found", exits=1)
prints(dev_path, title=Messages.M051, exits=1)
if meta_path is not None and not meta_path.exists():
prints(meta_path, title="meta.json not found", exits=1)
prints(meta_path, title=Messages.M020, exits=1)
meta = util.read_json(meta_path) if meta_path else {}
if not isinstance(meta, dict):
prints("Expected dict but got: {}".format(type(meta)),
title="Not a valid meta.json format", exits=1)
prints(Messages.M053.format(meta_type=type(meta)),
title=Messages.M052, exits=1)
meta.setdefault('lang', lang)
meta.setdefault('name', 'unnamed')

View File

@ -6,6 +6,7 @@ from pathlib import Path
import sys
import ujson
from ._messages import Messages
from ..compat import path2str, locale_escape, url_read, HTTPError
from ..util import prints, get_data_path, read_json
from .. import about
@ -18,14 +19,13 @@ def validate():
try:
data = url_read(about.__compatibility__)
except HTTPError as e:
prints("Couldn't fetch compatibility table.",
title="Server error (%d: %s)" % (e.code, e.reason), exits=1)
title = Messages.M003.format(code=e.code, desc=e.reason)
prints(Messages.M021, title=title, exits=1)
compat = ujson.loads(data)['spacy']
current_compat = compat.get(about.__version__)
if not current_compat:
prints(about.__compatibility__, exits=1,
title="Can't find spaCy v{} in compatibility table"
.format(about.__version__))
title=Messages.M022.format(version=about.__version__))
all_models = set()
for spacy_v, models in dict(compat).items():
all_models.update(models.keys())
@ -42,7 +42,7 @@ def validate():
update_models = [m for m in incompat_models if m in current_compat]
prints(path2str(Path(__file__).parent.parent),
title="Installed models (spaCy v{})".format(about.__version__))
title=Messages.M023.format(version=about.__version__))
if model_links or model_pkgs:
print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
for name, data in model_pkgs.items():
@ -50,23 +50,16 @@ def validate():
for name, data in model_links.items():
print(get_model_row(current_compat, name, data, 'link'))
else:
prints("No models found in your current environment.", exits=0)
prints(Messages.M024, exits=0)
if update_models:
cmd = ' python -m spacy download {}'
print("\n Use the following commands to update the model packages:")
print("\n " + Messages.M025)
print('\n'.join([cmd.format(pkg) for pkg in update_models]))
if na_models:
prints("The following models are not available for spaCy v{}: {}"
.format(about.__version__, ', '.join(na_models)))
prints(Messages.M025.format(version=about.__version__,
models=', '.join(na_models)))
if incompat_links:
prints("You may also want to overwrite the incompatible links using "
"the `python -m spacy link` command with `--force`, or remove "
"them from the data directory. Data path: {}"
.format(path2str(get_data_path())))
prints(Messages.M027.format(path=path2str(get_data_path())))
if incompat_models or incompat_links:
sys.exit(1)

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .render import DependencyRenderer, EntityRenderer
from ..tokens import Doc
from ..compat import b_to_str
from ..errors import Errors, Warnings, user_warning
from ..util import prints, is_in_jupyter
@ -27,7 +28,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
factories = {'dep': (DependencyRenderer, parse_deps),
'ent': (EntityRenderer, parse_ents)}
if style not in factories:
raise ValueError("Unknown style: %s" % style)
raise ValueError(Errors.E087.format(style=style))
if isinstance(docs, Doc) or isinstance(docs, dict):
docs = [docs]
renderer, converter = factories[style]
@ -57,12 +58,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
render(docs, style=style, page=page, minify=minify, options=options,
manual=manual)
httpd = simple_server.make_server('0.0.0.0', port, app)
prints("Using the '%s' visualizer" % style,
title="Serving on port %d..." % port)
prints("Using the '{}' visualizer".format(style),
title="Serving on port {}...".format(port))
try:
httpd.serve_forever()
except KeyboardInterrupt:
prints("Shutting down server on port %d." % port)
prints("Shutting down server on port {}.".format(port))
finally:
httpd.server_close()
@ -83,6 +84,8 @@ def parse_deps(orig_doc, options={}):
RETURNS (dict): Generated dependency parse keyed by words and arcs.
"""
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
if not doc.is_parsed:
user_warning(Warnings.W005)
if options.get('collapse_punct', True):
spans = []
for word in doc[:-1]:
@ -120,6 +123,8 @@ def parse_ents(doc, options={}):
"""
ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
for ent in doc.ents]
if not ents:
user_warning(Warnings.W006)
title = (doc.user_data.get('title', None)
if hasattr(doc, 'user_data') else None)
return {'text': doc.text, 'ents': ents, 'title': title}

297
spacy/errors.py Normal file
View File

@ -0,0 +1,297 @@
# coding: utf8
from __future__ import unicode_literals
import os
import warnings
import inspect
def add_codes(err_cls):
"""Add error codes to string messages via class attribute names."""
class ErrorsWithCodes(object):
def __getattribute__(self, code):
msg = getattr(err_cls, code)
return '[{code}] {msg}'.format(code=code, msg=msg)
return ErrorsWithCodes()
@add_codes
class Warnings(object):
W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
"You can now call spacy.load with the path as its first argument, "
"and the model's meta.json will be used to determine the language "
"to load. For example:\nnlp = spacy.load('{path}')")
W002 = ("Tokenizer.from_list is now deprecated. Create a new Doc object "
"instead and pass in the strings as the `words` keyword argument, "
"for example:\nfrom spacy.tokens import Doc\n"
"doc = Doc(nlp.vocab, words=[...])")
W003 = ("Positional arguments to Doc.merge are deprecated. Instead, use "
"the keyword arguments, for example tag=, lemma= or ent_type=.")
W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing "
"using ftfy.fix_text if necessary.")
W005 = ("Doc object not parsed. This means displaCy won't be able to "
"generate a dependency visualization for it. Make sure the Doc "
"was processed with a model that supports dependency parsing, and "
"not just a language class like `English()`. For more info, see "
"the docs:\nhttps://spacy.io/usage/models")
W006 = ("No entities to visualize found in Doc object. If this is "
"surprising to you, make sure the Doc was processed using a model "
"that supports named entity recognition, and check the `doc.ents` "
"property manually if necessary.")
@add_codes
class Errors(object):
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
E002 = ("Can't find factory for '{name}'. This usually happens when spaCy "
"calls `nlp.create_pipe` with a component name that's not built "
"in - for example, when constructing the pipeline from a model's "
"meta.json. If you're using a custom component, you can write to "
"`Language.factories['{name}']` or remove it from the model meta "
"and add it via `nlp.add_pipe` instead.")
E003 = ("Not a valid pipeline component. Expected callable, but "
"got {component} (name: '{name}').")
E004 = ("If you meant to add a built-in component, use `create_pipe`: "
"`nlp.add_pipe(nlp.create_pipe('{component}'))`")
E005 = ("Pipeline component '{name}' returned None. If you're using a "
"custom component, maybe you forgot to return the processed Doc?")
E006 = ("Invalid constraints. You can only set one of the following: "
"before, after, first, last.")
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
E008 = ("Some current components would be lost when restoring previous "
"pipeline state. If you added components after calling "
"`nlp.disable_pipes()`, you should remove them explicitly with "
"`nlp.remove_pipe()` before the pipeline is restored. Names of "
"the new components: {names}")
E009 = ("The `update` method expects same number of docs and golds, but "
"got: {n_docs} docs, {n_golds} golds.")
E010 = ("Word vectors set to length 0. This may be because you don't have "
"a model installed or loaded, or because your model doesn't "
"include word vectors. For more info, see the docs:\n"
"https://spacy.io/usage/models")
E011 = ("Unknown operator: '{op}'. Options: {opts}")
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
E013 = ("Error selecting action in matcher")
E014 = ("Uknown tag ID: {tag}")
E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
"`force=True` to overwrite.")
E016 = ("MultitaskObjective target should be function or one of: dep, "
"tag, ent, dep_tag_offset, ent_tag.")
E017 = ("Can only add unicode or bytes. Got type: {value_type}")
E018 = ("Can't retrieve string for hash '{hash_value}'.")
E019 = ("Can't create transition with unknown action ID: {action}. Action "
"IDs are enumerated in spacy/syntax/{src}.pyx.")
E020 = ("Could not find a gold-standard action to supervise the "
"dependency parser. The tree is non-projective (i.e. it has "
"crossing arcs - see spacy/syntax/nonproj.pyx for definitions). "
"The ArcEager transition system only supports projective trees. "
"To learn non-projective representations, transform the data "
"before training and after parsing. Either pass "
"`make_projective=True` to the GoldParse class, or use "
"spacy.syntax.nonproj.preprocess_training_data.")
E021 = ("Could not find a gold-standard action to supervise the "
"dependency parser. The GoldParse was projective. The transition "
"system has {n_actions} actions. State at failure: {state}")
E022 = ("Could not find a transition with the name '{name}' in the NER "
"model.")
E023 = ("Error cleaning up beam: The same state occurred twice at "
"memory address {addr} and position {i}.")
E024 = ("Could not find an optimal move to supervise the parser. Usually, "
"this means the GoldParse was not correct. For example, are all "
"labels added to the model?")
E025 = ("String is too long: {length} characters. Max is 2**30.")
E026 = ("Error accessing token at position {i}: out of bounds in Doc of "
"length {length}.")
E027 = ("Arguments 'words' and 'spaces' should be sequences of the same "
"length, or 'spaces' should be left default at None. spaces "
"should be a sequence of booleans, with True meaning that the "
"word owns a ' ' character following it.")
E028 = ("orths_and_spaces expects either a list of unicode string or a "
"list of (unicode, bool) tuples. Got bytes instance: {value}")
E029 = ("noun_chunks requires the dependency parse, which requires a "
"statistical model to be installed and loaded. For more info, see "
"the documentation:\nhttps://spacy.io/usage/models")
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
"Alternatively, add the dependency parser, or set sentence "
"boundaries by setting doc[i].is_sent_start.")
E031 = ("Invalid token: empty string ('') at position {i}.")
E032 = ("Conflicting attributes specified in doc.from_array(): "
"(HEAD, SENT_START). The HEAD attribute currently sets sentence "
"boundaries implicitly, based on the tree structure. This means "
"the HEAD attribute would potentially override the sentence "
"boundaries set by SENT_START.")
E033 = ("Cannot load into non-empty Doc of length {length}.")
E034 = ("Doc.merge received {n_args} non-keyword arguments. Expected "
"either 3 arguments (deprecated), or 0 (use keyword arguments).\n"
"Arguments supplied:\n{args}\nKeyword arguments:{kwargs}")
E035 = ("Error creating span with start {start} and end {end} for Doc of "
"length {length}.")
E036 = ("Error calculating span: Can't find a token starting at character "
"offset {start}.")
E037 = ("Error calculating span: Can't find a token ending at character "
"offset {end}.")
E038 = ("Error finding sentence for span. Infinite loop detected.")
E039 = ("Array bounds exceeded while searching for root word. This likely "
"means the parse tree is in an invalid state. Please report this "
"issue here: http://github.com/explosion/spaCy/issues")
E040 = ("Attempt to access token at {i}, max length {max_length}.")
E041 = ("Invalid comparison operator: {op}. Likely a Cython bug?")
E042 = ("Error accessing doc[{i}].nbor({j}), for doc of length {length}.")
E043 = ("Refusing to write to token.sent_start if its document is parsed, "
"because this may cause inconsistent state.")
E044 = ("Invalid value for token.sent_start: {value}. Must be one of: "
"None, True, False")
E045 = ("Possibly infinite loop encountered while looking for {attr}.")
E046 = ("Can't retrieve unregistered extension attribute '{name}'. Did "
"you forget to call the `set_extension` method?")
E047 = ("Can't assign a value to unregistered extension attribute "
"'{name}'. Did you forget to call the `set_extension` method?")
E048 = ("Can't import language {lang} from spacy.lang.")
E049 = ("Can't find spaCy data directory: '{path}'. Check your "
"installation and permissions, or use spacy.util.set_data_path "
"to customise the location if necessary.")
E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut "
"link, a Python package or a valid path to a data directory.")
E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure "
"it points to a valid package (not just a data directory).")
E052 = ("Can't find model directory: {path}")
E053 = ("Could not read meta.json from {path}")
E054 = ("No valid '{setting}' setting found in model meta.json.")
E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}")
E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
"original string.\nKey: {key}\nOrths: {orths}")
E057 = ("Stepped slices not supported in Span objects. Try: "
"list(tokens)[start:stop:step] instead.")
E058 = ("Could not retrieve vector for key {key}.")
E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
"({rows}, {cols}).")
E061 = ("Bad file name: {filename}. Example of a valid file name: "
"'vectors.128.f.bin'")
E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
"and 63 are occupied. You can replace one by specifying the "
"`flag_id` explicitly, e.g. "
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
E063 = ("Invalid value for flag_id: {value}. Flag IDs must be between 1 "
"and 63 (inclusive).")
E064 = ("Error fetching a Lexeme from the Vocab. When looking up a "
"string, the lexeme returned had an orth ID that did not match "
"the query string. This means that the cached lexeme structs are "
"mismatched to the string encoding table. The mismatched:\n"
"Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
E065 = ("Only one of the vector table's width and shape can be specified. "
"Got width {width} and shape {shape}.")
E066 = ("Error creating model helper for extracting columns. Can only "
"extract columns by positive integer. Got: {value}.")
E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
"an entity) without a preceding 'B' (beginning of an entity). "
"Tag sequence:\n{tags}")
E068 = ("Invalid BILUO tag: '{tag}'.")
E069 = ("Invalid gold-standard parse tree. Found cycle between word "
"IDs: {cycle}")
E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) "
"does not align with number of annotations ({n_annots}).")
E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
"match the one in the vocab ({vocab_orth}).")
E072 = ("Error serializing lexeme: expected data length {length}, "
"got {bad_length}.")
E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
"are of length {length}. You can use `vocab.reset_vectors` to "
"clear the existing vectors and resize the table.")
E074 = ("Error interpreting compiled match pattern: patterns are expected "
"to end with the attribute {attr}. Got: {bad_attr}.")
E075 = ("Error accepting match: length ({length}) > maximum length "
"({max_len}).")
E076 = ("Error setting tensor on Doc: tensor has {rows} rows, while Doc "
"has {words} words.")
E077 = ("Error computing {value}: number of Docs ({n_docs}) does not "
"equal number of GoldParse objects ({n_golds}) in batch.")
E078 = ("Error computing score: number of words in Doc ({words_doc}) does "
"not equal number of words in GoldParse ({words_gold}).")
E079 = ("Error computing states in beam: number of predicted beams "
"({pbeams}) does not equal number of gold beams ({gbeams}).")
E080 = ("Duplicate state found in beam: {key}.")
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
"does not equal number of losses ({losses}).")
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
"match.")
E083 = ("Error setting extension: only one of default, getter, setter and "
"method is allowed. {n_args} keyword arguments were specified.")
E084 = ("Error assigning label ID {label} to span: not in StringStore.")
E085 = ("Can't create lexeme for string '{string}'.")
E086 = ("Error deserializing lexeme '{string}': orth ID {orth_id} does "
"not match hash {hash_id} in StringStore.")
E087 = ("Unknown displaCy style: {style}.")
E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
"v2.x parser and NER models require roughly 1GB of temporary "
"memory per 100,000 characters in the input. This means long "
"texts may cause memory allocation errors. If you're not using "
"the parser or NER, it's probably safe to increase the "
"`nlp.max_length` limit. The limit is in number of characters, so "
"you can check whether your inputs are too long by checking "
"`len(text)`.")
@add_codes
class TempErrors(object):
T001 = ("Max length currently 10 for phrase matching")
T002 = ("Pattern length ({doc_len}) >= phrase_matcher.max_length "
"({max_len}). Length can be set on initialization, up to 10.")
T003 = ("Resizing pre-trained Tagger models is not currently supported.")
T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
T005 = ("Currently history size is hard-coded to 0. Received: {value}.")
T006 = ("Currently history width is hard-coded to 0. Received: {value}.")
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues")
class ModelsWarning(UserWarning):
pass
WARNINGS = {
'user': UserWarning,
'deprecation': DeprecationWarning,
'models': ModelsWarning,
}
def _get_warn_types(arg):
if arg == '': # don't show any warnings
return []
if not arg or arg == 'all': # show all available warnings
return WARNINGS.keys()
return [w_type.strip() for w_type in arg.split(',')
if w_type.strip() in WARNINGS]
SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER', 'always')
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES'))
def user_warning(message):
_warn(message, 'user')
def deprecation_warning(message):
_warn(message, 'deprecation')
def models_warning(message):
_warn(message, 'models')
def _warn(message, warn_type='user'):
"""
message (unicode): The message to display.
category (Warning): The Warning to show.
"""
if warn_type in SPACY_WARNING_TYPES:
category = WARNINGS[warn_type]
stack = inspect.stack()[-1]
with warnings.catch_warnings():
warnings.simplefilter(SPACY_WARNING_FILTER, category)
warnings.warn_explicit(message, category, stack[1], stack[2])

View File

@ -10,6 +10,7 @@ import itertools
from .syntax import nonproj
from .tokens import Doc
from .errors import Errors
from . import util
from .util import minibatch
@ -28,7 +29,8 @@ def tags_to_entities(tags):
elif tag == '-':
continue
elif tag.startswith('I'):
assert start is not None, tags[:i]
if start is None:
raise ValueError(Errors.E067.format(tags=tags[:i]))
continue
if tag.startswith('U'):
entities.append((tag[2:], i, i))
@ -38,7 +40,7 @@ def tags_to_entities(tags):
entities.append((tag[2:], start, i))
start = None
else:
raise Exception(tag)
raise ValueError(Errors.E068.format(tag=tag))
return entities
@ -238,7 +240,9 @@ class GoldCorpus(object):
@classmethod
def _make_golds(cls, docs, paragraph_tuples):
assert len(docs) == len(paragraph_tuples)
if len(docs) != len(paragraph_tuples):
raise ValueError(Errors.E070.format(n_docs=len(docs),
n_annots=len(paragraph_tuples)))
if len(docs) == 1:
return [GoldParse.from_annot_tuples(docs[0],
paragraph_tuples[0][0])]
@ -461,7 +465,7 @@ cdef class GoldParse:
cycle = nonproj.contains_cycle(self.heads)
if cycle is not None:
raise Exception("Cycle found: %s" % cycle)
raise ValueError(Errors.E069.format(cycle=cycle))
if make_projective:
proj_heads, _ = nonproj.projectivize(self.heads, self.labels)

View File

@ -28,6 +28,7 @@ from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP
from .lang.lex_attrs import LEX_ATTRS, is_stop
from .errors import Errors
from . import util
from . import about
@ -217,8 +218,7 @@ class Language(object):
for pipe_name, component in self.pipeline:
if pipe_name == name:
return component
msg = "No component '{}' found in pipeline. Available names: {}"
raise KeyError(msg.format(name, self.pipe_names))
raise KeyError(Errors.E001.format(name=name, opts=self.pipe_names))
def create_pipe(self, name, config=dict()):
"""Create a pipeline component from a factory.
@ -228,7 +228,7 @@ class Language(object):
RETURNS (callable): Pipeline component.
"""
if name not in self.factories:
raise KeyError("Can't find factory for '{}'.".format(name))
raise KeyError(Errors.E002.format(name=name))
factory = self.factories[name]
return factory(self, **config)
@ -253,12 +253,9 @@ class Language(object):
>>> nlp.add_pipe(component, name='custom_name', last=True)
"""
if not hasattr(component, '__call__'):
msg = ("Not a valid pipeline component. Expected callable, but "
"got {}. ".format(repr(component)))
msg = Errors.E003.format(component=repr(component), name=name)
if isinstance(component, basestring_) and component in self.factories:
msg += ("If you meant to add a built-in component, use "
"create_pipe: nlp.add_pipe(nlp.create_pipe('{}'))"
.format(component))
msg += Errors.E004.format(component=component)
raise ValueError(msg)
if name is None:
if hasattr(component, 'name'):
@ -271,11 +268,9 @@ class Language(object):
else:
name = repr(component)
if name in self.pipe_names:
raise ValueError("'{}' already exists in pipeline.".format(name))
raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
msg = ("Invalid constraints. You can only set one of the "
"following: before, after, first, last.")
raise ValueError(msg)
raise ValueError(Errors.E006)
pipe = (name, component)
if last or not any([first, before, after]):
self.pipeline.append(pipe)
@ -286,9 +281,8 @@ class Language(object):
elif after and after in self.pipe_names:
self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
else:
msg = "Can't find '{}' in pipeline. Available names: {}"
unfound = before or after
raise ValueError(msg.format(unfound, self.pipe_names))
raise ValueError(Errors.E001.format(name=before or after,
opts=self.pipe_names))
def has_pipe(self, name):
"""Check if a component name is present in the pipeline. Equivalent to
@ -306,8 +300,7 @@ class Language(object):
component (callable): Pipeline component.
"""
if name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}"
raise ValueError(msg.format(name, self.pipe_names))
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
self.pipeline[self.pipe_names.index(name)] = (name, component)
def rename_pipe(self, old_name, new_name):
@ -317,11 +310,9 @@ class Language(object):
new_name (unicode): New name of the component.
"""
if old_name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}"
raise ValueError(msg.format(old_name, self.pipe_names))
raise ValueError(Errors.E001.format(name=old_name, opts=self.pipe_names))
if new_name in self.pipe_names:
msg = "'{}' already exists in pipeline. Existing names: {}"
raise ValueError(msg.format(new_name, self.pipe_names))
raise ValueError(Errors.E007.format(name=new_name, opts=self.pipe_names))
i = self.pipe_names.index(old_name)
self.pipeline[i] = (new_name, self.pipeline[i][1])
@ -332,8 +323,7 @@ class Language(object):
RETURNS (tuple): A `(name, component)` tuple of the removed component.
"""
if name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}"
raise ValueError(msg.format(name, self.pipe_names))
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
return self.pipeline.pop(self.pipe_names.index(name))
def __call__(self, text, disable=[]):
@ -351,21 +341,17 @@ class Language(object):
('An', 'NN')
"""
if len(text) >= self.max_length:
msg = (
"Text of length {length} exceeds maximum of {max_length}. "
"The v2 parser and NER models require roughly 1GB of temporary "
"memory per 100,000 characters in the input. This means long "
"texts may cause memory allocation errors. If you're not using "
"the parser or NER, it's probably safe to increase the "
"nlp.max_length limit. The limit is in number of characters, "
"so you can check whether your inputs are too long by checking "
"len(text).")
raise ValueError(msg.format(length=len(text), max_length=self.max_length))
raise ValueError(Errors.E088.format(length=len(text),
max_length=self.max_length))
doc = self.make_doc(text)
for name, proc in self.pipeline:
if name in disable:
continue
if not hasattr(proc, '__call__'):
raise ValueError(Errors.E003.format(component=type(proc), name=name))
doc = proc(doc)
if doc is None:
raise ValueError(Errors.E005.format(name=name))
return doc
def disable_pipes(self, *names):
@ -407,8 +393,7 @@ class Language(object):
>>> state = nlp.update(docs, golds, sgd=optimizer)
"""
if len(docs) != len(golds):
raise IndexError("Update expects same number of docs and golds "
"Got: %d, %d" % (len(docs), len(golds)))
raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds)))
if len(docs) == 0:
return
if sgd is None:
@ -757,14 +742,7 @@ class DisabledPipes(list):
if unexpected:
# Don't change the pipeline if we're raising an error.
self.nlp.pipeline = current
msg = (
"Some current components would be lost when restoring "
"previous pipeline state. If you added components after "
"calling nlp.disable_pipes(), you should remove them "
"explicitly with nlp.remove_pipe() before the pipeline is "
"restore. Names of the new components: %s"
)
raise ValueError(msg % unexpected)
raise ValueError(Errors.E008.format(names=unexpected))
self[:] = []

View File

@ -15,7 +15,7 @@ from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV
from .attrs cimport PROB
from .attrs import intify_attrs
from . import about
from .errors import Errors
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
@ -37,7 +37,8 @@ cdef class Lexeme:
self.vocab = vocab
self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
assert self.c.orth == orth
if self.c.orth != orth:
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
def __richcmp__(self, other, int op):
if other is None:
@ -129,20 +130,25 @@ cdef class Lexeme:
lex_data = Lexeme.c_to_bytes(self.c)
start = <const char*>&self.c.flags
end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data))
if (end-start) != sizeof(lex_data.data):
raise ValueError(Errors.E072.format(length=end-start,
bad_length=sizeof(lex_data.data)))
byte_string = b'\0' * sizeof(lex_data.data)
byte_chars = <char*>byte_string
for i in range(sizeof(lex_data.data)):
byte_chars[i] = lex_data.data[i]
assert len(byte_string) == sizeof(lex_data.data), (len(byte_string),
sizeof(lex_data.data))
if len(byte_string) != sizeof(lex_data.data):
raise ValueError(Errors.E072.format(length=len(byte_string),
bad_length=sizeof(lex_data.data)))
return byte_string
def from_bytes(self, bytes byte_string):
# This method doesn't really have a use-case --- wrote it for testing.
# Possibly delete? It puts the Lexeme out of synch with the vocab.
cdef SerializedLexemeC lex_data
assert len(byte_string) == sizeof(lex_data.data)
if len(byte_string) != sizeof(lex_data.data):
raise ValueError(Errors.E072.format(length=len(byte_string),
bad_length=sizeof(lex_data.data)))
for i in range(len(byte_string)):
lex_data.data[i] = byte_string[i]
Lexeme.c_from_bytes(self.c, lex_data)
@ -169,16 +175,13 @@ cdef class Lexeme:
def __get__(self):
cdef int length = self.vocab.vectors_length
if length == 0:
raise ValueError(
"Word vectors set to length 0. This may be because you "
"don't have a model installed or loaded, or because your "
"model doesn't include word vectors. For more info, see "
"the documentation: \n%s\n" % about.__docs_models__
)
raise ValueError(Errors.E010)
return self.vocab.get_vector(self.c.orth)
def __set__(self, vector):
assert len(vector) == self.vocab.vectors_length
if len(vector) != self.vocab.vectors_length:
raise ValueError(Errors.E073.format(new_length=len(vector),
length=self.vocab.vectors_length))
self.vocab.set_vector(self.c.orth, vector)
property rank:

View File

@ -16,6 +16,7 @@ from .typedefs cimport hash_t
from .structs cimport TokenC
from .tokens.doc cimport Doc, get_token_attr
from .vocab cimport Vocab
from .errors import Errors, TempErrors
from .attrs import IDS
from .attrs cimport attr_id_t, ID, NULL_ATTR
@ -109,7 +110,8 @@ cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
while pattern.nr_attr != 0:
pattern += 1
id_attr = pattern[0].attrs[0]
assert id_attr.attr == ID
if id_attr.attr != ID:
raise ValueError(Errors.E074.format(attr=ID, bad_attr=id_attr.attr))
return id_attr.value
@ -161,8 +163,8 @@ def _convert_strings(token_specs, string_store):
if value in operators:
ops = operators[value]
else:
msg = "Unknown operator '%s'. Options: %s"
raise KeyError(msg % (value, ', '.join(operators.keys())))
keys = ', '.join(operators.keys())
raise KeyError(Errors.E011.format(op=value, opts=keys))
if isinstance(attr, basestring):
attr = IDS.get(attr.upper())
if isinstance(value, basestring):
@ -264,9 +266,7 @@ cdef class Matcher:
"""
for pattern in patterns:
if len(pattern) == 0:
msg = ("Cannot add pattern for zero tokens to matcher.\n"
"key: {key}\n")
raise ValueError(msg.format(key=key))
raise ValueError(Errors.E012.format(key=key))
key = self._normalize_key(key)
for pattern in patterns:
specs = _convert_strings(pattern, self.vocab.strings)
@ -348,13 +348,12 @@ cdef class Matcher:
for state in partials:
action = get_action(state.second, token)
if action == PANIC:
raise Exception("Error selecting action in matcher")
raise ValueError(Errors.E013)
while action == ADVANCE_ZERO:
state.second += 1
action = get_action(state.second, token)
if action == PANIC:
raise Exception("Error selecting action in matcher")
raise ValueError(Errors.E013)
if action == REPEAT:
# Leave the state in the queue, and advance to next slot
# (i.e. we don't overwrite -- we want to greedily match
@ -380,7 +379,7 @@ cdef class Matcher:
for pattern in self.patterns:
action = get_action(pattern, token)
if action == PANIC:
raise Exception("Error selecting action in matcher")
raise ValueError(Errors.E013)
while action == ADVANCE_ZERO:
pattern += 1
action = get_action(pattern, token)
@ -447,7 +446,7 @@ def get_bilou(length):
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
I10_ENT, I10_ENT, L10_ENT]
else:
raise ValueError("Max length currently 10 for phrase matching")
raise ValueError(TempErrors.T001)
cdef class PhraseMatcher:
@ -506,11 +505,8 @@ cdef class PhraseMatcher:
cdef Doc doc
for doc in docs:
if len(doc) >= self.max_length:
msg = (
"Pattern length (%d) >= phrase_matcher.max_length (%d). "
"Length can be set on initialization, up to 10."
)
raise ValueError(msg % (len(doc), self.max_length))
raise ValueError(TempErrors.T002.format(doc_len=len(doc),
max_len=self.max_length))
cdef hash_t ent_id = self.matcher._normalize_key(key)
self._callbacks[ent_id] = on_match
cdef int length
@ -562,7 +558,9 @@ cdef class PhraseMatcher:
yield doc
def accept_match(self, Doc doc, int start, int end):
assert (end - start) < self.max_length
if (end - start) >= self.max_length:
raise ValueError(Errors.E075.format(length=end - start,
max_len=self.max_length))
cdef int i, j
for i in range(self.max_length):
self._phrase_key[i] = 0

View File

@ -9,6 +9,7 @@ from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme
from .errors import Errors
def _normalize_props(props):
@ -93,7 +94,7 @@ cdef class Morphology:
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id > self.n_tags:
raise ValueError("Unknown tag ID: %s" % tag_id)
raise ValueError(Errors.E014.format(tag=tag_id))
# TODO: It's pretty arbitrary to put this logic here. I guess the
# justification is that this is where the specific word and the tag
# interact. Still, we should have a better way to enforce this rule, or
@ -147,9 +148,7 @@ cdef class Morphology:
elif force:
memset(cached, 0, sizeof(cached[0]))
else:
raise ValueError(
"Conflicting morphology exception for (%s, %s). Use "
"force=True to overwrite." % (tag_str, orth_str))
raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str))
cached.tag = rich_tag
# TODO: Refactor this to take arbitrary attributes.

View File

@ -33,6 +33,7 @@ from .parts_of_speech import X
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models, zero_init, flatten
from ._ml import create_default_optimizer
from .errors import Errors, TempErrors
from . import util
@ -336,7 +337,8 @@ class Tensorizer(Pipe):
tensors (object): Vector representation for each token in the docs.
"""
for doc, tensor in zip(docs, tensors):
assert tensor.shape[0] == len(doc)
if tensor.shape[0] != len(doc):
raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
doc.tensor = tensor
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
@ -550,9 +552,7 @@ class Tagger(Pipe):
# copy_array(larger.W[:smaller.nO], smaller.W)
# copy_array(larger.b[:smaller.nO], smaller.b)
# self.model._layers[-1] = larger
raise ValueError(
"Resizing pre-trained Tagger models is not "
"currently supported.")
raise ValueError(TempErrors.T003)
tag_map = dict(self.vocab.morphology.tag_map)
if values is None:
values = {POS: "X"}
@ -671,8 +671,7 @@ class MultitaskObjective(Tagger):
elif hasattr(target, '__call__'):
self.make_label = target
else:
raise ValueError("MultitaskObjective target should be function or "
"one of: dep, tag, ent, dep_tag_offset, ent_tag.")
raise ValueError(Errors.E016)
self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2)
@ -723,7 +722,9 @@ class MultitaskObjective(Tagger):
return tokvecs, scores
def get_loss(self, docs, golds, scores):
assert len(docs) == len(golds)
if len(docs) != len(golds):
raise ValueError(Errors.E077.format(value='loss', n_docs=len(docs),
n_golds=len(golds)))
cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1)

View File

@ -2,6 +2,7 @@
from __future__ import division, print_function, unicode_literals
from .gold import tags_to_entities
from .errors import Errors
class PRFScore(object):
@ -84,7 +85,8 @@ class Scorer(object):
}
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
assert len(tokens) == len(gold)
if len(tokens) != len(gold):
raise ValueError(Errors.E078.format(words_doc=len(tokens), words_gold=len(gold)))
gold_deps = set()
gold_tags = set()
gold_ents = set(tags_to_entities([annot[-1]

View File

@ -13,6 +13,7 @@ from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT
from .typedefs cimport hash_t
from .compat import json_dumps
from .errors import Errors
from . import util
@ -59,7 +60,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
string.p[0] = length
memcpy(&string.p[1], chars, length)
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
return string
else:
i = 0
@ -69,7 +69,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
string.p[i] = 255
string.p[n_length_bytes-1] = length % 255
memcpy(&string.p[n_length_bytes], chars, length)
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
return string
@ -115,7 +114,7 @@ cdef class StringStore:
self.hits.insert(key)
utf8str = <Utf8Str*>self._map.get(key)
if utf8str is NULL:
raise KeyError(string_or_id)
raise KeyError(Errors.E018.format(hash_value=string_or_id))
else:
return decode_Utf8Str(utf8str)
@ -136,8 +135,7 @@ cdef class StringStore:
key = hash_utf8(string, len(string))
self._intern_utf8(string, len(string))
else:
raise TypeError(
"Can only add unicode or bytes. Got type: %s" % type(string))
raise TypeError(Errors.E017.format(value_type=type(string)))
return key
def __len__(self):

View File

@ -10,6 +10,7 @@ from thinc.extra.search cimport MaxViolation
from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParse
from ..errors import Errors
from .stateclass cimport StateC, StateClass
@ -220,7 +221,8 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
p_indices = []
g_indices = []
cdef Beam pbeam, gbeam
assert len(pbeams) == len(gbeams)
if len(pbeams) != len(gbeams):
raise ValueError(Errors.E079.format(pbeams=len(pbeams), gbeams=len(gbeams)))
for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
p_indices.append([])
g_indices.append([])
@ -228,7 +230,8 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
state = StateClass.borrow(<StateC*>pbeam.at(i))
if not state.is_final():
key = tuple([eg_id] + pbeam.histories[i])
assert key not in seen, (key, seen)
if key in seen:
raise ValueError(Errors.E080.format(key=key))
seen[key] = len(states)
p_indices[-1].append(len(states))
states.append(state)
@ -271,7 +274,8 @@ def get_gradient(nr_class, beam_maps, histories, losses):
for i in range(nr_step):
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
dtype='f'))
assert len(histories) == len(losses)
if len(histories) != len(losses):
raise ValueError(Errors.E081.format(n_hist=len(histories), losses=len(losses)))
for eg_id, hists in enumerate(histories):
for loss, hist in zip(losses[eg_id], hists):
if loss == 0.0 or numpy.isnan(loss):

View File

@ -15,6 +15,7 @@ from .nonproj import is_nonproj_tree
from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse, GoldParseC
from ..structs cimport TokenC
from ..errors import Errors
DEF NON_MONOTONIC = True
@ -455,7 +456,7 @@ cdef class ArcEager(TransitionSystem):
t.do = Break.transition
t.get_cost = Break.cost
else:
raise Exception(move)
raise ValueError(Errors.E019.format(action=move, src='arc_eager'))
return t
cdef int initialize_state(self, StateC* st) nogil:
@ -529,28 +530,11 @@ cdef class ArcEager(TransitionSystem):
if n_gold < 1:
# Check projectivity --- leading cause
if is_nonproj_tree(gold.heads):
raise ValueError(
"Could not find a gold-standard action to supervise the "
"dependency parser. Likely cause: the tree is "
"non-projective (i.e. it has crossing arcs -- see "
"spacy/syntax/nonproj.pyx for definitions). The ArcEager "
"transition system only supports projective trees. To "
"learn non-projective representations, transform the data "
"before training and after parsing. Either pass "
"make_projective=True to the GoldParse class, or use "
"spacy.syntax.nonproj.preprocess_training_data.")
raise ValueError(Errors.E020)
else:
print(gold.orig_annot)
print(gold.words)
print(gold.heads)
print(gold.labels)
print(gold.sent_starts)
raise ValueError(
"Could not find a gold-standard action to supervise the"
"dependency parser. The GoldParse was projective. The "
"transition system has %d actions. State at failure: %s"
% (self.n_moves, stcls.print_state(gold.words)))
assert n_gold >= 1
failure_state = stcls.print_state(gold.words)
raise ValueError(Errors.E021.format(n_actions=self.n_moves,
state=failure_state))
def get_beam_annot(self, Beam beam):
length = (<StateC*>beam.at(0)).length

View File

@ -10,6 +10,7 @@ from ._state cimport StateC
from .transition_system cimport Transition
from .transition_system cimport do_func_t
from ..gold cimport GoldParseC, GoldParse
from ..errors import Errors
cdef enum:
@ -173,7 +174,7 @@ cdef class BiluoPushDown(TransitionSystem):
if self.c[i].move == move and self.c[i].label == label:
return self.c[i]
else:
raise KeyError(name)
raise KeyError(Errors.E022.format(name=name))
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition()
@ -208,7 +209,7 @@ cdef class BiluoPushDown(TransitionSystem):
t.do = Out.transition
t.get_cost = Out.cost
else:
raise Exception(move)
raise ValueError(Errors.E019.format(action=move, src='ner'))
return t
def add_action(self, int action, label_name):
@ -230,7 +231,6 @@ cdef class BiluoPushDown(TransitionSystem):
self._size *= 2
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
assert self.c[self.n_moves].label == label_id
self.n_moves += 1
return 1

View File

@ -34,6 +34,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer
from ..compat import json_dumps, copy_array
from ..tokens.doc cimport Doc
from ..gold cimport GoldParse
from ..errors import Errors, TempErrors
from .. import util
from .stateclass cimport StateClass
from ._state cimport StateC
@ -242,7 +243,7 @@ cdef class Parser:
def Model(cls, nr_class, **cfg):
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
if depth != 1:
raise ValueError("Currently parser depth is hard-coded to 1.")
raise ValueError(TempErrors.T004.format(value=depth))
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
cfg.get('maxout_pieces', 2))
token_vector_width = util.env_opt('token_vector_width',
@ -252,9 +253,9 @@ cdef class Parser:
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
if hist_size != 0:
raise ValueError("Currently history size is hard-coded to 0")
raise ValueError(TempErrors.T005.format(value=hist_size))
if hist_width != 0:
raise ValueError("Currently history width is hard-coded to 0")
raise ValueError(TempErrors.T006.format(value=hist_width))
pretrained_vectors = cfg.get('pretrained_vectors', None)
tok2vec = Tok2Vec(token_vector_width, embed_size,
pretrained_vectors=pretrained_vectors)
@ -542,7 +543,9 @@ cdef class Parser:
def update(self, docs, golds, drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds):
return None
assert len(docs) == len(golds)
if len(docs) != len(golds):
raise ValueError(Errors.E077.format(value='update', n_docs=len(docs),
n_golds=len(golds)))
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0:
return self.update_beam(docs, golds,
self.cfg['beam_width'], self.cfg['beam_density'],
@ -622,7 +625,6 @@ cdef class Parser:
if losses is not None and self.name not in losses:
losses[self.name] = 0.
lengths = [len(d) for d in docs]
assert min(lengths) >= 1
states = self.moves.init_batch(docs)
for gold in golds:
self.moves.preprocess_gold(gold)
@ -1021,15 +1023,11 @@ def _cleanup(Beam beam):
del state
seen.add(addr)
else:
print(i, addr)
print(seen)
raise Exception
raise ValueError(Errors.E023.format(addr=addr, i=i))
addr = <size_t>beam._states[i].content
if addr not in seen:
state = <StateC*>addr
del state
seen.add(addr)
else:
print(i, addr)
print(seen)
raise Exception
raise ValueError(Errors.E023.format(addr=addr, i=i))

View File

@ -10,6 +10,7 @@ from __future__ import unicode_literals
from copy import copy
from ..tokens.doc cimport Doc
from ..errors import Errors
DELIMITER = '||'
@ -131,7 +132,10 @@ cpdef deprojectivize(Doc doc):
def _decorate(heads, proj_heads, labels):
# uses decoration scheme HEAD from Nivre & Nilsson 2005
assert(len(heads) == len(proj_heads) == len(labels))
if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)):
raise ValueError(Errors.E082.format(n_heads=len(heads),
n_proj_heads=len(proj_heads),
n_labels=len(labels)))
deco_labels = []
for tokenid, head in enumerate(heads):
if head != proj_heads[tokenid]:

View File

@ -12,6 +12,7 @@ from ..structs cimport TokenC
from .stateclass cimport StateClass
from ..typedefs cimport attr_t
from ..compat import json_dumps
from ..errors import Errors
from .. import util
@ -80,10 +81,7 @@ cdef class TransitionSystem:
action.do(state.c, action.label)
break
else:
print(gold.words)
print(gold.ner)
print(history)
raise ValueError("Could not find gold move")
raise ValueError(Errors.E024)
return history
cdef int initialize_state(self, StateC* state) nogil:
@ -130,17 +128,7 @@ cdef class TransitionSystem:
else:
costs[i] = 9000
if n_gold <= 0:
print(gold.words)
print(gold.ner)
print([gold.c.ner[i].clas for i in range(gold.length)])
print([gold.c.ner[i].move for i in range(gold.length)])
print([gold.c.ner[i].label for i in range(gold.length)])
print("Self labels",
[self.c[i].label for i in range(self.n_moves)])
raise ValueError(
"Could not find a gold-standard action to supervise "
"the entity recognizer. The transition system has "
"%d actions." % (self.n_moves))
raise ValueError(Errors.E024)
def get_class_name(self, int clas):
act = self.c[clas]
@ -162,7 +150,6 @@ cdef class TransitionSystem:
self._size *= 2
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
assert self.c[self.n_moves].label == label_id
self.n_moves += 1
return 1

View File

@ -13,6 +13,7 @@ cimport cython
from .tokens.doc cimport Doc
from .strings cimport hash_string
from .errors import Errors, Warnings, deprecation_warning
from . import util
@ -63,11 +64,7 @@ cdef class Tokenizer:
return (self.__class__, args, None, None)
cpdef Doc tokens_from_list(self, list strings):
util.deprecated(
"Tokenizer.from_list is now deprecated. Create a new Doc "
"object instead and pass in the strings as the `words` keyword "
"argument, for example:\nfrom spacy.tokens import Doc\n"
"doc = Doc(nlp.vocab, words=[...])")
deprecation_warning(Warnings.W002)
return Doc(self.vocab, words=strings)
@cython.boundscheck(False)
@ -78,8 +75,7 @@ cdef class Tokenizer:
RETURNS (Doc): A container for linguistic annotations.
"""
if len(string) >= (2 ** 30):
msg = "String is too long: %d characters. Max is 2**30."
raise ValueError(msg % len(string))
raise ValueError(Errors.E025.format(length=len(string)))
cdef int length = len(string)
cdef Doc doc = Doc(self.vocab)
if length == 0:

View File

@ -31,7 +31,7 @@ from ..attrs cimport ENT_TYPE, SENT_START
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..util import normalize_slice
from ..compat import is_config, copy_reg, pickle, basestring_
from .. import about
from ..errors import Errors, Warnings, deprecation_warning
from .. import util
from .underscore import Underscore
from ._retokenize import Retokenizer
@ -41,9 +41,9 @@ DEF PADDING = 5
cdef int bounds_check(int i, int length, int padding) except -1:
if (i + padding) < 0:
raise IndexError
raise IndexError(Errors.E026.format(i=i, length=length))
if (i - padding) >= length:
raise IndexError
raise IndexError(Errors.E026.format(i=i, length=length))
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
@ -98,7 +98,8 @@ cdef class Doc:
def set_extension(cls, name, default=None, method=None,
getter=None, setter=None):
nr_defined = sum(t is not None for t in (default, getter, setter, method))
assert nr_defined == 1
if nr_defined != 1:
raise ValueError(Errors.E083.format(n_args=nr_defined))
Underscore.doc_extensions[name] = (default, method, getter, setter)
@classmethod
@ -155,11 +156,7 @@ cdef class Doc:
if spaces is None:
spaces = [True] * len(words)
elif len(spaces) != len(words):
raise ValueError(
"Arguments 'words' and 'spaces' should be sequences of "
"the same length, or 'spaces' should be left default at "
"None. spaces should be a sequence of booleans, with True "
"meaning that the word owns a ' ' character following it.")
raise ValueError(Errors.E027)
orths_and_spaces = zip(words, spaces)
if orths_and_spaces is not None:
for orth_space in orths_and_spaces:
@ -167,10 +164,7 @@ cdef class Doc:
orth = orth_space
has_space = True
elif isinstance(orth_space, bytes):
raise ValueError(
"orths_and_spaces expects either List(unicode) or "
"List((unicode, bool)). "
"Got bytes instance: %s" % (str(orth_space)))
raise ValueError(Errors.E028.format(value=orth_space))
else:
orth, has_space = orth_space
# Note that we pass self.mem here --- we have ownership, if LexemeC
@ -504,11 +498,7 @@ cdef class Doc:
"""
def __get__(self):
if not self.is_parsed:
raise ValueError(
"noun_chunks requires the dependency parse, which "
"requires a statistical model to be installed and loaded. "
"For more info, see the "
"documentation: \n%s\n" % about.__docs_models__)
raise ValueError(Errors.E029)
# Accumulate the result before beginning to iterate over it. This
# prevents the tokenisation from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts
@ -533,12 +523,7 @@ cdef class Doc:
"""
def __get__(self):
if not self.is_sentenced:
raise ValueError(
"Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
"Alternatively, add the dependency parser, or set "
"sentence boundaries by setting doc[i].sent_start")
raise ValueError(Errors.E030)
if 'sents' in self.user_hooks:
yield from self.user_hooks['sents'](self)
else:
@ -568,7 +553,8 @@ cdef class Doc:
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
t.l_edge = self.length
t.r_edge = self.length
assert t.lex.orth != 0
if t.lex.orth == 0:
raise ValueError(Errors.E031.format(i=self.length))
t.spacy = has_space
self.length += 1
return t.idx + t.lex.length + t.spacy
@ -684,13 +670,7 @@ cdef class Doc:
def from_array(self, attrs, array):
if SENT_START in attrs and HEAD in attrs:
raise ValueError(
"Conflicting attributes specified in doc.from_array(): "
"(HEAD, SENT_START)\n"
"The HEAD attribute currently sets sentence boundaries "
"implicitly, based on the tree structure. This means the HEAD "
"attribute would potentially override the sentence boundaries "
"set by SENT_START.")
raise ValueError(Errors.E032)
cdef int i, col
cdef attr_id_t attr_id
cdef TokenC* tokens = self.c
@ -828,7 +808,7 @@ cdef class Doc:
RETURNS (Doc): Itself.
"""
if self.length != 0:
raise ValueError("Cannot load into non-empty Doc")
raise ValueError(Errors.E033.format(length=self.length))
deserializers = {
'text': lambda b: None,
'array_head': lambda b: None,
@ -916,10 +896,7 @@ cdef class Doc:
"""
cdef unicode tag, lemma, ent_type
if len(args) == 3:
util.deprecated(
"Positional arguments to Doc.merge are deprecated. Instead, "
"use the keyword arguments, for example tag=, lemma= or "
"ent_type=.")
deprecation_warning(Warnings.W003)
tag, lemma, ent_type = args
attributes[TAG] = tag
attributes[LEMMA] = lemma
@ -933,13 +910,9 @@ cdef class Doc:
if 'ent_type' in attributes:
attributes[ENT_TYPE] = attributes['ent_type']
elif args:
raise ValueError(
"Doc.merge received %d non-keyword arguments. Expected either "
"3 arguments (deprecated), or 0 (use keyword arguments). "
"Arguments supplied:\n%s\n"
"Keyword arguments: %s\n" % (len(args), repr(args),
repr(attributes)))
raise ValueError(Errors.E034.format(n_args=len(args),
args=repr(args),
kwargs=repr(attributes)))
# More deprecated attribute handling =/
if 'label' in attributes:
attributes['ent_type'] = attributes.pop('label')

View File

@ -16,7 +16,7 @@ from ..util import normalize_slice
from ..attrs cimport IS_PUNCT, IS_SPACE
from ..lexeme cimport Lexeme
from ..compat import is_config
from .. import about
from ..errors import Errors, TempErrors
from .underscore import Underscore
@ -48,8 +48,7 @@ cdef class Span:
RETURNS (Span): The newly constructed object.
"""
if not (0 <= start <= end <= len(doc)):
raise IndexError
raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc)))
self.doc = doc
self.start = start
self.start_char = self.doc[start].idx if start < self.doc.length else 0
@ -58,7 +57,8 @@ cdef class Span:
self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1])
else:
self.end_char = 0
assert label in doc.vocab.strings, label
if label not in doc.vocab.strings:
raise ValueError(Errors.E084.format(label=label))
self.label = label
self._vector = vector
self._vector_norm = vector_norm
@ -267,11 +267,10 @@ cdef class Span:
or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char:
start = token_by_start(self.doc.c, self.doc.length, self.start_char)
if self.start == -1:
raise IndexError("Error calculating span: Can't find start")
raise IndexError(Errors.E036.format(start=self.start_char))
end = token_by_end(self.doc.c, self.doc.length, self.end_char)
if end == -1:
raise IndexError("Error calculating span: Can't find end")
raise IndexError(Errors.E037.format(end=self.end_char))
self.start = start
self.end = end + 1
@ -293,7 +292,7 @@ cdef class Span:
root += root.head
n += 1
if n >= self.doc.length:
raise RuntimeError
raise RuntimeError(Errors.E038)
return self.doc[root.l_edge:root.r_edge + 1]
property has_vector:
@ -376,11 +375,7 @@ cdef class Span:
"""
def __get__(self):
if not self.doc.is_parsed:
raise ValueError(
"noun_chunks requires the dependency parse, which "
"requires a statistical model to be installed and loaded. "
"For more info, see the "
"documentation: \n%s\n" % about.__docs_models__)
raise ValueError(Errors.E029)
# Accumulate the result before beginning to iterate over it. This
# prevents the tokenisation from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts
@ -526,9 +521,7 @@ cdef class Span:
return self.root.ent_id
def __set__(self, hash_t key):
raise NotImplementedError(
"Can't yet set ent_id from Span. Vote for this feature on "
"the issue tracker: http://github.com/explosion/spaCy/issues")
raise NotImplementedError(TempErrors.T007.format(attr='ent_id'))
property ent_id_:
"""RETURNS (unicode): The (string) entity ID."""
@ -536,9 +529,7 @@ cdef class Span:
return self.root.ent_id_
def __set__(self, hash_t key):
raise NotImplementedError(
"Can't yet set ent_id_ from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues")
raise NotImplementedError(TempErrors.T007.format(attr='ent_id_'))
property orth_:
"""Verbatim text content (identical to Span.text). Exists mostly for
@ -586,9 +577,5 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
token += token.head
n += 1
if n >= sent_length:
raise RuntimeError(
"Array bounds exceeded while searching for root word. This "
"likely means the parse tree is in an invalid state. Please "
"report this issue here: "
"http://github.com/explosion/spaCy/issues")
raise RuntimeError(Errors.E039)
return n

View File

@ -6,6 +6,7 @@ from ..typedefs cimport attr_t, flags_t
from ..parts_of_speech cimport univ_pos_t
from .doc cimport Doc
from ..lexeme cimport Lexeme
from ..errors import Errors
cdef class Token:
@ -17,8 +18,7 @@ cdef class Token:
@staticmethod
cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, Doc doc):
if offset < 0 or offset >= doc.length:
msg = "Attempt to access token at %d, max length %d"
raise IndexError(msg % (offset, doc.length))
raise IndexError(Errors.E040.format(i=offset, max_length=doc.length))
cdef Token self = Token.__new__(Token, vocab, doc, offset)
return self

View File

@ -19,8 +19,8 @@ from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
from ..compat import is_config
from ..errors import Errors
from .. import util
from .. import about
from .underscore import Underscore
@ -106,7 +106,7 @@ cdef class Token:
elif op == 5:
return my >= their
else:
raise ValueError(op)
raise ValueError(Errors.E041.format(op=op))
@property
def _(self):
@ -135,8 +135,7 @@ cdef class Token:
RETURNS (Token): The token at position `self.doc[self.i+i]`.
"""
if self.i+i < 0 or (self.i+i >= len(self.doc)):
msg = "Error accessing doc[%d].nbor(%d), for doc of length %d"
raise IndexError(msg % (self.i, i, len(self.doc)))
raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc)))
return self.doc[self.i+i]
def similarity(self, other):
@ -352,14 +351,7 @@ cdef class Token:
property sent_start:
def __get__(self):
# Raising a deprecation warning causes errors for autocomplete
#util.deprecated(
# "Token.sent_start is now deprecated. Use Token.is_sent_start "
# "instead, which returns a boolean value or None if the answer "
# "is unknown instead of a misleading 0 for False and 1 for "
# "True. It also fixes a quirk in the old logic that would "
# "always set the property to 0 for the first word of the "
# "document.")
# Raising a deprecation warning here causes errors for autocomplete
# Handle broken backwards compatibility case: doc[0].sent_start
# was False.
if self.i == 0:
@ -384,9 +376,7 @@ cdef class Token:
def __set__(self, value):
if self.doc.is_parsed:
raise ValueError(
"Refusing to write to token.sent_start if its document "
"is parsed, because this may cause inconsistent state.")
raise ValueError(Errors.E043)
if value is None:
self.c.sent_start = 0
elif value is True:
@ -394,8 +384,7 @@ cdef class Token:
elif value is False:
self.c.sent_start = -1
else:
raise ValueError("Invalid value for token.sent_start. Must be "
"one of: None, True, False")
raise ValueError(Errors.E044.format(value=value))
property lefts:
"""The leftward immediate children of the word, in the syntactic
@ -413,8 +402,7 @@ cdef class Token:
nr_iter += 1
# This is ugly, but it's a way to guard out infinite loops
if nr_iter >= 10000000:
raise RuntimeError("Possibly infinite loop encountered "
"while looking for token.lefts")
raise RuntimeError(Errors.E045.format(attr='token.lefts'))
property rights:
"""The rightward immediate children of the word, in the syntactic
@ -432,8 +420,7 @@ cdef class Token:
ptr -= 1
nr_iter += 1
if nr_iter >= 10000000:
raise RuntimeError("Possibly infinite loop encountered "
"while looking for token.rights")
raise RuntimeError(Errors.E045.format(attr='token.rights'))
tokens.reverse()
for t in tokens:
yield t

View File

@ -3,6 +3,8 @@ from __future__ import unicode_literals
import functools
from ..errors import Errors
class Underscore(object):
doc_extensions = {}
@ -23,7 +25,7 @@ class Underscore(object):
def __getattr__(self, name):
if name not in self._extensions:
raise AttributeError(name)
raise AttributeError(Errors.E046.format(name=name))
default, method, getter, setter = self._extensions[name]
if getter is not None:
return getter(self._obj)
@ -34,7 +36,7 @@ class Underscore(object):
def __setattr__(self, name, value):
if name not in self._extensions:
raise AttributeError(name)
raise AttributeError(Errors.E047.format(name=name))
default, method, getter, setter = self._extensions[name]
if setter is not None:
return setter(self._obj, value)

View File

@ -11,8 +11,6 @@ import sys
import textwrap
import random
from collections import OrderedDict
import inspect
import warnings
from thinc.neural._classes.model import Model
import functools
import cytoolz
@ -22,6 +20,7 @@ import numpy.random
from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
from .compat import import_file
from .errors import Errors
# Import these directly from Thinc, so that we're sure we always have the
# same version.
@ -50,8 +49,7 @@ def get_lang_class(lang):
try:
module = importlib.import_module('.lang.%s' % lang, 'spacy')
except ImportError:
msg = "Can't import language %s from spacy.lang."
raise ImportError(msg % lang)
raise ImportError(Errors.E048.format(lang=lang))
LANGUAGES[lang] = getattr(module, module.__all__[0])
return LANGUAGES[lang]
@ -108,7 +106,7 @@ def load_model(name, **overrides):
"""
data_path = get_data_path()
if not data_path or not data_path.exists():
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
raise IOError(Errors.E049.format(path=path2str(data_path)))
if isinstance(name, basestring_): # in data dir / shortcut
if name in set([d.name for d in data_path.iterdir()]):
return load_model_from_link(name, **overrides)
@ -118,7 +116,7 @@ def load_model(name, **overrides):
return load_model_from_path(Path(name), **overrides)
elif hasattr(name, 'exists'): # Path or Path-like to model data
return load_model_from_path(name, **overrides)
raise IOError("Can't find model '%s'" % name)
raise IOError(Errors.E050.format(name=name))
def load_model_from_link(name, **overrides):
@ -127,9 +125,7 @@ def load_model_from_link(name, **overrides):
try:
cls = import_file(name, path)
except AttributeError:
raise IOError(
"Cant' load '%s'. If you're using a shortcut link, make sure it "
"points to a valid package (not just a data directory)." % name)
raise IOError(Errors.E051.format(name=name))
return cls.load(**overrides)
@ -173,8 +169,7 @@ def load_model_from_init_py(init_file, **overrides):
data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
data_path = model_path / data_dir
if not model_path.exists():
msg = "Can't find model directory: %s"
raise ValueError(msg % path2str(data_path))
raise IOError(Errors.E052.format(path=path2str(data_path)))
return load_model_from_path(data_path, meta, **overrides)
@ -186,16 +181,14 @@ def get_model_meta(path):
"""
model_path = ensure_path(path)
if not model_path.exists():
msg = "Can't find model directory: %s"
raise ValueError(msg % path2str(model_path))
raise IOError(Errors.E052.format(path=path2str(model_path)))
meta_path = model_path / 'meta.json'
if not meta_path.is_file():
raise IOError("Could not read meta.json from %s" % meta_path)
raise IOError(Errors.E053.format(path=meta_path))
meta = read_json(meta_path)
for setting in ['lang', 'name', 'version']:
if setting not in meta or not meta[setting]:
msg = "No valid '%s' setting found in model meta.json"
raise ValueError(msg % setting)
raise ValueError(Errors.E054.format(setting=setting))
return meta
@ -339,13 +332,10 @@ def update_exc(base_exceptions, *addition_dicts):
for orth, token_attrs in additions.items():
if not all(isinstance(attr[ORTH], unicode_)
for attr in token_attrs):
msg = "Invalid ORTH value in exception: key='%s', orths='%s'"
raise ValueError(msg % (orth, token_attrs))
raise ValueError(Errors.E055.format(key=orth, orths=token_attrs))
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
if orth != described_orth:
msg = ("Invalid tokenizer exception: ORTH values combined "
"don't match original string. key='%s', orths='%s'")
raise ValueError(msg % (orth, described_orth))
raise ValueError(Errors.E056.format(key=orth, orths=described_orth))
exc.update(additions)
exc = expand_exc(exc, "'", "")
return exc
@ -375,8 +365,7 @@ def expand_exc(excs, search, replace):
def normalize_slice(length, start, stop, step=None):
if not (step is None or step == 1):
raise ValueError("Stepped slices not supported in Span objects."
"Try: list(tokens)[start:stop:step] instead.")
raise ValueError(Errors.E057)
if start is None:
start = 0
elif start < 0:
@ -387,7 +376,6 @@ def normalize_slice(length, start, stop, step=None):
elif stop < 0:
stop += length
stop = min(length, max(start, stop))
assert 0 <= start <= stop <= length
return start, stop
@ -524,18 +512,6 @@ def from_disk(path, readers, exclude):
return path
def deprecated(message, filter='always'):
"""Show a deprecation warning.
message (unicode): The message to display.
filter (unicode): Filter value.
"""
stack = inspect.stack()[-1]
with warnings.catch_warnings():
warnings.simplefilter(filter, DeprecationWarning)
warnings.warn_explicit(message, DeprecationWarning, stack[1], stack[2])
def print_table(data, title=None):
"""Print data in table format.

View File

@ -14,6 +14,7 @@ from thinc.neural._classes.model import Model
from .strings cimport StringStore, hash_string
from .compat import basestring_, path2str
from .errors import Errors
from . import util
from cython.operator cimport dereference as deref
@ -114,7 +115,7 @@ cdef class Vectors:
"""
i = self.key2row[key]
if i is None:
raise KeyError(key)
raise KeyError(Errors.E058.format(key=key))
else:
return self.data[i]
@ -215,7 +216,8 @@ cdef class Vectors:
RETURNS: The requested key, keys, row or rows.
"""
if sum(arg is None for arg in (key, keys, row, rows)) != 3:
raise ValueError("One (and only one) keyword arg must be set.")
bad_kwargs = {'key': key, 'keys': keys, 'row': row, 'rows': rows}
raise ValueError(Errors.E059.format(kwargs=bad_kwargs))
xp = get_array_module(self.data)
if key is not None:
if isinstance(key, basestring_):
@ -254,9 +256,9 @@ cdef class Vectors:
row = self.key2row[key]
elif row is None:
if self.is_full:
raise ValueError("Cannot add new key to vectors -- full")
raise ValueError(Errors.E060.format(rows=self.data.shape[0],
cols=self.data.shape[1]))
row = deref(self._unset.begin())
self.key2row[key] = row
if vector is not None:
self.data[row] = vector
@ -318,7 +320,7 @@ cdef class Vectors:
width = int(dims)
break
else:
raise IOError("Expected file named e.g. vectors.128.f.bin")
raise IOError(Errors.E061.format(filename=path))
bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims,
dtype=dtype)
xp = get_array_module(self.data)

View File

@ -16,6 +16,7 @@ from .attrs cimport PROB, LANG, ORTH, TAG
from .structs cimport SerializedLexemeC
from .compat import copy_reg, basestring_
from .errors import Errors
from .lemmatizer import Lemmatizer
from .attrs import intify_attrs
from .vectors import Vectors
@ -100,15 +101,9 @@ cdef class Vocab:
flag_id = bit
break
else:
raise ValueError(
"Cannot find empty bit for new lexical flag. All bits "
"between 0 and 63 are occupied. You can replace one by "
"specifying the flag_id explicitly, e.g. "
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
raise ValueError(Errors.E062)
elif flag_id >= 64 or flag_id < 1:
raise ValueError(
"Invalid value for flag_id: %d. Flag IDs must be between "
"1 and 63 (inclusive)" % flag_id)
raise ValueError(Errors.E063.format(value=flag_id))
for lex in self:
lex.set_flag(flag_id, flag_getter(lex.orth_))
self.lex_attr_getters[flag_id] = flag_getter
@ -127,8 +122,9 @@ cdef class Vocab:
cdef size_t addr
if lex != NULL:
if lex.orth != self.strings[string]:
raise LookupError.mismatched_strings(
lex.orth, self.strings[string], string)
raise KeyError(Errors.E064.format(string=lex.orth,
orth=self.strings[string],
orth_id=string))
return lex
else:
return self._new_lexeme(mem, string)
@ -171,7 +167,8 @@ cdef class Vocab:
if not is_oov:
key = hash_string(string)
self._add_lex_to_vocab(key, lex)
assert lex != NULL, string
if lex == NULL:
raise ValueError(Errors.E085.format(string=string))
return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
@ -254,7 +251,7 @@ cdef class Vocab:
width, you have to call this to change the size of the vectors.
"""
if width is not None and shape is not None:
raise ValueError("Only one of width and shape can be specified")
raise ValueError(Errors.E065.format(width=width, shape=shape))
elif shape is not None:
self.vectors = Vectors(shape=shape)
else:
@ -471,7 +468,10 @@ cdef class Vocab:
if ptr == NULL:
continue
py_str = self.strings[lexeme.orth]
assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth)
if self.strings[py_str] != lexeme.orth:
raise ValueError(Errors.E086.format(string=py_str,
orth_id=lexeme.orth,
hash_id=self.strings[py_str]))
key = hash_string(py_str)
self._by_hash.set(key, lexeme)
self._by_orth.set(lexeme.orth, lexeme)
@ -512,16 +512,3 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir,
copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab)
class LookupError(Exception):
@classmethod
def mismatched_strings(cls, id_, id_string, original_string):
return cls(
"Error fetching a Lexeme from the Vocab. When looking up a "
"string, the lexeme returned had an orth ID that did not match "
"the query string. This means that the cached lexeme structs are "
"mismatched to the string encoding table. The mismatched:\n"
"Query string: {}\n"
"Orth cached: {}\n"
"Orth ID: {}".format(repr(original_string), repr(id_string), id_))