💫 New system for error messages and warnings (#2163)

* Add spacy.errors module

* Update deprecation and user warnings

* Replace errors and asserts with new error message system

* Remove redundant asserts

* Fix whitespace

* Add messages for print/util.prints statements

* Fix typo

* Fix typos

* Move CLI messages to spacy.cli._messages

* Add decorator to display error code with message

An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc.

* Remove unused link in spacy.about

* Update errors for invalid pipeline components

* Improve error for unknown factories

* Add displaCy warnings

* Update formatting consistency

* Move error message to spacy.errors

* Update errors and check if doc returned by component is None
This commit is contained in:
Ines Montani 2018-04-03 15:50:31 +02:00 committed by GitHub
parent abf8b16d71
commit 3141e04822
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
41 changed files with 652 additions and 443 deletions

View File

@ -4,18 +4,14 @@ from __future__ import unicode_literals
from .cli.info import info as cli_info from .cli.info import info as cli_info
from .glossary import explain from .glossary import explain
from .about import __version__ from .about import __version__
from .errors import Warnings, deprecation_warning
from . import util from . import util
def load(name, **overrides): def load(name, **overrides):
depr_path = overrides.get('path') depr_path = overrides.get('path')
if depr_path not in (True, False, None): if depr_path not in (True, False, None):
util.deprecated( deprecation_warning(Warnings.W001.format(path=depr_path))
"As of spaCy v2.0, the keyword argument `path=` is deprecated. "
"You can now call spacy.load with the path as its first argument, "
"and the model's meta.json will be used to determine the language "
"to load. For example:\nnlp = spacy.load('{}')".format(depr_path),
'error')
return util.load_model(name, **overrides) return util.load_model(name, **overrides)

View File

@ -23,6 +23,7 @@ from thinc.neural._classes.affine import _set_dimensions_if_needed
import thinc.extra.load_nlp import thinc.extra.load_nlp
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
from .errors import Errors
from . import util from . import util
@ -174,7 +175,7 @@ class PrecomputableAffine(Model):
sgd(self._mem.weights, self._mem.gradient, key=self.id) sgd(self._mem.weights, self._mem.gradient, key=self.id)
return dXf.reshape((dXf.shape[0], self.nF, self.nI)) return dXf.reshape((dXf.shape[0], self.nF, self.nI))
return Yf, backward return Yf, backward
def _add_padding(self, Yf): def _add_padding(self, Yf):
Yf_padded = self.ops.xp.vstack((self.pad, Yf)) Yf_padded = self.ops.xp.vstack((self.pad, Yf))
return Yf_padded return Yf_padded
@ -340,10 +341,10 @@ def _divide_array(X, size):
def get_col(idx): def get_col(idx):
assert idx >= 0, idx if idx < 0:
raise IndexError(Errors.E066.format(value=idx))
def forward(X, drop=0.): def forward(X, drop=0.):
assert idx >= 0, idx
if isinstance(X, numpy.ndarray): if isinstance(X, numpy.ndarray):
ops = NumpyOps() ops = NumpyOps()
else: else:
@ -351,7 +352,6 @@ def get_col(idx):
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
def backward(y, sgd=None): def backward(y, sgd=None):
assert idx >= 0, idx
dX = ops.allocate(X.shape) dX = ops.allocate(X.shape)
dX[:, idx] += y dX[:, idx] += y
return dX return dX

View File

@ -11,7 +11,6 @@ __email__ = 'contact@explosion.ai'
__license__ = 'MIT' __license__ = 'MIT'
__release__ = True __release__ = True
__docs_models__ = 'https://spacy.io/usage/models'
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download' __download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json' __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json' __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json'

73
spacy/cli/_messages.py Normal file
View File

@ -0,0 +1,73 @@
# coding: utf8
from __future__ import unicode_literals
class Messages(object):
M001 = ("Download successful but linking failed")
M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
"don't have admin permissions?), but you can still load the "
"model via its full package name: nlp = spacy.load('{name}')")
M003 = ("Server error ({code}: {desc})")
M004 = ("Couldn't fetch {desc}. Please find a model for your spaCy "
"installation (v{version}), and download it manually. For more "
"details, see the documentation: https://spacy.io/usage/models")
M005 = ("Compatibility error")
M006 = ("No compatible models found for v{version} of spaCy.")
M007 = ("No compatible model found for '{name}' (spaCy v{version}).")
M008 = ("Can't locate model data")
M009 = ("The data should be located in {path}")
M010 = ("Can't find the spaCy data path to create model symlink")
M011 = ("Make sure a directory `/data` exists within your spaCy "
"installation and try again. The data directory should be "
"located here:")
M012 = ("Link '{name}' already exists")
M013 = ("To overwrite an existing link, use the --force flag.")
M014 = ("Can't overwrite symlink '{name}'")
M015 = ("This can happen if your data directory contains a directory or "
"file of the same name.")
M016 = ("Error: Couldn't link model to '{name}'")
M017 = ("Creating a symlink in spacy/data failed. Make sure you have the "
"required permissions and try re-running the command as admin, or "
"use a virtualenv. You can still import the model as a module and "
"call its load() method, or create the symlink manually.")
M018 = ("Linking successful")
M019 = ("You can now load the model via spacy.load('{name}')")
M020 = ("Can't find model meta.json")
M021 = ("Couldn't fetch compatibility table.")
M022 = ("Can't find spaCy v{version} in compatibility table")
M023 = ("Installed models (spaCy v{version})")
M024 = ("No models found in your current environment.")
M025 = ("Use the following commands to update the model packages:")
M026 = ("The following models are not available for spaCy "
"v{version}: {models}")
M027 = ("You may also want to overwrite the incompatible links using the "
"`python -m spacy link` command with `--force`, or remove them "
"from the data directory. Data path: {path}")
M028 = ("Input file not found")
M029 = ("Output directory not found")
M030 = ("Unknown format")
M031 = ("Can't find converter for {converter}")
M032 = ("Generated output file {name}")
M033 = ("Created {n_docs} documents")
M034 = ("Evaluation data not found")
M035 = ("Visualization output directory not found")
M036 = ("Generated {n} parses as HTML")
M037 = ("Can't find words frequencies file")
M038 = ("Sucessfully compiled vocab")
M039 = ("{entries} entries, {vectors} vectors")
M040 = ("Output directory not found")
M041 = ("Loaded meta.json from file")
M042 = ("Successfully created package '{name}'")
M043 = ("To build the package, run `python setup.py sdist` in this "
"directory.")
M044 = ("Package directory already exists")
M045 = ("Please delete the directory and try again, or use the `--force` "
"flag to overwrite existing directories.")
M046 = ("Generating meta.json")
M047 = ("Enter the package settings for your model. The following "
"information will be read from your model data: pipeline, vectors.")
M048 = ("No '{key}' setting found in meta.json")
M049 = ("This setting is required to build your package.")
M050 = ("Training data not found")
M051 = ("Development data not found")
M052 = ("Not a valid meta.json format")
M053 = ("Expected dict but got: {meta_type}")

View File

@ -5,6 +5,7 @@ import plac
from pathlib import Path from pathlib import Path
from .converters import conllu2json, iob2json, conll_ner2json from .converters import conllu2json, iob2json, conll_ner2json
from ._messages import Messages
from ..util import prints from ..util import prints
# Converters are matched by file extension. To add a converter, add a new # Converters are matched by file extension. To add a converter, add a new
@ -32,14 +33,14 @@ def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto
input_path = Path(input_file) input_path = Path(input_file)
output_path = Path(output_dir) output_path = Path(output_dir)
if not input_path.exists(): if not input_path.exists():
prints(input_path, title="Input file not found", exits=1) prints(input_path, title=Messages.M028, exits=1)
if not output_path.exists(): if not output_path.exists():
prints(output_path, title="Output directory not found", exits=1) prints(output_path, title=Messages.M029, exits=1)
if converter == 'auto': if converter == 'auto':
converter = input_path.suffix[1:] converter = input_path.suffix[1:]
if converter not in CONVERTERS: if converter not in CONVERTERS:
prints("Can't find converter for %s" % converter, prints(Messages.M031.format(converter=converter),
title="Unknown format", exits=1) title=Messages.M030, exits=1)
func = CONVERTERS[converter] func = CONVERTERS[converter]
func(input_path, output_path, func(input_path, output_path,
n_sents=n_sents, use_morphology=morphology) n_sents=n_sents, use_morphology=morphology)

View File

@ -1,6 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .._messages import Messages
from ...compat import json_dumps, path2str from ...compat import json_dumps, path2str
from ...util import prints from ...util import prints
from ...gold import iob_to_biluo from ...gold import iob_to_biluo
@ -18,8 +19,8 @@ def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
output_file = output_path / output_filename output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f: with output_file.open('w', encoding='utf-8') as f:
f.write(json_dumps(docs)) f.write(json_dumps(docs))
prints("Created %d documents" % len(docs), prints(Messages.M033.format(n_docs=len(docs)),
title="Generated output file %s" % path2str(output_file)) title=Messages.M032.format(name=path2str(output_file)))
def read_conll_ner(input_path): def read_conll_ner(input_path):

View File

@ -1,6 +1,7 @@
# coding: utf8 # coding: utf8
from __future__ import unicode_literals from __future__ import unicode_literals
from .._messages import Messages
from ...compat import json_dumps, path2str from ...compat import json_dumps, path2str
from ...util import prints from ...util import prints
@ -32,8 +33,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
output_file = output_path / output_filename output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f: with output_file.open('w', encoding='utf-8') as f:
f.write(json_dumps(docs)) f.write(json_dumps(docs))
prints("Created %d documents" % len(docs), prints(Messages.M033.format(n_docs=len(docs)),
title="Generated output file %s" % path2str(output_file)) title=Messages.M032.format(name=path2str(output_file)))
def read_conllx(input_path, use_morphology=False, n=0): def read_conllx(input_path, use_morphology=False, n=0):

View File

@ -2,6 +2,7 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from cytoolz import partition_all, concat from cytoolz import partition_all, concat
from .._messages import Messages
from ...compat import json_dumps, path2str from ...compat import json_dumps, path2str
from ...util import prints from ...util import prints
from ...gold import iob_to_biluo from ...gold import iob_to_biluo
@ -18,8 +19,8 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
output_file = output_path / output_filename output_file = output_path / output_filename
with output_file.open('w', encoding='utf-8') as f: with output_file.open('w', encoding='utf-8') as f:
f.write(json_dumps(docs)) f.write(json_dumps(docs))
prints("Created %d documents" % len(docs), prints(Messages.M033.format(n_docs=len(docs)),
title="Generated output file %s" % path2str(output_file)) title=Messages.M032.format(name=path2str(output_file)))
def read_iob(raw_sents): def read_iob(raw_sents):

View File

@ -8,6 +8,7 @@ import sys
import ujson import ujson
from .link import link from .link import link
from ._messages import Messages
from ..util import prints, get_package_path from ..util import prints, get_package_path
from ..compat import url_read, HTTPError from ..compat import url_read, HTTPError
from .. import about from .. import about
@ -32,9 +33,7 @@ def download(model, direct=False):
version = get_version(model_name, compatibility) version = get_version(model_name, compatibility)
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name,
v=version)) v=version))
if dl != 0: if dl != 0: # if download subprocess doesn't return 0, exit
# if download subprocess doesn't return 0, exit with the respective
# exit code before doing anything else
sys.exit(dl) sys.exit(dl)
try: try:
# Get package path here because link uses # Get package path here because link uses
@ -48,22 +47,15 @@ def download(model, direct=False):
# Dirty, but since spacy.download and the auto-linking is # Dirty, but since spacy.download and the auto-linking is
# mostly a convenience wrapper, it's best to show a success # mostly a convenience wrapper, it's best to show a success
# message and loading instructions, even if linking fails. # message and loading instructions, even if linking fails.
prints( prints(Messages.M001.format(name=model_name), title=Messages.M002)
"Creating a shortcut link for 'en' didn't work (maybe "
"you don't have admin permissions?), but you can still "
"load the model via its full package name:",
"nlp = spacy.load('%s')" % model_name,
title="Download successful but linking failed")
def get_json(url, desc): def get_json(url, desc):
try: try:
data = url_read(url) data = url_read(url)
except HTTPError as e: except HTTPError as e:
msg = ("Couldn't fetch %s. Please find a model for your spaCy " prints(Messages.M004.format(desc, about.__version__),
"installation (v%s), and download it manually.") title=Messages.M003.format(e.code, e.reason), exits=1)
prints(msg % (desc, about.__version__), about.__docs_models__,
title="Server error (%d: %s)" % (e.code, e.reason), exits=1)
return ujson.loads(data) return ujson.loads(data)
@ -73,17 +65,16 @@ def get_compatibility():
comp_table = get_json(about.__compatibility__, "compatibility table") comp_table = get_json(about.__compatibility__, "compatibility table")
comp = comp_table['spacy'] comp = comp_table['spacy']
if version not in comp: if version not in comp:
prints("No compatible models found for v%s of spaCy." % version, prints(Messages.M006.format(version=version), title=Messages.M005,
title="Compatibility error", exits=1) exits=1)
return comp[version] return comp[version]
def get_version(model, comp): def get_version(model, comp):
model = model.rsplit('.dev', 1)[0] model = model.rsplit('.dev', 1)[0]
if model not in comp: if model not in comp:
version = about.__version__ prints(Messages.M007.format(name=model, version=about.__version__),
msg = "No compatible model found for '%s' (spaCy v%s)." title=Messages.M005, exits=1)
prints(msg % (model, version), title="Compatibility error", exits=1)
return comp[model][0] return comp[model][0]

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals, division, print_function
import plac import plac
from timeit import default_timer as timer from timeit import default_timer as timer
from ._messages import Messages
from ..gold import GoldCorpus from ..gold import GoldCorpus
from ..util import prints from ..util import prints
from .. import util from .. import util
@ -33,10 +34,9 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
data_path = util.ensure_path(data_path) data_path = util.ensure_path(data_path)
displacy_path = util.ensure_path(displacy_path) displacy_path = util.ensure_path(displacy_path)
if not data_path.exists(): if not data_path.exists():
prints(data_path, title="Evaluation data not found", exits=1) prints(data_path, title=Messages.M034, exits=1)
if displacy_path and not displacy_path.exists(): if displacy_path and not displacy_path.exists():
prints(displacy_path, title="Visualization output directory not found", prints(displacy_path, title=Messages.M035, exits=1)
exits=1)
corpus = GoldCorpus(data_path, data_path) corpus = GoldCorpus(data_path, data_path)
nlp = util.load_model(model) nlp = util.load_model(model)
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
@ -52,8 +52,7 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
render_ents = 'ner' in nlp.meta.get('pipeline', []) render_ents = 'ner' in nlp.meta.get('pipeline', [])
render_parses(docs, displacy_path, model_name=model, render_parses(docs, displacy_path, model_name=model,
limit=displacy_limit, deps=render_deps, ents=render_ents) limit=displacy_limit, deps=render_deps, ents=render_ents)
msg = "Generated %s parses as HTML" % displacy_limit prints(displacy_path, title=Messages.M036.format(n=displacy_limit))
prints(displacy_path, title=msg)
def render_parses(docs, output_path, model_name='', limit=250, deps=True, def render_parses(docs, output_path, model_name='', limit=250, deps=True,

View File

@ -5,9 +5,10 @@ import plac
import platform import platform
from pathlib import Path from pathlib import Path
from ._messages import Messages
from ..compat import path2str from ..compat import path2str
from .. import about
from .. import util from .. import util
from .. import about
@plac.annotations( @plac.annotations(
@ -25,7 +26,7 @@ def info(model=None, markdown=False):
model_path = util.get_data_path() / model model_path = util.get_data_path() / model
meta_path = model_path / 'meta.json' meta_path = model_path / 'meta.json'
if not meta_path.is_file(): if not meta_path.is_file():
util.prints(meta_path, title="Can't find model meta.json", exits=1) util.prints(meta_path, title=Messages.M020, exits=1)
meta = util.read_json(meta_path) meta = util.read_json(meta_path)
if model_path.resolve() != model_path: if model_path.resolve() != model_path:
meta['link'] = path2str(model_path) meta['link'] = path2str(model_path)

View File

@ -11,7 +11,9 @@ from preshed.counter import PreshCounter
import tarfile import tarfile
import gzip import gzip
from ._messages import Messages
from ..vectors import Vectors from ..vectors import Vectors
from ..errors import Warnings, user_warning
from ..util import prints, ensure_path, get_lang_class from ..util import prints, ensure_path, get_lang_class
try: try:
@ -37,16 +39,13 @@ def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, vectors_loc=
and word vectors. and word vectors.
""" """
if freqs_loc is not None and not freqs_loc.exists(): if freqs_loc is not None and not freqs_loc.exists():
prints(freqs_loc, title="Can't find words frequencies file", exits=1) prints(freqs_loc, title=Messages.M037, exits=1)
clusters_loc = ensure_path(clusters_loc) clusters_loc = ensure_path(clusters_loc)
vectors_loc = ensure_path(vectors_loc) vectors_loc = ensure_path(vectors_loc)
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20) probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None) vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
clusters = read_clusters(clusters_loc) if clusters_loc else {} clusters = read_clusters(clusters_loc) if clusters_loc else {}
nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors) nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors)
if not output_dir.exists(): if not output_dir.exists():
output_dir.mkdir() output_dir.mkdir()
nlp.to_disk(output_dir) nlp.to_disk(output_dir)
@ -69,7 +68,6 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
nlp = lang_class() nlp = lang_class()
for lexeme in nlp.vocab: for lexeme in nlp.vocab:
lexeme.rank = 0 lexeme.rank = 0
lex_added = 0 lex_added = 0
for i, (word, prob) in enumerate(tqdm(sorted(probs.items(), key=lambda item: item[1], reverse=True))): for i, (word, prob) in enumerate(tqdm(sorted(probs.items(), key=lambda item: item[1], reverse=True))):
lexeme = nlp.vocab[word] lexeme = nlp.vocab[word]
@ -89,15 +87,13 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
lexeme = nlp.vocab[word] lexeme = nlp.vocab[word]
lexeme.is_oov = False lexeme.is_oov = False
lex_added += 1 lex_added += 1
if len(vectors_data): if len(vectors_data):
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
if prune_vectors >= 1: if prune_vectors >= 1:
nlp.vocab.prune_vectors(prune_vectors) nlp.vocab.prune_vectors(prune_vectors)
vec_added = len(nlp.vocab.vectors) vec_added = len(nlp.vocab.vectors)
prints(Messages.M039.format(entries=lex_added, vectors=vec_added),
prints("{} entries, {} vectors".format(lex_added, vec_added), title=Messages.M038)
title="Sucessfully compiled vocab")
return nlp return nlp
@ -145,7 +141,7 @@ def read_clusters(clusters_loc):
print("Reading clusters...") print("Reading clusters...")
clusters = {} clusters = {}
if ftfy is None: if ftfy is None:
print("Warning: No text fixing. Run pip install ftfy if necessary") user_warning(Warnings.W004)
with clusters_loc.open() as f: with clusters_loc.open() as f:
for line in tqdm(f): for line in tqdm(f):
try: try:

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
import plac import plac
from pathlib import Path from pathlib import Path
from ._messages import Messages
from ..compat import symlink_to, path2str from ..compat import symlink_to, path2str
from ..util import prints from ..util import prints
from .. import util from .. import util
@ -24,40 +25,29 @@ def link(origin, link_name, force=False, model_path=None):
else: else:
model_path = Path(origin) if model_path is None else Path(model_path) model_path = Path(origin) if model_path is None else Path(model_path)
if not model_path.exists(): if not model_path.exists():
prints("The data should be located in %s" % path2str(model_path), prints(Messages.M009.format(path=path2str(model_path)),
title="Can't locate model data", exits=1) title=Messages.M008, exits=1)
data_path = util.get_data_path() data_path = util.get_data_path()
if not data_path or not data_path.exists(): if not data_path or not data_path.exists():
spacy_loc = Path(__file__).parent.parent spacy_loc = Path(__file__).parent.parent
prints("Make sure a directory `/data` exists within your spaCy " prints(Messages.M011, spacy_loc, title=Messages.M010, exits=1)
"installation and try again. The data directory should be "
"located here:", path2str(spacy_loc), exits=1,
title="Can't find the spaCy data path to create model symlink")
link_path = util.get_data_path() / link_name link_path = util.get_data_path() / link_name
if link_path.is_symlink() and not force: if link_path.is_symlink() and not force:
prints("To overwrite an existing link, use the --force flag.", prints(Messages.M013, title=Messages.M012.format(name=link_name),
title="Link %s already exists" % link_name, exits=1) exits=1)
elif link_path.is_symlink(): # does a symlink exist? elif link_path.is_symlink(): # does a symlink exist?
# NB: It's important to check for is_symlink here and not for exists, # NB: It's important to check for is_symlink here and not for exists,
# because invalid/outdated symlinks would return False otherwise. # because invalid/outdated symlinks would return False otherwise.
link_path.unlink() link_path.unlink()
elif link_path.exists(): # does it exist otherwise? elif link_path.exists(): # does it exist otherwise?
# NB: Check this last because valid symlinks also "exist". # NB: Check this last because valid symlinks also "exist".
prints("This can happen if your data directory contains a directory " prints(Messages.M015, link_path,
"or file of the same name.", link_path, title=Messages.M014.format(name=link_name), exits=1)
title="Can't overwrite symlink %s" % link_name, exits=1) msg = "%s --> %s" % (path2str(model_path), path2str(link_path))
try: try:
symlink_to(link_path, model_path) symlink_to(link_path, model_path)
except: except:
# This is quite dirty, but just making sure other errors are caught. # This is quite dirty, but just making sure other errors are caught.
prints("Creating a symlink in spacy/data failed. Make sure you have " prints(Messages.M017, msg, title=Messages.M016.format(name=link_name))
"the required permissions and try re-running the command as "
"admin, or use a virtualenv. You can still import the model as "
"a module and call its load() method, or create the symlink "
"manually.",
"%s --> %s" % (path2str(model_path), path2str(link_path)),
title="Error: Couldn't link model to '%s'" % link_name)
raise raise
prints("%s --> %s" % (path2str(model_path), path2str(link_path)), prints(msg, Messages.M019.format(name=link_name), title=Messages.M018)
"You can now load the model via spacy.load('%s')" % link_name,
title="Linking successful")

View File

@ -5,6 +5,7 @@ import plac
import shutil import shutil
from pathlib import Path from pathlib import Path
from ._messages import Messages
from ..compat import path2str, json_dumps from ..compat import path2str, json_dumps
from ..util import prints from ..util import prints
from .. import util from .. import util
@ -31,17 +32,17 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False,
output_path = util.ensure_path(output_dir) output_path = util.ensure_path(output_dir)
meta_path = util.ensure_path(meta_path) meta_path = util.ensure_path(meta_path)
if not input_path or not input_path.exists(): if not input_path or not input_path.exists():
prints(input_path, title="Model directory not found", exits=1) prints(input_path, title=Messages.M008, exits=1)
if not output_path or not output_path.exists(): if not output_path or not output_path.exists():
prints(output_path, title="Output directory not found", exits=1) prints(output_path, title=Messages.M040, exits=1)
if meta_path and not meta_path.exists(): if meta_path and not meta_path.exists():
prints(meta_path, title="meta.json not found", exits=1) prints(meta_path, title=Messages.M020, exits=1)
meta_path = meta_path or input_path / 'meta.json' meta_path = meta_path or input_path / 'meta.json'
if meta_path.is_file(): if meta_path.is_file():
meta = util.read_json(meta_path) meta = util.read_json(meta_path)
if not create_meta: # only print this if user doesn't want to overwrite if not create_meta: # only print this if user doesn't want to overwrite
prints(meta_path, title="Loaded meta.json from file") prints(meta_path, title=Messages.M041)
else: else:
meta = generate_meta(input_dir, meta) meta = generate_meta(input_dir, meta)
meta = validate_meta(meta, ['lang', 'name', 'version']) meta = validate_meta(meta, ['lang', 'name', 'version'])
@ -57,9 +58,8 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False,
create_file(main_path / 'setup.py', TEMPLATE_SETUP) create_file(main_path / 'setup.py', TEMPLATE_SETUP)
create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST) create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST)
create_file(package_path / '__init__.py', TEMPLATE_INIT) create_file(package_path / '__init__.py', TEMPLATE_INIT)
prints(main_path, "To build the package, run `python setup.py sdist` in " prints(main_path, Messages.M043,
"this directory.", title=Messages.M042.format(name=model_name_v))
title="Successfully created package '%s'" % model_name_v)
def create_dirs(package_path, force): def create_dirs(package_path, force):
@ -67,10 +67,7 @@ def create_dirs(package_path, force):
if force: if force:
shutil.rmtree(path2str(package_path)) shutil.rmtree(path2str(package_path))
else: else:
prints(package_path, "Please delete the directory and try again, " prints(package_path, Messages.M045, title=Messages.M044, exits=1)
"or use the --force flag to overwrite existing "
"directories.", title="Package directory already exists",
exits=1)
Path.mkdir(package_path, parents=True) Path.mkdir(package_path, parents=True)
@ -97,9 +94,7 @@ def generate_meta(model_path, existing_meta):
meta['vectors'] = {'width': nlp.vocab.vectors_length, meta['vectors'] = {'width': nlp.vocab.vectors_length,
'vectors': len(nlp.vocab.vectors), 'vectors': len(nlp.vocab.vectors),
'keys': nlp.vocab.vectors.n_keys} 'keys': nlp.vocab.vectors.n_keys}
prints("Enter the package settings for your model. The following " prints(Messages.M047, title=Messages.Mo46)
"information will be read from your model data: pipeline, vectors.",
title="Generating meta.json")
for setting, desc, default in settings: for setting, desc, default in settings:
response = util.get_raw_input(desc, default) response = util.get_raw_input(desc, default)
meta[setting] = default if response == '' and default else response meta[setting] = default if response == '' and default else response
@ -111,8 +106,7 @@ def generate_meta(model_path, existing_meta):
def validate_meta(meta, keys): def validate_meta(meta, keys):
for key in keys: for key in keys:
if key not in meta or meta[key] == '': if key not in meta or meta[key] == '':
prints("This setting is required to build your package.", prints(Messages.M049, title=Messages.M048.format(key=key), exits=1)
title='No "%s" setting found in meta.json' % key, exits=1)
return meta return meta

View File

@ -7,6 +7,7 @@ import tqdm
from thinc.neural._classes.model import Model from thinc.neural._classes.model import Model
from timeit import default_timer as timer from timeit import default_timer as timer
from ._messages import Messages
from ..attrs import PROB, IS_OOV, CLUSTER, LANG from ..attrs import PROB, IS_OOV, CLUSTER, LANG
from ..gold import GoldCorpus, minibatch from ..gold import GoldCorpus, minibatch
from ..util import prints from ..util import prints
@ -54,15 +55,15 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
if not output_path.exists(): if not output_path.exists():
output_path.mkdir() output_path.mkdir()
if not train_path.exists(): if not train_path.exists():
prints(train_path, title="Training data not found", exits=1) prints(train_path, title=Messages.M050, exits=1)
if dev_path and not dev_path.exists(): if dev_path and not dev_path.exists():
prints(dev_path, title="Development data not found", exits=1) prints(dev_path, title=Messages.M051, exits=1)
if meta_path is not None and not meta_path.exists(): if meta_path is not None and not meta_path.exists():
prints(meta_path, title="meta.json not found", exits=1) prints(meta_path, title=Messages.M020, exits=1)
meta = util.read_json(meta_path) if meta_path else {} meta = util.read_json(meta_path) if meta_path else {}
if not isinstance(meta, dict): if not isinstance(meta, dict):
prints("Expected dict but got: {}".format(type(meta)), prints(Messages.M053.format(meta_type=type(meta)),
title="Not a valid meta.json format", exits=1) title=Messages.M052, exits=1)
meta.setdefault('lang', lang) meta.setdefault('lang', lang)
meta.setdefault('name', 'unnamed') meta.setdefault('name', 'unnamed')

View File

@ -6,6 +6,7 @@ from pathlib import Path
import sys import sys
import ujson import ujson
from ._messages import Messages
from ..compat import path2str, locale_escape, url_read, HTTPError from ..compat import path2str, locale_escape, url_read, HTTPError
from ..util import prints, get_data_path, read_json from ..util import prints, get_data_path, read_json
from .. import about from .. import about
@ -18,14 +19,13 @@ def validate():
try: try:
data = url_read(about.__compatibility__) data = url_read(about.__compatibility__)
except HTTPError as e: except HTTPError as e:
prints("Couldn't fetch compatibility table.", title = Messages.M003.format(code=e.code, desc=e.reason)
title="Server error (%d: %s)" % (e.code, e.reason), exits=1) prints(Messages.M021, title=title, exits=1)
compat = ujson.loads(data)['spacy'] compat = ujson.loads(data)['spacy']
current_compat = compat.get(about.__version__) current_compat = compat.get(about.__version__)
if not current_compat: if not current_compat:
prints(about.__compatibility__, exits=1, prints(about.__compatibility__, exits=1,
title="Can't find spaCy v{} in compatibility table" title=Messages.M022.format(version=about.__version__))
.format(about.__version__))
all_models = set() all_models = set()
for spacy_v, models in dict(compat).items(): for spacy_v, models in dict(compat).items():
all_models.update(models.keys()) all_models.update(models.keys())
@ -42,7 +42,7 @@ def validate():
update_models = [m for m in incompat_models if m in current_compat] update_models = [m for m in incompat_models if m in current_compat]
prints(path2str(Path(__file__).parent.parent), prints(path2str(Path(__file__).parent.parent),
title="Installed models (spaCy v{})".format(about.__version__)) title=Messages.M023.format(version=about.__version__))
if model_links or model_pkgs: if model_links or model_pkgs:
print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', '')) print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
for name, data in model_pkgs.items(): for name, data in model_pkgs.items():
@ -50,23 +50,16 @@ def validate():
for name, data in model_links.items(): for name, data in model_links.items():
print(get_model_row(current_compat, name, data, 'link')) print(get_model_row(current_compat, name, data, 'link'))
else: else:
prints("No models found in your current environment.", exits=0) prints(Messages.M024, exits=0)
if update_models: if update_models:
cmd = ' python -m spacy download {}' cmd = ' python -m spacy download {}'
print("\n Use the following commands to update the model packages:") print("\n " + Messages.M025)
print('\n'.join([cmd.format(pkg) for pkg in update_models])) print('\n'.join([cmd.format(pkg) for pkg in update_models]))
if na_models: if na_models:
prints("The following models are not available for spaCy v{}: {}" prints(Messages.M025.format(version=about.__version__,
.format(about.__version__, ', '.join(na_models))) models=', '.join(na_models)))
if incompat_links: if incompat_links:
prints("You may also want to overwrite the incompatible links using " prints(Messages.M027.format(path=path2str(get_data_path())))
"the `python -m spacy link` command with `--force`, or remove "
"them from the data directory. Data path: {}"
.format(path2str(get_data_path())))
if incompat_models or incompat_links: if incompat_models or incompat_links:
sys.exit(1) sys.exit(1)

View File

@ -4,6 +4,7 @@ from __future__ import unicode_literals
from .render import DependencyRenderer, EntityRenderer from .render import DependencyRenderer, EntityRenderer
from ..tokens import Doc from ..tokens import Doc
from ..compat import b_to_str from ..compat import b_to_str
from ..errors import Errors, Warnings, user_warning
from ..util import prints, is_in_jupyter from ..util import prints, is_in_jupyter
@ -27,7 +28,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
factories = {'dep': (DependencyRenderer, parse_deps), factories = {'dep': (DependencyRenderer, parse_deps),
'ent': (EntityRenderer, parse_ents)} 'ent': (EntityRenderer, parse_ents)}
if style not in factories: if style not in factories:
raise ValueError("Unknown style: %s" % style) raise ValueError(Errors.E087.format(style=style))
if isinstance(docs, Doc) or isinstance(docs, dict): if isinstance(docs, Doc) or isinstance(docs, dict):
docs = [docs] docs = [docs]
renderer, converter = factories[style] renderer, converter = factories[style]
@ -57,12 +58,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
render(docs, style=style, page=page, minify=minify, options=options, render(docs, style=style, page=page, minify=minify, options=options,
manual=manual) manual=manual)
httpd = simple_server.make_server('0.0.0.0', port, app) httpd = simple_server.make_server('0.0.0.0', port, app)
prints("Using the '%s' visualizer" % style, prints("Using the '{}' visualizer".format(style),
title="Serving on port %d..." % port) title="Serving on port {}...".format(port))
try: try:
httpd.serve_forever() httpd.serve_forever()
except KeyboardInterrupt: except KeyboardInterrupt:
prints("Shutting down server on port %d." % port) prints("Shutting down server on port {}.".format(port))
finally: finally:
httpd.server_close() httpd.server_close()
@ -83,6 +84,8 @@ def parse_deps(orig_doc, options={}):
RETURNS (dict): Generated dependency parse keyed by words and arcs. RETURNS (dict): Generated dependency parse keyed by words and arcs.
""" """
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes()) doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
if not doc.is_parsed:
user_warning(Warnings.W005)
if options.get('collapse_punct', True): if options.get('collapse_punct', True):
spans = [] spans = []
for word in doc[:-1]: for word in doc[:-1]:
@ -120,6 +123,8 @@ def parse_ents(doc, options={}):
""" """
ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_} ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
for ent in doc.ents] for ent in doc.ents]
if not ents:
user_warning(Warnings.W006)
title = (doc.user_data.get('title', None) title = (doc.user_data.get('title', None)
if hasattr(doc, 'user_data') else None) if hasattr(doc, 'user_data') else None)
return {'text': doc.text, 'ents': ents, 'title': title} return {'text': doc.text, 'ents': ents, 'title': title}

297
spacy/errors.py Normal file
View File

@ -0,0 +1,297 @@
# coding: utf8
from __future__ import unicode_literals
import os
import warnings
import inspect
def add_codes(err_cls):
"""Add error codes to string messages via class attribute names."""
class ErrorsWithCodes(object):
def __getattribute__(self, code):
msg = getattr(err_cls, code)
return '[{code}] {msg}'.format(code=code, msg=msg)
return ErrorsWithCodes()
@add_codes
class Warnings(object):
W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
"You can now call spacy.load with the path as its first argument, "
"and the model's meta.json will be used to determine the language "
"to load. For example:\nnlp = spacy.load('{path}')")
W002 = ("Tokenizer.from_list is now deprecated. Create a new Doc object "
"instead and pass in the strings as the `words` keyword argument, "
"for example:\nfrom spacy.tokens import Doc\n"
"doc = Doc(nlp.vocab, words=[...])")
W003 = ("Positional arguments to Doc.merge are deprecated. Instead, use "
"the keyword arguments, for example tag=, lemma= or ent_type=.")
W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing "
"using ftfy.fix_text if necessary.")
W005 = ("Doc object not parsed. This means displaCy won't be able to "
"generate a dependency visualization for it. Make sure the Doc "
"was processed with a model that supports dependency parsing, and "
"not just a language class like `English()`. For more info, see "
"the docs:\nhttps://spacy.io/usage/models")
W006 = ("No entities to visualize found in Doc object. If this is "
"surprising to you, make sure the Doc was processed using a model "
"that supports named entity recognition, and check the `doc.ents` "
"property manually if necessary.")
@add_codes
class Errors(object):
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
E002 = ("Can't find factory for '{name}'. This usually happens when spaCy "
"calls `nlp.create_pipe` with a component name that's not built "
"in - for example, when constructing the pipeline from a model's "
"meta.json. If you're using a custom component, you can write to "
"`Language.factories['{name}']` or remove it from the model meta "
"and add it via `nlp.add_pipe` instead.")
E003 = ("Not a valid pipeline component. Expected callable, but "
"got {component} (name: '{name}').")
E004 = ("If you meant to add a built-in component, use `create_pipe`: "
"`nlp.add_pipe(nlp.create_pipe('{component}'))`")
E005 = ("Pipeline component '{name}' returned None. If you're using a "
"custom component, maybe you forgot to return the processed Doc?")
E006 = ("Invalid constraints. You can only set one of the following: "
"before, after, first, last.")
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
E008 = ("Some current components would be lost when restoring previous "
"pipeline state. If you added components after calling "
"`nlp.disable_pipes()`, you should remove them explicitly with "
"`nlp.remove_pipe()` before the pipeline is restored. Names of "
"the new components: {names}")
E009 = ("The `update` method expects same number of docs and golds, but "
"got: {n_docs} docs, {n_golds} golds.")
E010 = ("Word vectors set to length 0. This may be because you don't have "
"a model installed or loaded, or because your model doesn't "
"include word vectors. For more info, see the docs:\n"
"https://spacy.io/usage/models")
E011 = ("Unknown operator: '{op}'. Options: {opts}")
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
E013 = ("Error selecting action in matcher")
E014 = ("Uknown tag ID: {tag}")
E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
"`force=True` to overwrite.")
E016 = ("MultitaskObjective target should be function or one of: dep, "
"tag, ent, dep_tag_offset, ent_tag.")
E017 = ("Can only add unicode or bytes. Got type: {value_type}")
E018 = ("Can't retrieve string for hash '{hash_value}'.")
E019 = ("Can't create transition with unknown action ID: {action}. Action "
"IDs are enumerated in spacy/syntax/{src}.pyx.")
E020 = ("Could not find a gold-standard action to supervise the "
"dependency parser. The tree is non-projective (i.e. it has "
"crossing arcs - see spacy/syntax/nonproj.pyx for definitions). "
"The ArcEager transition system only supports projective trees. "
"To learn non-projective representations, transform the data "
"before training and after parsing. Either pass "
"`make_projective=True` to the GoldParse class, or use "
"spacy.syntax.nonproj.preprocess_training_data.")
E021 = ("Could not find a gold-standard action to supervise the "
"dependency parser. The GoldParse was projective. The transition "
"system has {n_actions} actions. State at failure: {state}")
E022 = ("Could not find a transition with the name '{name}' in the NER "
"model.")
E023 = ("Error cleaning up beam: The same state occurred twice at "
"memory address {addr} and position {i}.")
E024 = ("Could not find an optimal move to supervise the parser. Usually, "
"this means the GoldParse was not correct. For example, are all "
"labels added to the model?")
E025 = ("String is too long: {length} characters. Max is 2**30.")
E026 = ("Error accessing token at position {i}: out of bounds in Doc of "
"length {length}.")
E027 = ("Arguments 'words' and 'spaces' should be sequences of the same "
"length, or 'spaces' should be left default at None. spaces "
"should be a sequence of booleans, with True meaning that the "
"word owns a ' ' character following it.")
E028 = ("orths_and_spaces expects either a list of unicode string or a "
"list of (unicode, bool) tuples. Got bytes instance: {value}")
E029 = ("noun_chunks requires the dependency parse, which requires a "
"statistical model to be installed and loaded. For more info, see "
"the documentation:\nhttps://spacy.io/usage/models")
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
"Alternatively, add the dependency parser, or set sentence "
"boundaries by setting doc[i].is_sent_start.")
E031 = ("Invalid token: empty string ('') at position {i}.")
E032 = ("Conflicting attributes specified in doc.from_array(): "
"(HEAD, SENT_START). The HEAD attribute currently sets sentence "
"boundaries implicitly, based on the tree structure. This means "
"the HEAD attribute would potentially override the sentence "
"boundaries set by SENT_START.")
E033 = ("Cannot load into non-empty Doc of length {length}.")
E034 = ("Doc.merge received {n_args} non-keyword arguments. Expected "
"either 3 arguments (deprecated), or 0 (use keyword arguments).\n"
"Arguments supplied:\n{args}\nKeyword arguments:{kwargs}")
E035 = ("Error creating span with start {start} and end {end} for Doc of "
"length {length}.")
E036 = ("Error calculating span: Can't find a token starting at character "
"offset {start}.")
E037 = ("Error calculating span: Can't find a token ending at character "
"offset {end}.")
E038 = ("Error finding sentence for span. Infinite loop detected.")
E039 = ("Array bounds exceeded while searching for root word. This likely "
"means the parse tree is in an invalid state. Please report this "
"issue here: http://github.com/explosion/spaCy/issues")
E040 = ("Attempt to access token at {i}, max length {max_length}.")
E041 = ("Invalid comparison operator: {op}. Likely a Cython bug?")
E042 = ("Error accessing doc[{i}].nbor({j}), for doc of length {length}.")
E043 = ("Refusing to write to token.sent_start if its document is parsed, "
"because this may cause inconsistent state.")
E044 = ("Invalid value for token.sent_start: {value}. Must be one of: "
"None, True, False")
E045 = ("Possibly infinite loop encountered while looking for {attr}.")
E046 = ("Can't retrieve unregistered extension attribute '{name}'. Did "
"you forget to call the `set_extension` method?")
E047 = ("Can't assign a value to unregistered extension attribute "
"'{name}'. Did you forget to call the `set_extension` method?")
E048 = ("Can't import language {lang} from spacy.lang.")
E049 = ("Can't find spaCy data directory: '{path}'. Check your "
"installation and permissions, or use spacy.util.set_data_path "
"to customise the location if necessary.")
E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut "
"link, a Python package or a valid path to a data directory.")
E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure "
"it points to a valid package (not just a data directory).")
E052 = ("Can't find model directory: {path}")
E053 = ("Could not read meta.json from {path}")
E054 = ("No valid '{setting}' setting found in model meta.json.")
E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}")
E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
"original string.\nKey: {key}\nOrths: {orths}")
E057 = ("Stepped slices not supported in Span objects. Try: "
"list(tokens)[start:stop:step] instead.")
E058 = ("Could not retrieve vector for key {key}.")
E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
"({rows}, {cols}).")
E061 = ("Bad file name: {filename}. Example of a valid file name: "
"'vectors.128.f.bin'")
E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
"and 63 are occupied. You can replace one by specifying the "
"`flag_id` explicitly, e.g. "
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
E063 = ("Invalid value for flag_id: {value}. Flag IDs must be between 1 "
"and 63 (inclusive).")
E064 = ("Error fetching a Lexeme from the Vocab. When looking up a "
"string, the lexeme returned had an orth ID that did not match "
"the query string. This means that the cached lexeme structs are "
"mismatched to the string encoding table. The mismatched:\n"
"Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
E065 = ("Only one of the vector table's width and shape can be specified. "
"Got width {width} and shape {shape}.")
E066 = ("Error creating model helper for extracting columns. Can only "
"extract columns by positive integer. Got: {value}.")
E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
"an entity) without a preceding 'B' (beginning of an entity). "
"Tag sequence:\n{tags}")
E068 = ("Invalid BILUO tag: '{tag}'.")
E069 = ("Invalid gold-standard parse tree. Found cycle between word "
"IDs: {cycle}")
E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) "
"does not align with number of annotations ({n_annots}).")
E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
"match the one in the vocab ({vocab_orth}).")
E072 = ("Error serializing lexeme: expected data length {length}, "
"got {bad_length}.")
E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
"are of length {length}. You can use `vocab.reset_vectors` to "
"clear the existing vectors and resize the table.")
E074 = ("Error interpreting compiled match pattern: patterns are expected "
"to end with the attribute {attr}. Got: {bad_attr}.")
E075 = ("Error accepting match: length ({length}) > maximum length "
"({max_len}).")
E076 = ("Error setting tensor on Doc: tensor has {rows} rows, while Doc "
"has {words} words.")
E077 = ("Error computing {value}: number of Docs ({n_docs}) does not "
"equal number of GoldParse objects ({n_golds}) in batch.")
E078 = ("Error computing score: number of words in Doc ({words_doc}) does "
"not equal number of words in GoldParse ({words_gold}).")
E079 = ("Error computing states in beam: number of predicted beams "
"({pbeams}) does not equal number of gold beams ({gbeams}).")
E080 = ("Duplicate state found in beam: {key}.")
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
"does not equal number of losses ({losses}).")
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
"match.")
E083 = ("Error setting extension: only one of default, getter, setter and "
"method is allowed. {n_args} keyword arguments were specified.")
E084 = ("Error assigning label ID {label} to span: not in StringStore.")
E085 = ("Can't create lexeme for string '{string}'.")
E086 = ("Error deserializing lexeme '{string}': orth ID {orth_id} does "
"not match hash {hash_id} in StringStore.")
E087 = ("Unknown displaCy style: {style}.")
E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
"v2.x parser and NER models require roughly 1GB of temporary "
"memory per 100,000 characters in the input. This means long "
"texts may cause memory allocation errors. If you're not using "
"the parser or NER, it's probably safe to increase the "
"`nlp.max_length` limit. The limit is in number of characters, so "
"you can check whether your inputs are too long by checking "
"`len(text)`.")
@add_codes
class TempErrors(object):
T001 = ("Max length currently 10 for phrase matching")
T002 = ("Pattern length ({doc_len}) >= phrase_matcher.max_length "
"({max_len}). Length can be set on initialization, up to 10.")
T003 = ("Resizing pre-trained Tagger models is not currently supported.")
T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
T005 = ("Currently history size is hard-coded to 0. Received: {value}.")
T006 = ("Currently history width is hard-coded to 0. Received: {value}.")
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues")
class ModelsWarning(UserWarning):
pass
WARNINGS = {
'user': UserWarning,
'deprecation': DeprecationWarning,
'models': ModelsWarning,
}
def _get_warn_types(arg):
if arg == '': # don't show any warnings
return []
if not arg or arg == 'all': # show all available warnings
return WARNINGS.keys()
return [w_type.strip() for w_type in arg.split(',')
if w_type.strip() in WARNINGS]
SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER', 'always')
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES'))
def user_warning(message):
_warn(message, 'user')
def deprecation_warning(message):
_warn(message, 'deprecation')
def models_warning(message):
_warn(message, 'models')
def _warn(message, warn_type='user'):
"""
message (unicode): The message to display.
category (Warning): The Warning to show.
"""
if warn_type in SPACY_WARNING_TYPES:
category = WARNINGS[warn_type]
stack = inspect.stack()[-1]
with warnings.catch_warnings():
warnings.simplefilter(SPACY_WARNING_FILTER, category)
warnings.warn_explicit(message, category, stack[1], stack[2])

View File

@ -10,6 +10,7 @@ import itertools
from .syntax import nonproj from .syntax import nonproj
from .tokens import Doc from .tokens import Doc
from .errors import Errors
from . import util from . import util
from .util import minibatch from .util import minibatch
@ -28,7 +29,8 @@ def tags_to_entities(tags):
elif tag == '-': elif tag == '-':
continue continue
elif tag.startswith('I'): elif tag.startswith('I'):
assert start is not None, tags[:i] if start is None:
raise ValueError(Errors.E067.format(tags=tags[:i]))
continue continue
if tag.startswith('U'): if tag.startswith('U'):
entities.append((tag[2:], i, i)) entities.append((tag[2:], i, i))
@ -38,7 +40,7 @@ def tags_to_entities(tags):
entities.append((tag[2:], start, i)) entities.append((tag[2:], start, i))
start = None start = None
else: else:
raise Exception(tag) raise ValueError(Errors.E068.format(tag=tag))
return entities return entities
@ -238,7 +240,9 @@ class GoldCorpus(object):
@classmethod @classmethod
def _make_golds(cls, docs, paragraph_tuples): def _make_golds(cls, docs, paragraph_tuples):
assert len(docs) == len(paragraph_tuples) if len(docs) != len(paragraph_tuples):
raise ValueError(Errors.E070.format(n_docs=len(docs),
n_annots=len(paragraph_tuples)))
if len(docs) == 1: if len(docs) == 1:
return [GoldParse.from_annot_tuples(docs[0], return [GoldParse.from_annot_tuples(docs[0],
paragraph_tuples[0][0])] paragraph_tuples[0][0])]
@ -461,7 +465,7 @@ cdef class GoldParse:
cycle = nonproj.contains_cycle(self.heads) cycle = nonproj.contains_cycle(self.heads)
if cycle is not None: if cycle is not None:
raise Exception("Cycle found: %s" % cycle) raise ValueError(Errors.E069.format(cycle=cycle))
if make_projective: if make_projective:
proj_heads, _ = nonproj.projectivize(self.heads, self.labels) proj_heads, _ = nonproj.projectivize(self.heads, self.labels)

View File

@ -28,6 +28,7 @@ from .lang.punctuation import TOKENIZER_INFIXES
from .lang.tokenizer_exceptions import TOKEN_MATCH from .lang.tokenizer_exceptions import TOKEN_MATCH
from .lang.tag_map import TAG_MAP from .lang.tag_map import TAG_MAP
from .lang.lex_attrs import LEX_ATTRS, is_stop from .lang.lex_attrs import LEX_ATTRS, is_stop
from .errors import Errors
from . import util from . import util
from . import about from . import about
@ -217,8 +218,7 @@ class Language(object):
for pipe_name, component in self.pipeline: for pipe_name, component in self.pipeline:
if pipe_name == name: if pipe_name == name:
return component return component
msg = "No component '{}' found in pipeline. Available names: {}" raise KeyError(Errors.E001.format(name=name, opts=self.pipe_names))
raise KeyError(msg.format(name, self.pipe_names))
def create_pipe(self, name, config=dict()): def create_pipe(self, name, config=dict()):
"""Create a pipeline component from a factory. """Create a pipeline component from a factory.
@ -228,7 +228,7 @@ class Language(object):
RETURNS (callable): Pipeline component. RETURNS (callable): Pipeline component.
""" """
if name not in self.factories: if name not in self.factories:
raise KeyError("Can't find factory for '{}'.".format(name)) raise KeyError(Errors.E002.format(name=name))
factory = self.factories[name] factory = self.factories[name]
return factory(self, **config) return factory(self, **config)
@ -253,12 +253,9 @@ class Language(object):
>>> nlp.add_pipe(component, name='custom_name', last=True) >>> nlp.add_pipe(component, name='custom_name', last=True)
""" """
if not hasattr(component, '__call__'): if not hasattr(component, '__call__'):
msg = ("Not a valid pipeline component. Expected callable, but " msg = Errors.E003.format(component=repr(component), name=name)
"got {}. ".format(repr(component)))
if isinstance(component, basestring_) and component in self.factories: if isinstance(component, basestring_) and component in self.factories:
msg += ("If you meant to add a built-in component, use " msg += Errors.E004.format(component=component)
"create_pipe: nlp.add_pipe(nlp.create_pipe('{}'))"
.format(component))
raise ValueError(msg) raise ValueError(msg)
if name is None: if name is None:
if hasattr(component, 'name'): if hasattr(component, 'name'):
@ -271,11 +268,9 @@ class Language(object):
else: else:
name = repr(component) name = repr(component)
if name in self.pipe_names: if name in self.pipe_names:
raise ValueError("'{}' already exists in pipeline.".format(name)) raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2: if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
msg = ("Invalid constraints. You can only set one of the " raise ValueError(Errors.E006)
"following: before, after, first, last.")
raise ValueError(msg)
pipe = (name, component) pipe = (name, component)
if last or not any([first, before, after]): if last or not any([first, before, after]):
self.pipeline.append(pipe) self.pipeline.append(pipe)
@ -286,9 +281,8 @@ class Language(object):
elif after and after in self.pipe_names: elif after and after in self.pipe_names:
self.pipeline.insert(self.pipe_names.index(after) + 1, pipe) self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
else: else:
msg = "Can't find '{}' in pipeline. Available names: {}" raise ValueError(Errors.E001.format(name=before or after,
unfound = before or after opts=self.pipe_names))
raise ValueError(msg.format(unfound, self.pipe_names))
def has_pipe(self, name): def has_pipe(self, name):
"""Check if a component name is present in the pipeline. Equivalent to """Check if a component name is present in the pipeline. Equivalent to
@ -306,8 +300,7 @@ class Language(object):
component (callable): Pipeline component. component (callable): Pipeline component.
""" """
if name not in self.pipe_names: if name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}" raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
raise ValueError(msg.format(name, self.pipe_names))
self.pipeline[self.pipe_names.index(name)] = (name, component) self.pipeline[self.pipe_names.index(name)] = (name, component)
def rename_pipe(self, old_name, new_name): def rename_pipe(self, old_name, new_name):
@ -317,11 +310,9 @@ class Language(object):
new_name (unicode): New name of the component. new_name (unicode): New name of the component.
""" """
if old_name not in self.pipe_names: if old_name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}" raise ValueError(Errors.E001.format(name=old_name, opts=self.pipe_names))
raise ValueError(msg.format(old_name, self.pipe_names))
if new_name in self.pipe_names: if new_name in self.pipe_names:
msg = "'{}' already exists in pipeline. Existing names: {}" raise ValueError(Errors.E007.format(name=new_name, opts=self.pipe_names))
raise ValueError(msg.format(new_name, self.pipe_names))
i = self.pipe_names.index(old_name) i = self.pipe_names.index(old_name)
self.pipeline[i] = (new_name, self.pipeline[i][1]) self.pipeline[i] = (new_name, self.pipeline[i][1])
@ -332,8 +323,7 @@ class Language(object):
RETURNS (tuple): A `(name, component)` tuple of the removed component. RETURNS (tuple): A `(name, component)` tuple of the removed component.
""" """
if name not in self.pipe_names: if name not in self.pipe_names:
msg = "Can't find '{}' in pipeline. Available names: {}" raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
raise ValueError(msg.format(name, self.pipe_names))
return self.pipeline.pop(self.pipe_names.index(name)) return self.pipeline.pop(self.pipe_names.index(name))
def __call__(self, text, disable=[]): def __call__(self, text, disable=[]):
@ -351,21 +341,17 @@ class Language(object):
('An', 'NN') ('An', 'NN')
""" """
if len(text) >= self.max_length: if len(text) >= self.max_length:
msg = ( raise ValueError(Errors.E088.format(length=len(text),
"Text of length {length} exceeds maximum of {max_length}. " max_length=self.max_length))
"The v2 parser and NER models require roughly 1GB of temporary "
"memory per 100,000 characters in the input. This means long "
"texts may cause memory allocation errors. If you're not using "
"the parser or NER, it's probably safe to increase the "
"nlp.max_length limit. The limit is in number of characters, "
"so you can check whether your inputs are too long by checking "
"len(text).")
raise ValueError(msg.format(length=len(text), max_length=self.max_length))
doc = self.make_doc(text) doc = self.make_doc(text)
for name, proc in self.pipeline: for name, proc in self.pipeline:
if name in disable: if name in disable:
continue continue
if not hasattr(proc, '__call__'):
raise ValueError(Errors.E003.format(component=type(proc), name=name))
doc = proc(doc) doc = proc(doc)
if doc is None:
raise ValueError(Errors.E005.format(name=name))
return doc return doc
def disable_pipes(self, *names): def disable_pipes(self, *names):
@ -407,8 +393,7 @@ class Language(object):
>>> state = nlp.update(docs, golds, sgd=optimizer) >>> state = nlp.update(docs, golds, sgd=optimizer)
""" """
if len(docs) != len(golds): if len(docs) != len(golds):
raise IndexError("Update expects same number of docs and golds " raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds)))
"Got: %d, %d" % (len(docs), len(golds)))
if len(docs) == 0: if len(docs) == 0:
return return
if sgd is None: if sgd is None:
@ -757,14 +742,7 @@ class DisabledPipes(list):
if unexpected: if unexpected:
# Don't change the pipeline if we're raising an error. # Don't change the pipeline if we're raising an error.
self.nlp.pipeline = current self.nlp.pipeline = current
msg = ( raise ValueError(Errors.E008.format(names=unexpected))
"Some current components would be lost when restoring "
"previous pipeline state. If you added components after "
"calling nlp.disable_pipes(), you should remove them "
"explicitly with nlp.remove_pipe() before the pipeline is "
"restore. Names of the new components: %s"
)
raise ValueError(msg % unexpected)
self[:] = [] self[:] = []

View File

@ -15,7 +15,7 @@ from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV
from .attrs cimport PROB from .attrs cimport PROB
from .attrs import intify_attrs from .attrs import intify_attrs
from . import about from .errors import Errors
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
@ -37,7 +37,8 @@ cdef class Lexeme:
self.vocab = vocab self.vocab = vocab
self.orth = orth self.orth = orth
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth) self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
assert self.c.orth == orth if self.c.orth != orth:
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
def __richcmp__(self, other, int op): def __richcmp__(self, other, int op):
if other is None: if other is None:
@ -129,20 +130,25 @@ cdef class Lexeme:
lex_data = Lexeme.c_to_bytes(self.c) lex_data = Lexeme.c_to_bytes(self.c)
start = <const char*>&self.c.flags start = <const char*>&self.c.flags
end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment) end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data)) if (end-start) != sizeof(lex_data.data):
raise ValueError(Errors.E072.format(length=end-start,
bad_length=sizeof(lex_data.data)))
byte_string = b'\0' * sizeof(lex_data.data) byte_string = b'\0' * sizeof(lex_data.data)
byte_chars = <char*>byte_string byte_chars = <char*>byte_string
for i in range(sizeof(lex_data.data)): for i in range(sizeof(lex_data.data)):
byte_chars[i] = lex_data.data[i] byte_chars[i] = lex_data.data[i]
assert len(byte_string) == sizeof(lex_data.data), (len(byte_string), if len(byte_string) != sizeof(lex_data.data):
sizeof(lex_data.data)) raise ValueError(Errors.E072.format(length=len(byte_string),
bad_length=sizeof(lex_data.data)))
return byte_string return byte_string
def from_bytes(self, bytes byte_string): def from_bytes(self, bytes byte_string):
# This method doesn't really have a use-case --- wrote it for testing. # This method doesn't really have a use-case --- wrote it for testing.
# Possibly delete? It puts the Lexeme out of synch with the vocab. # Possibly delete? It puts the Lexeme out of synch with the vocab.
cdef SerializedLexemeC lex_data cdef SerializedLexemeC lex_data
assert len(byte_string) == sizeof(lex_data.data) if len(byte_string) != sizeof(lex_data.data):
raise ValueError(Errors.E072.format(length=len(byte_string),
bad_length=sizeof(lex_data.data)))
for i in range(len(byte_string)): for i in range(len(byte_string)):
lex_data.data[i] = byte_string[i] lex_data.data[i] = byte_string[i]
Lexeme.c_from_bytes(self.c, lex_data) Lexeme.c_from_bytes(self.c, lex_data)
@ -169,16 +175,13 @@ cdef class Lexeme:
def __get__(self): def __get__(self):
cdef int length = self.vocab.vectors_length cdef int length = self.vocab.vectors_length
if length == 0: if length == 0:
raise ValueError( raise ValueError(Errors.E010)
"Word vectors set to length 0. This may be because you "
"don't have a model installed or loaded, or because your "
"model doesn't include word vectors. For more info, see "
"the documentation: \n%s\n" % about.__docs_models__
)
return self.vocab.get_vector(self.c.orth) return self.vocab.get_vector(self.c.orth)
def __set__(self, vector): def __set__(self, vector):
assert len(vector) == self.vocab.vectors_length if len(vector) != self.vocab.vectors_length:
raise ValueError(Errors.E073.format(new_length=len(vector),
length=self.vocab.vectors_length))
self.vocab.set_vector(self.c.orth, vector) self.vocab.set_vector(self.c.orth, vector)
property rank: property rank:

View File

@ -16,6 +16,7 @@ from .typedefs cimport hash_t
from .structs cimport TokenC from .structs cimport TokenC
from .tokens.doc cimport Doc, get_token_attr from .tokens.doc cimport Doc, get_token_attr
from .vocab cimport Vocab from .vocab cimport Vocab
from .errors import Errors, TempErrors
from .attrs import IDS from .attrs import IDS
from .attrs cimport attr_id_t, ID, NULL_ATTR from .attrs cimport attr_id_t, ID, NULL_ATTR
@ -109,7 +110,8 @@ cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
while pattern.nr_attr != 0: while pattern.nr_attr != 0:
pattern += 1 pattern += 1
id_attr = pattern[0].attrs[0] id_attr = pattern[0].attrs[0]
assert id_attr.attr == ID if id_attr.attr != ID:
raise ValueError(Errors.E074.format(attr=ID, bad_attr=id_attr.attr))
return id_attr.value return id_attr.value
@ -161,8 +163,8 @@ def _convert_strings(token_specs, string_store):
if value in operators: if value in operators:
ops = operators[value] ops = operators[value]
else: else:
msg = "Unknown operator '%s'. Options: %s" keys = ', '.join(operators.keys())
raise KeyError(msg % (value, ', '.join(operators.keys()))) raise KeyError(Errors.E011.format(op=value, opts=keys))
if isinstance(attr, basestring): if isinstance(attr, basestring):
attr = IDS.get(attr.upper()) attr = IDS.get(attr.upper())
if isinstance(value, basestring): if isinstance(value, basestring):
@ -264,9 +266,7 @@ cdef class Matcher:
""" """
for pattern in patterns: for pattern in patterns:
if len(pattern) == 0: if len(pattern) == 0:
msg = ("Cannot add pattern for zero tokens to matcher.\n" raise ValueError(Errors.E012.format(key=key))
"key: {key}\n")
raise ValueError(msg.format(key=key))
key = self._normalize_key(key) key = self._normalize_key(key)
for pattern in patterns: for pattern in patterns:
specs = _convert_strings(pattern, self.vocab.strings) specs = _convert_strings(pattern, self.vocab.strings)
@ -348,13 +348,12 @@ cdef class Matcher:
for state in partials: for state in partials:
action = get_action(state.second, token) action = get_action(state.second, token)
if action == PANIC: if action == PANIC:
raise Exception("Error selecting action in matcher") raise ValueError(Errors.E013)
while action == ADVANCE_ZERO: while action == ADVANCE_ZERO:
state.second += 1 state.second += 1
action = get_action(state.second, token) action = get_action(state.second, token)
if action == PANIC: if action == PANIC:
raise Exception("Error selecting action in matcher") raise ValueError(Errors.E013)
if action == REPEAT: if action == REPEAT:
# Leave the state in the queue, and advance to next slot # Leave the state in the queue, and advance to next slot
# (i.e. we don't overwrite -- we want to greedily match # (i.e. we don't overwrite -- we want to greedily match
@ -380,7 +379,7 @@ cdef class Matcher:
for pattern in self.patterns: for pattern in self.patterns:
action = get_action(pattern, token) action = get_action(pattern, token)
if action == PANIC: if action == PANIC:
raise Exception("Error selecting action in matcher") raise ValueError(Errors.E013)
while action == ADVANCE_ZERO: while action == ADVANCE_ZERO:
pattern += 1 pattern += 1
action = get_action(pattern, token) action = get_action(pattern, token)
@ -447,7 +446,7 @@ def get_bilou(length):
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
I10_ENT, I10_ENT, L10_ENT] I10_ENT, I10_ENT, L10_ENT]
else: else:
raise ValueError("Max length currently 10 for phrase matching") raise ValueError(TempErrors.T001)
cdef class PhraseMatcher: cdef class PhraseMatcher:
@ -506,11 +505,8 @@ cdef class PhraseMatcher:
cdef Doc doc cdef Doc doc
for doc in docs: for doc in docs:
if len(doc) >= self.max_length: if len(doc) >= self.max_length:
msg = ( raise ValueError(TempErrors.T002.format(doc_len=len(doc),
"Pattern length (%d) >= phrase_matcher.max_length (%d). " max_len=self.max_length))
"Length can be set on initialization, up to 10."
)
raise ValueError(msg % (len(doc), self.max_length))
cdef hash_t ent_id = self.matcher._normalize_key(key) cdef hash_t ent_id = self.matcher._normalize_key(key)
self._callbacks[ent_id] = on_match self._callbacks[ent_id] = on_match
cdef int length cdef int length
@ -562,7 +558,9 @@ cdef class PhraseMatcher:
yield doc yield doc
def accept_match(self, Doc doc, int start, int end): def accept_match(self, Doc doc, int start, int end):
assert (end - start) < self.max_length if (end - start) >= self.max_length:
raise ValueError(Errors.E075.format(length=end - start,
max_len=self.max_length))
cdef int i, j cdef int i, j
for i in range(self.max_length): for i in range(self.max_length):
self._phrase_key[i] = 0 self._phrase_key[i] = 0

View File

@ -9,6 +9,7 @@ from .attrs import LEMMA, intify_attrs
from .parts_of_speech cimport SPACE from .parts_of_speech cimport SPACE
from .parts_of_speech import IDS as POS_IDS from .parts_of_speech import IDS as POS_IDS
from .lexeme cimport Lexeme from .lexeme cimport Lexeme
from .errors import Errors
def _normalize_props(props): def _normalize_props(props):
@ -93,7 +94,7 @@ cdef class Morphology:
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
if tag_id > self.n_tags: if tag_id > self.n_tags:
raise ValueError("Unknown tag ID: %s" % tag_id) raise ValueError(Errors.E014.format(tag=tag_id))
# TODO: It's pretty arbitrary to put this logic here. I guess the # TODO: It's pretty arbitrary to put this logic here. I guess the
# justification is that this is where the specific word and the tag # justification is that this is where the specific word and the tag
# interact. Still, we should have a better way to enforce this rule, or # interact. Still, we should have a better way to enforce this rule, or
@ -129,7 +130,7 @@ cdef class Morphology:
tag (unicode): The part-of-speech tag to key the exception. tag (unicode): The part-of-speech tag to key the exception.
orth (unicode): The word-form to key the exception. orth (unicode): The word-form to key the exception.
""" """
# TODO: Currently we've assumed that we know the number of tags -- # TODO: Currently we've assumed that we know the number of tags --
# RichTagC is an array, and _cache is a PreshMapArray # RichTagC is an array, and _cache is a PreshMapArray
# This is really bad: it makes the morphology typed to the tagger # This is really bad: it makes the morphology typed to the tagger
# classes, which is all wrong. # classes, which is all wrong.
@ -147,9 +148,7 @@ cdef class Morphology:
elif force: elif force:
memset(cached, 0, sizeof(cached[0])) memset(cached, 0, sizeof(cached[0]))
else: else:
raise ValueError( raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str))
"Conflicting morphology exception for (%s, %s). Use "
"force=True to overwrite." % (tag_str, orth_str))
cached.tag = rich_tag cached.tag = rich_tag
# TODO: Refactor this to take arbitrary attributes. # TODO: Refactor this to take arbitrary attributes.

View File

@ -33,6 +33,7 @@ from .parts_of_speech import X
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
from ._ml import link_vectors_to_models, zero_init, flatten from ._ml import link_vectors_to_models, zero_init, flatten
from ._ml import create_default_optimizer from ._ml import create_default_optimizer
from .errors import Errors, TempErrors
from . import util from . import util
@ -169,7 +170,7 @@ class Pipe(object):
problem. problem.
""" """
raise NotImplementedError raise NotImplementedError
def create_optimizer(self): def create_optimizer(self):
return create_default_optimizer(self.model.ops, return create_default_optimizer(self.model.ops,
**self.cfg.get('optimizer', {})) **self.cfg.get('optimizer', {}))
@ -336,7 +337,8 @@ class Tensorizer(Pipe):
tensors (object): Vector representation for each token in the docs. tensors (object): Vector representation for each token in the docs.
""" """
for doc, tensor in zip(docs, tensors): for doc, tensor in zip(docs, tensors):
assert tensor.shape[0] == len(doc) if tensor.shape[0] != len(doc):
raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
doc.tensor = tensor doc.tensor = tensor
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
@ -550,9 +552,7 @@ class Tagger(Pipe):
# copy_array(larger.W[:smaller.nO], smaller.W) # copy_array(larger.W[:smaller.nO], smaller.W)
# copy_array(larger.b[:smaller.nO], smaller.b) # copy_array(larger.b[:smaller.nO], smaller.b)
# self.model._layers[-1] = larger # self.model._layers[-1] = larger
raise ValueError( raise ValueError(TempErrors.T003)
"Resizing pre-trained Tagger models is not "
"currently supported.")
tag_map = dict(self.vocab.morphology.tag_map) tag_map = dict(self.vocab.morphology.tag_map)
if values is None: if values is None:
values = {POS: "X"} values = {POS: "X"}
@ -671,8 +671,7 @@ class MultitaskObjective(Tagger):
elif hasattr(target, '__call__'): elif hasattr(target, '__call__'):
self.make_label = target self.make_label = target
else: else:
raise ValueError("MultitaskObjective target should be function or " raise ValueError(Errors.E016)
"one of: dep, tag, ent, dep_tag_offset, ent_tag.")
self.cfg = dict(cfg) self.cfg = dict(cfg)
self.cfg.setdefault('cnn_maxout_pieces', 2) self.cfg.setdefault('cnn_maxout_pieces', 2)
@ -723,7 +722,9 @@ class MultitaskObjective(Tagger):
return tokvecs, scores return tokvecs, scores
def get_loss(self, docs, golds, scores): def get_loss(self, docs, golds, scores):
assert len(docs) == len(golds) if len(docs) != len(golds):
raise ValueError(Errors.E077.format(value='loss', n_docs=len(docs),
n_golds=len(golds)))
cdef int idx = 0 cdef int idx = 0
correct = numpy.zeros((scores.shape[0],), dtype='i') correct = numpy.zeros((scores.shape[0],), dtype='i')
guesses = scores.argmax(axis=1) guesses = scores.argmax(axis=1)
@ -936,7 +937,7 @@ cdef class DependencyParser(Parser):
@property @property
def postprocesses(self): def postprocesses(self):
return [nonproj.deprojectivize] return [nonproj.deprojectivize]
def add_multitask_objective(self, target): def add_multitask_objective(self, target):
labeller = MultitaskObjective(self.vocab, target=target) labeller = MultitaskObjective(self.vocab, target=target)
self._multitasks.append(labeller) self._multitasks.append(labeller)
@ -957,7 +958,7 @@ cdef class EntityRecognizer(Parser):
TransitionSystem = BiluoPushDown TransitionSystem = BiluoPushDown
nr_feature = 6 nr_feature = 6
def add_multitask_objective(self, target): def add_multitask_objective(self, target):
labeller = MultitaskObjective(self.vocab, target=target) labeller = MultitaskObjective(self.vocab, target=target)
self._multitasks.append(labeller) self._multitasks.append(labeller)

View File

@ -2,6 +2,7 @@
from __future__ import division, print_function, unicode_literals from __future__ import division, print_function, unicode_literals
from .gold import tags_to_entities from .gold import tags_to_entities
from .errors import Errors
class PRFScore(object): class PRFScore(object):
@ -84,7 +85,8 @@ class Scorer(object):
} }
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')): def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
assert len(tokens) == len(gold) if len(tokens) != len(gold):
raise ValueError(Errors.E078.format(words_doc=len(tokens), words_gold=len(gold)))
gold_deps = set() gold_deps = set()
gold_tags = set() gold_tags = set()
gold_ents = set(tags_to_entities([annot[-1] gold_ents = set(tags_to_entities([annot[-1]

View File

@ -13,6 +13,7 @@ from .symbols import IDS as SYMBOLS_BY_STR
from .symbols import NAMES as SYMBOLS_BY_INT from .symbols import NAMES as SYMBOLS_BY_INT
from .typedefs cimport hash_t from .typedefs cimport hash_t
from .compat import json_dumps from .compat import json_dumps
from .errors import Errors
from . import util from . import util
@ -59,7 +60,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char)) string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
string.p[0] = length string.p[0] = length
memcpy(&string.p[1], chars, length) memcpy(&string.p[1], chars, length)
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
return string return string
else: else:
i = 0 i = 0
@ -69,7 +69,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
string.p[i] = 255 string.p[i] = 255
string.p[n_length_bytes-1] = length % 255 string.p[n_length_bytes-1] = length % 255
memcpy(&string.p[n_length_bytes], chars, length) memcpy(&string.p[n_length_bytes], chars, length)
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
return string return string
@ -115,7 +114,7 @@ cdef class StringStore:
self.hits.insert(key) self.hits.insert(key)
utf8str = <Utf8Str*>self._map.get(key) utf8str = <Utf8Str*>self._map.get(key)
if utf8str is NULL: if utf8str is NULL:
raise KeyError(string_or_id) raise KeyError(Errors.E018.format(hash_value=string_or_id))
else: else:
return decode_Utf8Str(utf8str) return decode_Utf8Str(utf8str)
@ -136,8 +135,7 @@ cdef class StringStore:
key = hash_utf8(string, len(string)) key = hash_utf8(string, len(string))
self._intern_utf8(string, len(string)) self._intern_utf8(string, len(string))
else: else:
raise TypeError( raise TypeError(Errors.E017.format(value_type=type(string)))
"Can only add unicode or bytes. Got type: %s" % type(string))
return key return key
def __len__(self): def __len__(self):

View File

@ -10,6 +10,7 @@ from thinc.extra.search cimport MaxViolation
from .transition_system cimport TransitionSystem, Transition from .transition_system cimport TransitionSystem, Transition
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..errors import Errors
from .stateclass cimport StateC, StateClass from .stateclass cimport StateC, StateClass
@ -220,7 +221,8 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
p_indices = [] p_indices = []
g_indices = [] g_indices = []
cdef Beam pbeam, gbeam cdef Beam pbeam, gbeam
assert len(pbeams) == len(gbeams) if len(pbeams) != len(gbeams):
raise ValueError(Errors.E079.format(pbeams=len(pbeams), gbeams=len(gbeams)))
for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)): for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
p_indices.append([]) p_indices.append([])
g_indices.append([]) g_indices.append([])
@ -228,7 +230,8 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
state = StateClass.borrow(<StateC*>pbeam.at(i)) state = StateClass.borrow(<StateC*>pbeam.at(i))
if not state.is_final(): if not state.is_final():
key = tuple([eg_id] + pbeam.histories[i]) key = tuple([eg_id] + pbeam.histories[i])
assert key not in seen, (key, seen) if key in seen:
raise ValueError(Errors.E080.format(key=key))
seen[key] = len(states) seen[key] = len(states)
p_indices[-1].append(len(states)) p_indices[-1].append(len(states))
states.append(state) states.append(state)
@ -271,7 +274,8 @@ def get_gradient(nr_class, beam_maps, histories, losses):
for i in range(nr_step): for i in range(nr_step):
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
dtype='f')) dtype='f'))
assert len(histories) == len(losses) if len(histories) != len(losses):
raise ValueError(Errors.E081.format(n_hist=len(histories), losses=len(losses)))
for eg_id, hists in enumerate(histories): for eg_id, hists in enumerate(histories):
for loss, hist in zip(losses[eg_id], hists): for loss, hist in zip(losses[eg_id], hists):
if loss == 0.0 or numpy.isnan(loss): if loss == 0.0 or numpy.isnan(loss):

View File

@ -15,6 +15,7 @@ from .nonproj import is_nonproj_tree
from .transition_system cimport move_cost_func_t, label_cost_func_t from .transition_system cimport move_cost_func_t, label_cost_func_t
from ..gold cimport GoldParse, GoldParseC from ..gold cimport GoldParse, GoldParseC
from ..structs cimport TokenC from ..structs cimport TokenC
from ..errors import Errors
DEF NON_MONOTONIC = True DEF NON_MONOTONIC = True
@ -455,7 +456,7 @@ cdef class ArcEager(TransitionSystem):
t.do = Break.transition t.do = Break.transition
t.get_cost = Break.cost t.get_cost = Break.cost
else: else:
raise Exception(move) raise ValueError(Errors.E019.format(action=move, src='arc_eager'))
return t return t
cdef int initialize_state(self, StateC* st) nogil: cdef int initialize_state(self, StateC* st) nogil:
@ -529,28 +530,11 @@ cdef class ArcEager(TransitionSystem):
if n_gold < 1: if n_gold < 1:
# Check projectivity --- leading cause # Check projectivity --- leading cause
if is_nonproj_tree(gold.heads): if is_nonproj_tree(gold.heads):
raise ValueError( raise ValueError(Errors.E020)
"Could not find a gold-standard action to supervise the "
"dependency parser. Likely cause: the tree is "
"non-projective (i.e. it has crossing arcs -- see "
"spacy/syntax/nonproj.pyx for definitions). The ArcEager "
"transition system only supports projective trees. To "
"learn non-projective representations, transform the data "
"before training and after parsing. Either pass "
"make_projective=True to the GoldParse class, or use "
"spacy.syntax.nonproj.preprocess_training_data.")
else: else:
print(gold.orig_annot) failure_state = stcls.print_state(gold.words)
print(gold.words) raise ValueError(Errors.E021.format(n_actions=self.n_moves,
print(gold.heads) state=failure_state))
print(gold.labels)
print(gold.sent_starts)
raise ValueError(
"Could not find a gold-standard action to supervise the"
"dependency parser. The GoldParse was projective. The "
"transition system has %d actions. State at failure: %s"
% (self.n_moves, stcls.print_state(gold.words)))
assert n_gold >= 1
def get_beam_annot(self, Beam beam): def get_beam_annot(self, Beam beam):
length = (<StateC*>beam.at(0)).length length = (<StateC*>beam.at(0)).length

View File

@ -10,6 +10,7 @@ from ._state cimport StateC
from .transition_system cimport Transition from .transition_system cimport Transition
from .transition_system cimport do_func_t from .transition_system cimport do_func_t
from ..gold cimport GoldParseC, GoldParse from ..gold cimport GoldParseC, GoldParse
from ..errors import Errors
cdef enum: cdef enum:
@ -173,7 +174,7 @@ cdef class BiluoPushDown(TransitionSystem):
if self.c[i].move == move and self.c[i].label == label: if self.c[i].move == move and self.c[i].label == label:
return self.c[i] return self.c[i]
else: else:
raise KeyError(name) raise KeyError(Errors.E022.format(name=name))
cdef Transition init_transition(self, int clas, int move, attr_t label) except *: cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
# TODO: Apparent Cython bug here when we try to use the Transition() # TODO: Apparent Cython bug here when we try to use the Transition()
@ -208,7 +209,7 @@ cdef class BiluoPushDown(TransitionSystem):
t.do = Out.transition t.do = Out.transition
t.get_cost = Out.cost t.get_cost = Out.cost
else: else:
raise Exception(move) raise ValueError(Errors.E019.format(action=move, src='ner'))
return t return t
def add_action(self, int action, label_name): def add_action(self, int action, label_name):
@ -230,7 +231,6 @@ cdef class BiluoPushDown(TransitionSystem):
self._size *= 2 self._size *= 2
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0])) self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id) self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
assert self.c[self.n_moves].label == label_id
self.n_moves += 1 self.n_moves += 1
return 1 return 1

View File

@ -34,6 +34,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer
from ..compat import json_dumps, copy_array from ..compat import json_dumps, copy_array
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..gold cimport GoldParse from ..gold cimport GoldParse
from ..errors import Errors, TempErrors
from .. import util from .. import util
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ._state cimport StateC from ._state cimport StateC
@ -242,7 +243,7 @@ cdef class Parser:
def Model(cls, nr_class, **cfg): def Model(cls, nr_class, **cfg):
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
if depth != 1: if depth != 1:
raise ValueError("Currently parser depth is hard-coded to 1.") raise ValueError(TempErrors.T004.format(value=depth))
parser_maxout_pieces = util.env_opt('parser_maxout_pieces', parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
cfg.get('maxout_pieces', 2)) cfg.get('maxout_pieces', 2))
token_vector_width = util.env_opt('token_vector_width', token_vector_width = util.env_opt('token_vector_width',
@ -252,9 +253,9 @@ cdef class Parser:
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
if hist_size != 0: if hist_size != 0:
raise ValueError("Currently history size is hard-coded to 0") raise ValueError(TempErrors.T005.format(value=hist_size))
if hist_width != 0: if hist_width != 0:
raise ValueError("Currently history width is hard-coded to 0") raise ValueError(TempErrors.T006.format(value=hist_width))
pretrained_vectors = cfg.get('pretrained_vectors', None) pretrained_vectors = cfg.get('pretrained_vectors', None)
tok2vec = Tok2Vec(token_vector_width, embed_size, tok2vec = Tok2Vec(token_vector_width, embed_size,
pretrained_vectors=pretrained_vectors) pretrained_vectors=pretrained_vectors)
@ -431,7 +432,7 @@ cdef class Parser:
[len(doc) for doc in docs]) [len(doc) for doc in docs])
return state_objs, tokvecs return state_objs, tokvecs
cdef void _parseC(self, StateC* state, cdef void _parseC(self, StateC* state,
const float* feat_weights, const float* bias, const float* feat_weights, const float* bias,
const float* hW, const float* hb, const float* hW, const float* hb,
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil: int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
@ -542,7 +543,9 @@ cdef class Parser:
def update(self, docs, golds, drop=0., sgd=None, losses=None): def update(self, docs, golds, drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds): if not any(self.moves.has_gold(gold) for gold in golds):
return None return None
assert len(docs) == len(golds) if len(docs) != len(golds):
raise ValueError(Errors.E077.format(value='update', n_docs=len(docs),
n_golds=len(golds)))
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0: if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0:
return self.update_beam(docs, golds, return self.update_beam(docs, golds,
self.cfg['beam_width'], self.cfg['beam_density'], self.cfg['beam_width'], self.cfg['beam_density'],
@ -608,7 +611,7 @@ cdef class Parser:
break break
self._make_updates(d_tokvecs, self._make_updates(d_tokvecs,
bp_tokvecs, backprops, sgd, cuda_stream) bp_tokvecs, backprops, sgd, cuda_stream)
def update_beam(self, docs, golds, width=None, density=None, def update_beam(self, docs, golds, width=None, density=None,
drop=0., sgd=None, losses=None): drop=0., sgd=None, losses=None):
if not any(self.moves.has_gold(gold) for gold in golds): if not any(self.moves.has_gold(gold) for gold in golds):
@ -622,7 +625,6 @@ cdef class Parser:
if losses is not None and self.name not in losses: if losses is not None and self.name not in losses:
losses[self.name] = 0. losses[self.name] = 0.
lengths = [len(d) for d in docs] lengths = [len(d) for d in docs]
assert min(lengths) >= 1
states = self.moves.init_batch(docs) states = self.moves.init_batch(docs)
for gold in golds: for gold in golds:
self.moves.preprocess_gold(gold) self.moves.preprocess_gold(gold)
@ -851,7 +853,7 @@ cdef class Parser:
def add_multitask_objective(self, target): def add_multitask_objective(self, target):
# Defined in subclasses, to avoid circular import # Defined in subclasses, to avoid circular import
raise NotImplementedError raise NotImplementedError
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg): def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
'''Setup models for secondary objectives, to benefit from multi-task '''Setup models for secondary objectives, to benefit from multi-task
learning. This method is intended to be overridden by subclasses. learning. This method is intended to be overridden by subclasses.
@ -1021,15 +1023,11 @@ def _cleanup(Beam beam):
del state del state
seen.add(addr) seen.add(addr)
else: else:
print(i, addr) raise ValueError(Errors.E023.format(addr=addr, i=i))
print(seen)
raise Exception
addr = <size_t>beam._states[i].content addr = <size_t>beam._states[i].content
if addr not in seen: if addr not in seen:
state = <StateC*>addr state = <StateC*>addr
del state del state
seen.add(addr) seen.add(addr)
else: else:
print(i, addr) raise ValueError(Errors.E023.format(addr=addr, i=i))
print(seen)
raise Exception

View File

@ -10,6 +10,7 @@ from __future__ import unicode_literals
from copy import copy from copy import copy
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..errors import Errors
DELIMITER = '||' DELIMITER = '||'
@ -131,7 +132,10 @@ cpdef deprojectivize(Doc doc):
def _decorate(heads, proj_heads, labels): def _decorate(heads, proj_heads, labels):
# uses decoration scheme HEAD from Nivre & Nilsson 2005 # uses decoration scheme HEAD from Nivre & Nilsson 2005
assert(len(heads) == len(proj_heads) == len(labels)) if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)):
raise ValueError(Errors.E082.format(n_heads=len(heads),
n_proj_heads=len(proj_heads),
n_labels=len(labels)))
deco_labels = [] deco_labels = []
for tokenid, head in enumerate(heads): for tokenid, head in enumerate(heads):
if head != proj_heads[tokenid]: if head != proj_heads[tokenid]:

View File

@ -12,6 +12,7 @@ from ..structs cimport TokenC
from .stateclass cimport StateClass from .stateclass cimport StateClass
from ..typedefs cimport attr_t from ..typedefs cimport attr_t
from ..compat import json_dumps from ..compat import json_dumps
from ..errors import Errors
from .. import util from .. import util
@ -80,10 +81,7 @@ cdef class TransitionSystem:
action.do(state.c, action.label) action.do(state.c, action.label)
break break
else: else:
print(gold.words) raise ValueError(Errors.E024)
print(gold.ner)
print(history)
raise ValueError("Could not find gold move")
return history return history
cdef int initialize_state(self, StateC* state) nogil: cdef int initialize_state(self, StateC* state) nogil:
@ -130,17 +128,7 @@ cdef class TransitionSystem:
else: else:
costs[i] = 9000 costs[i] = 9000
if n_gold <= 0: if n_gold <= 0:
print(gold.words) raise ValueError(Errors.E024)
print(gold.ner)
print([gold.c.ner[i].clas for i in range(gold.length)])
print([gold.c.ner[i].move for i in range(gold.length)])
print([gold.c.ner[i].label for i in range(gold.length)])
print("Self labels",
[self.c[i].label for i in range(self.n_moves)])
raise ValueError(
"Could not find a gold-standard action to supervise "
"the entity recognizer. The transition system has "
"%d actions." % (self.n_moves))
def get_class_name(self, int clas): def get_class_name(self, int clas):
act = self.c[clas] act = self.c[clas]
@ -162,7 +150,6 @@ cdef class TransitionSystem:
self._size *= 2 self._size *= 2
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0])) self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id) self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
assert self.c[self.n_moves].label == label_id
self.n_moves += 1 self.n_moves += 1
return 1 return 1

View File

@ -13,6 +13,7 @@ cimport cython
from .tokens.doc cimport Doc from .tokens.doc cimport Doc
from .strings cimport hash_string from .strings cimport hash_string
from .errors import Errors, Warnings, deprecation_warning
from . import util from . import util
@ -63,11 +64,7 @@ cdef class Tokenizer:
return (self.__class__, args, None, None) return (self.__class__, args, None, None)
cpdef Doc tokens_from_list(self, list strings): cpdef Doc tokens_from_list(self, list strings):
util.deprecated( deprecation_warning(Warnings.W002)
"Tokenizer.from_list is now deprecated. Create a new Doc "
"object instead and pass in the strings as the `words` keyword "
"argument, for example:\nfrom spacy.tokens import Doc\n"
"doc = Doc(nlp.vocab, words=[...])")
return Doc(self.vocab, words=strings) return Doc(self.vocab, words=strings)
@cython.boundscheck(False) @cython.boundscheck(False)
@ -78,8 +75,7 @@ cdef class Tokenizer:
RETURNS (Doc): A container for linguistic annotations. RETURNS (Doc): A container for linguistic annotations.
""" """
if len(string) >= (2 ** 30): if len(string) >= (2 ** 30):
msg = "String is too long: %d characters. Max is 2**30." raise ValueError(Errors.E025.format(length=len(string)))
raise ValueError(msg % len(string))
cdef int length = len(string) cdef int length = len(string)
cdef Doc doc = Doc(self.vocab) cdef Doc doc = Doc(self.vocab)
if length == 0: if length == 0:

View File

@ -31,7 +31,7 @@ from ..attrs cimport ENT_TYPE, SENT_START
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
from ..util import normalize_slice from ..util import normalize_slice
from ..compat import is_config, copy_reg, pickle, basestring_ from ..compat import is_config, copy_reg, pickle, basestring_
from .. import about from ..errors import Errors, Warnings, deprecation_warning
from .. import util from .. import util
from .underscore import Underscore from .underscore import Underscore
from ._retokenize import Retokenizer from ._retokenize import Retokenizer
@ -41,9 +41,9 @@ DEF PADDING = 5
cdef int bounds_check(int i, int length, int padding) except -1: cdef int bounds_check(int i, int length, int padding) except -1:
if (i + padding) < 0: if (i + padding) < 0:
raise IndexError raise IndexError(Errors.E026.format(i=i, length=length))
if (i - padding) >= length: if (i - padding) >= length:
raise IndexError raise IndexError(Errors.E026.format(i=i, length=length))
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
@ -98,7 +98,8 @@ cdef class Doc:
def set_extension(cls, name, default=None, method=None, def set_extension(cls, name, default=None, method=None,
getter=None, setter=None): getter=None, setter=None):
nr_defined = sum(t is not None for t in (default, getter, setter, method)) nr_defined = sum(t is not None for t in (default, getter, setter, method))
assert nr_defined == 1 if nr_defined != 1:
raise ValueError(Errors.E083.format(n_args=nr_defined))
Underscore.doc_extensions[name] = (default, method, getter, setter) Underscore.doc_extensions[name] = (default, method, getter, setter)
@classmethod @classmethod
@ -155,11 +156,7 @@ cdef class Doc:
if spaces is None: if spaces is None:
spaces = [True] * len(words) spaces = [True] * len(words)
elif len(spaces) != len(words): elif len(spaces) != len(words):
raise ValueError( raise ValueError(Errors.E027)
"Arguments 'words' and 'spaces' should be sequences of "
"the same length, or 'spaces' should be left default at "
"None. spaces should be a sequence of booleans, with True "
"meaning that the word owns a ' ' character following it.")
orths_and_spaces = zip(words, spaces) orths_and_spaces = zip(words, spaces)
if orths_and_spaces is not None: if orths_and_spaces is not None:
for orth_space in orths_and_spaces: for orth_space in orths_and_spaces:
@ -167,10 +164,7 @@ cdef class Doc:
orth = orth_space orth = orth_space
has_space = True has_space = True
elif isinstance(orth_space, bytes): elif isinstance(orth_space, bytes):
raise ValueError( raise ValueError(Errors.E028.format(value=orth_space))
"orths_and_spaces expects either List(unicode) or "
"List((unicode, bool)). "
"Got bytes instance: %s" % (str(orth_space)))
else: else:
orth, has_space = orth_space orth, has_space = orth_space
# Note that we pass self.mem here --- we have ownership, if LexemeC # Note that we pass self.mem here --- we have ownership, if LexemeC
@ -504,11 +498,7 @@ cdef class Doc:
""" """
def __get__(self): def __get__(self):
if not self.is_parsed: if not self.is_parsed:
raise ValueError( raise ValueError(Errors.E029)
"noun_chunks requires the dependency parse, which "
"requires a statistical model to be installed and loaded. "
"For more info, see the "
"documentation: \n%s\n" % about.__docs_models__)
# Accumulate the result before beginning to iterate over it. This # Accumulate the result before beginning to iterate over it. This
# prevents the tokenisation from being changed out from under us # prevents the tokenisation from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts # during the iteration. The tricky thing here is that Span accepts
@ -533,12 +523,7 @@ cdef class Doc:
""" """
def __get__(self): def __get__(self):
if not self.is_sentenced: if not self.is_sentenced:
raise ValueError( raise ValueError(Errors.E030)
"Sentence boundaries unset. You can add the 'sentencizer' "
"component to the pipeline with: "
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
"Alternatively, add the dependency parser, or set "
"sentence boundaries by setting doc[i].sent_start")
if 'sents' in self.user_hooks: if 'sents' in self.user_hooks:
yield from self.user_hooks['sents'](self) yield from self.user_hooks['sents'](self)
else: else:
@ -568,7 +553,8 @@ cdef class Doc:
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
t.l_edge = self.length t.l_edge = self.length
t.r_edge = self.length t.r_edge = self.length
assert t.lex.orth != 0 if t.lex.orth == 0:
raise ValueError(Errors.E031.format(i=self.length))
t.spacy = has_space t.spacy = has_space
self.length += 1 self.length += 1
return t.idx + t.lex.length + t.spacy return t.idx + t.lex.length + t.spacy
@ -684,13 +670,7 @@ cdef class Doc:
def from_array(self, attrs, array): def from_array(self, attrs, array):
if SENT_START in attrs and HEAD in attrs: if SENT_START in attrs and HEAD in attrs:
raise ValueError( raise ValueError(Errors.E032)
"Conflicting attributes specified in doc.from_array(): "
"(HEAD, SENT_START)\n"
"The HEAD attribute currently sets sentence boundaries "
"implicitly, based on the tree structure. This means the HEAD "
"attribute would potentially override the sentence boundaries "
"set by SENT_START.")
cdef int i, col cdef int i, col
cdef attr_id_t attr_id cdef attr_id_t attr_id
cdef TokenC* tokens = self.c cdef TokenC* tokens = self.c
@ -828,7 +808,7 @@ cdef class Doc:
RETURNS (Doc): Itself. RETURNS (Doc): Itself.
""" """
if self.length != 0: if self.length != 0:
raise ValueError("Cannot load into non-empty Doc") raise ValueError(Errors.E033.format(length=self.length))
deserializers = { deserializers = {
'text': lambda b: None, 'text': lambda b: None,
'array_head': lambda b: None, 'array_head': lambda b: None,
@ -916,10 +896,7 @@ cdef class Doc:
""" """
cdef unicode tag, lemma, ent_type cdef unicode tag, lemma, ent_type
if len(args) == 3: if len(args) == 3:
util.deprecated( deprecation_warning(Warnings.W003)
"Positional arguments to Doc.merge are deprecated. Instead, "
"use the keyword arguments, for example tag=, lemma= or "
"ent_type=.")
tag, lemma, ent_type = args tag, lemma, ent_type = args
attributes[TAG] = tag attributes[TAG] = tag
attributes[LEMMA] = lemma attributes[LEMMA] = lemma
@ -933,13 +910,9 @@ cdef class Doc:
if 'ent_type' in attributes: if 'ent_type' in attributes:
attributes[ENT_TYPE] = attributes['ent_type'] attributes[ENT_TYPE] = attributes['ent_type']
elif args: elif args:
raise ValueError( raise ValueError(Errors.E034.format(n_args=len(args),
"Doc.merge received %d non-keyword arguments. Expected either " args=repr(args),
"3 arguments (deprecated), or 0 (use keyword arguments). " kwargs=repr(attributes)))
"Arguments supplied:\n%s\n"
"Keyword arguments: %s\n" % (len(args), repr(args),
repr(attributes)))
# More deprecated attribute handling =/ # More deprecated attribute handling =/
if 'label' in attributes: if 'label' in attributes:
attributes['ent_type'] = attributes.pop('label') attributes['ent_type'] = attributes.pop('label')

View File

@ -16,7 +16,7 @@ from ..util import normalize_slice
from ..attrs cimport IS_PUNCT, IS_SPACE from ..attrs cimport IS_PUNCT, IS_SPACE
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..compat import is_config from ..compat import is_config
from .. import about from ..errors import Errors, TempErrors
from .underscore import Underscore from .underscore import Underscore
@ -48,8 +48,7 @@ cdef class Span:
RETURNS (Span): The newly constructed object. RETURNS (Span): The newly constructed object.
""" """
if not (0 <= start <= end <= len(doc)): if not (0 <= start <= end <= len(doc)):
raise IndexError raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc)))
self.doc = doc self.doc = doc
self.start = start self.start = start
self.start_char = self.doc[start].idx if start < self.doc.length else 0 self.start_char = self.doc[start].idx if start < self.doc.length else 0
@ -58,7 +57,8 @@ cdef class Span:
self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1]) self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1])
else: else:
self.end_char = 0 self.end_char = 0
assert label in doc.vocab.strings, label if label not in doc.vocab.strings:
raise ValueError(Errors.E084.format(label=label))
self.label = label self.label = label
self._vector = vector self._vector = vector
self._vector_norm = vector_norm self._vector_norm = vector_norm
@ -267,11 +267,10 @@ cdef class Span:
or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char: or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char:
start = token_by_start(self.doc.c, self.doc.length, self.start_char) start = token_by_start(self.doc.c, self.doc.length, self.start_char)
if self.start == -1: if self.start == -1:
raise IndexError("Error calculating span: Can't find start") raise IndexError(Errors.E036.format(start=self.start_char))
end = token_by_end(self.doc.c, self.doc.length, self.end_char) end = token_by_end(self.doc.c, self.doc.length, self.end_char)
if end == -1: if end == -1:
raise IndexError("Error calculating span: Can't find end") raise IndexError(Errors.E037.format(end=self.end_char))
self.start = start self.start = start
self.end = end + 1 self.end = end + 1
@ -293,7 +292,7 @@ cdef class Span:
root += root.head root += root.head
n += 1 n += 1
if n >= self.doc.length: if n >= self.doc.length:
raise RuntimeError raise RuntimeError(Errors.E038)
return self.doc[root.l_edge:root.r_edge + 1] return self.doc[root.l_edge:root.r_edge + 1]
property has_vector: property has_vector:
@ -376,11 +375,7 @@ cdef class Span:
""" """
def __get__(self): def __get__(self):
if not self.doc.is_parsed: if not self.doc.is_parsed:
raise ValueError( raise ValueError(Errors.E029)
"noun_chunks requires the dependency parse, which "
"requires a statistical model to be installed and loaded. "
"For more info, see the "
"documentation: \n%s\n" % about.__docs_models__)
# Accumulate the result before beginning to iterate over it. This # Accumulate the result before beginning to iterate over it. This
# prevents the tokenisation from being changed out from under us # prevents the tokenisation from being changed out from under us
# during the iteration. The tricky thing here is that Span accepts # during the iteration. The tricky thing here is that Span accepts
@ -526,9 +521,7 @@ cdef class Span:
return self.root.ent_id return self.root.ent_id
def __set__(self, hash_t key): def __set__(self, hash_t key):
raise NotImplementedError( raise NotImplementedError(TempErrors.T007.format(attr='ent_id'))
"Can't yet set ent_id from Span. Vote for this feature on "
"the issue tracker: http://github.com/explosion/spaCy/issues")
property ent_id_: property ent_id_:
"""RETURNS (unicode): The (string) entity ID.""" """RETURNS (unicode): The (string) entity ID."""
@ -536,9 +529,7 @@ cdef class Span:
return self.root.ent_id_ return self.root.ent_id_
def __set__(self, hash_t key): def __set__(self, hash_t key):
raise NotImplementedError( raise NotImplementedError(TempErrors.T007.format(attr='ent_id_'))
"Can't yet set ent_id_ from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues")
property orth_: property orth_:
"""Verbatim text content (identical to Span.text). Exists mostly for """Verbatim text content (identical to Span.text). Exists mostly for
@ -586,9 +577,5 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
token += token.head token += token.head
n += 1 n += 1
if n >= sent_length: if n >= sent_length:
raise RuntimeError( raise RuntimeError(Errors.E039)
"Array bounds exceeded while searching for root word. This "
"likely means the parse tree is in an invalid state. Please "
"report this issue here: "
"http://github.com/explosion/spaCy/issues")
return n return n

View File

@ -6,6 +6,7 @@ from ..typedefs cimport attr_t, flags_t
from ..parts_of_speech cimport univ_pos_t from ..parts_of_speech cimport univ_pos_t
from .doc cimport Doc from .doc cimport Doc
from ..lexeme cimport Lexeme from ..lexeme cimport Lexeme
from ..errors import Errors
cdef class Token: cdef class Token:
@ -17,8 +18,7 @@ cdef class Token:
@staticmethod @staticmethod
cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, Doc doc): cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, Doc doc):
if offset < 0 or offset >= doc.length: if offset < 0 or offset >= doc.length:
msg = "Attempt to access token at %d, max length %d" raise IndexError(Errors.E040.format(i=offset, max_length=doc.length))
raise IndexError(msg % (offset, doc.length))
cdef Token self = Token.__new__(Token, vocab, doc, offset) cdef Token self = Token.__new__(Token, vocab, doc, offset)
return self return self

View File

@ -19,8 +19,8 @@ from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
from ..compat import is_config from ..compat import is_config
from ..errors import Errors
from .. import util from .. import util
from .. import about
from .underscore import Underscore from .underscore import Underscore
@ -106,7 +106,7 @@ cdef class Token:
elif op == 5: elif op == 5:
return my >= their return my >= their
else: else:
raise ValueError(op) raise ValueError(Errors.E041.format(op=op))
@property @property
def _(self): def _(self):
@ -135,8 +135,7 @@ cdef class Token:
RETURNS (Token): The token at position `self.doc[self.i+i]`. RETURNS (Token): The token at position `self.doc[self.i+i]`.
""" """
if self.i+i < 0 or (self.i+i >= len(self.doc)): if self.i+i < 0 or (self.i+i >= len(self.doc)):
msg = "Error accessing doc[%d].nbor(%d), for doc of length %d" raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc)))
raise IndexError(msg % (self.i, i, len(self.doc)))
return self.doc[self.i+i] return self.doc[self.i+i]
def similarity(self, other): def similarity(self, other):
@ -352,14 +351,7 @@ cdef class Token:
property sent_start: property sent_start:
def __get__(self): def __get__(self):
# Raising a deprecation warning causes errors for autocomplete # Raising a deprecation warning here causes errors for autocomplete
#util.deprecated(
# "Token.sent_start is now deprecated. Use Token.is_sent_start "
# "instead, which returns a boolean value or None if the answer "
# "is unknown instead of a misleading 0 for False and 1 for "
# "True. It also fixes a quirk in the old logic that would "
# "always set the property to 0 for the first word of the "
# "document.")
# Handle broken backwards compatibility case: doc[0].sent_start # Handle broken backwards compatibility case: doc[0].sent_start
# was False. # was False.
if self.i == 0: if self.i == 0:
@ -384,9 +376,7 @@ cdef class Token:
def __set__(self, value): def __set__(self, value):
if self.doc.is_parsed: if self.doc.is_parsed:
raise ValueError( raise ValueError(Errors.E043)
"Refusing to write to token.sent_start if its document "
"is parsed, because this may cause inconsistent state.")
if value is None: if value is None:
self.c.sent_start = 0 self.c.sent_start = 0
elif value is True: elif value is True:
@ -394,8 +384,7 @@ cdef class Token:
elif value is False: elif value is False:
self.c.sent_start = -1 self.c.sent_start = -1
else: else:
raise ValueError("Invalid value for token.sent_start. Must be " raise ValueError(Errors.E044.format(value=value))
"one of: None, True, False")
property lefts: property lefts:
"""The leftward immediate children of the word, in the syntactic """The leftward immediate children of the word, in the syntactic
@ -413,8 +402,7 @@ cdef class Token:
nr_iter += 1 nr_iter += 1
# This is ugly, but it's a way to guard out infinite loops # This is ugly, but it's a way to guard out infinite loops
if nr_iter >= 10000000: if nr_iter >= 10000000:
raise RuntimeError("Possibly infinite loop encountered " raise RuntimeError(Errors.E045.format(attr='token.lefts'))
"while looking for token.lefts")
property rights: property rights:
"""The rightward immediate children of the word, in the syntactic """The rightward immediate children of the word, in the syntactic
@ -432,8 +420,7 @@ cdef class Token:
ptr -= 1 ptr -= 1
nr_iter += 1 nr_iter += 1
if nr_iter >= 10000000: if nr_iter >= 10000000:
raise RuntimeError("Possibly infinite loop encountered " raise RuntimeError(Errors.E045.format(attr='token.rights'))
"while looking for token.rights")
tokens.reverse() tokens.reverse()
for t in tokens: for t in tokens:
yield t yield t

View File

@ -3,6 +3,8 @@ from __future__ import unicode_literals
import functools import functools
from ..errors import Errors
class Underscore(object): class Underscore(object):
doc_extensions = {} doc_extensions = {}
@ -23,7 +25,7 @@ class Underscore(object):
def __getattr__(self, name): def __getattr__(self, name):
if name not in self._extensions: if name not in self._extensions:
raise AttributeError(name) raise AttributeError(Errors.E046.format(name=name))
default, method, getter, setter = self._extensions[name] default, method, getter, setter = self._extensions[name]
if getter is not None: if getter is not None:
return getter(self._obj) return getter(self._obj)
@ -34,7 +36,7 @@ class Underscore(object):
def __setattr__(self, name, value): def __setattr__(self, name, value):
if name not in self._extensions: if name not in self._extensions:
raise AttributeError(name) raise AttributeError(Errors.E047.format(name=name))
default, method, getter, setter = self._extensions[name] default, method, getter, setter = self._extensions[name]
if setter is not None: if setter is not None:
return setter(self._obj, value) return setter(self._obj, value)

View File

@ -11,8 +11,6 @@ import sys
import textwrap import textwrap
import random import random
from collections import OrderedDict from collections import OrderedDict
import inspect
import warnings
from thinc.neural._classes.model import Model from thinc.neural._classes.model import Model
import functools import functools
import cytoolz import cytoolz
@ -22,6 +20,7 @@ import numpy.random
from .symbols import ORTH from .symbols import ORTH
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_ from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
from .compat import import_file from .compat import import_file
from .errors import Errors
# Import these directly from Thinc, so that we're sure we always have the # Import these directly from Thinc, so that we're sure we always have the
# same version. # same version.
@ -50,8 +49,7 @@ def get_lang_class(lang):
try: try:
module = importlib.import_module('.lang.%s' % lang, 'spacy') module = importlib.import_module('.lang.%s' % lang, 'spacy')
except ImportError: except ImportError:
msg = "Can't import language %s from spacy.lang." raise ImportError(Errors.E048.format(lang=lang))
raise ImportError(msg % lang)
LANGUAGES[lang] = getattr(module, module.__all__[0]) LANGUAGES[lang] = getattr(module, module.__all__[0])
return LANGUAGES[lang] return LANGUAGES[lang]
@ -108,7 +106,7 @@ def load_model(name, **overrides):
""" """
data_path = get_data_path() data_path = get_data_path()
if not data_path or not data_path.exists(): if not data_path or not data_path.exists():
raise IOError("Can't find spaCy data path: %s" % path2str(data_path)) raise IOError(Errors.E049.format(path=path2str(data_path)))
if isinstance(name, basestring_): # in data dir / shortcut if isinstance(name, basestring_): # in data dir / shortcut
if name in set([d.name for d in data_path.iterdir()]): if name in set([d.name for d in data_path.iterdir()]):
return load_model_from_link(name, **overrides) return load_model_from_link(name, **overrides)
@ -118,7 +116,7 @@ def load_model(name, **overrides):
return load_model_from_path(Path(name), **overrides) return load_model_from_path(Path(name), **overrides)
elif hasattr(name, 'exists'): # Path or Path-like to model data elif hasattr(name, 'exists'): # Path or Path-like to model data
return load_model_from_path(name, **overrides) return load_model_from_path(name, **overrides)
raise IOError("Can't find model '%s'" % name) raise IOError(Errors.E050.format(name=name))
def load_model_from_link(name, **overrides): def load_model_from_link(name, **overrides):
@ -127,9 +125,7 @@ def load_model_from_link(name, **overrides):
try: try:
cls = import_file(name, path) cls = import_file(name, path)
except AttributeError: except AttributeError:
raise IOError( raise IOError(Errors.E051.format(name=name))
"Cant' load '%s'. If you're using a shortcut link, make sure it "
"points to a valid package (not just a data directory)." % name)
return cls.load(**overrides) return cls.load(**overrides)
@ -173,8 +169,7 @@ def load_model_from_init_py(init_file, **overrides):
data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version']) data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
data_path = model_path / data_dir data_path = model_path / data_dir
if not model_path.exists(): if not model_path.exists():
msg = "Can't find model directory: %s" raise IOError(Errors.E052.format(path=path2str(data_path)))
raise ValueError(msg % path2str(data_path))
return load_model_from_path(data_path, meta, **overrides) return load_model_from_path(data_path, meta, **overrides)
@ -186,16 +181,14 @@ def get_model_meta(path):
""" """
model_path = ensure_path(path) model_path = ensure_path(path)
if not model_path.exists(): if not model_path.exists():
msg = "Can't find model directory: %s" raise IOError(Errors.E052.format(path=path2str(model_path)))
raise ValueError(msg % path2str(model_path))
meta_path = model_path / 'meta.json' meta_path = model_path / 'meta.json'
if not meta_path.is_file(): if not meta_path.is_file():
raise IOError("Could not read meta.json from %s" % meta_path) raise IOError(Errors.E053.format(path=meta_path))
meta = read_json(meta_path) meta = read_json(meta_path)
for setting in ['lang', 'name', 'version']: for setting in ['lang', 'name', 'version']:
if setting not in meta or not meta[setting]: if setting not in meta or not meta[setting]:
msg = "No valid '%s' setting found in model meta.json" raise ValueError(Errors.E054.format(setting=setting))
raise ValueError(msg % setting)
return meta return meta
@ -339,13 +332,10 @@ def update_exc(base_exceptions, *addition_dicts):
for orth, token_attrs in additions.items(): for orth, token_attrs in additions.items():
if not all(isinstance(attr[ORTH], unicode_) if not all(isinstance(attr[ORTH], unicode_)
for attr in token_attrs): for attr in token_attrs):
msg = "Invalid ORTH value in exception: key='%s', orths='%s'" raise ValueError(Errors.E055.format(key=orth, orths=token_attrs))
raise ValueError(msg % (orth, token_attrs))
described_orth = ''.join(attr[ORTH] for attr in token_attrs) described_orth = ''.join(attr[ORTH] for attr in token_attrs)
if orth != described_orth: if orth != described_orth:
msg = ("Invalid tokenizer exception: ORTH values combined " raise ValueError(Errors.E056.format(key=orth, orths=described_orth))
"don't match original string. key='%s', orths='%s'")
raise ValueError(msg % (orth, described_orth))
exc.update(additions) exc.update(additions)
exc = expand_exc(exc, "'", "") exc = expand_exc(exc, "'", "")
return exc return exc
@ -375,8 +365,7 @@ def expand_exc(excs, search, replace):
def normalize_slice(length, start, stop, step=None): def normalize_slice(length, start, stop, step=None):
if not (step is None or step == 1): if not (step is None or step == 1):
raise ValueError("Stepped slices not supported in Span objects." raise ValueError(Errors.E057)
"Try: list(tokens)[start:stop:step] instead.")
if start is None: if start is None:
start = 0 start = 0
elif start < 0: elif start < 0:
@ -387,7 +376,6 @@ def normalize_slice(length, start, stop, step=None):
elif stop < 0: elif stop < 0:
stop += length stop += length
stop = min(length, max(start, stop)) stop = min(length, max(start, stop))
assert 0 <= start <= stop <= length
return start, stop return start, stop
@ -524,18 +512,6 @@ def from_disk(path, readers, exclude):
return path return path
def deprecated(message, filter='always'):
"""Show a deprecation warning.
message (unicode): The message to display.
filter (unicode): Filter value.
"""
stack = inspect.stack()[-1]
with warnings.catch_warnings():
warnings.simplefilter(filter, DeprecationWarning)
warnings.warn_explicit(message, DeprecationWarning, stack[1], stack[2])
def print_table(data, title=None): def print_table(data, title=None):
"""Print data in table format. """Print data in table format.

View File

@ -14,6 +14,7 @@ from thinc.neural._classes.model import Model
from .strings cimport StringStore, hash_string from .strings cimport StringStore, hash_string
from .compat import basestring_, path2str from .compat import basestring_, path2str
from .errors import Errors
from . import util from . import util
from cython.operator cimport dereference as deref from cython.operator cimport dereference as deref
@ -114,7 +115,7 @@ cdef class Vectors:
""" """
i = self.key2row[key] i = self.key2row[key]
if i is None: if i is None:
raise KeyError(key) raise KeyError(Errors.E058.format(key=key))
else: else:
return self.data[i] return self.data[i]
@ -215,7 +216,8 @@ cdef class Vectors:
RETURNS: The requested key, keys, row or rows. RETURNS: The requested key, keys, row or rows.
""" """
if sum(arg is None for arg in (key, keys, row, rows)) != 3: if sum(arg is None for arg in (key, keys, row, rows)) != 3:
raise ValueError("One (and only one) keyword arg must be set.") bad_kwargs = {'key': key, 'keys': keys, 'row': row, 'rows': rows}
raise ValueError(Errors.E059.format(kwargs=bad_kwargs))
xp = get_array_module(self.data) xp = get_array_module(self.data)
if key is not None: if key is not None:
if isinstance(key, basestring_): if isinstance(key, basestring_):
@ -254,9 +256,9 @@ cdef class Vectors:
row = self.key2row[key] row = self.key2row[key]
elif row is None: elif row is None:
if self.is_full: if self.is_full:
raise ValueError("Cannot add new key to vectors -- full") raise ValueError(Errors.E060.format(rows=self.data.shape[0],
cols=self.data.shape[1]))
row = deref(self._unset.begin()) row = deref(self._unset.begin())
self.key2row[key] = row self.key2row[key] = row
if vector is not None: if vector is not None:
self.data[row] = vector self.data[row] = vector
@ -318,7 +320,7 @@ cdef class Vectors:
width = int(dims) width = int(dims)
break break
else: else:
raise IOError("Expected file named e.g. vectors.128.f.bin") raise IOError(Errors.E061.format(filename=path))
bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims, bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims,
dtype=dtype) dtype=dtype)
xp = get_array_module(self.data) xp = get_array_module(self.data)

View File

@ -16,6 +16,7 @@ from .attrs cimport PROB, LANG, ORTH, TAG
from .structs cimport SerializedLexemeC from .structs cimport SerializedLexemeC
from .compat import copy_reg, basestring_ from .compat import copy_reg, basestring_
from .errors import Errors
from .lemmatizer import Lemmatizer from .lemmatizer import Lemmatizer
from .attrs import intify_attrs from .attrs import intify_attrs
from .vectors import Vectors from .vectors import Vectors
@ -100,15 +101,9 @@ cdef class Vocab:
flag_id = bit flag_id = bit
break break
else: else:
raise ValueError( raise ValueError(Errors.E062)
"Cannot find empty bit for new lexical flag. All bits "
"between 0 and 63 are occupied. You can replace one by "
"specifying the flag_id explicitly, e.g. "
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
elif flag_id >= 64 or flag_id < 1: elif flag_id >= 64 or flag_id < 1:
raise ValueError( raise ValueError(Errors.E063.format(value=flag_id))
"Invalid value for flag_id: %d. Flag IDs must be between "
"1 and 63 (inclusive)" % flag_id)
for lex in self: for lex in self:
lex.set_flag(flag_id, flag_getter(lex.orth_)) lex.set_flag(flag_id, flag_getter(lex.orth_))
self.lex_attr_getters[flag_id] = flag_getter self.lex_attr_getters[flag_id] = flag_getter
@ -127,8 +122,9 @@ cdef class Vocab:
cdef size_t addr cdef size_t addr
if lex != NULL: if lex != NULL:
if lex.orth != self.strings[string]: if lex.orth != self.strings[string]:
raise LookupError.mismatched_strings( raise KeyError(Errors.E064.format(string=lex.orth,
lex.orth, self.strings[string], string) orth=self.strings[string],
orth_id=string))
return lex return lex
else: else:
return self._new_lexeme(mem, string) return self._new_lexeme(mem, string)
@ -171,7 +167,8 @@ cdef class Vocab:
if not is_oov: if not is_oov:
key = hash_string(string) key = hash_string(string)
self._add_lex_to_vocab(key, lex) self._add_lex_to_vocab(key, lex)
assert lex != NULL, string if lex == NULL:
raise ValueError(Errors.E085.format(string=string))
return lex return lex
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
@ -254,7 +251,7 @@ cdef class Vocab:
width, you have to call this to change the size of the vectors. width, you have to call this to change the size of the vectors.
""" """
if width is not None and shape is not None: if width is not None and shape is not None:
raise ValueError("Only one of width and shape can be specified") raise ValueError(Errors.E065.format(width=width, shape=shape))
elif shape is not None: elif shape is not None:
self.vectors = Vectors(shape=shape) self.vectors = Vectors(shape=shape)
else: else:
@ -471,7 +468,10 @@ cdef class Vocab:
if ptr == NULL: if ptr == NULL:
continue continue
py_str = self.strings[lexeme.orth] py_str = self.strings[lexeme.orth]
assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth) if self.strings[py_str] != lexeme.orth:
raise ValueError(Errors.E086.format(string=py_str,
orth_id=lexeme.orth,
hash_id=self.strings[py_str]))
key = hash_string(py_str) key = hash_string(py_str)
self._by_hash.set(key, lexeme) self._by_hash.set(key, lexeme)
self._by_orth.set(lexeme.orth, lexeme) self._by_orth.set(lexeme.orth, lexeme)
@ -512,16 +512,3 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir,
copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab) copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab)
class LookupError(Exception):
@classmethod
def mismatched_strings(cls, id_, id_string, original_string):
return cls(
"Error fetching a Lexeme from the Vocab. When looking up a "
"string, the lexeme returned had an orth ID that did not match "
"the query string. This means that the cached lexeme structs are "
"mismatched to the string encoding table. The mismatched:\n"
"Query string: {}\n"
"Orth cached: {}\n"
"Orth ID: {}".format(repr(original_string), repr(id_string), id_))