mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-31 07:57:35 +03:00 
			
		
		
		
	💫 New system for error messages and warnings (#2163)
* Add spacy.errors module * Update deprecation and user warnings * Replace errors and asserts with new error message system * Remove redundant asserts * Fix whitespace * Add messages for print/util.prints statements * Fix typo * Fix typos * Move CLI messages to spacy.cli._messages * Add decorator to display error code with message An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc. * Remove unused link in spacy.about * Update errors for invalid pipeline components * Improve error for unknown factories * Add displaCy warnings * Update formatting consistency * Move error message to spacy.errors * Update errors and check if doc returned by component is None
This commit is contained in:
		
							parent
							
								
									abf8b16d71
								
							
						
					
					
						commit
						3141e04822
					
				|  | @ -4,18 +4,14 @@ from __future__ import unicode_literals | |||
| from .cli.info import info as cli_info | ||||
| from .glossary import explain | ||||
| from .about import __version__ | ||||
| from .errors import Warnings, deprecation_warning | ||||
| from . import util | ||||
| 
 | ||||
| 
 | ||||
| def load(name, **overrides): | ||||
|     depr_path = overrides.get('path') | ||||
|     if depr_path not in (True, False, None): | ||||
|         util.deprecated( | ||||
|             "As of spaCy v2.0, the keyword argument `path=` is deprecated. " | ||||
|             "You can now call spacy.load with the path as its first argument, " | ||||
|             "and the model's meta.json will be used to determine the language " | ||||
|             "to load. For example:\nnlp = spacy.load('{}')".format(depr_path), | ||||
|             'error') | ||||
|         deprecation_warning(Warnings.W001.format(path=depr_path)) | ||||
|     return util.load_model(name, **overrides) | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -23,6 +23,7 @@ from thinc.neural._classes.affine import _set_dimensions_if_needed | |||
| import thinc.extra.load_nlp | ||||
| 
 | ||||
| from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE | ||||
| from .errors import Errors | ||||
| from . import util | ||||
| 
 | ||||
| 
 | ||||
|  | @ -340,10 +341,10 @@ def _divide_array(X, size): | |||
| 
 | ||||
| 
 | ||||
| def get_col(idx): | ||||
|     assert idx >= 0, idx | ||||
|     if idx < 0: | ||||
|         raise IndexError(Errors.E066.format(value=idx)) | ||||
| 
 | ||||
|     def forward(X, drop=0.): | ||||
|         assert idx >= 0, idx | ||||
|         if isinstance(X, numpy.ndarray): | ||||
|             ops = NumpyOps() | ||||
|         else: | ||||
|  | @ -351,7 +352,6 @@ def get_col(idx): | |||
|         output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) | ||||
| 
 | ||||
|         def backward(y, sgd=None): | ||||
|             assert idx >= 0, idx | ||||
|             dX = ops.allocate(X.shape) | ||||
|             dX[:, idx] += y | ||||
|             return dX | ||||
|  |  | |||
|  | @ -11,7 +11,6 @@ __email__ = 'contact@explosion.ai' | |||
| __license__ = 'MIT' | ||||
| __release__ = True | ||||
| 
 | ||||
| __docs_models__ = 'https://spacy.io/usage/models' | ||||
| __download_url__ = 'https://github.com/explosion/spacy-models/releases/download' | ||||
| __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json' | ||||
| __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json' | ||||
|  |  | |||
							
								
								
									
										73
									
								
								spacy/cli/_messages.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										73
									
								
								spacy/cli/_messages.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,73 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| class Messages(object): | ||||
|     M001 = ("Download successful but linking failed") | ||||
|     M002 = ("Creating a shortcut link for 'en' didn't work (maybe you " | ||||
|             "don't have admin permissions?), but you can still load the " | ||||
|             "model via its full package name: nlp = spacy.load('{name}')") | ||||
|     M003 = ("Server error ({code}: {desc})") | ||||
|     M004 = ("Couldn't fetch {desc}. Please find a model for your spaCy " | ||||
|             "installation (v{version}), and download it manually. For more " | ||||
|             "details, see the documentation: https://spacy.io/usage/models") | ||||
|     M005 = ("Compatibility error") | ||||
|     M006 = ("No compatible models found for v{version} of spaCy.") | ||||
|     M007 = ("No compatible model found for '{name}' (spaCy v{version}).") | ||||
|     M008 = ("Can't locate model data") | ||||
|     M009 = ("The data should be located in {path}") | ||||
|     M010 = ("Can't find the spaCy data path to create model symlink") | ||||
|     M011 = ("Make sure a directory `/data` exists within your spaCy " | ||||
|             "installation and try again. The data directory should be " | ||||
|             "located here:") | ||||
|     M012 = ("Link '{name}' already exists") | ||||
|     M013 = ("To overwrite an existing link, use the --force flag.") | ||||
|     M014 = ("Can't overwrite symlink '{name}'") | ||||
|     M015 = ("This can happen if your data directory contains a directory or " | ||||
|             "file of the same name.") | ||||
|     M016 = ("Error: Couldn't link model to '{name}'") | ||||
|     M017 = ("Creating a symlink in spacy/data failed. Make sure you have the " | ||||
|             "required permissions and try re-running the command as admin, or " | ||||
|             "use a virtualenv. You can still import the model as a module and " | ||||
|             "call its load() method, or create the symlink manually.") | ||||
|     M018 = ("Linking successful") | ||||
|     M019 = ("You can now load the model via spacy.load('{name}')") | ||||
|     M020 = ("Can't find model meta.json") | ||||
|     M021 = ("Couldn't fetch compatibility table.") | ||||
|     M022 = ("Can't find spaCy v{version} in compatibility table") | ||||
|     M023 = ("Installed models (spaCy v{version})") | ||||
|     M024 = ("No models found in your current environment.") | ||||
|     M025 = ("Use the following commands to update the model packages:") | ||||
|     M026 = ("The following models are not available for spaCy " | ||||
|             "v{version}: {models}") | ||||
|     M027 = ("You may also want to overwrite the incompatible links using the " | ||||
|             "`python -m spacy link` command with `--force`, or remove them " | ||||
|             "from the data directory. Data path: {path}") | ||||
|     M028 = ("Input file not found") | ||||
|     M029 = ("Output directory not found") | ||||
|     M030 = ("Unknown format") | ||||
|     M031 = ("Can't find converter for {converter}") | ||||
|     M032 = ("Generated output file {name}") | ||||
|     M033 = ("Created {n_docs} documents") | ||||
|     M034 = ("Evaluation data not found") | ||||
|     M035 = ("Visualization output directory not found") | ||||
|     M036 = ("Generated {n} parses as HTML") | ||||
|     M037 = ("Can't find words frequencies file") | ||||
|     M038 = ("Sucessfully compiled vocab") | ||||
|     M039 = ("{entries} entries, {vectors} vectors") | ||||
|     M040 = ("Output directory not found") | ||||
|     M041 = ("Loaded meta.json from file") | ||||
|     M042 = ("Successfully created package '{name}'") | ||||
|     M043 = ("To build the package, run `python setup.py sdist` in this " | ||||
|             "directory.") | ||||
|     M044 = ("Package directory already exists") | ||||
|     M045 = ("Please delete the directory and try again, or use the `--force` " | ||||
|             "flag to overwrite existing directories.") | ||||
|     M046 = ("Generating meta.json") | ||||
|     M047 = ("Enter the package settings for your model. The following " | ||||
|            "information will be read from your model data: pipeline, vectors.") | ||||
|     M048 = ("No '{key}' setting found in meta.json") | ||||
|     M049 = ("This setting is required to build your package.") | ||||
|     M050 = ("Training data not found") | ||||
|     M051 = ("Development data not found") | ||||
|     M052 = ("Not a valid meta.json format") | ||||
|     M053 = ("Expected dict but got: {meta_type}") | ||||
|  | @ -5,6 +5,7 @@ import plac | |||
| from pathlib import Path | ||||
| 
 | ||||
| from .converters import conllu2json, iob2json, conll_ner2json | ||||
| from ._messages import Messages | ||||
| from ..util import prints | ||||
| 
 | ||||
| # Converters are matched by file extension. To add a converter, add a new | ||||
|  | @ -32,14 +33,14 @@ def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto | |||
|     input_path = Path(input_file) | ||||
|     output_path = Path(output_dir) | ||||
|     if not input_path.exists(): | ||||
|         prints(input_path, title="Input file not found", exits=1) | ||||
|         prints(input_path, title=Messages.M028, exits=1) | ||||
|     if not output_path.exists(): | ||||
|         prints(output_path, title="Output directory not found", exits=1) | ||||
|         prints(output_path, title=Messages.M029, exits=1) | ||||
|     if converter == 'auto': | ||||
|         converter = input_path.suffix[1:] | ||||
|     if converter not in CONVERTERS: | ||||
|             prints("Can't find converter for %s" % converter, | ||||
|                 title="Unknown format", exits=1) | ||||
|             prints(Messages.M031.format(converter=converter), | ||||
|                    title=Messages.M030, exits=1) | ||||
|     func = CONVERTERS[converter] | ||||
|     func(input_path, output_path, | ||||
|          n_sents=n_sents, use_morphology=morphology) | ||||
|  |  | |||
|  | @ -1,6 +1,7 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .._messages import Messages | ||||
| from ...compat import json_dumps, path2str | ||||
| from ...util import prints | ||||
| from ...gold import iob_to_biluo | ||||
|  | @ -18,8 +19,8 @@ def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False): | |||
|     output_file = output_path / output_filename | ||||
|     with output_file.open('w', encoding='utf-8') as f: | ||||
|         f.write(json_dumps(docs)) | ||||
|     prints("Created %d documents" % len(docs), | ||||
|            title="Generated output file %s" % path2str(output_file)) | ||||
|     prints(Messages.M033.format(n_docs=len(docs)), | ||||
|            title=Messages.M032.format(name=path2str(output_file))) | ||||
| 
 | ||||
| 
 | ||||
| def read_conll_ner(input_path): | ||||
|  |  | |||
|  | @ -1,6 +1,7 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| from .._messages import Messages | ||||
| from ...compat import json_dumps, path2str | ||||
| from ...util import prints | ||||
| 
 | ||||
|  | @ -32,8 +33,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False): | |||
|     output_file = output_path / output_filename | ||||
|     with output_file.open('w', encoding='utf-8') as f: | ||||
|         f.write(json_dumps(docs)) | ||||
|     prints("Created %d documents" % len(docs), | ||||
|            title="Generated output file %s" % path2str(output_file)) | ||||
|     prints(Messages.M033.format(n_docs=len(docs)), | ||||
|            title=Messages.M032.format(name=path2str(output_file))) | ||||
| 
 | ||||
| 
 | ||||
| def read_conllx(input_path, use_morphology=False, n=0): | ||||
|  |  | |||
|  | @ -2,6 +2,7 @@ | |||
| from __future__ import unicode_literals | ||||
| from cytoolz import partition_all, concat | ||||
| 
 | ||||
| from .._messages import Messages | ||||
| from ...compat import json_dumps, path2str | ||||
| from ...util import prints | ||||
| from ...gold import iob_to_biluo | ||||
|  | @ -18,8 +19,8 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): | |||
|     output_file = output_path / output_filename | ||||
|     with output_file.open('w', encoding='utf-8') as f: | ||||
|         f.write(json_dumps(docs)) | ||||
|     prints("Created %d documents" % len(docs), | ||||
|            title="Generated output file %s" % path2str(output_file)) | ||||
|     prints(Messages.M033.format(n_docs=len(docs)), | ||||
|            title=Messages.M032.format(name=path2str(output_file))) | ||||
| 
 | ||||
| 
 | ||||
| def read_iob(raw_sents): | ||||
|  |  | |||
|  | @ -8,6 +8,7 @@ import sys | |||
| import ujson | ||||
| 
 | ||||
| from .link import link | ||||
| from ._messages import Messages | ||||
| from ..util import prints, get_package_path | ||||
| from ..compat import url_read, HTTPError | ||||
| from .. import about | ||||
|  | @ -32,9 +33,7 @@ def download(model, direct=False): | |||
|         version = get_version(model_name, compatibility) | ||||
|         dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, | ||||
|                                                             v=version)) | ||||
|         if dl != 0: | ||||
|             # if download subprocess doesn't return 0, exit with the respective | ||||
|             # exit code before doing anything else | ||||
|         if dl != 0:  # if download subprocess doesn't return 0, exit | ||||
|             sys.exit(dl) | ||||
|         try: | ||||
|             # Get package path here because link uses | ||||
|  | @ -48,22 +47,15 @@ def download(model, direct=False): | |||
|             # Dirty, but since spacy.download and the auto-linking is | ||||
|             # mostly a convenience wrapper, it's best to show a success | ||||
|             # message and loading instructions, even if linking fails. | ||||
|             prints( | ||||
|                 "Creating a shortcut link for 'en' didn't work (maybe " | ||||
|                 "you don't have admin permissions?), but you can still " | ||||
|                 "load the model via its full package name:", | ||||
|                 "nlp = spacy.load('%s')" % model_name, | ||||
|                 title="Download successful but linking failed") | ||||
|             prints(Messages.M001.format(name=model_name), title=Messages.M002) | ||||
| 
 | ||||
| 
 | ||||
| def get_json(url, desc): | ||||
|     try: | ||||
|         data = url_read(url) | ||||
|     except HTTPError as e: | ||||
|         msg = ("Couldn't fetch %s. Please find a model for your spaCy " | ||||
|                "installation (v%s), and download it manually.") | ||||
|         prints(msg % (desc, about.__version__), about.__docs_models__, | ||||
|                title="Server error (%d: %s)" % (e.code, e.reason), exits=1) | ||||
|         prints(Messages.M004.format(desc, about.__version__), | ||||
|                title=Messages.M003.format(e.code, e.reason), exits=1) | ||||
|     return ujson.loads(data) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -73,17 +65,16 @@ def get_compatibility(): | |||
|     comp_table = get_json(about.__compatibility__, "compatibility table") | ||||
|     comp = comp_table['spacy'] | ||||
|     if version not in comp: | ||||
|         prints("No compatible models found for v%s of spaCy." % version, | ||||
|                title="Compatibility error", exits=1) | ||||
|         prints(Messages.M006.format(version=version), title=Messages.M005, | ||||
|                exits=1) | ||||
|     return comp[version] | ||||
| 
 | ||||
| 
 | ||||
| def get_version(model, comp): | ||||
|     model = model.rsplit('.dev', 1)[0] | ||||
|     if model not in comp: | ||||
|         version = about.__version__ | ||||
|         msg = "No compatible model found for '%s' (spaCy v%s)." | ||||
|         prints(msg % (model, version), title="Compatibility error", exits=1) | ||||
|         prints(Messages.M007.format(name=model, version=about.__version__), | ||||
|                title=Messages.M005, exits=1) | ||||
|     return comp[model][0] | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -4,6 +4,7 @@ from __future__ import unicode_literals, division, print_function | |||
| import plac | ||||
| from timeit import default_timer as timer | ||||
| 
 | ||||
| from ._messages import Messages | ||||
| from ..gold import GoldCorpus | ||||
| from ..util import prints | ||||
| from .. import util | ||||
|  | @ -33,10 +34,9 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None | |||
|     data_path = util.ensure_path(data_path) | ||||
|     displacy_path = util.ensure_path(displacy_path) | ||||
|     if not data_path.exists(): | ||||
|         prints(data_path, title="Evaluation data not found", exits=1) | ||||
|         prints(data_path, title=Messages.M034, exits=1) | ||||
|     if displacy_path and not displacy_path.exists(): | ||||
|         prints(displacy_path, title="Visualization output directory not found", | ||||
|                exits=1) | ||||
|         prints(displacy_path, title=Messages.M035, exits=1) | ||||
|     corpus = GoldCorpus(data_path, data_path) | ||||
|     nlp = util.load_model(model) | ||||
|     dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) | ||||
|  | @ -52,8 +52,7 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None | |||
|         render_ents = 'ner' in nlp.meta.get('pipeline', []) | ||||
|         render_parses(docs, displacy_path, model_name=model, | ||||
|                       limit=displacy_limit, deps=render_deps, ents=render_ents) | ||||
|         msg = "Generated %s parses as HTML" % displacy_limit | ||||
|         prints(displacy_path, title=msg) | ||||
|         prints(displacy_path, title=Messages.M036.format(n=displacy_limit)) | ||||
| 
 | ||||
| 
 | ||||
| def render_parses(docs, output_path, model_name='', limit=250, deps=True, | ||||
|  |  | |||
|  | @ -5,9 +5,10 @@ import plac | |||
| import platform | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from ._messages import Messages | ||||
| from ..compat import path2str | ||||
| from .. import about | ||||
| from .. import util | ||||
| from .. import about | ||||
| 
 | ||||
| 
 | ||||
| @plac.annotations( | ||||
|  | @ -25,7 +26,7 @@ def info(model=None, markdown=False): | |||
|             model_path = util.get_data_path() / model | ||||
|         meta_path = model_path / 'meta.json' | ||||
|         if not meta_path.is_file(): | ||||
|             util.prints(meta_path, title="Can't find model meta.json", exits=1) | ||||
|             util.prints(meta_path, title=Messages.M020, exits=1) | ||||
|         meta = util.read_json(meta_path) | ||||
|         if model_path.resolve() != model_path: | ||||
|             meta['link'] = path2str(model_path) | ||||
|  |  | |||
|  | @ -11,7 +11,9 @@ from preshed.counter import PreshCounter | |||
| import tarfile | ||||
| import gzip | ||||
| 
 | ||||
| from ._messages import Messages | ||||
| from ..vectors import Vectors | ||||
| from ..errors import Warnings, user_warning | ||||
| from ..util import prints, ensure_path, get_lang_class | ||||
| 
 | ||||
| try: | ||||
|  | @ -37,16 +39,13 @@ def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, vectors_loc= | |||
|     and word vectors. | ||||
|     """ | ||||
|     if freqs_loc is not None and not freqs_loc.exists(): | ||||
|         prints(freqs_loc, title="Can't find words frequencies file", exits=1) | ||||
|         prints(freqs_loc, title=Messages.M037, exits=1) | ||||
|     clusters_loc = ensure_path(clusters_loc) | ||||
|     vectors_loc = ensure_path(vectors_loc) | ||||
| 
 | ||||
|     probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20) | ||||
|     vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None) | ||||
|     clusters = read_clusters(clusters_loc) if clusters_loc else {} | ||||
| 
 | ||||
|     nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors) | ||||
| 
 | ||||
|     if not output_dir.exists(): | ||||
|         output_dir.mkdir() | ||||
|     nlp.to_disk(output_dir) | ||||
|  | @ -69,7 +68,6 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru | |||
|     nlp = lang_class() | ||||
|     for lexeme in nlp.vocab: | ||||
|         lexeme.rank = 0 | ||||
| 
 | ||||
|     lex_added = 0 | ||||
|     for i, (word, prob) in enumerate(tqdm(sorted(probs.items(), key=lambda item: item[1], reverse=True))): | ||||
|         lexeme = nlp.vocab[word] | ||||
|  | @ -89,15 +87,13 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru | |||
|             lexeme = nlp.vocab[word] | ||||
|             lexeme.is_oov = False | ||||
|             lex_added += 1 | ||||
| 
 | ||||
|     if len(vectors_data): | ||||
|         nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) | ||||
|     if prune_vectors >= 1: | ||||
|         nlp.vocab.prune_vectors(prune_vectors) | ||||
|     vec_added = len(nlp.vocab.vectors) | ||||
| 
 | ||||
|     prints("{} entries, {} vectors".format(lex_added, vec_added), | ||||
|            title="Sucessfully compiled vocab") | ||||
|     prints(Messages.M039.format(entries=lex_added, vectors=vec_added), | ||||
|            title=Messages.M038) | ||||
|     return nlp | ||||
| 
 | ||||
| 
 | ||||
|  | @ -145,7 +141,7 @@ def read_clusters(clusters_loc): | |||
|     print("Reading clusters...") | ||||
|     clusters = {} | ||||
|     if ftfy is None: | ||||
|         print("Warning: No text fixing. Run pip install ftfy if necessary") | ||||
|         user_warning(Warnings.W004) | ||||
|     with clusters_loc.open() as f: | ||||
|         for line in tqdm(f): | ||||
|             try: | ||||
|  |  | |||
|  | @ -4,6 +4,7 @@ from __future__ import unicode_literals | |||
| import plac | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from ._messages import Messages | ||||
| from ..compat import symlink_to, path2str | ||||
| from ..util import prints | ||||
| from .. import util | ||||
|  | @ -24,40 +25,29 @@ def link(origin, link_name, force=False, model_path=None): | |||
|     else: | ||||
|         model_path = Path(origin) if model_path is None else Path(model_path) | ||||
|     if not model_path.exists(): | ||||
|         prints("The data should be located in %s" % path2str(model_path), | ||||
|                title="Can't locate model data", exits=1) | ||||
|         prints(Messages.M009.format(path=path2str(model_path)), | ||||
|                title=Messages.M008, exits=1) | ||||
|     data_path = util.get_data_path() | ||||
|     if not data_path or not data_path.exists(): | ||||
|         spacy_loc = Path(__file__).parent.parent | ||||
|         prints("Make sure a directory `/data` exists within your spaCy " | ||||
|                "installation and try again. The data directory should be " | ||||
|                "located here:", path2str(spacy_loc), exits=1, | ||||
|                title="Can't find the spaCy data path to create model symlink") | ||||
|         prints(Messages.M011, spacy_loc, title=Messages.M010, exits=1) | ||||
|     link_path = util.get_data_path() / link_name | ||||
|     if link_path.is_symlink() and not force: | ||||
|         prints("To overwrite an existing link, use the --force flag.", | ||||
|                title="Link %s already exists" % link_name, exits=1) | ||||
|         prints(Messages.M013, title=Messages.M012.format(name=link_name), | ||||
|                exits=1) | ||||
|     elif link_path.is_symlink():  # does a symlink exist? | ||||
|         # NB: It's important to check for is_symlink here and not for exists, | ||||
|         # because invalid/outdated symlinks would return False otherwise. | ||||
|         link_path.unlink() | ||||
|     elif link_path.exists(): # does it exist otherwise? | ||||
|         # NB: Check this last because valid symlinks also "exist". | ||||
|         prints("This can happen if your data directory contains a directory " | ||||
|                "or file of the same name.", link_path, | ||||
|                title="Can't overwrite symlink %s" % link_name, exits=1) | ||||
|         prints(Messages.M015, link_path, | ||||
|                title=Messages.M014.format(name=link_name), exits=1) | ||||
|     msg = "%s --> %s" % (path2str(model_path), path2str(link_path)) | ||||
|     try: | ||||
|         symlink_to(link_path, model_path) | ||||
|     except: | ||||
|         # This is quite dirty, but just making sure other errors are caught. | ||||
|         prints("Creating a symlink in spacy/data failed. Make sure you have " | ||||
|                "the required permissions and try re-running the command as " | ||||
|                "admin, or use a virtualenv. You can still import the model as " | ||||
|                "a module and call its load() method, or create the symlink " | ||||
|                "manually.", | ||||
|                "%s --> %s" % (path2str(model_path), path2str(link_path)), | ||||
|                title="Error: Couldn't link model to '%s'" % link_name) | ||||
|         prints(Messages.M017, msg, title=Messages.M016.format(name=link_name)) | ||||
|         raise | ||||
|     prints("%s --> %s" % (path2str(model_path), path2str(link_path)), | ||||
|            "You can now load the model via spacy.load('%s')" % link_name, | ||||
|            title="Linking successful") | ||||
|     prints(msg, Messages.M019.format(name=link_name), title=Messages.M018) | ||||
|  |  | |||
|  | @ -5,6 +5,7 @@ import plac | |||
| import shutil | ||||
| from pathlib import Path | ||||
| 
 | ||||
| from ._messages import Messages | ||||
| from ..compat import path2str, json_dumps | ||||
| from ..util import prints | ||||
| from .. import util | ||||
|  | @ -31,17 +32,17 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, | |||
|     output_path = util.ensure_path(output_dir) | ||||
|     meta_path = util.ensure_path(meta_path) | ||||
|     if not input_path or not input_path.exists(): | ||||
|         prints(input_path, title="Model directory not found", exits=1) | ||||
|         prints(input_path, title=Messages.M008, exits=1) | ||||
|     if not output_path or not output_path.exists(): | ||||
|         prints(output_path, title="Output directory not found", exits=1) | ||||
|         prints(output_path, title=Messages.M040, exits=1) | ||||
|     if meta_path and not meta_path.exists(): | ||||
|         prints(meta_path, title="meta.json not found", exits=1) | ||||
|         prints(meta_path, title=Messages.M020, exits=1) | ||||
| 
 | ||||
|     meta_path = meta_path or input_path / 'meta.json' | ||||
|     if meta_path.is_file(): | ||||
|         meta = util.read_json(meta_path) | ||||
|         if not create_meta:  # only print this if user doesn't want to overwrite | ||||
|             prints(meta_path, title="Loaded meta.json from file") | ||||
|             prints(meta_path, title=Messages.M041) | ||||
|         else: | ||||
|             meta = generate_meta(input_dir, meta) | ||||
|     meta = validate_meta(meta, ['lang', 'name', 'version']) | ||||
|  | @ -57,9 +58,8 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, | |||
|     create_file(main_path / 'setup.py', TEMPLATE_SETUP) | ||||
|     create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST) | ||||
|     create_file(package_path / '__init__.py', TEMPLATE_INIT) | ||||
|     prints(main_path, "To build the package, run `python setup.py sdist` in " | ||||
|            "this directory.", | ||||
|            title="Successfully created package '%s'" % model_name_v) | ||||
|     prints(main_path, Messages.M043, | ||||
|            title=Messages.M042.format(name=model_name_v)) | ||||
| 
 | ||||
| 
 | ||||
| def create_dirs(package_path, force): | ||||
|  | @ -67,10 +67,7 @@ def create_dirs(package_path, force): | |||
|         if force: | ||||
|             shutil.rmtree(path2str(package_path)) | ||||
|         else: | ||||
|             prints(package_path, "Please delete the directory and try again, " | ||||
|                    "or use the --force flag to overwrite existing " | ||||
|                    "directories.", title="Package directory already exists", | ||||
|                    exits=1) | ||||
|             prints(package_path, Messages.M045, title=Messages.M044, exits=1) | ||||
|     Path.mkdir(package_path, parents=True) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -97,9 +94,7 @@ def generate_meta(model_path, existing_meta): | |||
|     meta['vectors'] = {'width': nlp.vocab.vectors_length, | ||||
|                        'vectors': len(nlp.vocab.vectors), | ||||
|                        'keys': nlp.vocab.vectors.n_keys} | ||||
|     prints("Enter the package settings for your model. The following " | ||||
|            "information will be read from your model data: pipeline, vectors.", | ||||
|            title="Generating meta.json") | ||||
|     prints(Messages.M047, title=Messages.Mo46) | ||||
|     for setting, desc, default in settings: | ||||
|         response = util.get_raw_input(desc, default) | ||||
|         meta[setting] = default if response == '' and default else response | ||||
|  | @ -111,8 +106,7 @@ def generate_meta(model_path, existing_meta): | |||
| def validate_meta(meta, keys): | ||||
|     for key in keys: | ||||
|         if key not in meta or meta[key] == '': | ||||
|             prints("This setting is required to build your package.", | ||||
|                    title='No "%s" setting found in meta.json' % key, exits=1) | ||||
|             prints(Messages.M049, title=Messages.M048.format(key=key), exits=1) | ||||
|     return meta | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -7,6 +7,7 @@ import tqdm | |||
| from thinc.neural._classes.model import Model | ||||
| from timeit import default_timer as timer | ||||
| 
 | ||||
| from ._messages import Messages | ||||
| from ..attrs import PROB, IS_OOV, CLUSTER, LANG | ||||
| from ..gold import GoldCorpus, minibatch | ||||
| from ..util import prints | ||||
|  | @ -54,15 +55,15 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, | |||
|     if not output_path.exists(): | ||||
|         output_path.mkdir() | ||||
|     if not train_path.exists(): | ||||
|         prints(train_path, title="Training data not found", exits=1) | ||||
|         prints(train_path, title=Messages.M050, exits=1) | ||||
|     if dev_path and not dev_path.exists(): | ||||
|         prints(dev_path, title="Development data not found", exits=1) | ||||
|         prints(dev_path, title=Messages.M051, exits=1) | ||||
|     if meta_path is not None and not meta_path.exists(): | ||||
|         prints(meta_path, title="meta.json not found", exits=1) | ||||
|         prints(meta_path, title=Messages.M020, exits=1) | ||||
|     meta = util.read_json(meta_path) if meta_path else {} | ||||
|     if not isinstance(meta, dict): | ||||
|         prints("Expected dict but got: {}".format(type(meta)), | ||||
|                title="Not a valid meta.json format", exits=1) | ||||
|         prints(Messages.M053.format(meta_type=type(meta)), | ||||
|                title=Messages.M052, exits=1) | ||||
|     meta.setdefault('lang', lang) | ||||
|     meta.setdefault('name', 'unnamed') | ||||
| 
 | ||||
|  |  | |||
|  | @ -6,6 +6,7 @@ from pathlib import Path | |||
| import sys | ||||
| import ujson | ||||
| 
 | ||||
| from ._messages import Messages | ||||
| from ..compat import path2str, locale_escape, url_read, HTTPError | ||||
| from ..util import prints, get_data_path, read_json | ||||
| from .. import about | ||||
|  | @ -18,14 +19,13 @@ def validate(): | |||
|     try: | ||||
|         data = url_read(about.__compatibility__) | ||||
|     except HTTPError as e: | ||||
|         prints("Couldn't fetch compatibility table.", | ||||
|                title="Server error (%d: %s)" % (e.code, e.reason), exits=1) | ||||
|         title = Messages.M003.format(code=e.code, desc=e.reason) | ||||
|         prints(Messages.M021, title=title, exits=1) | ||||
|     compat = ujson.loads(data)['spacy'] | ||||
|     current_compat = compat.get(about.__version__) | ||||
|     if not current_compat: | ||||
|         prints(about.__compatibility__, exits=1, | ||||
|                title="Can't find spaCy v{} in compatibility table" | ||||
|                .format(about.__version__)) | ||||
|                title=Messages.M022.format(version=about.__version__)) | ||||
|     all_models = set() | ||||
|     for spacy_v, models in dict(compat).items(): | ||||
|         all_models.update(models.keys()) | ||||
|  | @ -42,7 +42,7 @@ def validate(): | |||
|     update_models = [m for m in incompat_models if m in current_compat] | ||||
| 
 | ||||
|     prints(path2str(Path(__file__).parent.parent), | ||||
|            title="Installed models (spaCy v{})".format(about.__version__)) | ||||
|            title=Messages.M023.format(version=about.__version__)) | ||||
|     if model_links or model_pkgs: | ||||
|         print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', '')) | ||||
|         for name, data in model_pkgs.items(): | ||||
|  | @ -50,23 +50,16 @@ def validate(): | |||
|         for name, data in model_links.items(): | ||||
|             print(get_model_row(current_compat, name, data, 'link')) | ||||
|     else: | ||||
|         prints("No models found in your current environment.", exits=0) | ||||
| 
 | ||||
|         prints(Messages.M024, exits=0) | ||||
|     if update_models: | ||||
|         cmd = '    python -m spacy download {}' | ||||
|         print("\n    Use the following commands to update the model packages:") | ||||
|         print("\n    " + Messages.M025) | ||||
|         print('\n'.join([cmd.format(pkg) for pkg in update_models])) | ||||
| 
 | ||||
|     if na_models: | ||||
|         prints("The following models are not available for spaCy v{}: {}" | ||||
|                .format(about.__version__, ', '.join(na_models))) | ||||
| 
 | ||||
|         prints(Messages.M025.format(version=about.__version__, | ||||
|                                     models=', '.join(na_models))) | ||||
|     if incompat_links: | ||||
|         prints("You may also want to overwrite the incompatible links using " | ||||
|                "the `python -m spacy link` command with `--force`, or remove " | ||||
|                "them from the data directory. Data path: {}" | ||||
|                .format(path2str(get_data_path()))) | ||||
| 
 | ||||
|         prints(Messages.M027.format(path=path2str(get_data_path()))) | ||||
|     if incompat_models or incompat_links: | ||||
|         sys.exit(1) | ||||
| 
 | ||||
|  |  | |||
|  | @ -4,6 +4,7 @@ from __future__ import unicode_literals | |||
| from .render import DependencyRenderer, EntityRenderer | ||||
| from ..tokens import Doc | ||||
| from ..compat import b_to_str | ||||
| from ..errors import Errors, Warnings, user_warning | ||||
| from ..util import prints, is_in_jupyter | ||||
| 
 | ||||
| 
 | ||||
|  | @ -27,7 +28,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, | |||
|     factories = {'dep': (DependencyRenderer, parse_deps), | ||||
|                  'ent': (EntityRenderer, parse_ents)} | ||||
|     if style not in factories: | ||||
|         raise ValueError("Unknown style: %s" % style) | ||||
|         raise ValueError(Errors.E087.format(style=style)) | ||||
|     if isinstance(docs, Doc) or isinstance(docs, dict): | ||||
|         docs = [docs] | ||||
|     renderer, converter = factories[style] | ||||
|  | @ -57,12 +58,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False, | |||
|     render(docs, style=style, page=page, minify=minify, options=options, | ||||
|            manual=manual) | ||||
|     httpd = simple_server.make_server('0.0.0.0', port, app) | ||||
|     prints("Using the '%s' visualizer" % style, | ||||
|            title="Serving on port %d..." % port) | ||||
|     prints("Using the '{}' visualizer".format(style), | ||||
|            title="Serving on port {}...".format(port)) | ||||
|     try: | ||||
|         httpd.serve_forever() | ||||
|     except KeyboardInterrupt: | ||||
|         prints("Shutting down server on port %d." % port) | ||||
|         prints("Shutting down server on port {}.".format(port)) | ||||
|     finally: | ||||
|         httpd.server_close() | ||||
| 
 | ||||
|  | @ -83,6 +84,8 @@ def parse_deps(orig_doc, options={}): | |||
|     RETURNS (dict): Generated dependency parse keyed by words and arcs. | ||||
|     """ | ||||
|     doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes()) | ||||
|     if not doc.is_parsed: | ||||
|         user_warning(Warnings.W005) | ||||
|     if options.get('collapse_punct', True): | ||||
|         spans = [] | ||||
|         for word in doc[:-1]: | ||||
|  | @ -120,6 +123,8 @@ def parse_ents(doc, options={}): | |||
|     """ | ||||
|     ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_} | ||||
|             for ent in doc.ents] | ||||
|     if not ents: | ||||
|         user_warning(Warnings.W006) | ||||
|     title = (doc.user_data.get('title', None) | ||||
|              if hasattr(doc, 'user_data') else None) | ||||
|     return {'text': doc.text, 'ents': ents, 'title': title} | ||||
|  |  | |||
							
								
								
									
										297
									
								
								spacy/errors.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										297
									
								
								spacy/errors.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,297 @@ | |||
| # coding: utf8 | ||||
| from __future__ import unicode_literals | ||||
| 
 | ||||
| import os | ||||
| import warnings | ||||
| import inspect | ||||
| 
 | ||||
| 
 | ||||
| def add_codes(err_cls): | ||||
|     """Add error codes to string messages via class attribute names.""" | ||||
|     class ErrorsWithCodes(object): | ||||
|         def __getattribute__(self, code): | ||||
|             msg = getattr(err_cls, code) | ||||
|             return '[{code}] {msg}'.format(code=code, msg=msg) | ||||
|     return ErrorsWithCodes() | ||||
| 
 | ||||
| 
 | ||||
| @add_codes | ||||
| class Warnings(object): | ||||
|     W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. " | ||||
|             "You can now call spacy.load with the path as its first argument, " | ||||
|             "and the model's meta.json will be used to determine the language " | ||||
|             "to load. For example:\nnlp = spacy.load('{path}')") | ||||
|     W002 = ("Tokenizer.from_list is now deprecated. Create a new Doc object " | ||||
|             "instead and pass in the strings as the `words` keyword argument, " | ||||
|             "for example:\nfrom spacy.tokens import Doc\n" | ||||
|             "doc = Doc(nlp.vocab, words=[...])") | ||||
|     W003 = ("Positional arguments to Doc.merge are deprecated. Instead, use " | ||||
|             "the keyword arguments, for example tag=, lemma= or ent_type=.") | ||||
|     W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing " | ||||
|             "using ftfy.fix_text if necessary.") | ||||
|     W005 = ("Doc object not parsed. This means displaCy won't be able to " | ||||
|             "generate a dependency visualization for it. Make sure the Doc " | ||||
|             "was processed with a model that supports dependency parsing, and " | ||||
|             "not just a language class like `English()`. For more info, see " | ||||
|             "the docs:\nhttps://spacy.io/usage/models") | ||||
|     W006 = ("No entities to visualize found in Doc object. If this is " | ||||
|             "surprising to you, make sure the Doc was processed using a model " | ||||
|             "that supports named entity recognition, and check the `doc.ents` " | ||||
|             "property manually if necessary.") | ||||
| 
 | ||||
| 
 | ||||
| @add_codes | ||||
| class Errors(object): | ||||
|     E001 = ("No component '{name}' found in pipeline. Available names: {opts}") | ||||
|     E002 = ("Can't find factory for '{name}'. This usually happens when spaCy " | ||||
|             "calls `nlp.create_pipe` with a component name that's not built " | ||||
|             "in - for example, when constructing the pipeline from a model's " | ||||
|             "meta.json. If you're using a custom component, you can write to " | ||||
|             "`Language.factories['{name}']` or remove it from the model meta " | ||||
|             "and add it via `nlp.add_pipe` instead.") | ||||
|     E003 = ("Not a valid pipeline component. Expected callable, but " | ||||
|             "got {component} (name: '{name}').") | ||||
|     E004 = ("If you meant to add a built-in component, use `create_pipe`: " | ||||
|             "`nlp.add_pipe(nlp.create_pipe('{component}'))`") | ||||
|     E005 = ("Pipeline component '{name}' returned None. If you're using a " | ||||
|             "custom component, maybe you forgot to return the processed Doc?") | ||||
|     E006 = ("Invalid constraints. You can only set one of the following: " | ||||
|             "before, after, first, last.") | ||||
|     E007 = ("'{name}' already exists in pipeline. Existing names: {opts}") | ||||
|     E008 = ("Some current components would be lost when restoring previous " | ||||
|             "pipeline state. If you added components after calling " | ||||
|             "`nlp.disable_pipes()`, you should remove them explicitly with " | ||||
|             "`nlp.remove_pipe()` before the pipeline is restored. Names of " | ||||
|             "the new components: {names}") | ||||
|     E009 = ("The `update` method expects same number of docs and golds, but " | ||||
|             "got: {n_docs} docs, {n_golds} golds.") | ||||
|     E010 = ("Word vectors set to length 0. This may be because you don't have " | ||||
|             "a model installed or loaded, or because your model doesn't " | ||||
|             "include word vectors. For more info, see the docs:\n" | ||||
|             "https://spacy.io/usage/models") | ||||
|     E011 = ("Unknown operator: '{op}'. Options: {opts}") | ||||
|     E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}") | ||||
|     E013 = ("Error selecting action in matcher") | ||||
|     E014 = ("Uknown tag ID: {tag}") | ||||
|     E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use " | ||||
|             "`force=True` to overwrite.") | ||||
|     E016 = ("MultitaskObjective target should be function or one of: dep, " | ||||
|             "tag, ent, dep_tag_offset, ent_tag.") | ||||
|     E017 = ("Can only add unicode or bytes. Got type: {value_type}") | ||||
|     E018 = ("Can't retrieve string for hash '{hash_value}'.") | ||||
|     E019 = ("Can't create transition with unknown action ID: {action}. Action " | ||||
|             "IDs are enumerated in spacy/syntax/{src}.pyx.") | ||||
|     E020 = ("Could not find a gold-standard action to supervise the " | ||||
|             "dependency parser. The tree is non-projective (i.e. it has " | ||||
|             "crossing arcs - see spacy/syntax/nonproj.pyx for definitions). " | ||||
|             "The ArcEager transition system only supports projective trees. " | ||||
|             "To learn non-projective representations, transform the data " | ||||
|             "before training and after parsing. Either pass " | ||||
|             "`make_projective=True` to the GoldParse class, or use " | ||||
|             "spacy.syntax.nonproj.preprocess_training_data.") | ||||
|     E021 = ("Could not find a gold-standard action to supervise the " | ||||
|             "dependency parser. The GoldParse was projective. The transition " | ||||
|             "system has {n_actions} actions. State at failure: {state}") | ||||
|     E022 = ("Could not find a transition with the name '{name}' in the NER " | ||||
|             "model.") | ||||
|     E023 = ("Error cleaning up beam: The same state occurred twice at " | ||||
|             "memory address {addr} and position {i}.") | ||||
|     E024 = ("Could not find an optimal move to supervise the parser. Usually, " | ||||
|             "this means the GoldParse was not correct. For example, are all " | ||||
|             "labels added to the model?") | ||||
|     E025 = ("String is too long: {length} characters. Max is 2**30.") | ||||
|     E026 = ("Error accessing token at position {i}: out of bounds in Doc of " | ||||
|             "length {length}.") | ||||
|     E027 = ("Arguments 'words' and 'spaces' should be sequences of the same " | ||||
|             "length, or 'spaces' should be left default at None. spaces " | ||||
|             "should be a sequence of booleans, with True meaning that the " | ||||
|             "word owns a ' ' character following it.") | ||||
|     E028 = ("orths_and_spaces expects either a list of unicode string or a " | ||||
|             "list of (unicode, bool) tuples. Got bytes instance: {value}") | ||||
|     E029 = ("noun_chunks requires the dependency parse, which requires a " | ||||
|             "statistical model to be installed and loaded. For more info, see " | ||||
|             "the documentation:\nhttps://spacy.io/usage/models") | ||||
|     E030 = ("Sentence boundaries unset. You can add the 'sentencizer' " | ||||
|             "component to the pipeline with: " | ||||
|             "nlp.add_pipe(nlp.create_pipe('sentencizer')) " | ||||
|             "Alternatively, add the dependency parser, or set sentence " | ||||
|             "boundaries by setting doc[i].is_sent_start.") | ||||
|     E031 = ("Invalid token: empty string ('') at position {i}.") | ||||
|     E032 = ("Conflicting attributes specified in doc.from_array(): " | ||||
|             "(HEAD, SENT_START). The HEAD attribute currently sets sentence " | ||||
|             "boundaries implicitly, based on the tree structure. This means " | ||||
|             "the HEAD attribute would potentially override the sentence " | ||||
|             "boundaries set by SENT_START.") | ||||
|     E033 = ("Cannot load into non-empty Doc of length {length}.") | ||||
|     E034 = ("Doc.merge received {n_args} non-keyword arguments. Expected " | ||||
|             "either 3 arguments (deprecated), or 0 (use keyword arguments).\n" | ||||
|             "Arguments supplied:\n{args}\nKeyword arguments:{kwargs}") | ||||
|     E035 = ("Error creating span with start {start} and end {end} for Doc of " | ||||
|             "length {length}.") | ||||
|     E036 = ("Error calculating span: Can't find a token starting at character " | ||||
|             "offset {start}.") | ||||
|     E037 = ("Error calculating span: Can't find a token ending at character " | ||||
|             "offset {end}.") | ||||
|     E038 = ("Error finding sentence for span. Infinite loop detected.") | ||||
|     E039 = ("Array bounds exceeded while searching for root word. This likely " | ||||
|             "means the parse tree is in an invalid state. Please report this " | ||||
|             "issue here: http://github.com/explosion/spaCy/issues") | ||||
|     E040 = ("Attempt to access token at {i}, max length {max_length}.") | ||||
|     E041 = ("Invalid comparison operator: {op}. Likely a Cython bug?") | ||||
|     E042 = ("Error accessing doc[{i}].nbor({j}), for doc of length {length}.") | ||||
|     E043 = ("Refusing to write to token.sent_start if its document is parsed, " | ||||
|             "because this may cause inconsistent state.") | ||||
|     E044 = ("Invalid value for token.sent_start: {value}. Must be one of: " | ||||
|             "None, True, False") | ||||
|     E045 = ("Possibly infinite loop encountered while looking for {attr}.") | ||||
|     E046 = ("Can't retrieve unregistered extension attribute '{name}'. Did " | ||||
|             "you forget to call the `set_extension` method?") | ||||
|     E047 = ("Can't assign a value to unregistered extension attribute " | ||||
|             "'{name}'. Did you forget to call the `set_extension` method?") | ||||
|     E048 = ("Can't import language {lang} from spacy.lang.") | ||||
|     E049 = ("Can't find spaCy data directory: '{path}'. Check your " | ||||
|             "installation and permissions, or use spacy.util.set_data_path " | ||||
|             "to customise the location if necessary.") | ||||
|     E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut " | ||||
|             "link, a Python package or a valid path to a data directory.") | ||||
|     E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure " | ||||
|             "it points to a valid package (not just a data directory).") | ||||
|     E052 = ("Can't find model directory: {path}") | ||||
|     E053 = ("Could not read meta.json from {path}") | ||||
|     E054 = ("No valid '{setting}' setting found in model meta.json.") | ||||
|     E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}") | ||||
|     E056 = ("Invalid tokenizer exception: ORTH values combined don't match " | ||||
|             "original string.\nKey: {key}\nOrths: {orths}") | ||||
|     E057 = ("Stepped slices not supported in Span objects. Try: " | ||||
|             "list(tokens)[start:stop:step] instead.") | ||||
|     E058 = ("Could not retrieve vector for key {key}.") | ||||
|     E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}") | ||||
|     E060 = ("Cannot add new key to vectors: the table is full. Current shape: " | ||||
|             "({rows}, {cols}).") | ||||
|     E061 = ("Bad file name: {filename}. Example of a valid file name: " | ||||
|             "'vectors.128.f.bin'") | ||||
|     E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 " | ||||
|             "and 63 are occupied. You can replace one by specifying the " | ||||
|             "`flag_id` explicitly, e.g. " | ||||
|             "`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.") | ||||
|     E063 = ("Invalid value for flag_id: {value}. Flag IDs must be between 1 " | ||||
|             "and 63 (inclusive).") | ||||
|     E064 = ("Error fetching a Lexeme from the Vocab. When looking up a " | ||||
|             "string, the lexeme returned had an orth ID that did not match " | ||||
|             "the query string. This means that the cached lexeme structs are " | ||||
|             "mismatched to the string encoding table. The mismatched:\n" | ||||
|             "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}") | ||||
|     E065 = ("Only one of the vector table's width and shape can be specified. " | ||||
|             "Got width {width} and shape {shape}.") | ||||
|     E066 = ("Error creating model helper for extracting columns. Can only " | ||||
|             "extract columns by positive integer. Got: {value}.") | ||||
|     E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside " | ||||
|             "an entity) without a preceding 'B' (beginning of an entity). " | ||||
|             "Tag sequence:\n{tags}") | ||||
|     E068 = ("Invalid BILUO tag: '{tag}'.") | ||||
|     E069 = ("Invalid gold-standard parse tree. Found cycle between word " | ||||
|             "IDs: {cycle}") | ||||
|     E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) " | ||||
|             "does not align with number of annotations ({n_annots}).") | ||||
|     E071 = ("Error creating lexeme: specified orth ID ({orth}) does not " | ||||
|             "match the one in the vocab ({vocab_orth}).") | ||||
|     E072 = ("Error serializing lexeme: expected data length {length}, " | ||||
|             "got {bad_length}.") | ||||
|     E073 = ("Cannot assign vector of length {new_length}. Existing vectors " | ||||
|             "are of length {length}. You can use `vocab.reset_vectors` to " | ||||
|             "clear the existing vectors and resize the table.") | ||||
|     E074 = ("Error interpreting compiled match pattern: patterns are expected " | ||||
|             "to end with the attribute {attr}. Got: {bad_attr}.") | ||||
|     E075 = ("Error accepting match: length ({length}) > maximum length " | ||||
|             "({max_len}).") | ||||
|     E076 = ("Error setting tensor on Doc: tensor has {rows} rows, while Doc " | ||||
|             "has {words} words.") | ||||
|     E077 = ("Error computing {value}: number of Docs ({n_docs}) does not " | ||||
|             "equal number of GoldParse objects ({n_golds}) in batch.") | ||||
|     E078 = ("Error computing score: number of words in Doc ({words_doc}) does " | ||||
|             "not equal number of words in GoldParse ({words_gold}).") | ||||
|     E079 = ("Error computing states in beam: number of predicted beams " | ||||
|             "({pbeams}) does not equal number of gold beams ({gbeams}).") | ||||
|     E080 = ("Duplicate state found in beam: {key}.") | ||||
|     E081 = ("Error getting gradient in beam: number of histories ({n_hist}) " | ||||
|             "does not equal number of losses ({losses}).") | ||||
|     E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), " | ||||
|             "projective heads ({n_proj_heads}) and labels ({n_labels}) do not " | ||||
|             "match.") | ||||
|     E083 = ("Error setting extension: only one of default, getter, setter and " | ||||
|             "method is allowed. {n_args} keyword arguments were specified.") | ||||
|     E084 = ("Error assigning label ID {label} to span: not in StringStore.") | ||||
|     E085 = ("Can't create lexeme for string '{string}'.") | ||||
|     E086 = ("Error deserializing lexeme '{string}': orth ID {orth_id} does " | ||||
|             "not match hash {hash_id} in StringStore.") | ||||
|     E087 = ("Unknown displaCy style: {style}.") | ||||
|     E088 = ("Text of length {length} exceeds maximum of {max_length}. The " | ||||
|             "v2.x parser and NER models require roughly 1GB of temporary " | ||||
|             "memory per 100,000 characters in the input. This means long " | ||||
|             "texts may cause memory allocation errors. If you're not using " | ||||
|             "the parser or NER, it's probably safe to increase the " | ||||
|             "`nlp.max_length` limit. The limit is in number of characters, so " | ||||
|             "you can check whether your inputs are too long by checking " | ||||
|             "`len(text)`.") | ||||
| 
 | ||||
| 
 | ||||
| @add_codes | ||||
| class TempErrors(object): | ||||
|     T001 = ("Max length currently 10 for phrase matching") | ||||
|     T002 = ("Pattern length ({doc_len}) >= phrase_matcher.max_length " | ||||
|             "({max_len}). Length can be set on initialization, up to 10.") | ||||
|     T003 = ("Resizing pre-trained Tagger models is not currently supported.") | ||||
|     T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.") | ||||
|     T005 = ("Currently history size is hard-coded to 0. Received: {value}.") | ||||
|     T006 = ("Currently history width is hard-coded to 0. Received: {value}.") | ||||
|     T007 = ("Can't yet set {attr} from Span. Vote for this feature on the " | ||||
|             "issue tracker: http://github.com/explosion/spaCy/issues") | ||||
| 
 | ||||
| 
 | ||||
| class ModelsWarning(UserWarning): | ||||
|     pass | ||||
| 
 | ||||
| 
 | ||||
| WARNINGS = { | ||||
|     'user': UserWarning, | ||||
|     'deprecation': DeprecationWarning, | ||||
|     'models': ModelsWarning, | ||||
| } | ||||
| 
 | ||||
| 
 | ||||
| def _get_warn_types(arg): | ||||
|     if arg == '':  # don't show any warnings | ||||
|         return [] | ||||
|     if not arg or arg == 'all':  # show all available warnings | ||||
|         return WARNINGS.keys() | ||||
|     return [w_type.strip() for w_type in arg.split(',') | ||||
|             if w_type.strip() in WARNINGS] | ||||
| 
 | ||||
| 
 | ||||
| SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER', 'always') | ||||
| SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES')) | ||||
| 
 | ||||
| 
 | ||||
| def user_warning(message): | ||||
|     _warn(message, 'user') | ||||
| 
 | ||||
| 
 | ||||
| def deprecation_warning(message): | ||||
|     _warn(message, 'deprecation') | ||||
| 
 | ||||
| 
 | ||||
| def models_warning(message): | ||||
|     _warn(message, 'models') | ||||
| 
 | ||||
| 
 | ||||
| def _warn(message, warn_type='user'): | ||||
|     """ | ||||
|     message (unicode): The message to display. | ||||
|     category (Warning): The Warning to show. | ||||
|     """ | ||||
|     if warn_type in SPACY_WARNING_TYPES: | ||||
|         category = WARNINGS[warn_type] | ||||
|         stack = inspect.stack()[-1] | ||||
|         with warnings.catch_warnings(): | ||||
|             warnings.simplefilter(SPACY_WARNING_FILTER, category) | ||||
|             warnings.warn_explicit(message, category, stack[1], stack[2]) | ||||
|  | @ -10,6 +10,7 @@ import itertools | |||
| 
 | ||||
| from .syntax import nonproj | ||||
| from .tokens import Doc | ||||
| from .errors import Errors | ||||
| from . import util | ||||
| from .util import minibatch | ||||
| 
 | ||||
|  | @ -28,7 +29,8 @@ def tags_to_entities(tags): | |||
|         elif tag == '-': | ||||
|             continue | ||||
|         elif tag.startswith('I'): | ||||
|             assert start is not None, tags[:i] | ||||
|             if start is None: | ||||
|                 raise ValueError(Errors.E067.format(tags=tags[:i])) | ||||
|             continue | ||||
|         if tag.startswith('U'): | ||||
|             entities.append((tag[2:], i, i)) | ||||
|  | @ -38,7 +40,7 @@ def tags_to_entities(tags): | |||
|             entities.append((tag[2:], start, i)) | ||||
|             start = None | ||||
|         else: | ||||
|             raise Exception(tag) | ||||
|             raise ValueError(Errors.E068.format(tag=tag)) | ||||
|     return entities | ||||
| 
 | ||||
| 
 | ||||
|  | @ -238,7 +240,9 @@ class GoldCorpus(object): | |||
| 
 | ||||
|     @classmethod | ||||
|     def _make_golds(cls, docs, paragraph_tuples): | ||||
|         assert len(docs) == len(paragraph_tuples) | ||||
|         if len(docs) != len(paragraph_tuples): | ||||
|             raise ValueError(Errors.E070.format(n_docs=len(docs), | ||||
|                                                 n_annots=len(paragraph_tuples))) | ||||
|         if len(docs) == 1: | ||||
|             return [GoldParse.from_annot_tuples(docs[0], | ||||
|                                                 paragraph_tuples[0][0])] | ||||
|  | @ -461,7 +465,7 @@ cdef class GoldParse: | |||
| 
 | ||||
|         cycle = nonproj.contains_cycle(self.heads) | ||||
|         if cycle is not None: | ||||
|             raise Exception("Cycle found: %s" % cycle) | ||||
|             raise ValueError(Errors.E069.format(cycle=cycle)) | ||||
| 
 | ||||
|         if make_projective: | ||||
|             proj_heads, _ = nonproj.projectivize(self.heads, self.labels) | ||||
|  |  | |||
|  | @ -28,6 +28,7 @@ from .lang.punctuation import TOKENIZER_INFIXES | |||
| from .lang.tokenizer_exceptions import TOKEN_MATCH | ||||
| from .lang.tag_map import TAG_MAP | ||||
| from .lang.lex_attrs import LEX_ATTRS, is_stop | ||||
| from .errors import Errors | ||||
| from . import util | ||||
| from . import about | ||||
| 
 | ||||
|  | @ -217,8 +218,7 @@ class Language(object): | |||
|         for pipe_name, component in self.pipeline: | ||||
|             if pipe_name == name: | ||||
|                 return component | ||||
|         msg = "No component '{}' found in pipeline. Available names: {}" | ||||
|         raise KeyError(msg.format(name, self.pipe_names)) | ||||
|         raise KeyError(Errors.E001.format(name=name, opts=self.pipe_names)) | ||||
| 
 | ||||
|     def create_pipe(self, name, config=dict()): | ||||
|         """Create a pipeline component from a factory. | ||||
|  | @ -228,7 +228,7 @@ class Language(object): | |||
|         RETURNS (callable): Pipeline component. | ||||
|         """ | ||||
|         if name not in self.factories: | ||||
|             raise KeyError("Can't find factory for '{}'.".format(name)) | ||||
|             raise KeyError(Errors.E002.format(name=name)) | ||||
|         factory = self.factories[name] | ||||
|         return factory(self, **config) | ||||
| 
 | ||||
|  | @ -253,12 +253,9 @@ class Language(object): | |||
|             >>> nlp.add_pipe(component, name='custom_name', last=True) | ||||
|         """ | ||||
|         if not hasattr(component, '__call__'): | ||||
|             msg = ("Not a valid pipeline component. Expected callable, but " | ||||
|                    "got {}. ".format(repr(component))) | ||||
|             msg = Errors.E003.format(component=repr(component), name=name) | ||||
|             if isinstance(component, basestring_) and component in self.factories: | ||||
|                 msg += ("If you meant to add a built-in component, use " | ||||
|                         "create_pipe: nlp.add_pipe(nlp.create_pipe('{}'))" | ||||
|                         .format(component)) | ||||
|                 msg += Errors.E004.format(component=component) | ||||
|             raise ValueError(msg) | ||||
|         if name is None: | ||||
|             if hasattr(component, 'name'): | ||||
|  | @ -271,11 +268,9 @@ class Language(object): | |||
|             else: | ||||
|                 name = repr(component) | ||||
|         if name in self.pipe_names: | ||||
|             raise ValueError("'{}' already exists in pipeline.".format(name)) | ||||
|             raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names)) | ||||
|         if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2: | ||||
|             msg = ("Invalid constraints. You can only set one of the " | ||||
|                    "following: before, after, first, last.") | ||||
|             raise ValueError(msg) | ||||
|             raise ValueError(Errors.E006) | ||||
|         pipe = (name, component) | ||||
|         if last or not any([first, before, after]): | ||||
|             self.pipeline.append(pipe) | ||||
|  | @ -286,9 +281,8 @@ class Language(object): | |||
|         elif after and after in self.pipe_names: | ||||
|             self.pipeline.insert(self.pipe_names.index(after) + 1, pipe) | ||||
|         else: | ||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||
|             unfound = before or after | ||||
|             raise ValueError(msg.format(unfound, self.pipe_names)) | ||||
|             raise ValueError(Errors.E001.format(name=before or after, | ||||
|                                                 opts=self.pipe_names)) | ||||
| 
 | ||||
|     def has_pipe(self, name): | ||||
|         """Check if a component name is present in the pipeline. Equivalent to | ||||
|  | @ -306,8 +300,7 @@ class Language(object): | |||
|         component (callable): Pipeline component. | ||||
|         """ | ||||
|         if name not in self.pipe_names: | ||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||
|             raise ValueError(msg.format(name, self.pipe_names)) | ||||
|             raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names)) | ||||
|         self.pipeline[self.pipe_names.index(name)] = (name, component) | ||||
| 
 | ||||
|     def rename_pipe(self, old_name, new_name): | ||||
|  | @ -317,11 +310,9 @@ class Language(object): | |||
|         new_name (unicode): New name of the component. | ||||
|         """ | ||||
|         if old_name not in self.pipe_names: | ||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||
|             raise ValueError(msg.format(old_name, self.pipe_names)) | ||||
|             raise ValueError(Errors.E001.format(name=old_name, opts=self.pipe_names)) | ||||
|         if new_name in self.pipe_names: | ||||
|             msg = "'{}' already exists in pipeline. Existing names: {}" | ||||
|             raise ValueError(msg.format(new_name, self.pipe_names)) | ||||
|             raise ValueError(Errors.E007.format(name=new_name, opts=self.pipe_names)) | ||||
|         i = self.pipe_names.index(old_name) | ||||
|         self.pipeline[i] = (new_name, self.pipeline[i][1]) | ||||
| 
 | ||||
|  | @ -332,8 +323,7 @@ class Language(object): | |||
|         RETURNS (tuple): A `(name, component)` tuple of the removed component. | ||||
|         """ | ||||
|         if name not in self.pipe_names: | ||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" | ||||
|             raise ValueError(msg.format(name, self.pipe_names)) | ||||
|             raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names)) | ||||
|         return self.pipeline.pop(self.pipe_names.index(name)) | ||||
| 
 | ||||
|     def __call__(self, text, disable=[]): | ||||
|  | @ -351,21 +341,17 @@ class Language(object): | |||
|             ('An', 'NN') | ||||
|         """ | ||||
|         if len(text) >= self.max_length: | ||||
|             msg = ( | ||||
|                 "Text of length {length} exceeds maximum of {max_length}. " | ||||
|                 "The v2 parser and NER models require roughly 1GB of temporary " | ||||
|                 "memory per 100,000 characters in the input. This means long " | ||||
|                 "texts may cause memory allocation errors. If you're not using " | ||||
|                 "the parser or NER, it's probably safe to increase the " | ||||
|                 "nlp.max_length limit. The limit is in number of characters, " | ||||
|                 "so you can check whether your inputs are too long by checking " | ||||
|                 "len(text).") | ||||
|             raise ValueError(msg.format(length=len(text), max_length=self.max_length)) | ||||
|             raise ValueError(Errors.E088.format(length=len(text), | ||||
|                                                 max_length=self.max_length)) | ||||
|         doc = self.make_doc(text) | ||||
|         for name, proc in self.pipeline: | ||||
|             if name in disable: | ||||
|                 continue | ||||
|             if not hasattr(proc, '__call__'): | ||||
|                 raise ValueError(Errors.E003.format(component=type(proc), name=name)) | ||||
|             doc = proc(doc) | ||||
|             if doc is None: | ||||
|                 raise ValueError(Errors.E005.format(name=name)) | ||||
|         return doc | ||||
| 
 | ||||
|     def disable_pipes(self, *names): | ||||
|  | @ -407,8 +393,7 @@ class Language(object): | |||
|             >>>            state = nlp.update(docs, golds, sgd=optimizer) | ||||
|         """ | ||||
|         if len(docs) != len(golds): | ||||
|             raise IndexError("Update expects same number of docs and golds " | ||||
|                              "Got: %d, %d" % (len(docs), len(golds))) | ||||
|             raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds))) | ||||
|         if len(docs) == 0: | ||||
|             return | ||||
|         if sgd is None: | ||||
|  | @ -757,14 +742,7 @@ class DisabledPipes(list): | |||
|         if unexpected: | ||||
|             # Don't change the pipeline if we're raising an error. | ||||
|             self.nlp.pipeline = current | ||||
|             msg = ( | ||||
|                 "Some current components would be lost when restoring " | ||||
|                 "previous pipeline state. If you added components after " | ||||
|                 "calling nlp.disable_pipes(), you should remove them " | ||||
|                 "explicitly with nlp.remove_pipe() before the pipeline is " | ||||
|                 "restore. Names of the new components: %s" | ||||
|             ) | ||||
|             raise ValueError(msg % unexpected) | ||||
|             raise ValueError(Errors.E008.format(names=unexpected)) | ||||
|         self[:] = [] | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -15,7 +15,7 @@ from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP | |||
| from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV | ||||
| from .attrs cimport PROB | ||||
| from .attrs import intify_attrs | ||||
| from . import about | ||||
| from .errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) | ||||
|  | @ -37,7 +37,8 @@ cdef class Lexeme: | |||
|         self.vocab = vocab | ||||
|         self.orth = orth | ||||
|         self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth) | ||||
|         assert self.c.orth == orth | ||||
|         if self.c.orth != orth: | ||||
|             raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth)) | ||||
| 
 | ||||
|     def __richcmp__(self, other, int op): | ||||
|         if other is None: | ||||
|  | @ -129,20 +130,25 @@ cdef class Lexeme: | |||
|         lex_data = Lexeme.c_to_bytes(self.c) | ||||
|         start = <const char*>&self.c.flags | ||||
|         end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment) | ||||
|         assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data)) | ||||
|         if (end-start) != sizeof(lex_data.data): | ||||
|             raise ValueError(Errors.E072.format(length=end-start, | ||||
|                                                 bad_length=sizeof(lex_data.data))) | ||||
|         byte_string = b'\0' * sizeof(lex_data.data) | ||||
|         byte_chars = <char*>byte_string | ||||
|         for i in range(sizeof(lex_data.data)): | ||||
|             byte_chars[i] = lex_data.data[i] | ||||
|         assert len(byte_string) == sizeof(lex_data.data), (len(byte_string), | ||||
|                 sizeof(lex_data.data)) | ||||
|         if len(byte_string) != sizeof(lex_data.data): | ||||
|             raise ValueError(Errors.E072.format(length=len(byte_string), | ||||
|                                                 bad_length=sizeof(lex_data.data))) | ||||
|         return byte_string | ||||
| 
 | ||||
|     def from_bytes(self, bytes byte_string): | ||||
|         # This method doesn't really have a use-case --- wrote it for testing. | ||||
|         # Possibly delete? It puts the Lexeme out of synch with the vocab. | ||||
|         cdef SerializedLexemeC lex_data | ||||
|         assert len(byte_string) == sizeof(lex_data.data) | ||||
|         if len(byte_string) != sizeof(lex_data.data): | ||||
|             raise ValueError(Errors.E072.format(length=len(byte_string), | ||||
|                                                 bad_length=sizeof(lex_data.data))) | ||||
|         for i in range(len(byte_string)): | ||||
|             lex_data.data[i] = byte_string[i] | ||||
|         Lexeme.c_from_bytes(self.c, lex_data) | ||||
|  | @ -169,16 +175,13 @@ cdef class Lexeme: | |||
|         def __get__(self): | ||||
|             cdef int length = self.vocab.vectors_length | ||||
|             if length == 0: | ||||
|                 raise ValueError( | ||||
|                     "Word vectors set to length 0. This may be because you " | ||||
|                     "don't have a model installed or loaded, or because your " | ||||
|                     "model doesn't include word vectors. For more info, see " | ||||
|                     "the documentation: \n%s\n" % about.__docs_models__ | ||||
|                 ) | ||||
|                 raise ValueError(Errors.E010) | ||||
|             return self.vocab.get_vector(self.c.orth) | ||||
| 
 | ||||
|         def __set__(self, vector): | ||||
|             assert len(vector) == self.vocab.vectors_length | ||||
|             if len(vector) != self.vocab.vectors_length: | ||||
|                 raise ValueError(Errors.E073.format(new_length=len(vector), | ||||
|                                                     length=self.vocab.vectors_length)) | ||||
|             self.vocab.set_vector(self.c.orth, vector) | ||||
| 
 | ||||
|     property rank: | ||||
|  |  | |||
|  | @ -16,6 +16,7 @@ from .typedefs cimport hash_t | |||
| from .structs cimport TokenC | ||||
| from .tokens.doc cimport Doc, get_token_attr | ||||
| from .vocab cimport Vocab | ||||
| from .errors import Errors, TempErrors | ||||
| 
 | ||||
| from .attrs import IDS | ||||
| from .attrs cimport attr_id_t, ID, NULL_ATTR | ||||
|  | @ -109,7 +110,8 @@ cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0: | |||
|     while pattern.nr_attr != 0: | ||||
|         pattern += 1 | ||||
|     id_attr = pattern[0].attrs[0] | ||||
|     assert id_attr.attr == ID | ||||
|     if id_attr.attr != ID: | ||||
|         raise ValueError(Errors.E074.format(attr=ID, bad_attr=id_attr.attr)) | ||||
|     return id_attr.value | ||||
| 
 | ||||
| 
 | ||||
|  | @ -161,8 +163,8 @@ def _convert_strings(token_specs, string_store): | |||
|                 if value in operators: | ||||
|                     ops = operators[value] | ||||
|                 else: | ||||
|                     msg = "Unknown operator '%s'. Options: %s" | ||||
|                     raise KeyError(msg % (value, ', '.join(operators.keys()))) | ||||
|                     keys = ', '.join(operators.keys()) | ||||
|                     raise KeyError(Errors.E011.format(op=value, opts=keys)) | ||||
|             if isinstance(attr, basestring): | ||||
|                 attr = IDS.get(attr.upper()) | ||||
|             if isinstance(value, basestring): | ||||
|  | @ -264,9 +266,7 @@ cdef class Matcher: | |||
|         """ | ||||
|         for pattern in patterns: | ||||
|             if len(pattern) == 0: | ||||
|                 msg = ("Cannot add pattern for zero tokens to matcher.\n" | ||||
|                        "key: {key}\n") | ||||
|                 raise ValueError(msg.format(key=key)) | ||||
|                 raise ValueError(Errors.E012.format(key=key)) | ||||
|         key = self._normalize_key(key) | ||||
|         for pattern in patterns: | ||||
|             specs = _convert_strings(pattern, self.vocab.strings) | ||||
|  | @ -348,13 +348,12 @@ cdef class Matcher: | |||
|             for state in partials: | ||||
|                 action = get_action(state.second, token) | ||||
|                 if action == PANIC: | ||||
|                     raise Exception("Error selecting action in matcher") | ||||
|                     raise ValueError(Errors.E013) | ||||
|                 while action == ADVANCE_ZERO: | ||||
|                     state.second += 1 | ||||
|                     action = get_action(state.second, token) | ||||
|                 if action == PANIC: | ||||
|                     raise Exception("Error selecting action in matcher") | ||||
| 
 | ||||
|                     raise ValueError(Errors.E013) | ||||
|                 if action == REPEAT: | ||||
|                     # Leave the state in the queue, and advance to next slot | ||||
|                     # (i.e. we don't overwrite -- we want to greedily match | ||||
|  | @ -380,7 +379,7 @@ cdef class Matcher: | |||
|             for pattern in self.patterns: | ||||
|                 action = get_action(pattern, token) | ||||
|                 if action == PANIC: | ||||
|                     raise Exception("Error selecting action in matcher") | ||||
|                     raise ValueError(Errors.E013) | ||||
|                 while action == ADVANCE_ZERO: | ||||
|                     pattern += 1 | ||||
|                     action = get_action(pattern, token) | ||||
|  | @ -447,7 +446,7 @@ def get_bilou(length): | |||
|         return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, | ||||
|                 I10_ENT, I10_ENT, L10_ENT] | ||||
|     else: | ||||
|         raise ValueError("Max length currently 10 for phrase matching") | ||||
|         raise ValueError(TempErrors.T001) | ||||
| 
 | ||||
| 
 | ||||
| cdef class PhraseMatcher: | ||||
|  | @ -506,11 +505,8 @@ cdef class PhraseMatcher: | |||
|         cdef Doc doc | ||||
|         for doc in docs: | ||||
|             if len(doc) >= self.max_length: | ||||
|                 msg = ( | ||||
|                     "Pattern length (%d) >= phrase_matcher.max_length (%d). " | ||||
|                     "Length can be set on initialization, up to 10." | ||||
|                 ) | ||||
|                 raise ValueError(msg % (len(doc), self.max_length)) | ||||
|                 raise ValueError(TempErrors.T002.format(doc_len=len(doc), | ||||
|                                                         max_len=self.max_length)) | ||||
|         cdef hash_t ent_id = self.matcher._normalize_key(key) | ||||
|         self._callbacks[ent_id] = on_match | ||||
|         cdef int length | ||||
|  | @ -562,7 +558,9 @@ cdef class PhraseMatcher: | |||
|             yield doc | ||||
| 
 | ||||
|     def accept_match(self, Doc doc, int start, int end): | ||||
|         assert (end - start) < self.max_length | ||||
|         if (end - start) >= self.max_length: | ||||
|             raise ValueError(Errors.E075.format(length=end - start, | ||||
|                                                 max_len=self.max_length)) | ||||
|         cdef int i, j | ||||
|         for i in range(self.max_length): | ||||
|             self._phrase_key[i] = 0 | ||||
|  |  | |||
|  | @ -9,6 +9,7 @@ from .attrs import LEMMA, intify_attrs | |||
| from .parts_of_speech cimport SPACE | ||||
| from .parts_of_speech import IDS as POS_IDS | ||||
| from .lexeme cimport Lexeme | ||||
| from .errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| def _normalize_props(props): | ||||
|  | @ -93,7 +94,7 @@ cdef class Morphology: | |||
| 
 | ||||
|     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: | ||||
|         if tag_id > self.n_tags: | ||||
|             raise ValueError("Unknown tag ID: %s" % tag_id) | ||||
|             raise ValueError(Errors.E014.format(tag=tag_id)) | ||||
|         # TODO: It's pretty arbitrary to put this logic here. I guess the | ||||
|         # justification is that this is where the specific word and the tag | ||||
|         # interact. Still, we should have a better way to enforce this rule, or | ||||
|  | @ -147,9 +148,7 @@ cdef class Morphology: | |||
|         elif force: | ||||
|             memset(cached, 0, sizeof(cached[0])) | ||||
|         else: | ||||
|             raise ValueError( | ||||
|                 "Conflicting morphology exception for (%s, %s). Use " | ||||
|                 "force=True to overwrite." % (tag_str, orth_str)) | ||||
|             raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str)) | ||||
| 
 | ||||
|         cached.tag = rich_tag | ||||
|         # TODO: Refactor this to take arbitrary attributes. | ||||
|  |  | |||
|  | @ -33,6 +33,7 @@ from .parts_of_speech import X | |||
| from ._ml import Tok2Vec, build_text_classifier, build_tagger_model | ||||
| from ._ml import link_vectors_to_models, zero_init, flatten | ||||
| from ._ml import create_default_optimizer | ||||
| from .errors import Errors, TempErrors | ||||
| from . import util | ||||
| 
 | ||||
| 
 | ||||
|  | @ -336,7 +337,8 @@ class Tensorizer(Pipe): | |||
|         tensors (object): Vector representation for each token in the docs. | ||||
|         """ | ||||
|         for doc, tensor in zip(docs, tensors): | ||||
|             assert tensor.shape[0] == len(doc) | ||||
|             if tensor.shape[0] != len(doc): | ||||
|                 raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc))) | ||||
|             doc.tensor = tensor | ||||
| 
 | ||||
|     def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): | ||||
|  | @ -550,9 +552,7 @@ class Tagger(Pipe): | |||
|             # copy_array(larger.W[:smaller.nO], smaller.W) | ||||
|             # copy_array(larger.b[:smaller.nO], smaller.b) | ||||
|             # self.model._layers[-1] = larger | ||||
|             raise ValueError( | ||||
|                 "Resizing pre-trained Tagger models is not " | ||||
|                 "currently supported.") | ||||
|             raise ValueError(TempErrors.T003) | ||||
|         tag_map = dict(self.vocab.morphology.tag_map) | ||||
|         if values is None: | ||||
|             values = {POS: "X"} | ||||
|  | @ -671,8 +671,7 @@ class MultitaskObjective(Tagger): | |||
|         elif hasattr(target, '__call__'): | ||||
|             self.make_label = target | ||||
|         else: | ||||
|             raise ValueError("MultitaskObjective target should be function or " | ||||
|                              "one of: dep, tag, ent, dep_tag_offset, ent_tag.") | ||||
|             raise ValueError(Errors.E016) | ||||
|         self.cfg = dict(cfg) | ||||
|         self.cfg.setdefault('cnn_maxout_pieces', 2) | ||||
| 
 | ||||
|  | @ -723,7 +722,9 @@ class MultitaskObjective(Tagger): | |||
|         return tokvecs, scores | ||||
| 
 | ||||
|     def get_loss(self, docs, golds, scores): | ||||
|         assert len(docs) == len(golds) | ||||
|         if len(docs) != len(golds): | ||||
|             raise ValueError(Errors.E077.format(value='loss', n_docs=len(docs), | ||||
|                                                 n_golds=len(golds))) | ||||
|         cdef int idx = 0 | ||||
|         correct = numpy.zeros((scores.shape[0],), dtype='i') | ||||
|         guesses = scores.argmax(axis=1) | ||||
|  |  | |||
|  | @ -2,6 +2,7 @@ | |||
| from __future__ import division, print_function, unicode_literals | ||||
| 
 | ||||
| from .gold import tags_to_entities | ||||
| from .errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| class PRFScore(object): | ||||
|  | @ -84,7 +85,8 @@ class Scorer(object): | |||
|         } | ||||
| 
 | ||||
|     def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')): | ||||
|         assert len(tokens) == len(gold) | ||||
|         if len(tokens) != len(gold): | ||||
|             raise ValueError(Errors.E078.format(words_doc=len(tokens), words_gold=len(gold))) | ||||
|         gold_deps = set() | ||||
|         gold_tags = set() | ||||
|         gold_ents = set(tags_to_entities([annot[-1] | ||||
|  |  | |||
|  | @ -13,6 +13,7 @@ from .symbols import IDS as SYMBOLS_BY_STR | |||
| from .symbols import NAMES as SYMBOLS_BY_INT | ||||
| from .typedefs cimport hash_t | ||||
| from .compat import json_dumps | ||||
| from .errors import Errors | ||||
| from . import util | ||||
| 
 | ||||
| 
 | ||||
|  | @ -59,7 +60,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e | |||
|         string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char)) | ||||
|         string.p[0] = length | ||||
|         memcpy(&string.p[1], chars, length) | ||||
|         assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0] | ||||
|         return string | ||||
|     else: | ||||
|         i = 0 | ||||
|  | @ -69,7 +69,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e | |||
|             string.p[i] = 255 | ||||
|         string.p[n_length_bytes-1] = length % 255 | ||||
|         memcpy(&string.p[n_length_bytes], chars, length) | ||||
|         assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0] | ||||
|         return string | ||||
| 
 | ||||
| 
 | ||||
|  | @ -115,7 +114,7 @@ cdef class StringStore: | |||
|             self.hits.insert(key) | ||||
|             utf8str = <Utf8Str*>self._map.get(key) | ||||
|             if utf8str is NULL: | ||||
|                 raise KeyError(string_or_id) | ||||
|                 raise KeyError(Errors.E018.format(hash_value=string_or_id)) | ||||
|             else: | ||||
|                 return decode_Utf8Str(utf8str) | ||||
| 
 | ||||
|  | @ -136,8 +135,7 @@ cdef class StringStore: | |||
|             key = hash_utf8(string, len(string)) | ||||
|             self._intern_utf8(string, len(string)) | ||||
|         else: | ||||
|             raise TypeError( | ||||
|                 "Can only add unicode or bytes. Got type: %s" % type(string)) | ||||
|             raise TypeError(Errors.E017.format(value_type=type(string))) | ||||
|         return key | ||||
| 
 | ||||
|     def __len__(self): | ||||
|  |  | |||
|  | @ -10,6 +10,7 @@ from thinc.extra.search cimport MaxViolation | |||
| 
 | ||||
| from .transition_system cimport TransitionSystem, Transition | ||||
| from ..gold cimport GoldParse | ||||
| from ..errors import Errors | ||||
| from .stateclass cimport StateC, StateClass | ||||
| 
 | ||||
| 
 | ||||
|  | @ -220,7 +221,8 @@ def get_states(pbeams, gbeams, beam_map, nr_update): | |||
|     p_indices = [] | ||||
|     g_indices = [] | ||||
|     cdef Beam pbeam, gbeam | ||||
|     assert len(pbeams) == len(gbeams) | ||||
|     if len(pbeams) != len(gbeams): | ||||
|         raise ValueError(Errors.E079.format(pbeams=len(pbeams), gbeams=len(gbeams))) | ||||
|     for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)): | ||||
|         p_indices.append([]) | ||||
|         g_indices.append([]) | ||||
|  | @ -228,7 +230,8 @@ def get_states(pbeams, gbeams, beam_map, nr_update): | |||
|             state = StateClass.borrow(<StateC*>pbeam.at(i)) | ||||
|             if not state.is_final(): | ||||
|                 key = tuple([eg_id] + pbeam.histories[i]) | ||||
|                 assert key not in seen, (key, seen) | ||||
|                 if key in seen: | ||||
|                     raise ValueError(Errors.E080.format(key=key)) | ||||
|                 seen[key] = len(states) | ||||
|                 p_indices[-1].append(len(states)) | ||||
|                 states.append(state) | ||||
|  | @ -271,7 +274,8 @@ def get_gradient(nr_class, beam_maps, histories, losses): | |||
|     for i in range(nr_step): | ||||
|         grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), | ||||
|                                  dtype='f')) | ||||
|     assert len(histories) == len(losses) | ||||
|     if len(histories) != len(losses): | ||||
|         raise ValueError(Errors.E081.format(n_hist=len(histories), losses=len(losses))) | ||||
|     for eg_id, hists in enumerate(histories): | ||||
|         for loss, hist in zip(losses[eg_id], hists): | ||||
|             if loss == 0.0 or numpy.isnan(loss): | ||||
|  |  | |||
|  | @ -15,6 +15,7 @@ from .nonproj import is_nonproj_tree | |||
| from .transition_system cimport move_cost_func_t, label_cost_func_t | ||||
| from ..gold cimport GoldParse, GoldParseC | ||||
| from ..structs cimport TokenC | ||||
| from ..errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| DEF NON_MONOTONIC = True | ||||
|  | @ -455,7 +456,7 @@ cdef class ArcEager(TransitionSystem): | |||
|             t.do = Break.transition | ||||
|             t.get_cost = Break.cost | ||||
|         else: | ||||
|             raise Exception(move) | ||||
|             raise ValueError(Errors.E019.format(action=move, src='arc_eager')) | ||||
|         return t | ||||
| 
 | ||||
|     cdef int initialize_state(self, StateC* st) nogil: | ||||
|  | @ -529,28 +530,11 @@ cdef class ArcEager(TransitionSystem): | |||
|         if n_gold < 1: | ||||
|             # Check projectivity --- leading cause | ||||
|             if is_nonproj_tree(gold.heads): | ||||
|                 raise ValueError( | ||||
|                     "Could not find a gold-standard action to supervise the " | ||||
|                     "dependency parser. Likely cause: the tree is " | ||||
|                     "non-projective (i.e. it has crossing arcs -- see " | ||||
|                     "spacy/syntax/nonproj.pyx for definitions). The ArcEager " | ||||
|                     "transition system only supports projective trees. To " | ||||
|                     "learn non-projective representations, transform the data " | ||||
|                     "before training and after parsing. Either pass " | ||||
|                     "make_projective=True to the GoldParse class, or use " | ||||
|                     "spacy.syntax.nonproj.preprocess_training_data.") | ||||
|                 raise ValueError(Errors.E020) | ||||
|             else: | ||||
|                 print(gold.orig_annot) | ||||
|                 print(gold.words) | ||||
|                 print(gold.heads) | ||||
|                 print(gold.labels) | ||||
|                 print(gold.sent_starts) | ||||
|                 raise ValueError( | ||||
|                     "Could not find a gold-standard action to supervise the" | ||||
|                     "dependency parser. The GoldParse was projective. The " | ||||
|                     "transition system has %d actions. State at failure: %s" | ||||
|                     % (self.n_moves, stcls.print_state(gold.words))) | ||||
|         assert n_gold >= 1 | ||||
|                 failure_state = stcls.print_state(gold.words) | ||||
|                 raise ValueError(Errors.E021.format(n_actions=self.n_moves, | ||||
|                                                     state=failure_state)) | ||||
| 
 | ||||
|     def get_beam_annot(self, Beam beam): | ||||
|         length = (<StateC*>beam.at(0)).length | ||||
|  |  | |||
|  | @ -10,6 +10,7 @@ from ._state cimport StateC | |||
| from .transition_system cimport Transition | ||||
| from .transition_system cimport do_func_t | ||||
| from ..gold cimport GoldParseC, GoldParse | ||||
| from ..errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| cdef enum: | ||||
|  | @ -173,7 +174,7 @@ cdef class BiluoPushDown(TransitionSystem): | |||
|             if self.c[i].move == move and self.c[i].label == label: | ||||
|                 return self.c[i] | ||||
|         else: | ||||
|             raise KeyError(name) | ||||
|             raise KeyError(Errors.E022.format(name=name)) | ||||
| 
 | ||||
|     cdef Transition init_transition(self, int clas, int move, attr_t label) except *: | ||||
|         # TODO: Apparent Cython bug here when we try to use the Transition() | ||||
|  | @ -208,7 +209,7 @@ cdef class BiluoPushDown(TransitionSystem): | |||
|             t.do = Out.transition | ||||
|             t.get_cost = Out.cost | ||||
|         else: | ||||
|             raise Exception(move) | ||||
|             raise ValueError(Errors.E019.format(action=move, src='ner')) | ||||
|         return t | ||||
| 
 | ||||
|     def add_action(self, int action, label_name): | ||||
|  | @ -230,7 +231,6 @@ cdef class BiluoPushDown(TransitionSystem): | |||
|             self._size *= 2 | ||||
|             self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0])) | ||||
|         self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id) | ||||
|         assert self.c[self.n_moves].label == label_id | ||||
|         self.n_moves += 1 | ||||
|         return 1 | ||||
| 
 | ||||
|  |  | |||
|  | @ -34,6 +34,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer | |||
| from ..compat import json_dumps, copy_array | ||||
| from ..tokens.doc cimport Doc | ||||
| from ..gold cimport GoldParse | ||||
| from ..errors import Errors, TempErrors | ||||
| from .. import util | ||||
| from .stateclass cimport StateClass | ||||
| from ._state cimport StateC | ||||
|  | @ -242,7 +243,7 @@ cdef class Parser: | |||
|     def Model(cls, nr_class, **cfg): | ||||
|         depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) | ||||
|         if depth != 1: | ||||
|             raise ValueError("Currently parser depth is hard-coded to 1.") | ||||
|             raise ValueError(TempErrors.T004.format(value=depth)) | ||||
|         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', | ||||
|                                             cfg.get('maxout_pieces', 2)) | ||||
|         token_vector_width = util.env_opt('token_vector_width', | ||||
|  | @ -252,9 +253,9 @@ cdef class Parser: | |||
|         hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) | ||||
|         hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) | ||||
|         if hist_size != 0: | ||||
|             raise ValueError("Currently history size is hard-coded to 0") | ||||
|             raise ValueError(TempErrors.T005.format(value=hist_size)) | ||||
|         if hist_width != 0: | ||||
|             raise ValueError("Currently history width is hard-coded to 0") | ||||
|             raise ValueError(TempErrors.T006.format(value=hist_width)) | ||||
|         pretrained_vectors = cfg.get('pretrained_vectors', None) | ||||
|         tok2vec = Tok2Vec(token_vector_width, embed_size, | ||||
|                           pretrained_vectors=pretrained_vectors) | ||||
|  | @ -542,7 +543,9 @@ cdef class Parser: | |||
|     def update(self, docs, golds, drop=0., sgd=None, losses=None): | ||||
|         if not any(self.moves.has_gold(gold) for gold in golds): | ||||
|             return None | ||||
|         assert len(docs) == len(golds) | ||||
|         if len(docs) != len(golds): | ||||
|             raise ValueError(Errors.E077.format(value='update', n_docs=len(docs), | ||||
|                                                 n_golds=len(golds))) | ||||
|         if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0: | ||||
|             return self.update_beam(docs, golds, | ||||
|                     self.cfg['beam_width'], self.cfg['beam_density'], | ||||
|  | @ -622,7 +625,6 @@ cdef class Parser: | |||
|         if losses is not None and self.name not in losses: | ||||
|             losses[self.name] = 0. | ||||
|         lengths = [len(d) for d in docs] | ||||
|         assert min(lengths) >= 1 | ||||
|         states = self.moves.init_batch(docs) | ||||
|         for gold in golds: | ||||
|             self.moves.preprocess_gold(gold) | ||||
|  | @ -1021,15 +1023,11 @@ def _cleanup(Beam beam): | |||
|             del state | ||||
|             seen.add(addr) | ||||
|         else: | ||||
|             print(i, addr) | ||||
|             print(seen) | ||||
|             raise Exception | ||||
|             raise ValueError(Errors.E023.format(addr=addr, i=i)) | ||||
|         addr = <size_t>beam._states[i].content | ||||
|         if addr not in seen: | ||||
|             state = <StateC*>addr | ||||
|             del state | ||||
|             seen.add(addr) | ||||
|         else: | ||||
|             print(i, addr) | ||||
|             print(seen) | ||||
|             raise Exception | ||||
|             raise ValueError(Errors.E023.format(addr=addr, i=i)) | ||||
|  |  | |||
|  | @ -10,6 +10,7 @@ from __future__ import unicode_literals | |||
| from copy import copy | ||||
| 
 | ||||
| from ..tokens.doc cimport Doc | ||||
| from ..errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| DELIMITER = '||' | ||||
|  | @ -131,7 +132,10 @@ cpdef deprojectivize(Doc doc): | |||
| 
 | ||||
| def _decorate(heads, proj_heads, labels): | ||||
|     # uses decoration scheme HEAD from Nivre & Nilsson 2005 | ||||
|     assert(len(heads) == len(proj_heads) == len(labels)) | ||||
|     if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)): | ||||
|         raise ValueError(Errors.E082.format(n_heads=len(heads), | ||||
|                                             n_proj_heads=len(proj_heads), | ||||
|                                             n_labels=len(labels))) | ||||
|     deco_labels = [] | ||||
|     for tokenid, head in enumerate(heads): | ||||
|         if head != proj_heads[tokenid]: | ||||
|  |  | |||
|  | @ -12,6 +12,7 @@ from ..structs cimport TokenC | |||
| from .stateclass cimport StateClass | ||||
| from ..typedefs cimport attr_t | ||||
| from ..compat import json_dumps | ||||
| from ..errors import Errors | ||||
| from .. import util | ||||
| 
 | ||||
| 
 | ||||
|  | @ -80,10 +81,7 @@ cdef class TransitionSystem: | |||
|                     action.do(state.c, action.label) | ||||
|                     break | ||||
|             else: | ||||
|                 print(gold.words) | ||||
|                 print(gold.ner) | ||||
|                 print(history) | ||||
|                 raise ValueError("Could not find gold move") | ||||
|                 raise ValueError(Errors.E024) | ||||
|         return history | ||||
| 
 | ||||
|     cdef int initialize_state(self, StateC* state) nogil: | ||||
|  | @ -130,17 +128,7 @@ cdef class TransitionSystem: | |||
|             else: | ||||
|                 costs[i] = 9000 | ||||
|         if n_gold <= 0: | ||||
|             print(gold.words) | ||||
|             print(gold.ner) | ||||
|             print([gold.c.ner[i].clas for i in range(gold.length)]) | ||||
|             print([gold.c.ner[i].move for i in range(gold.length)]) | ||||
|             print([gold.c.ner[i].label for i in range(gold.length)]) | ||||
|             print("Self labels", | ||||
|                   [self.c[i].label for i in range(self.n_moves)]) | ||||
|             raise ValueError( | ||||
|                 "Could not find a gold-standard action to supervise " | ||||
|                 "the entity recognizer. The transition system has " | ||||
|                 "%d actions." % (self.n_moves)) | ||||
|             raise ValueError(Errors.E024) | ||||
| 
 | ||||
|     def get_class_name(self, int clas): | ||||
|         act = self.c[clas] | ||||
|  | @ -162,7 +150,6 @@ cdef class TransitionSystem: | |||
|             self._size *= 2 | ||||
|             self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0])) | ||||
|         self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id) | ||||
|         assert self.c[self.n_moves].label == label_id | ||||
|         self.n_moves += 1 | ||||
|         return 1 | ||||
| 
 | ||||
|  |  | |||
|  | @ -13,6 +13,7 @@ cimport cython | |||
| 
 | ||||
| from .tokens.doc cimport Doc | ||||
| from .strings cimport hash_string | ||||
| from .errors import Errors, Warnings, deprecation_warning | ||||
| from . import util | ||||
| 
 | ||||
| 
 | ||||
|  | @ -63,11 +64,7 @@ cdef class Tokenizer: | |||
|         return (self.__class__, args, None, None) | ||||
| 
 | ||||
|     cpdef Doc tokens_from_list(self, list strings): | ||||
|         util.deprecated( | ||||
|             "Tokenizer.from_list is now deprecated. Create a new Doc " | ||||
|             "object instead and pass in the strings as the `words` keyword " | ||||
|             "argument, for example:\nfrom spacy.tokens import Doc\n" | ||||
|             "doc = Doc(nlp.vocab, words=[...])") | ||||
|         deprecation_warning(Warnings.W002) | ||||
|         return Doc(self.vocab, words=strings) | ||||
| 
 | ||||
|     @cython.boundscheck(False) | ||||
|  | @ -78,8 +75,7 @@ cdef class Tokenizer: | |||
|         RETURNS (Doc): A container for linguistic annotations. | ||||
|         """ | ||||
|         if len(string) >= (2 ** 30): | ||||
|             msg = "String is too long: %d characters. Max is 2**30." | ||||
|             raise ValueError(msg % len(string)) | ||||
|             raise ValueError(Errors.E025.format(length=len(string))) | ||||
|         cdef int length = len(string) | ||||
|         cdef Doc doc = Doc(self.vocab) | ||||
|         if length == 0: | ||||
|  |  | |||
|  | @ -31,7 +31,7 @@ from ..attrs cimport ENT_TYPE, SENT_START | |||
| from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t | ||||
| from ..util import normalize_slice | ||||
| from ..compat import is_config, copy_reg, pickle, basestring_ | ||||
| from .. import about | ||||
| from ..errors import Errors, Warnings, deprecation_warning | ||||
| from .. import util | ||||
| from .underscore import Underscore | ||||
| from ._retokenize import Retokenizer | ||||
|  | @ -41,9 +41,9 @@ DEF PADDING = 5 | |||
| 
 | ||||
| cdef int bounds_check(int i, int length, int padding) except -1: | ||||
|     if (i + padding) < 0: | ||||
|         raise IndexError | ||||
|         raise IndexError(Errors.E026.format(i=i, length=length)) | ||||
|     if (i - padding) >= length: | ||||
|         raise IndexError | ||||
|         raise IndexError(Errors.E026.format(i=i, length=length)) | ||||
| 
 | ||||
| 
 | ||||
| cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: | ||||
|  | @ -98,7 +98,8 @@ cdef class Doc: | |||
|     def set_extension(cls, name, default=None, method=None, | ||||
|                       getter=None, setter=None): | ||||
|         nr_defined = sum(t is not None for t in (default, getter, setter, method)) | ||||
|         assert nr_defined == 1 | ||||
|         if nr_defined != 1: | ||||
|             raise ValueError(Errors.E083.format(n_args=nr_defined)) | ||||
|         Underscore.doc_extensions[name] = (default, method, getter, setter) | ||||
| 
 | ||||
|     @classmethod | ||||
|  | @ -155,11 +156,7 @@ cdef class Doc: | |||
|             if spaces is None: | ||||
|                 spaces = [True] * len(words) | ||||
|             elif len(spaces) != len(words): | ||||
|                 raise ValueError( | ||||
|                     "Arguments 'words' and 'spaces' should be sequences of " | ||||
|                     "the same length, or 'spaces' should be left default at " | ||||
|                     "None. spaces should be a sequence of booleans, with True " | ||||
|                     "meaning that the word owns a ' ' character following it.") | ||||
|                 raise ValueError(Errors.E027) | ||||
|             orths_and_spaces = zip(words, spaces) | ||||
|         if orths_and_spaces is not None: | ||||
|             for orth_space in orths_and_spaces: | ||||
|  | @ -167,10 +164,7 @@ cdef class Doc: | |||
|                     orth = orth_space | ||||
|                     has_space = True | ||||
|                 elif isinstance(orth_space, bytes): | ||||
|                     raise ValueError( | ||||
|                         "orths_and_spaces expects either List(unicode) or " | ||||
|                         "List((unicode, bool)). " | ||||
|                         "Got bytes instance: %s" % (str(orth_space))) | ||||
|                     raise ValueError(Errors.E028.format(value=orth_space)) | ||||
|                 else: | ||||
|                     orth, has_space = orth_space | ||||
|                 # Note that we pass self.mem here --- we have ownership, if LexemeC | ||||
|  | @ -504,11 +498,7 @@ cdef class Doc: | |||
|         """ | ||||
|         def __get__(self): | ||||
|             if not self.is_parsed: | ||||
|                 raise ValueError( | ||||
|                     "noun_chunks requires the dependency parse, which " | ||||
|                     "requires a statistical model to be installed and loaded. " | ||||
|                     "For more info, see the " | ||||
|                     "documentation: \n%s\n" % about.__docs_models__) | ||||
|                 raise ValueError(Errors.E029) | ||||
|             # Accumulate the result before beginning to iterate over it. This | ||||
|             # prevents the tokenisation from being changed out from under us | ||||
|             # during the iteration. The tricky thing here is that Span accepts | ||||
|  | @ -533,12 +523,7 @@ cdef class Doc: | |||
|         """ | ||||
|         def __get__(self): | ||||
|             if not self.is_sentenced: | ||||
|                 raise ValueError( | ||||
|                     "Sentence boundaries unset. You can add the 'sentencizer' " | ||||
|                     "component to the pipeline with: " | ||||
|                     "nlp.add_pipe(nlp.create_pipe('sentencizer')) " | ||||
|                     "Alternatively, add the dependency parser, or set " | ||||
|                     "sentence boundaries by setting doc[i].sent_start") | ||||
|                 raise ValueError(Errors.E030) | ||||
|             if 'sents' in self.user_hooks: | ||||
|                 yield from self.user_hooks['sents'](self) | ||||
|             else: | ||||
|  | @ -568,7 +553,8 @@ cdef class Doc: | |||
|             t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy | ||||
|         t.l_edge = self.length | ||||
|         t.r_edge = self.length | ||||
|         assert t.lex.orth != 0 | ||||
|         if t.lex.orth == 0: | ||||
|             raise ValueError(Errors.E031.format(i=self.length)) | ||||
|         t.spacy = has_space | ||||
|         self.length += 1 | ||||
|         return t.idx + t.lex.length + t.spacy | ||||
|  | @ -684,13 +670,7 @@ cdef class Doc: | |||
| 
 | ||||
|     def from_array(self, attrs, array): | ||||
|         if SENT_START in attrs and HEAD in attrs: | ||||
|             raise ValueError( | ||||
|                 "Conflicting attributes specified in doc.from_array(): " | ||||
|                 "(HEAD, SENT_START)\n" | ||||
|                 "The HEAD attribute currently sets sentence boundaries " | ||||
|                 "implicitly, based on the tree structure. This means the HEAD " | ||||
|                 "attribute would potentially override the sentence boundaries " | ||||
|                 "set by SENT_START.") | ||||
|             raise ValueError(Errors.E032) | ||||
|         cdef int i, col | ||||
|         cdef attr_id_t attr_id | ||||
|         cdef TokenC* tokens = self.c | ||||
|  | @ -828,7 +808,7 @@ cdef class Doc: | |||
|         RETURNS (Doc): Itself. | ||||
|         """ | ||||
|         if self.length != 0: | ||||
|             raise ValueError("Cannot load into non-empty Doc") | ||||
|             raise ValueError(Errors.E033.format(length=self.length)) | ||||
|         deserializers = { | ||||
|             'text': lambda b: None, | ||||
|             'array_head': lambda b: None, | ||||
|  | @ -916,10 +896,7 @@ cdef class Doc: | |||
|         """ | ||||
|         cdef unicode tag, lemma, ent_type | ||||
|         if len(args) == 3: | ||||
|             util.deprecated( | ||||
|                 "Positional arguments to Doc.merge are deprecated. Instead, " | ||||
|                 "use the keyword arguments, for example tag=, lemma= or " | ||||
|                 "ent_type=.") | ||||
|             deprecation_warning(Warnings.W003) | ||||
|             tag, lemma, ent_type = args | ||||
|             attributes[TAG] = tag | ||||
|             attributes[LEMMA] = lemma | ||||
|  | @ -933,13 +910,9 @@ cdef class Doc: | |||
|             if 'ent_type' in attributes: | ||||
|                 attributes[ENT_TYPE] = attributes['ent_type'] | ||||
|         elif args: | ||||
|             raise ValueError( | ||||
|                 "Doc.merge received %d non-keyword arguments. Expected either " | ||||
|                 "3 arguments (deprecated), or 0 (use keyword arguments). " | ||||
|                 "Arguments supplied:\n%s\n" | ||||
|                 "Keyword arguments: %s\n" % (len(args), repr(args), | ||||
|                                              repr(attributes))) | ||||
| 
 | ||||
|             raise ValueError(Errors.E034.format(n_args=len(args), | ||||
|                                                 args=repr(args), | ||||
|                                                 kwargs=repr(attributes))) | ||||
|         # More deprecated attribute handling =/ | ||||
|         if 'label' in attributes: | ||||
|             attributes['ent_type'] = attributes.pop('label') | ||||
|  |  | |||
|  | @ -16,7 +16,7 @@ from ..util import normalize_slice | |||
| from ..attrs cimport IS_PUNCT, IS_SPACE | ||||
| from ..lexeme cimport Lexeme | ||||
| from ..compat import is_config | ||||
| from .. import about | ||||
| from ..errors import Errors, TempErrors | ||||
| from .underscore import Underscore | ||||
| 
 | ||||
| 
 | ||||
|  | @ -48,8 +48,7 @@ cdef class Span: | |||
|         RETURNS (Span): The newly constructed object. | ||||
|         """ | ||||
|         if not (0 <= start <= end <= len(doc)): | ||||
|             raise IndexError | ||||
| 
 | ||||
|             raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc))) | ||||
|         self.doc = doc | ||||
|         self.start = start | ||||
|         self.start_char = self.doc[start].idx if start < self.doc.length else 0 | ||||
|  | @ -58,7 +57,8 @@ cdef class Span: | |||
|             self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1]) | ||||
|         else: | ||||
|             self.end_char = 0 | ||||
|         assert label in doc.vocab.strings, label | ||||
|         if label not in doc.vocab.strings: | ||||
|             raise ValueError(Errors.E084.format(label=label)) | ||||
|         self.label = label | ||||
|         self._vector = vector | ||||
|         self._vector_norm = vector_norm | ||||
|  | @ -267,11 +267,10 @@ cdef class Span: | |||
|         or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char: | ||||
|             start = token_by_start(self.doc.c, self.doc.length, self.start_char) | ||||
|             if self.start == -1: | ||||
|                 raise IndexError("Error calculating span: Can't find start") | ||||
|                 raise IndexError(Errors.E036.format(start=self.start_char)) | ||||
|             end = token_by_end(self.doc.c, self.doc.length, self.end_char) | ||||
|             if end == -1: | ||||
|                 raise IndexError("Error calculating span: Can't find end") | ||||
| 
 | ||||
|                 raise IndexError(Errors.E037.format(end=self.end_char)) | ||||
|             self.start = start | ||||
|             self.end = end + 1 | ||||
| 
 | ||||
|  | @ -293,7 +292,7 @@ cdef class Span: | |||
|                 root += root.head | ||||
|                 n += 1 | ||||
|                 if n >= self.doc.length: | ||||
|                     raise RuntimeError | ||||
|                     raise RuntimeError(Errors.E038) | ||||
|             return self.doc[root.l_edge:root.r_edge + 1] | ||||
| 
 | ||||
|     property has_vector: | ||||
|  | @ -376,11 +375,7 @@ cdef class Span: | |||
|         """ | ||||
|         def __get__(self): | ||||
|             if not self.doc.is_parsed: | ||||
|                 raise ValueError( | ||||
|                     "noun_chunks requires the dependency parse, which " | ||||
|                     "requires a statistical model to be installed and loaded. " | ||||
|                     "For more info, see the " | ||||
|                     "documentation: \n%s\n" % about.__docs_models__) | ||||
|                 raise ValueError(Errors.E029) | ||||
|             # Accumulate the result before beginning to iterate over it. This | ||||
|             # prevents the tokenisation from being changed out from under us | ||||
|             # during the iteration. The tricky thing here is that Span accepts | ||||
|  | @ -526,9 +521,7 @@ cdef class Span: | |||
|             return self.root.ent_id | ||||
| 
 | ||||
|         def __set__(self, hash_t key): | ||||
|             raise NotImplementedError( | ||||
|                 "Can't yet set ent_id from Span. Vote for this feature on " | ||||
|                 "the issue tracker: http://github.com/explosion/spaCy/issues") | ||||
|             raise NotImplementedError(TempErrors.T007.format(attr='ent_id')) | ||||
| 
 | ||||
|     property ent_id_: | ||||
|         """RETURNS (unicode): The (string) entity ID.""" | ||||
|  | @ -536,9 +529,7 @@ cdef class Span: | |||
|             return self.root.ent_id_ | ||||
| 
 | ||||
|         def __set__(self, hash_t key): | ||||
|             raise NotImplementedError( | ||||
|                 "Can't yet set ent_id_ from Span. Vote for this feature on the " | ||||
|                 "issue tracker: http://github.com/explosion/spaCy/issues") | ||||
|             raise NotImplementedError(TempErrors.T007.format(attr='ent_id_')) | ||||
| 
 | ||||
|     property orth_: | ||||
|         """Verbatim text content (identical to Span.text). Exists mostly for | ||||
|  | @ -586,9 +577,5 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: | |||
|         token += token.head | ||||
|         n += 1 | ||||
|         if n >= sent_length: | ||||
|             raise RuntimeError( | ||||
|                 "Array bounds exceeded while searching for root word. This " | ||||
|                 "likely means the parse tree is in an invalid state. Please " | ||||
|                 "report this issue here: " | ||||
|                 "http://github.com/explosion/spaCy/issues") | ||||
|             raise RuntimeError(Errors.E039) | ||||
|     return n | ||||
|  |  | |||
|  | @ -6,6 +6,7 @@ from ..typedefs cimport attr_t, flags_t | |||
| from ..parts_of_speech cimport univ_pos_t | ||||
| from .doc cimport Doc | ||||
| from ..lexeme cimport Lexeme | ||||
| from ..errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| cdef class Token: | ||||
|  | @ -17,8 +18,7 @@ cdef class Token: | |||
|     @staticmethod | ||||
|     cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, Doc doc): | ||||
|         if offset < 0 or offset >= doc.length: | ||||
|             msg = "Attempt to access token at %d, max length %d" | ||||
|             raise IndexError(msg % (offset, doc.length)) | ||||
|             raise IndexError(Errors.E040.format(i=offset, max_length=doc.length)) | ||||
|         cdef Token self = Token.__new__(Token, vocab, doc, offset) | ||||
|         return self | ||||
| 
 | ||||
|  |  | |||
|  | @ -19,8 +19,8 @@ from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM | |||
| from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX | ||||
| from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP | ||||
| from ..compat import is_config | ||||
| from ..errors import Errors | ||||
| from .. import util | ||||
| from .. import about | ||||
| from .underscore import Underscore | ||||
| 
 | ||||
| 
 | ||||
|  | @ -106,7 +106,7 @@ cdef class Token: | |||
|         elif op == 5: | ||||
|             return my >= their | ||||
|         else: | ||||
|             raise ValueError(op) | ||||
|             raise ValueError(Errors.E041.format(op=op)) | ||||
| 
 | ||||
|     @property | ||||
|     def _(self): | ||||
|  | @ -135,8 +135,7 @@ cdef class Token: | |||
|         RETURNS (Token): The token at position `self.doc[self.i+i]`. | ||||
|         """ | ||||
|         if self.i+i < 0 or (self.i+i >= len(self.doc)): | ||||
|             msg = "Error accessing doc[%d].nbor(%d), for doc of length %d" | ||||
|             raise IndexError(msg % (self.i, i, len(self.doc))) | ||||
|             raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc))) | ||||
|         return self.doc[self.i+i] | ||||
| 
 | ||||
|     def similarity(self, other): | ||||
|  | @ -352,14 +351,7 @@ cdef class Token: | |||
| 
 | ||||
|     property sent_start: | ||||
|         def __get__(self): | ||||
|             # Raising a deprecation warning causes errors for autocomplete | ||||
|             #util.deprecated( | ||||
|             #    "Token.sent_start is now deprecated. Use Token.is_sent_start " | ||||
|             #    "instead, which returns a boolean value or None if the answer " | ||||
|             #    "is unknown – instead of a misleading 0 for False and 1 for " | ||||
|             #    "True. It also fixes a quirk in the old logic that would " | ||||
|             #    "always set the property to 0 for the first word of the " | ||||
|             #    "document.") | ||||
|             # Raising a deprecation warning here causes errors for autocomplete | ||||
|             # Handle broken backwards compatibility case: doc[0].sent_start | ||||
|             # was False. | ||||
|             if self.i == 0: | ||||
|  | @ -384,9 +376,7 @@ cdef class Token: | |||
| 
 | ||||
|         def __set__(self, value): | ||||
|             if self.doc.is_parsed: | ||||
|                 raise ValueError( | ||||
|                     "Refusing to write to token.sent_start if its document " | ||||
|                     "is parsed, because this may cause inconsistent state.") | ||||
|                 raise ValueError(Errors.E043) | ||||
|             if value is None: | ||||
|                 self.c.sent_start = 0 | ||||
|             elif value is True: | ||||
|  | @ -394,8 +384,7 @@ cdef class Token: | |||
|             elif value is False: | ||||
|                 self.c.sent_start = -1 | ||||
|             else: | ||||
|                 raise ValueError("Invalid value for token.sent_start. Must be " | ||||
|                                  "one of: None, True, False") | ||||
|                 raise ValueError(Errors.E044.format(value=value)) | ||||
| 
 | ||||
|     property lefts: | ||||
|         """The leftward immediate children of the word, in the syntactic | ||||
|  | @ -413,8 +402,7 @@ cdef class Token: | |||
|                 nr_iter += 1 | ||||
|                 # This is ugly, but it's a way to guard out infinite loops | ||||
|                 if nr_iter >= 10000000: | ||||
|                     raise RuntimeError("Possibly infinite loop encountered " | ||||
|                                        "while looking for token.lefts") | ||||
|                     raise RuntimeError(Errors.E045.format(attr='token.lefts')) | ||||
| 
 | ||||
|     property rights: | ||||
|         """The rightward immediate children of the word, in the syntactic | ||||
|  | @ -432,8 +420,7 @@ cdef class Token: | |||
|                 ptr -= 1 | ||||
|                 nr_iter += 1 | ||||
|                 if nr_iter >= 10000000: | ||||
|                     raise RuntimeError("Possibly infinite loop encountered " | ||||
|                                        "while looking for token.rights") | ||||
|                     raise RuntimeError(Errors.E045.format(attr='token.rights')) | ||||
|             tokens.reverse() | ||||
|             for t in tokens: | ||||
|                 yield t | ||||
|  |  | |||
|  | @ -3,6 +3,8 @@ from __future__ import unicode_literals | |||
| 
 | ||||
| import functools | ||||
| 
 | ||||
| from ..errors import Errors | ||||
| 
 | ||||
| 
 | ||||
| class Underscore(object): | ||||
|     doc_extensions = {} | ||||
|  | @ -23,7 +25,7 @@ class Underscore(object): | |||
| 
 | ||||
|     def __getattr__(self, name): | ||||
|         if name not in self._extensions: | ||||
|             raise AttributeError(name) | ||||
|             raise AttributeError(Errors.E046.format(name=name)) | ||||
|         default, method, getter, setter = self._extensions[name] | ||||
|         if getter is not None: | ||||
|             return getter(self._obj) | ||||
|  | @ -34,7 +36,7 @@ class Underscore(object): | |||
| 
 | ||||
|     def __setattr__(self, name, value): | ||||
|         if name not in self._extensions: | ||||
|             raise AttributeError(name) | ||||
|             raise AttributeError(Errors.E047.format(name=name)) | ||||
|         default, method, getter, setter = self._extensions[name] | ||||
|         if setter is not None: | ||||
|             return setter(self._obj, value) | ||||
|  |  | |||
|  | @ -11,8 +11,6 @@ import sys | |||
| import textwrap | ||||
| import random | ||||
| from collections import OrderedDict | ||||
| import inspect | ||||
| import warnings | ||||
| from thinc.neural._classes.model import Model | ||||
| import functools | ||||
| import cytoolz | ||||
|  | @ -22,6 +20,7 @@ import numpy.random | |||
| from .symbols import ORTH | ||||
| from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_ | ||||
| from .compat import import_file | ||||
| from .errors import Errors | ||||
| 
 | ||||
| # Import these directly from Thinc, so that we're sure we always have the | ||||
| # same version. | ||||
|  | @ -50,8 +49,7 @@ def get_lang_class(lang): | |||
|         try: | ||||
|             module = importlib.import_module('.lang.%s' % lang, 'spacy') | ||||
|         except ImportError: | ||||
|             msg = "Can't import language %s from spacy.lang." | ||||
|             raise ImportError(msg % lang) | ||||
|             raise ImportError(Errors.E048.format(lang=lang)) | ||||
|         LANGUAGES[lang] = getattr(module, module.__all__[0]) | ||||
|     return LANGUAGES[lang] | ||||
| 
 | ||||
|  | @ -108,7 +106,7 @@ def load_model(name, **overrides): | |||
|     """ | ||||
|     data_path = get_data_path() | ||||
|     if not data_path or not data_path.exists(): | ||||
|         raise IOError("Can't find spaCy data path: %s" % path2str(data_path)) | ||||
|         raise IOError(Errors.E049.format(path=path2str(data_path))) | ||||
|     if isinstance(name, basestring_):  # in data dir / shortcut | ||||
|         if name in set([d.name for d in data_path.iterdir()]): | ||||
|             return load_model_from_link(name, **overrides) | ||||
|  | @ -118,7 +116,7 @@ def load_model(name, **overrides): | |||
|             return load_model_from_path(Path(name), **overrides) | ||||
|     elif hasattr(name, 'exists'):  # Path or Path-like to model data | ||||
|         return load_model_from_path(name, **overrides) | ||||
|     raise IOError("Can't find model '%s'" % name) | ||||
|     raise IOError(Errors.E050.format(name=name)) | ||||
| 
 | ||||
| 
 | ||||
| def load_model_from_link(name, **overrides): | ||||
|  | @ -127,9 +125,7 @@ def load_model_from_link(name, **overrides): | |||
|     try: | ||||
|         cls = import_file(name, path) | ||||
|     except AttributeError: | ||||
|         raise IOError( | ||||
|             "Cant' load '%s'. If you're using a shortcut link, make sure it " | ||||
|             "points to a valid package (not just a data directory)." % name) | ||||
|         raise IOError(Errors.E051.format(name=name)) | ||||
|     return cls.load(**overrides) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -173,8 +169,7 @@ def load_model_from_init_py(init_file, **overrides): | |||
|     data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version']) | ||||
|     data_path = model_path / data_dir | ||||
|     if not model_path.exists(): | ||||
|         msg = "Can't find model directory: %s" | ||||
|         raise ValueError(msg % path2str(data_path)) | ||||
|         raise IOError(Errors.E052.format(path=path2str(data_path))) | ||||
|     return load_model_from_path(data_path, meta, **overrides) | ||||
| 
 | ||||
| 
 | ||||
|  | @ -186,16 +181,14 @@ def get_model_meta(path): | |||
|     """ | ||||
|     model_path = ensure_path(path) | ||||
|     if not model_path.exists(): | ||||
|         msg = "Can't find model directory: %s" | ||||
|         raise ValueError(msg % path2str(model_path)) | ||||
|         raise IOError(Errors.E052.format(path=path2str(model_path))) | ||||
|     meta_path = model_path / 'meta.json' | ||||
|     if not meta_path.is_file(): | ||||
|         raise IOError("Could not read meta.json from %s" % meta_path) | ||||
|         raise IOError(Errors.E053.format(path=meta_path)) | ||||
|     meta = read_json(meta_path) | ||||
|     for setting in ['lang', 'name', 'version']: | ||||
|         if setting not in meta or not meta[setting]: | ||||
|             msg = "No valid '%s' setting found in model meta.json" | ||||
|             raise ValueError(msg % setting) | ||||
|             raise ValueError(Errors.E054.format(setting=setting)) | ||||
|     return meta | ||||
| 
 | ||||
| 
 | ||||
|  | @ -339,13 +332,10 @@ def update_exc(base_exceptions, *addition_dicts): | |||
|         for orth, token_attrs in additions.items(): | ||||
|             if not all(isinstance(attr[ORTH], unicode_) | ||||
|                        for attr in token_attrs): | ||||
|                 msg = "Invalid ORTH value in exception: key='%s', orths='%s'" | ||||
|                 raise ValueError(msg % (orth, token_attrs)) | ||||
|                 raise ValueError(Errors.E055.format(key=orth, orths=token_attrs)) | ||||
|             described_orth = ''.join(attr[ORTH] for attr in token_attrs) | ||||
|             if orth != described_orth: | ||||
|                 msg = ("Invalid tokenizer exception: ORTH values combined " | ||||
|                        "don't match original string. key='%s', orths='%s'") | ||||
|                 raise ValueError(msg % (orth, described_orth)) | ||||
|                 raise ValueError(Errors.E056.format(key=orth, orths=described_orth)) | ||||
|         exc.update(additions) | ||||
|     exc = expand_exc(exc, "'", "’") | ||||
|     return exc | ||||
|  | @ -375,8 +365,7 @@ def expand_exc(excs, search, replace): | |||
| 
 | ||||
| def normalize_slice(length, start, stop, step=None): | ||||
|     if not (step is None or step == 1): | ||||
|         raise ValueError("Stepped slices not supported in Span objects." | ||||
|                          "Try: list(tokens)[start:stop:step] instead.") | ||||
|         raise ValueError(Errors.E057) | ||||
|     if start is None: | ||||
|         start = 0 | ||||
|     elif start < 0: | ||||
|  | @ -387,7 +376,6 @@ def normalize_slice(length, start, stop, step=None): | |||
|     elif stop < 0: | ||||
|         stop += length | ||||
|     stop = min(length, max(start, stop)) | ||||
|     assert 0 <= start <= stop <= length | ||||
|     return start, stop | ||||
| 
 | ||||
| 
 | ||||
|  | @ -524,18 +512,6 @@ def from_disk(path, readers, exclude): | |||
|     return path | ||||
| 
 | ||||
| 
 | ||||
| def deprecated(message, filter='always'): | ||||
|     """Show a deprecation warning. | ||||
| 
 | ||||
|     message (unicode): The message to display. | ||||
|     filter (unicode): Filter value. | ||||
|     """ | ||||
|     stack = inspect.stack()[-1] | ||||
|     with warnings.catch_warnings(): | ||||
|         warnings.simplefilter(filter, DeprecationWarning) | ||||
|         warnings.warn_explicit(message, DeprecationWarning, stack[1], stack[2]) | ||||
| 
 | ||||
| 
 | ||||
| def print_table(data, title=None): | ||||
|     """Print data in table format. | ||||
| 
 | ||||
|  |  | |||
|  | @ -14,6 +14,7 @@ from thinc.neural._classes.model import Model | |||
| 
 | ||||
| from .strings cimport StringStore, hash_string | ||||
| from .compat import basestring_, path2str | ||||
| from .errors import Errors | ||||
| from . import util | ||||
| 
 | ||||
| from cython.operator cimport dereference as deref | ||||
|  | @ -114,7 +115,7 @@ cdef class Vectors: | |||
|         """ | ||||
|         i = self.key2row[key] | ||||
|         if i is None: | ||||
|             raise KeyError(key) | ||||
|             raise KeyError(Errors.E058.format(key=key)) | ||||
|         else: | ||||
|             return self.data[i] | ||||
| 
 | ||||
|  | @ -215,7 +216,8 @@ cdef class Vectors: | |||
|         RETURNS: The requested key, keys, row or rows. | ||||
|         """ | ||||
|         if sum(arg is None for arg in (key, keys, row, rows)) != 3: | ||||
|             raise ValueError("One (and only one) keyword arg must be set.") | ||||
|             bad_kwargs = {'key': key, 'keys': keys, 'row': row, 'rows': rows} | ||||
|             raise ValueError(Errors.E059.format(kwargs=bad_kwargs)) | ||||
|         xp = get_array_module(self.data) | ||||
|         if key is not None: | ||||
|             if isinstance(key, basestring_): | ||||
|  | @ -254,9 +256,9 @@ cdef class Vectors: | |||
|             row = self.key2row[key] | ||||
|         elif row is None: | ||||
|             if self.is_full: | ||||
|                 raise ValueError("Cannot add new key to vectors -- full") | ||||
|                 raise ValueError(Errors.E060.format(rows=self.data.shape[0], | ||||
|                                                     cols=self.data.shape[1])) | ||||
|             row = deref(self._unset.begin()) | ||||
| 
 | ||||
|         self.key2row[key] = row | ||||
|         if vector is not None: | ||||
|             self.data[row] = vector | ||||
|  | @ -318,7 +320,7 @@ cdef class Vectors: | |||
|                 width = int(dims) | ||||
|                 break | ||||
|         else: | ||||
|             raise IOError("Expected file named e.g. vectors.128.f.bin") | ||||
|             raise IOError(Errors.E061.format(filename=path)) | ||||
|         bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims, | ||||
|                                                              dtype=dtype) | ||||
|         xp = get_array_module(self.data) | ||||
|  |  | |||
|  | @ -16,6 +16,7 @@ from .attrs cimport PROB, LANG, ORTH, TAG | |||
| from .structs cimport SerializedLexemeC | ||||
| 
 | ||||
| from .compat import copy_reg, basestring_ | ||||
| from .errors import Errors | ||||
| from .lemmatizer import Lemmatizer | ||||
| from .attrs import intify_attrs | ||||
| from .vectors import Vectors | ||||
|  | @ -100,15 +101,9 @@ cdef class Vocab: | |||
|                     flag_id = bit | ||||
|                     break | ||||
|             else: | ||||
|                 raise ValueError( | ||||
|                     "Cannot find empty bit for new lexical flag. All bits " | ||||
|                     "between 0 and 63 are occupied. You can replace one by " | ||||
|                     "specifying the flag_id explicitly, e.g. " | ||||
|                     "`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.") | ||||
|                 raise ValueError(Errors.E062) | ||||
|         elif flag_id >= 64 or flag_id < 1: | ||||
|             raise ValueError( | ||||
|                 "Invalid value for flag_id: %d. Flag IDs must be between " | ||||
|                 "1 and 63 (inclusive)" % flag_id) | ||||
|             raise ValueError(Errors.E063.format(value=flag_id)) | ||||
|         for lex in self: | ||||
|             lex.set_flag(flag_id, flag_getter(lex.orth_)) | ||||
|         self.lex_attr_getters[flag_id] = flag_getter | ||||
|  | @ -127,8 +122,9 @@ cdef class Vocab: | |||
|         cdef size_t addr | ||||
|         if lex != NULL: | ||||
|             if lex.orth != self.strings[string]: | ||||
|                 raise LookupError.mismatched_strings( | ||||
|                     lex.orth, self.strings[string], string) | ||||
|                 raise KeyError(Errors.E064.format(string=lex.orth, | ||||
|                                                   orth=self.strings[string], | ||||
|                                                   orth_id=string)) | ||||
|             return lex | ||||
|         else: | ||||
|             return self._new_lexeme(mem, string) | ||||
|  | @ -171,7 +167,8 @@ cdef class Vocab: | |||
|         if not is_oov: | ||||
|             key = hash_string(string) | ||||
|             self._add_lex_to_vocab(key, lex) | ||||
|         assert lex != NULL, string | ||||
|         if lex == NULL: | ||||
|             raise ValueError(Errors.E085.format(string=string)) | ||||
|         return lex | ||||
| 
 | ||||
|     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: | ||||
|  | @ -254,7 +251,7 @@ cdef class Vocab: | |||
|         width, you have to call this to change the size of the vectors. | ||||
|         """ | ||||
|         if width is not None and shape is not None: | ||||
|             raise ValueError("Only one of width and shape can be specified") | ||||
|             raise ValueError(Errors.E065.format(width=width, shape=shape)) | ||||
|         elif shape is not None: | ||||
|             self.vectors = Vectors(shape=shape) | ||||
|         else: | ||||
|  | @ -471,7 +468,10 @@ cdef class Vocab: | |||
|             if ptr == NULL: | ||||
|                 continue | ||||
|             py_str = self.strings[lexeme.orth] | ||||
|             assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth) | ||||
|             if self.strings[py_str] != lexeme.orth: | ||||
|                 raise ValueError(Errors.E086.format(string=py_str, | ||||
|                                                     orth_id=lexeme.orth, | ||||
|                                                     hash_id=self.strings[py_str])) | ||||
|             key = hash_string(py_str) | ||||
|             self._by_hash.set(key, lexeme) | ||||
|             self._by_orth.set(lexeme.orth, lexeme) | ||||
|  | @ -512,16 +512,3 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir, | |||
| 
 | ||||
| 
 | ||||
| copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab) | ||||
| 
 | ||||
| 
 | ||||
| class LookupError(Exception): | ||||
|     @classmethod | ||||
|     def mismatched_strings(cls, id_, id_string, original_string): | ||||
|         return cls( | ||||
|             "Error fetching a Lexeme from the Vocab. When looking up a " | ||||
|             "string, the lexeme returned had an orth ID that did not match " | ||||
|             "the query string. This means that the cached lexeme structs are " | ||||
|             "mismatched to the string encoding table. The mismatched:\n" | ||||
|             "Query string: {}\n" | ||||
|             "Orth cached: {}\n" | ||||
|             "Orth ID: {}".format(repr(original_string), repr(id_string), id_)) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user