mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-11-01 00:17:44 +03:00 
			
		
		
		
	💫 New system for error messages and warnings (#2163)
* Add spacy.errors module * Update deprecation and user warnings * Replace errors and asserts with new error message system * Remove redundant asserts * Fix whitespace * Add messages for print/util.prints statements * Fix typo * Fix typos * Move CLI messages to spacy.cli._messages * Add decorator to display error code with message An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc. * Remove unused link in spacy.about * Update errors for invalid pipeline components * Improve error for unknown factories * Add displaCy warnings * Update formatting consistency * Move error message to spacy.errors * Update errors and check if doc returned by component is None
This commit is contained in:
		
							parent
							
								
									abf8b16d71
								
							
						
					
					
						commit
						3141e04822
					
				|  | @ -4,18 +4,14 @@ from __future__ import unicode_literals | ||||||
| from .cli.info import info as cli_info | from .cli.info import info as cli_info | ||||||
| from .glossary import explain | from .glossary import explain | ||||||
| from .about import __version__ | from .about import __version__ | ||||||
|  | from .errors import Warnings, deprecation_warning | ||||||
| from . import util | from . import util | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def load(name, **overrides): | def load(name, **overrides): | ||||||
|     depr_path = overrides.get('path') |     depr_path = overrides.get('path') | ||||||
|     if depr_path not in (True, False, None): |     if depr_path not in (True, False, None): | ||||||
|         util.deprecated( |         deprecation_warning(Warnings.W001.format(path=depr_path)) | ||||||
|             "As of spaCy v2.0, the keyword argument `path=` is deprecated. " |  | ||||||
|             "You can now call spacy.load with the path as its first argument, " |  | ||||||
|             "and the model's meta.json will be used to determine the language " |  | ||||||
|             "to load. For example:\nnlp = spacy.load('{}')".format(depr_path), |  | ||||||
|             'error') |  | ||||||
|     return util.load_model(name, **overrides) |     return util.load_model(name, **overrides) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -23,6 +23,7 @@ from thinc.neural._classes.affine import _set_dimensions_if_needed | ||||||
| import thinc.extra.load_nlp | import thinc.extra.load_nlp | ||||||
| 
 | 
 | ||||||
| from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE | from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE | ||||||
|  | from .errors import Errors | ||||||
| from . import util | from . import util | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -340,10 +341,10 @@ def _divide_array(X, size): | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_col(idx): | def get_col(idx): | ||||||
|     assert idx >= 0, idx |     if idx < 0: | ||||||
|  |         raise IndexError(Errors.E066.format(value=idx)) | ||||||
| 
 | 
 | ||||||
|     def forward(X, drop=0.): |     def forward(X, drop=0.): | ||||||
|         assert idx >= 0, idx |  | ||||||
|         if isinstance(X, numpy.ndarray): |         if isinstance(X, numpy.ndarray): | ||||||
|             ops = NumpyOps() |             ops = NumpyOps() | ||||||
|         else: |         else: | ||||||
|  | @ -351,7 +352,6 @@ def get_col(idx): | ||||||
|         output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) |         output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype) | ||||||
| 
 | 
 | ||||||
|         def backward(y, sgd=None): |         def backward(y, sgd=None): | ||||||
|             assert idx >= 0, idx |  | ||||||
|             dX = ops.allocate(X.shape) |             dX = ops.allocate(X.shape) | ||||||
|             dX[:, idx] += y |             dX[:, idx] += y | ||||||
|             return dX |             return dX | ||||||
|  |  | ||||||
|  | @ -11,7 +11,6 @@ __email__ = 'contact@explosion.ai' | ||||||
| __license__ = 'MIT' | __license__ = 'MIT' | ||||||
| __release__ = True | __release__ = True | ||||||
| 
 | 
 | ||||||
| __docs_models__ = 'https://spacy.io/usage/models' |  | ||||||
| __download_url__ = 'https://github.com/explosion/spacy-models/releases/download' | __download_url__ = 'https://github.com/explosion/spacy-models/releases/download' | ||||||
| __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json' | __compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json' | ||||||
| __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json' | __shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json' | ||||||
|  |  | ||||||
							
								
								
									
										73
									
								
								spacy/cli/_messages.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										73
									
								
								spacy/cli/_messages.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,73 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | class Messages(object): | ||||||
|  |     M001 = ("Download successful but linking failed") | ||||||
|  |     M002 = ("Creating a shortcut link for 'en' didn't work (maybe you " | ||||||
|  |             "don't have admin permissions?), but you can still load the " | ||||||
|  |             "model via its full package name: nlp = spacy.load('{name}')") | ||||||
|  |     M003 = ("Server error ({code}: {desc})") | ||||||
|  |     M004 = ("Couldn't fetch {desc}. Please find a model for your spaCy " | ||||||
|  |             "installation (v{version}), and download it manually. For more " | ||||||
|  |             "details, see the documentation: https://spacy.io/usage/models") | ||||||
|  |     M005 = ("Compatibility error") | ||||||
|  |     M006 = ("No compatible models found for v{version} of spaCy.") | ||||||
|  |     M007 = ("No compatible model found for '{name}' (spaCy v{version}).") | ||||||
|  |     M008 = ("Can't locate model data") | ||||||
|  |     M009 = ("The data should be located in {path}") | ||||||
|  |     M010 = ("Can't find the spaCy data path to create model symlink") | ||||||
|  |     M011 = ("Make sure a directory `/data` exists within your spaCy " | ||||||
|  |             "installation and try again. The data directory should be " | ||||||
|  |             "located here:") | ||||||
|  |     M012 = ("Link '{name}' already exists") | ||||||
|  |     M013 = ("To overwrite an existing link, use the --force flag.") | ||||||
|  |     M014 = ("Can't overwrite symlink '{name}'") | ||||||
|  |     M015 = ("This can happen if your data directory contains a directory or " | ||||||
|  |             "file of the same name.") | ||||||
|  |     M016 = ("Error: Couldn't link model to '{name}'") | ||||||
|  |     M017 = ("Creating a symlink in spacy/data failed. Make sure you have the " | ||||||
|  |             "required permissions and try re-running the command as admin, or " | ||||||
|  |             "use a virtualenv. You can still import the model as a module and " | ||||||
|  |             "call its load() method, or create the symlink manually.") | ||||||
|  |     M018 = ("Linking successful") | ||||||
|  |     M019 = ("You can now load the model via spacy.load('{name}')") | ||||||
|  |     M020 = ("Can't find model meta.json") | ||||||
|  |     M021 = ("Couldn't fetch compatibility table.") | ||||||
|  |     M022 = ("Can't find spaCy v{version} in compatibility table") | ||||||
|  |     M023 = ("Installed models (spaCy v{version})") | ||||||
|  |     M024 = ("No models found in your current environment.") | ||||||
|  |     M025 = ("Use the following commands to update the model packages:") | ||||||
|  |     M026 = ("The following models are not available for spaCy " | ||||||
|  |             "v{version}: {models}") | ||||||
|  |     M027 = ("You may also want to overwrite the incompatible links using the " | ||||||
|  |             "`python -m spacy link` command with `--force`, or remove them " | ||||||
|  |             "from the data directory. Data path: {path}") | ||||||
|  |     M028 = ("Input file not found") | ||||||
|  |     M029 = ("Output directory not found") | ||||||
|  |     M030 = ("Unknown format") | ||||||
|  |     M031 = ("Can't find converter for {converter}") | ||||||
|  |     M032 = ("Generated output file {name}") | ||||||
|  |     M033 = ("Created {n_docs} documents") | ||||||
|  |     M034 = ("Evaluation data not found") | ||||||
|  |     M035 = ("Visualization output directory not found") | ||||||
|  |     M036 = ("Generated {n} parses as HTML") | ||||||
|  |     M037 = ("Can't find words frequencies file") | ||||||
|  |     M038 = ("Sucessfully compiled vocab") | ||||||
|  |     M039 = ("{entries} entries, {vectors} vectors") | ||||||
|  |     M040 = ("Output directory not found") | ||||||
|  |     M041 = ("Loaded meta.json from file") | ||||||
|  |     M042 = ("Successfully created package '{name}'") | ||||||
|  |     M043 = ("To build the package, run `python setup.py sdist` in this " | ||||||
|  |             "directory.") | ||||||
|  |     M044 = ("Package directory already exists") | ||||||
|  |     M045 = ("Please delete the directory and try again, or use the `--force` " | ||||||
|  |             "flag to overwrite existing directories.") | ||||||
|  |     M046 = ("Generating meta.json") | ||||||
|  |     M047 = ("Enter the package settings for your model. The following " | ||||||
|  |            "information will be read from your model data: pipeline, vectors.") | ||||||
|  |     M048 = ("No '{key}' setting found in meta.json") | ||||||
|  |     M049 = ("This setting is required to build your package.") | ||||||
|  |     M050 = ("Training data not found") | ||||||
|  |     M051 = ("Development data not found") | ||||||
|  |     M052 = ("Not a valid meta.json format") | ||||||
|  |     M053 = ("Expected dict but got: {meta_type}") | ||||||
|  | @ -5,6 +5,7 @@ import plac | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| 
 | 
 | ||||||
| from .converters import conllu2json, iob2json, conll_ner2json | from .converters import conllu2json, iob2json, conll_ner2json | ||||||
|  | from ._messages import Messages | ||||||
| from ..util import prints | from ..util import prints | ||||||
| 
 | 
 | ||||||
| # Converters are matched by file extension. To add a converter, add a new | # Converters are matched by file extension. To add a converter, add a new | ||||||
|  | @ -32,14 +33,14 @@ def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto | ||||||
|     input_path = Path(input_file) |     input_path = Path(input_file) | ||||||
|     output_path = Path(output_dir) |     output_path = Path(output_dir) | ||||||
|     if not input_path.exists(): |     if not input_path.exists(): | ||||||
|         prints(input_path, title="Input file not found", exits=1) |         prints(input_path, title=Messages.M028, exits=1) | ||||||
|     if not output_path.exists(): |     if not output_path.exists(): | ||||||
|         prints(output_path, title="Output directory not found", exits=1) |         prints(output_path, title=Messages.M029, exits=1) | ||||||
|     if converter == 'auto': |     if converter == 'auto': | ||||||
|         converter = input_path.suffix[1:] |         converter = input_path.suffix[1:] | ||||||
|     if converter not in CONVERTERS: |     if converter not in CONVERTERS: | ||||||
|             prints("Can't find converter for %s" % converter, |             prints(Messages.M031.format(converter=converter), | ||||||
|                 title="Unknown format", exits=1) |                    title=Messages.M030, exits=1) | ||||||
|     func = CONVERTERS[converter] |     func = CONVERTERS[converter] | ||||||
|     func(input_path, output_path, |     func(input_path, output_path, | ||||||
|          n_sents=n_sents, use_morphology=morphology) |          n_sents=n_sents, use_morphology=morphology) | ||||||
|  |  | ||||||
|  | @ -1,6 +1,7 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
|  | from .._messages import Messages | ||||||
| from ...compat import json_dumps, path2str | from ...compat import json_dumps, path2str | ||||||
| from ...util import prints | from ...util import prints | ||||||
| from ...gold import iob_to_biluo | from ...gold import iob_to_biluo | ||||||
|  | @ -18,8 +19,8 @@ def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False): | ||||||
|     output_file = output_path / output_filename |     output_file = output_path / output_filename | ||||||
|     with output_file.open('w', encoding='utf-8') as f: |     with output_file.open('w', encoding='utf-8') as f: | ||||||
|         f.write(json_dumps(docs)) |         f.write(json_dumps(docs)) | ||||||
|     prints("Created %d documents" % len(docs), |     prints(Messages.M033.format(n_docs=len(docs)), | ||||||
|            title="Generated output file %s" % path2str(output_file)) |            title=Messages.M032.format(name=path2str(output_file))) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def read_conll_ner(input_path): | def read_conll_ner(input_path): | ||||||
|  |  | ||||||
|  | @ -1,6 +1,7 @@ | ||||||
| # coding: utf8 | # coding: utf8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
|  | from .._messages import Messages | ||||||
| from ...compat import json_dumps, path2str | from ...compat import json_dumps, path2str | ||||||
| from ...util import prints | from ...util import prints | ||||||
| 
 | 
 | ||||||
|  | @ -32,8 +33,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False): | ||||||
|     output_file = output_path / output_filename |     output_file = output_path / output_filename | ||||||
|     with output_file.open('w', encoding='utf-8') as f: |     with output_file.open('w', encoding='utf-8') as f: | ||||||
|         f.write(json_dumps(docs)) |         f.write(json_dumps(docs)) | ||||||
|     prints("Created %d documents" % len(docs), |     prints(Messages.M033.format(n_docs=len(docs)), | ||||||
|            title="Generated output file %s" % path2str(output_file)) |            title=Messages.M032.format(name=path2str(output_file))) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def read_conllx(input_path, use_morphology=False, n=0): | def read_conllx(input_path, use_morphology=False, n=0): | ||||||
|  |  | ||||||
|  | @ -2,6 +2,7 @@ | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
| from cytoolz import partition_all, concat | from cytoolz import partition_all, concat | ||||||
| 
 | 
 | ||||||
|  | from .._messages import Messages | ||||||
| from ...compat import json_dumps, path2str | from ...compat import json_dumps, path2str | ||||||
| from ...util import prints | from ...util import prints | ||||||
| from ...gold import iob_to_biluo | from ...gold import iob_to_biluo | ||||||
|  | @ -18,8 +19,8 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): | ||||||
|     output_file = output_path / output_filename |     output_file = output_path / output_filename | ||||||
|     with output_file.open('w', encoding='utf-8') as f: |     with output_file.open('w', encoding='utf-8') as f: | ||||||
|         f.write(json_dumps(docs)) |         f.write(json_dumps(docs)) | ||||||
|     prints("Created %d documents" % len(docs), |     prints(Messages.M033.format(n_docs=len(docs)), | ||||||
|            title="Generated output file %s" % path2str(output_file)) |            title=Messages.M032.format(name=path2str(output_file))) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def read_iob(raw_sents): | def read_iob(raw_sents): | ||||||
|  |  | ||||||
|  | @ -8,6 +8,7 @@ import sys | ||||||
| import ujson | import ujson | ||||||
| 
 | 
 | ||||||
| from .link import link | from .link import link | ||||||
|  | from ._messages import Messages | ||||||
| from ..util import prints, get_package_path | from ..util import prints, get_package_path | ||||||
| from ..compat import url_read, HTTPError | from ..compat import url_read, HTTPError | ||||||
| from .. import about | from .. import about | ||||||
|  | @ -32,9 +33,7 @@ def download(model, direct=False): | ||||||
|         version = get_version(model_name, compatibility) |         version = get_version(model_name, compatibility) | ||||||
|         dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, |         dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name, | ||||||
|                                                             v=version)) |                                                             v=version)) | ||||||
|         if dl != 0: |         if dl != 0:  # if download subprocess doesn't return 0, exit | ||||||
|             # if download subprocess doesn't return 0, exit with the respective |  | ||||||
|             # exit code before doing anything else |  | ||||||
|             sys.exit(dl) |             sys.exit(dl) | ||||||
|         try: |         try: | ||||||
|             # Get package path here because link uses |             # Get package path here because link uses | ||||||
|  | @ -48,22 +47,15 @@ def download(model, direct=False): | ||||||
|             # Dirty, but since spacy.download and the auto-linking is |             # Dirty, but since spacy.download and the auto-linking is | ||||||
|             # mostly a convenience wrapper, it's best to show a success |             # mostly a convenience wrapper, it's best to show a success | ||||||
|             # message and loading instructions, even if linking fails. |             # message and loading instructions, even if linking fails. | ||||||
|             prints( |             prints(Messages.M001.format(name=model_name), title=Messages.M002) | ||||||
|                 "Creating a shortcut link for 'en' didn't work (maybe " |  | ||||||
|                 "you don't have admin permissions?), but you can still " |  | ||||||
|                 "load the model via its full package name:", |  | ||||||
|                 "nlp = spacy.load('%s')" % model_name, |  | ||||||
|                 title="Download successful but linking failed") |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_json(url, desc): | def get_json(url, desc): | ||||||
|     try: |     try: | ||||||
|         data = url_read(url) |         data = url_read(url) | ||||||
|     except HTTPError as e: |     except HTTPError as e: | ||||||
|         msg = ("Couldn't fetch %s. Please find a model for your spaCy " |         prints(Messages.M004.format(desc, about.__version__), | ||||||
|                "installation (v%s), and download it manually.") |                title=Messages.M003.format(e.code, e.reason), exits=1) | ||||||
|         prints(msg % (desc, about.__version__), about.__docs_models__, |  | ||||||
|                title="Server error (%d: %s)" % (e.code, e.reason), exits=1) |  | ||||||
|     return ujson.loads(data) |     return ujson.loads(data) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -73,17 +65,16 @@ def get_compatibility(): | ||||||
|     comp_table = get_json(about.__compatibility__, "compatibility table") |     comp_table = get_json(about.__compatibility__, "compatibility table") | ||||||
|     comp = comp_table['spacy'] |     comp = comp_table['spacy'] | ||||||
|     if version not in comp: |     if version not in comp: | ||||||
|         prints("No compatible models found for v%s of spaCy." % version, |         prints(Messages.M006.format(version=version), title=Messages.M005, | ||||||
|                title="Compatibility error", exits=1) |                exits=1) | ||||||
|     return comp[version] |     return comp[version] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def get_version(model, comp): | def get_version(model, comp): | ||||||
|     model = model.rsplit('.dev', 1)[0] |     model = model.rsplit('.dev', 1)[0] | ||||||
|     if model not in comp: |     if model not in comp: | ||||||
|         version = about.__version__ |         prints(Messages.M007.format(name=model, version=about.__version__), | ||||||
|         msg = "No compatible model found for '%s' (spaCy v%s)." |                title=Messages.M005, exits=1) | ||||||
|         prints(msg % (model, version), title="Compatibility error", exits=1) |  | ||||||
|     return comp[model][0] |     return comp[model][0] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -4,6 +4,7 @@ from __future__ import unicode_literals, division, print_function | ||||||
| import plac | import plac | ||||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||||
| 
 | 
 | ||||||
|  | from ._messages import Messages | ||||||
| from ..gold import GoldCorpus | from ..gold import GoldCorpus | ||||||
| from ..util import prints | from ..util import prints | ||||||
| from .. import util | from .. import util | ||||||
|  | @ -33,10 +34,9 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None | ||||||
|     data_path = util.ensure_path(data_path) |     data_path = util.ensure_path(data_path) | ||||||
|     displacy_path = util.ensure_path(displacy_path) |     displacy_path = util.ensure_path(displacy_path) | ||||||
|     if not data_path.exists(): |     if not data_path.exists(): | ||||||
|         prints(data_path, title="Evaluation data not found", exits=1) |         prints(data_path, title=Messages.M034, exits=1) | ||||||
|     if displacy_path and not displacy_path.exists(): |     if displacy_path and not displacy_path.exists(): | ||||||
|         prints(displacy_path, title="Visualization output directory not found", |         prints(displacy_path, title=Messages.M035, exits=1) | ||||||
|                exits=1) |  | ||||||
|     corpus = GoldCorpus(data_path, data_path) |     corpus = GoldCorpus(data_path, data_path) | ||||||
|     nlp = util.load_model(model) |     nlp = util.load_model(model) | ||||||
|     dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) |     dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc)) | ||||||
|  | @ -52,8 +52,7 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None | ||||||
|         render_ents = 'ner' in nlp.meta.get('pipeline', []) |         render_ents = 'ner' in nlp.meta.get('pipeline', []) | ||||||
|         render_parses(docs, displacy_path, model_name=model, |         render_parses(docs, displacy_path, model_name=model, | ||||||
|                       limit=displacy_limit, deps=render_deps, ents=render_ents) |                       limit=displacy_limit, deps=render_deps, ents=render_ents) | ||||||
|         msg = "Generated %s parses as HTML" % displacy_limit |         prints(displacy_path, title=Messages.M036.format(n=displacy_limit)) | ||||||
|         prints(displacy_path, title=msg) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def render_parses(docs, output_path, model_name='', limit=250, deps=True, | def render_parses(docs, output_path, model_name='', limit=250, deps=True, | ||||||
|  |  | ||||||
|  | @ -5,9 +5,10 @@ import plac | ||||||
| import platform | import platform | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| 
 | 
 | ||||||
|  | from ._messages import Messages | ||||||
| from ..compat import path2str | from ..compat import path2str | ||||||
| from .. import about |  | ||||||
| from .. import util | from .. import util | ||||||
|  | from .. import about | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @plac.annotations( | @plac.annotations( | ||||||
|  | @ -25,7 +26,7 @@ def info(model=None, markdown=False): | ||||||
|             model_path = util.get_data_path() / model |             model_path = util.get_data_path() / model | ||||||
|         meta_path = model_path / 'meta.json' |         meta_path = model_path / 'meta.json' | ||||||
|         if not meta_path.is_file(): |         if not meta_path.is_file(): | ||||||
|             util.prints(meta_path, title="Can't find model meta.json", exits=1) |             util.prints(meta_path, title=Messages.M020, exits=1) | ||||||
|         meta = util.read_json(meta_path) |         meta = util.read_json(meta_path) | ||||||
|         if model_path.resolve() != model_path: |         if model_path.resolve() != model_path: | ||||||
|             meta['link'] = path2str(model_path) |             meta['link'] = path2str(model_path) | ||||||
|  |  | ||||||
|  | @ -11,7 +11,9 @@ from preshed.counter import PreshCounter | ||||||
| import tarfile | import tarfile | ||||||
| import gzip | import gzip | ||||||
| 
 | 
 | ||||||
|  | from ._messages import Messages | ||||||
| from ..vectors import Vectors | from ..vectors import Vectors | ||||||
|  | from ..errors import Warnings, user_warning | ||||||
| from ..util import prints, ensure_path, get_lang_class | from ..util import prints, ensure_path, get_lang_class | ||||||
| 
 | 
 | ||||||
| try: | try: | ||||||
|  | @ -37,16 +39,13 @@ def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, vectors_loc= | ||||||
|     and word vectors. |     and word vectors. | ||||||
|     """ |     """ | ||||||
|     if freqs_loc is not None and not freqs_loc.exists(): |     if freqs_loc is not None and not freqs_loc.exists(): | ||||||
|         prints(freqs_loc, title="Can't find words frequencies file", exits=1) |         prints(freqs_loc, title=Messages.M037, exits=1) | ||||||
|     clusters_loc = ensure_path(clusters_loc) |     clusters_loc = ensure_path(clusters_loc) | ||||||
|     vectors_loc = ensure_path(vectors_loc) |     vectors_loc = ensure_path(vectors_loc) | ||||||
| 
 |  | ||||||
|     probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20) |     probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20) | ||||||
|     vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None) |     vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None) | ||||||
|     clusters = read_clusters(clusters_loc) if clusters_loc else {} |     clusters = read_clusters(clusters_loc) if clusters_loc else {} | ||||||
| 
 |  | ||||||
|     nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors) |     nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors) | ||||||
| 
 |  | ||||||
|     if not output_dir.exists(): |     if not output_dir.exists(): | ||||||
|         output_dir.mkdir() |         output_dir.mkdir() | ||||||
|     nlp.to_disk(output_dir) |     nlp.to_disk(output_dir) | ||||||
|  | @ -69,7 +68,6 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru | ||||||
|     nlp = lang_class() |     nlp = lang_class() | ||||||
|     for lexeme in nlp.vocab: |     for lexeme in nlp.vocab: | ||||||
|         lexeme.rank = 0 |         lexeme.rank = 0 | ||||||
| 
 |  | ||||||
|     lex_added = 0 |     lex_added = 0 | ||||||
|     for i, (word, prob) in enumerate(tqdm(sorted(probs.items(), key=lambda item: item[1], reverse=True))): |     for i, (word, prob) in enumerate(tqdm(sorted(probs.items(), key=lambda item: item[1], reverse=True))): | ||||||
|         lexeme = nlp.vocab[word] |         lexeme = nlp.vocab[word] | ||||||
|  | @ -89,15 +87,13 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru | ||||||
|             lexeme = nlp.vocab[word] |             lexeme = nlp.vocab[word] | ||||||
|             lexeme.is_oov = False |             lexeme.is_oov = False | ||||||
|             lex_added += 1 |             lex_added += 1 | ||||||
| 
 |  | ||||||
|     if len(vectors_data): |     if len(vectors_data): | ||||||
|         nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) |         nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys) | ||||||
|     if prune_vectors >= 1: |     if prune_vectors >= 1: | ||||||
|         nlp.vocab.prune_vectors(prune_vectors) |         nlp.vocab.prune_vectors(prune_vectors) | ||||||
|     vec_added = len(nlp.vocab.vectors) |     vec_added = len(nlp.vocab.vectors) | ||||||
| 
 |     prints(Messages.M039.format(entries=lex_added, vectors=vec_added), | ||||||
|     prints("{} entries, {} vectors".format(lex_added, vec_added), |            title=Messages.M038) | ||||||
|            title="Sucessfully compiled vocab") |  | ||||||
|     return nlp |     return nlp | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -145,7 +141,7 @@ def read_clusters(clusters_loc): | ||||||
|     print("Reading clusters...") |     print("Reading clusters...") | ||||||
|     clusters = {} |     clusters = {} | ||||||
|     if ftfy is None: |     if ftfy is None: | ||||||
|         print("Warning: No text fixing. Run pip install ftfy if necessary") |         user_warning(Warnings.W004) | ||||||
|     with clusters_loc.open() as f: |     with clusters_loc.open() as f: | ||||||
|         for line in tqdm(f): |         for line in tqdm(f): | ||||||
|             try: |             try: | ||||||
|  |  | ||||||
|  | @ -4,6 +4,7 @@ from __future__ import unicode_literals | ||||||
| import plac | import plac | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| 
 | 
 | ||||||
|  | from ._messages import Messages | ||||||
| from ..compat import symlink_to, path2str | from ..compat import symlink_to, path2str | ||||||
| from ..util import prints | from ..util import prints | ||||||
| from .. import util | from .. import util | ||||||
|  | @ -24,40 +25,29 @@ def link(origin, link_name, force=False, model_path=None): | ||||||
|     else: |     else: | ||||||
|         model_path = Path(origin) if model_path is None else Path(model_path) |         model_path = Path(origin) if model_path is None else Path(model_path) | ||||||
|     if not model_path.exists(): |     if not model_path.exists(): | ||||||
|         prints("The data should be located in %s" % path2str(model_path), |         prints(Messages.M009.format(path=path2str(model_path)), | ||||||
|                title="Can't locate model data", exits=1) |                title=Messages.M008, exits=1) | ||||||
|     data_path = util.get_data_path() |     data_path = util.get_data_path() | ||||||
|     if not data_path or not data_path.exists(): |     if not data_path or not data_path.exists(): | ||||||
|         spacy_loc = Path(__file__).parent.parent |         spacy_loc = Path(__file__).parent.parent | ||||||
|         prints("Make sure a directory `/data` exists within your spaCy " |         prints(Messages.M011, spacy_loc, title=Messages.M010, exits=1) | ||||||
|                "installation and try again. The data directory should be " |  | ||||||
|                "located here:", path2str(spacy_loc), exits=1, |  | ||||||
|                title="Can't find the spaCy data path to create model symlink") |  | ||||||
|     link_path = util.get_data_path() / link_name |     link_path = util.get_data_path() / link_name | ||||||
|     if link_path.is_symlink() and not force: |     if link_path.is_symlink() and not force: | ||||||
|         prints("To overwrite an existing link, use the --force flag.", |         prints(Messages.M013, title=Messages.M012.format(name=link_name), | ||||||
|                title="Link %s already exists" % link_name, exits=1) |                exits=1) | ||||||
|     elif link_path.is_symlink():  # does a symlink exist? |     elif link_path.is_symlink():  # does a symlink exist? | ||||||
|         # NB: It's important to check for is_symlink here and not for exists, |         # NB: It's important to check for is_symlink here and not for exists, | ||||||
|         # because invalid/outdated symlinks would return False otherwise. |         # because invalid/outdated symlinks would return False otherwise. | ||||||
|         link_path.unlink() |         link_path.unlink() | ||||||
|     elif link_path.exists(): # does it exist otherwise? |     elif link_path.exists(): # does it exist otherwise? | ||||||
|         # NB: Check this last because valid symlinks also "exist". |         # NB: Check this last because valid symlinks also "exist". | ||||||
|         prints("This can happen if your data directory contains a directory " |         prints(Messages.M015, link_path, | ||||||
|                "or file of the same name.", link_path, |                title=Messages.M014.format(name=link_name), exits=1) | ||||||
|                title="Can't overwrite symlink %s" % link_name, exits=1) |     msg = "%s --> %s" % (path2str(model_path), path2str(link_path)) | ||||||
|     try: |     try: | ||||||
|         symlink_to(link_path, model_path) |         symlink_to(link_path, model_path) | ||||||
|     except: |     except: | ||||||
|         # This is quite dirty, but just making sure other errors are caught. |         # This is quite dirty, but just making sure other errors are caught. | ||||||
|         prints("Creating a symlink in spacy/data failed. Make sure you have " |         prints(Messages.M017, msg, title=Messages.M016.format(name=link_name)) | ||||||
|                "the required permissions and try re-running the command as " |  | ||||||
|                "admin, or use a virtualenv. You can still import the model as " |  | ||||||
|                "a module and call its load() method, or create the symlink " |  | ||||||
|                "manually.", |  | ||||||
|                "%s --> %s" % (path2str(model_path), path2str(link_path)), |  | ||||||
|                title="Error: Couldn't link model to '%s'" % link_name) |  | ||||||
|         raise |         raise | ||||||
|     prints("%s --> %s" % (path2str(model_path), path2str(link_path)), |     prints(msg, Messages.M019.format(name=link_name), title=Messages.M018) | ||||||
|            "You can now load the model via spacy.load('%s')" % link_name, |  | ||||||
|            title="Linking successful") |  | ||||||
|  |  | ||||||
|  | @ -5,6 +5,7 @@ import plac | ||||||
| import shutil | import shutil | ||||||
| from pathlib import Path | from pathlib import Path | ||||||
| 
 | 
 | ||||||
|  | from ._messages import Messages | ||||||
| from ..compat import path2str, json_dumps | from ..compat import path2str, json_dumps | ||||||
| from ..util import prints | from ..util import prints | ||||||
| from .. import util | from .. import util | ||||||
|  | @ -31,17 +32,17 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, | ||||||
|     output_path = util.ensure_path(output_dir) |     output_path = util.ensure_path(output_dir) | ||||||
|     meta_path = util.ensure_path(meta_path) |     meta_path = util.ensure_path(meta_path) | ||||||
|     if not input_path or not input_path.exists(): |     if not input_path or not input_path.exists(): | ||||||
|         prints(input_path, title="Model directory not found", exits=1) |         prints(input_path, title=Messages.M008, exits=1) | ||||||
|     if not output_path or not output_path.exists(): |     if not output_path or not output_path.exists(): | ||||||
|         prints(output_path, title="Output directory not found", exits=1) |         prints(output_path, title=Messages.M040, exits=1) | ||||||
|     if meta_path and not meta_path.exists(): |     if meta_path and not meta_path.exists(): | ||||||
|         prints(meta_path, title="meta.json not found", exits=1) |         prints(meta_path, title=Messages.M020, exits=1) | ||||||
| 
 | 
 | ||||||
|     meta_path = meta_path or input_path / 'meta.json' |     meta_path = meta_path or input_path / 'meta.json' | ||||||
|     if meta_path.is_file(): |     if meta_path.is_file(): | ||||||
|         meta = util.read_json(meta_path) |         meta = util.read_json(meta_path) | ||||||
|         if not create_meta:  # only print this if user doesn't want to overwrite |         if not create_meta:  # only print this if user doesn't want to overwrite | ||||||
|             prints(meta_path, title="Loaded meta.json from file") |             prints(meta_path, title=Messages.M041) | ||||||
|         else: |         else: | ||||||
|             meta = generate_meta(input_dir, meta) |             meta = generate_meta(input_dir, meta) | ||||||
|     meta = validate_meta(meta, ['lang', 'name', 'version']) |     meta = validate_meta(meta, ['lang', 'name', 'version']) | ||||||
|  | @ -57,9 +58,8 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False, | ||||||
|     create_file(main_path / 'setup.py', TEMPLATE_SETUP) |     create_file(main_path / 'setup.py', TEMPLATE_SETUP) | ||||||
|     create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST) |     create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST) | ||||||
|     create_file(package_path / '__init__.py', TEMPLATE_INIT) |     create_file(package_path / '__init__.py', TEMPLATE_INIT) | ||||||
|     prints(main_path, "To build the package, run `python setup.py sdist` in " |     prints(main_path, Messages.M043, | ||||||
|            "this directory.", |            title=Messages.M042.format(name=model_name_v)) | ||||||
|            title="Successfully created package '%s'" % model_name_v) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def create_dirs(package_path, force): | def create_dirs(package_path, force): | ||||||
|  | @ -67,10 +67,7 @@ def create_dirs(package_path, force): | ||||||
|         if force: |         if force: | ||||||
|             shutil.rmtree(path2str(package_path)) |             shutil.rmtree(path2str(package_path)) | ||||||
|         else: |         else: | ||||||
|             prints(package_path, "Please delete the directory and try again, " |             prints(package_path, Messages.M045, title=Messages.M044, exits=1) | ||||||
|                    "or use the --force flag to overwrite existing " |  | ||||||
|                    "directories.", title="Package directory already exists", |  | ||||||
|                    exits=1) |  | ||||||
|     Path.mkdir(package_path, parents=True) |     Path.mkdir(package_path, parents=True) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -97,9 +94,7 @@ def generate_meta(model_path, existing_meta): | ||||||
|     meta['vectors'] = {'width': nlp.vocab.vectors_length, |     meta['vectors'] = {'width': nlp.vocab.vectors_length, | ||||||
|                        'vectors': len(nlp.vocab.vectors), |                        'vectors': len(nlp.vocab.vectors), | ||||||
|                        'keys': nlp.vocab.vectors.n_keys} |                        'keys': nlp.vocab.vectors.n_keys} | ||||||
|     prints("Enter the package settings for your model. The following " |     prints(Messages.M047, title=Messages.Mo46) | ||||||
|            "information will be read from your model data: pipeline, vectors.", |  | ||||||
|            title="Generating meta.json") |  | ||||||
|     for setting, desc, default in settings: |     for setting, desc, default in settings: | ||||||
|         response = util.get_raw_input(desc, default) |         response = util.get_raw_input(desc, default) | ||||||
|         meta[setting] = default if response == '' and default else response |         meta[setting] = default if response == '' and default else response | ||||||
|  | @ -111,8 +106,7 @@ def generate_meta(model_path, existing_meta): | ||||||
| def validate_meta(meta, keys): | def validate_meta(meta, keys): | ||||||
|     for key in keys: |     for key in keys: | ||||||
|         if key not in meta or meta[key] == '': |         if key not in meta or meta[key] == '': | ||||||
|             prints("This setting is required to build your package.", |             prints(Messages.M049, title=Messages.M048.format(key=key), exits=1) | ||||||
|                    title='No "%s" setting found in meta.json' % key, exits=1) |  | ||||||
|     return meta |     return meta | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -7,6 +7,7 @@ import tqdm | ||||||
| from thinc.neural._classes.model import Model | from thinc.neural._classes.model import Model | ||||||
| from timeit import default_timer as timer | from timeit import default_timer as timer | ||||||
| 
 | 
 | ||||||
|  | from ._messages import Messages | ||||||
| from ..attrs import PROB, IS_OOV, CLUSTER, LANG | from ..attrs import PROB, IS_OOV, CLUSTER, LANG | ||||||
| from ..gold import GoldCorpus, minibatch | from ..gold import GoldCorpus, minibatch | ||||||
| from ..util import prints | from ..util import prints | ||||||
|  | @ -54,15 +55,15 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0, | ||||||
|     if not output_path.exists(): |     if not output_path.exists(): | ||||||
|         output_path.mkdir() |         output_path.mkdir() | ||||||
|     if not train_path.exists(): |     if not train_path.exists(): | ||||||
|         prints(train_path, title="Training data not found", exits=1) |         prints(train_path, title=Messages.M050, exits=1) | ||||||
|     if dev_path and not dev_path.exists(): |     if dev_path and not dev_path.exists(): | ||||||
|         prints(dev_path, title="Development data not found", exits=1) |         prints(dev_path, title=Messages.M051, exits=1) | ||||||
|     if meta_path is not None and not meta_path.exists(): |     if meta_path is not None and not meta_path.exists(): | ||||||
|         prints(meta_path, title="meta.json not found", exits=1) |         prints(meta_path, title=Messages.M020, exits=1) | ||||||
|     meta = util.read_json(meta_path) if meta_path else {} |     meta = util.read_json(meta_path) if meta_path else {} | ||||||
|     if not isinstance(meta, dict): |     if not isinstance(meta, dict): | ||||||
|         prints("Expected dict but got: {}".format(type(meta)), |         prints(Messages.M053.format(meta_type=type(meta)), | ||||||
|                title="Not a valid meta.json format", exits=1) |                title=Messages.M052, exits=1) | ||||||
|     meta.setdefault('lang', lang) |     meta.setdefault('lang', lang) | ||||||
|     meta.setdefault('name', 'unnamed') |     meta.setdefault('name', 'unnamed') | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -6,6 +6,7 @@ from pathlib import Path | ||||||
| import sys | import sys | ||||||
| import ujson | import ujson | ||||||
| 
 | 
 | ||||||
|  | from ._messages import Messages | ||||||
| from ..compat import path2str, locale_escape, url_read, HTTPError | from ..compat import path2str, locale_escape, url_read, HTTPError | ||||||
| from ..util import prints, get_data_path, read_json | from ..util import prints, get_data_path, read_json | ||||||
| from .. import about | from .. import about | ||||||
|  | @ -18,14 +19,13 @@ def validate(): | ||||||
|     try: |     try: | ||||||
|         data = url_read(about.__compatibility__) |         data = url_read(about.__compatibility__) | ||||||
|     except HTTPError as e: |     except HTTPError as e: | ||||||
|         prints("Couldn't fetch compatibility table.", |         title = Messages.M003.format(code=e.code, desc=e.reason) | ||||||
|                title="Server error (%d: %s)" % (e.code, e.reason), exits=1) |         prints(Messages.M021, title=title, exits=1) | ||||||
|     compat = ujson.loads(data)['spacy'] |     compat = ujson.loads(data)['spacy'] | ||||||
|     current_compat = compat.get(about.__version__) |     current_compat = compat.get(about.__version__) | ||||||
|     if not current_compat: |     if not current_compat: | ||||||
|         prints(about.__compatibility__, exits=1, |         prints(about.__compatibility__, exits=1, | ||||||
|                title="Can't find spaCy v{} in compatibility table" |                title=Messages.M022.format(version=about.__version__)) | ||||||
|                .format(about.__version__)) |  | ||||||
|     all_models = set() |     all_models = set() | ||||||
|     for spacy_v, models in dict(compat).items(): |     for spacy_v, models in dict(compat).items(): | ||||||
|         all_models.update(models.keys()) |         all_models.update(models.keys()) | ||||||
|  | @ -42,7 +42,7 @@ def validate(): | ||||||
|     update_models = [m for m in incompat_models if m in current_compat] |     update_models = [m for m in incompat_models if m in current_compat] | ||||||
| 
 | 
 | ||||||
|     prints(path2str(Path(__file__).parent.parent), |     prints(path2str(Path(__file__).parent.parent), | ||||||
|            title="Installed models (spaCy v{})".format(about.__version__)) |            title=Messages.M023.format(version=about.__version__)) | ||||||
|     if model_links or model_pkgs: |     if model_links or model_pkgs: | ||||||
|         print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', '')) |         print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', '')) | ||||||
|         for name, data in model_pkgs.items(): |         for name, data in model_pkgs.items(): | ||||||
|  | @ -50,23 +50,16 @@ def validate(): | ||||||
|         for name, data in model_links.items(): |         for name, data in model_links.items(): | ||||||
|             print(get_model_row(current_compat, name, data, 'link')) |             print(get_model_row(current_compat, name, data, 'link')) | ||||||
|     else: |     else: | ||||||
|         prints("No models found in your current environment.", exits=0) |         prints(Messages.M024, exits=0) | ||||||
| 
 |  | ||||||
|     if update_models: |     if update_models: | ||||||
|         cmd = '    python -m spacy download {}' |         cmd = '    python -m spacy download {}' | ||||||
|         print("\n    Use the following commands to update the model packages:") |         print("\n    " + Messages.M025) | ||||||
|         print('\n'.join([cmd.format(pkg) for pkg in update_models])) |         print('\n'.join([cmd.format(pkg) for pkg in update_models])) | ||||||
| 
 |  | ||||||
|     if na_models: |     if na_models: | ||||||
|         prints("The following models are not available for spaCy v{}: {}" |         prints(Messages.M025.format(version=about.__version__, | ||||||
|                .format(about.__version__, ', '.join(na_models))) |                                     models=', '.join(na_models))) | ||||||
| 
 |  | ||||||
|     if incompat_links: |     if incompat_links: | ||||||
|         prints("You may also want to overwrite the incompatible links using " |         prints(Messages.M027.format(path=path2str(get_data_path()))) | ||||||
|                "the `python -m spacy link` command with `--force`, or remove " |  | ||||||
|                "them from the data directory. Data path: {}" |  | ||||||
|                .format(path2str(get_data_path()))) |  | ||||||
| 
 |  | ||||||
|     if incompat_models or incompat_links: |     if incompat_models or incompat_links: | ||||||
|         sys.exit(1) |         sys.exit(1) | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -4,6 +4,7 @@ from __future__ import unicode_literals | ||||||
| from .render import DependencyRenderer, EntityRenderer | from .render import DependencyRenderer, EntityRenderer | ||||||
| from ..tokens import Doc | from ..tokens import Doc | ||||||
| from ..compat import b_to_str | from ..compat import b_to_str | ||||||
|  | from ..errors import Errors, Warnings, user_warning | ||||||
| from ..util import prints, is_in_jupyter | from ..util import prints, is_in_jupyter | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -27,7 +28,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER, | ||||||
|     factories = {'dep': (DependencyRenderer, parse_deps), |     factories = {'dep': (DependencyRenderer, parse_deps), | ||||||
|                  'ent': (EntityRenderer, parse_ents)} |                  'ent': (EntityRenderer, parse_ents)} | ||||||
|     if style not in factories: |     if style not in factories: | ||||||
|         raise ValueError("Unknown style: %s" % style) |         raise ValueError(Errors.E087.format(style=style)) | ||||||
|     if isinstance(docs, Doc) or isinstance(docs, dict): |     if isinstance(docs, Doc) or isinstance(docs, dict): | ||||||
|         docs = [docs] |         docs = [docs] | ||||||
|     renderer, converter = factories[style] |     renderer, converter = factories[style] | ||||||
|  | @ -57,12 +58,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False, | ||||||
|     render(docs, style=style, page=page, minify=minify, options=options, |     render(docs, style=style, page=page, minify=minify, options=options, | ||||||
|            manual=manual) |            manual=manual) | ||||||
|     httpd = simple_server.make_server('0.0.0.0', port, app) |     httpd = simple_server.make_server('0.0.0.0', port, app) | ||||||
|     prints("Using the '%s' visualizer" % style, |     prints("Using the '{}' visualizer".format(style), | ||||||
|            title="Serving on port %d..." % port) |            title="Serving on port {}...".format(port)) | ||||||
|     try: |     try: | ||||||
|         httpd.serve_forever() |         httpd.serve_forever() | ||||||
|     except KeyboardInterrupt: |     except KeyboardInterrupt: | ||||||
|         prints("Shutting down server on port %d." % port) |         prints("Shutting down server on port {}.".format(port)) | ||||||
|     finally: |     finally: | ||||||
|         httpd.server_close() |         httpd.server_close() | ||||||
| 
 | 
 | ||||||
|  | @ -83,6 +84,8 @@ def parse_deps(orig_doc, options={}): | ||||||
|     RETURNS (dict): Generated dependency parse keyed by words and arcs. |     RETURNS (dict): Generated dependency parse keyed by words and arcs. | ||||||
|     """ |     """ | ||||||
|     doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes()) |     doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes()) | ||||||
|  |     if not doc.is_parsed: | ||||||
|  |         user_warning(Warnings.W005) | ||||||
|     if options.get('collapse_punct', True): |     if options.get('collapse_punct', True): | ||||||
|         spans = [] |         spans = [] | ||||||
|         for word in doc[:-1]: |         for word in doc[:-1]: | ||||||
|  | @ -120,6 +123,8 @@ def parse_ents(doc, options={}): | ||||||
|     """ |     """ | ||||||
|     ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_} |     ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_} | ||||||
|             for ent in doc.ents] |             for ent in doc.ents] | ||||||
|  |     if not ents: | ||||||
|  |         user_warning(Warnings.W006) | ||||||
|     title = (doc.user_data.get('title', None) |     title = (doc.user_data.get('title', None) | ||||||
|              if hasattr(doc, 'user_data') else None) |              if hasattr(doc, 'user_data') else None) | ||||||
|     return {'text': doc.text, 'ents': ents, 'title': title} |     return {'text': doc.text, 'ents': ents, 'title': title} | ||||||
|  |  | ||||||
							
								
								
									
										297
									
								
								spacy/errors.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										297
									
								
								spacy/errors.py
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,297 @@ | ||||||
|  | # coding: utf8 | ||||||
|  | from __future__ import unicode_literals | ||||||
|  | 
 | ||||||
|  | import os | ||||||
|  | import warnings | ||||||
|  | import inspect | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def add_codes(err_cls): | ||||||
|  |     """Add error codes to string messages via class attribute names.""" | ||||||
|  |     class ErrorsWithCodes(object): | ||||||
|  |         def __getattribute__(self, code): | ||||||
|  |             msg = getattr(err_cls, code) | ||||||
|  |             return '[{code}] {msg}'.format(code=code, msg=msg) | ||||||
|  |     return ErrorsWithCodes() | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @add_codes | ||||||
|  | class Warnings(object): | ||||||
|  |     W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. " | ||||||
|  |             "You can now call spacy.load with the path as its first argument, " | ||||||
|  |             "and the model's meta.json will be used to determine the language " | ||||||
|  |             "to load. For example:\nnlp = spacy.load('{path}')") | ||||||
|  |     W002 = ("Tokenizer.from_list is now deprecated. Create a new Doc object " | ||||||
|  |             "instead and pass in the strings as the `words` keyword argument, " | ||||||
|  |             "for example:\nfrom spacy.tokens import Doc\n" | ||||||
|  |             "doc = Doc(nlp.vocab, words=[...])") | ||||||
|  |     W003 = ("Positional arguments to Doc.merge are deprecated. Instead, use " | ||||||
|  |             "the keyword arguments, for example tag=, lemma= or ent_type=.") | ||||||
|  |     W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing " | ||||||
|  |             "using ftfy.fix_text if necessary.") | ||||||
|  |     W005 = ("Doc object not parsed. This means displaCy won't be able to " | ||||||
|  |             "generate a dependency visualization for it. Make sure the Doc " | ||||||
|  |             "was processed with a model that supports dependency parsing, and " | ||||||
|  |             "not just a language class like `English()`. For more info, see " | ||||||
|  |             "the docs:\nhttps://spacy.io/usage/models") | ||||||
|  |     W006 = ("No entities to visualize found in Doc object. If this is " | ||||||
|  |             "surprising to you, make sure the Doc was processed using a model " | ||||||
|  |             "that supports named entity recognition, and check the `doc.ents` " | ||||||
|  |             "property manually if necessary.") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @add_codes | ||||||
|  | class Errors(object): | ||||||
|  |     E001 = ("No component '{name}' found in pipeline. Available names: {opts}") | ||||||
|  |     E002 = ("Can't find factory for '{name}'. This usually happens when spaCy " | ||||||
|  |             "calls `nlp.create_pipe` with a component name that's not built " | ||||||
|  |             "in - for example, when constructing the pipeline from a model's " | ||||||
|  |             "meta.json. If you're using a custom component, you can write to " | ||||||
|  |             "`Language.factories['{name}']` or remove it from the model meta " | ||||||
|  |             "and add it via `nlp.add_pipe` instead.") | ||||||
|  |     E003 = ("Not a valid pipeline component. Expected callable, but " | ||||||
|  |             "got {component} (name: '{name}').") | ||||||
|  |     E004 = ("If you meant to add a built-in component, use `create_pipe`: " | ||||||
|  |             "`nlp.add_pipe(nlp.create_pipe('{component}'))`") | ||||||
|  |     E005 = ("Pipeline component '{name}' returned None. If you're using a " | ||||||
|  |             "custom component, maybe you forgot to return the processed Doc?") | ||||||
|  |     E006 = ("Invalid constraints. You can only set one of the following: " | ||||||
|  |             "before, after, first, last.") | ||||||
|  |     E007 = ("'{name}' already exists in pipeline. Existing names: {opts}") | ||||||
|  |     E008 = ("Some current components would be lost when restoring previous " | ||||||
|  |             "pipeline state. If you added components after calling " | ||||||
|  |             "`nlp.disable_pipes()`, you should remove them explicitly with " | ||||||
|  |             "`nlp.remove_pipe()` before the pipeline is restored. Names of " | ||||||
|  |             "the new components: {names}") | ||||||
|  |     E009 = ("The `update` method expects same number of docs and golds, but " | ||||||
|  |             "got: {n_docs} docs, {n_golds} golds.") | ||||||
|  |     E010 = ("Word vectors set to length 0. This may be because you don't have " | ||||||
|  |             "a model installed or loaded, or because your model doesn't " | ||||||
|  |             "include word vectors. For more info, see the docs:\n" | ||||||
|  |             "https://spacy.io/usage/models") | ||||||
|  |     E011 = ("Unknown operator: '{op}'. Options: {opts}") | ||||||
|  |     E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}") | ||||||
|  |     E013 = ("Error selecting action in matcher") | ||||||
|  |     E014 = ("Uknown tag ID: {tag}") | ||||||
|  |     E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use " | ||||||
|  |             "`force=True` to overwrite.") | ||||||
|  |     E016 = ("MultitaskObjective target should be function or one of: dep, " | ||||||
|  |             "tag, ent, dep_tag_offset, ent_tag.") | ||||||
|  |     E017 = ("Can only add unicode or bytes. Got type: {value_type}") | ||||||
|  |     E018 = ("Can't retrieve string for hash '{hash_value}'.") | ||||||
|  |     E019 = ("Can't create transition with unknown action ID: {action}. Action " | ||||||
|  |             "IDs are enumerated in spacy/syntax/{src}.pyx.") | ||||||
|  |     E020 = ("Could not find a gold-standard action to supervise the " | ||||||
|  |             "dependency parser. The tree is non-projective (i.e. it has " | ||||||
|  |             "crossing arcs - see spacy/syntax/nonproj.pyx for definitions). " | ||||||
|  |             "The ArcEager transition system only supports projective trees. " | ||||||
|  |             "To learn non-projective representations, transform the data " | ||||||
|  |             "before training and after parsing. Either pass " | ||||||
|  |             "`make_projective=True` to the GoldParse class, or use " | ||||||
|  |             "spacy.syntax.nonproj.preprocess_training_data.") | ||||||
|  |     E021 = ("Could not find a gold-standard action to supervise the " | ||||||
|  |             "dependency parser. The GoldParse was projective. The transition " | ||||||
|  |             "system has {n_actions} actions. State at failure: {state}") | ||||||
|  |     E022 = ("Could not find a transition with the name '{name}' in the NER " | ||||||
|  |             "model.") | ||||||
|  |     E023 = ("Error cleaning up beam: The same state occurred twice at " | ||||||
|  |             "memory address {addr} and position {i}.") | ||||||
|  |     E024 = ("Could not find an optimal move to supervise the parser. Usually, " | ||||||
|  |             "this means the GoldParse was not correct. For example, are all " | ||||||
|  |             "labels added to the model?") | ||||||
|  |     E025 = ("String is too long: {length} characters. Max is 2**30.") | ||||||
|  |     E026 = ("Error accessing token at position {i}: out of bounds in Doc of " | ||||||
|  |             "length {length}.") | ||||||
|  |     E027 = ("Arguments 'words' and 'spaces' should be sequences of the same " | ||||||
|  |             "length, or 'spaces' should be left default at None. spaces " | ||||||
|  |             "should be a sequence of booleans, with True meaning that the " | ||||||
|  |             "word owns a ' ' character following it.") | ||||||
|  |     E028 = ("orths_and_spaces expects either a list of unicode string or a " | ||||||
|  |             "list of (unicode, bool) tuples. Got bytes instance: {value}") | ||||||
|  |     E029 = ("noun_chunks requires the dependency parse, which requires a " | ||||||
|  |             "statistical model to be installed and loaded. For more info, see " | ||||||
|  |             "the documentation:\nhttps://spacy.io/usage/models") | ||||||
|  |     E030 = ("Sentence boundaries unset. You can add the 'sentencizer' " | ||||||
|  |             "component to the pipeline with: " | ||||||
|  |             "nlp.add_pipe(nlp.create_pipe('sentencizer')) " | ||||||
|  |             "Alternatively, add the dependency parser, or set sentence " | ||||||
|  |             "boundaries by setting doc[i].is_sent_start.") | ||||||
|  |     E031 = ("Invalid token: empty string ('') at position {i}.") | ||||||
|  |     E032 = ("Conflicting attributes specified in doc.from_array(): " | ||||||
|  |             "(HEAD, SENT_START). The HEAD attribute currently sets sentence " | ||||||
|  |             "boundaries implicitly, based on the tree structure. This means " | ||||||
|  |             "the HEAD attribute would potentially override the sentence " | ||||||
|  |             "boundaries set by SENT_START.") | ||||||
|  |     E033 = ("Cannot load into non-empty Doc of length {length}.") | ||||||
|  |     E034 = ("Doc.merge received {n_args} non-keyword arguments. Expected " | ||||||
|  |             "either 3 arguments (deprecated), or 0 (use keyword arguments).\n" | ||||||
|  |             "Arguments supplied:\n{args}\nKeyword arguments:{kwargs}") | ||||||
|  |     E035 = ("Error creating span with start {start} and end {end} for Doc of " | ||||||
|  |             "length {length}.") | ||||||
|  |     E036 = ("Error calculating span: Can't find a token starting at character " | ||||||
|  |             "offset {start}.") | ||||||
|  |     E037 = ("Error calculating span: Can't find a token ending at character " | ||||||
|  |             "offset {end}.") | ||||||
|  |     E038 = ("Error finding sentence for span. Infinite loop detected.") | ||||||
|  |     E039 = ("Array bounds exceeded while searching for root word. This likely " | ||||||
|  |             "means the parse tree is in an invalid state. Please report this " | ||||||
|  |             "issue here: http://github.com/explosion/spaCy/issues") | ||||||
|  |     E040 = ("Attempt to access token at {i}, max length {max_length}.") | ||||||
|  |     E041 = ("Invalid comparison operator: {op}. Likely a Cython bug?") | ||||||
|  |     E042 = ("Error accessing doc[{i}].nbor({j}), for doc of length {length}.") | ||||||
|  |     E043 = ("Refusing to write to token.sent_start if its document is parsed, " | ||||||
|  |             "because this may cause inconsistent state.") | ||||||
|  |     E044 = ("Invalid value for token.sent_start: {value}. Must be one of: " | ||||||
|  |             "None, True, False") | ||||||
|  |     E045 = ("Possibly infinite loop encountered while looking for {attr}.") | ||||||
|  |     E046 = ("Can't retrieve unregistered extension attribute '{name}'. Did " | ||||||
|  |             "you forget to call the `set_extension` method?") | ||||||
|  |     E047 = ("Can't assign a value to unregistered extension attribute " | ||||||
|  |             "'{name}'. Did you forget to call the `set_extension` method?") | ||||||
|  |     E048 = ("Can't import language {lang} from spacy.lang.") | ||||||
|  |     E049 = ("Can't find spaCy data directory: '{path}'. Check your " | ||||||
|  |             "installation and permissions, or use spacy.util.set_data_path " | ||||||
|  |             "to customise the location if necessary.") | ||||||
|  |     E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut " | ||||||
|  |             "link, a Python package or a valid path to a data directory.") | ||||||
|  |     E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure " | ||||||
|  |             "it points to a valid package (not just a data directory).") | ||||||
|  |     E052 = ("Can't find model directory: {path}") | ||||||
|  |     E053 = ("Could not read meta.json from {path}") | ||||||
|  |     E054 = ("No valid '{setting}' setting found in model meta.json.") | ||||||
|  |     E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}") | ||||||
|  |     E056 = ("Invalid tokenizer exception: ORTH values combined don't match " | ||||||
|  |             "original string.\nKey: {key}\nOrths: {orths}") | ||||||
|  |     E057 = ("Stepped slices not supported in Span objects. Try: " | ||||||
|  |             "list(tokens)[start:stop:step] instead.") | ||||||
|  |     E058 = ("Could not retrieve vector for key {key}.") | ||||||
|  |     E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}") | ||||||
|  |     E060 = ("Cannot add new key to vectors: the table is full. Current shape: " | ||||||
|  |             "({rows}, {cols}).") | ||||||
|  |     E061 = ("Bad file name: {filename}. Example of a valid file name: " | ||||||
|  |             "'vectors.128.f.bin'") | ||||||
|  |     E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 " | ||||||
|  |             "and 63 are occupied. You can replace one by specifying the " | ||||||
|  |             "`flag_id` explicitly, e.g. " | ||||||
|  |             "`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.") | ||||||
|  |     E063 = ("Invalid value for flag_id: {value}. Flag IDs must be between 1 " | ||||||
|  |             "and 63 (inclusive).") | ||||||
|  |     E064 = ("Error fetching a Lexeme from the Vocab. When looking up a " | ||||||
|  |             "string, the lexeme returned had an orth ID that did not match " | ||||||
|  |             "the query string. This means that the cached lexeme structs are " | ||||||
|  |             "mismatched to the string encoding table. The mismatched:\n" | ||||||
|  |             "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}") | ||||||
|  |     E065 = ("Only one of the vector table's width and shape can be specified. " | ||||||
|  |             "Got width {width} and shape {shape}.") | ||||||
|  |     E066 = ("Error creating model helper for extracting columns. Can only " | ||||||
|  |             "extract columns by positive integer. Got: {value}.") | ||||||
|  |     E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside " | ||||||
|  |             "an entity) without a preceding 'B' (beginning of an entity). " | ||||||
|  |             "Tag sequence:\n{tags}") | ||||||
|  |     E068 = ("Invalid BILUO tag: '{tag}'.") | ||||||
|  |     E069 = ("Invalid gold-standard parse tree. Found cycle between word " | ||||||
|  |             "IDs: {cycle}") | ||||||
|  |     E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) " | ||||||
|  |             "does not align with number of annotations ({n_annots}).") | ||||||
|  |     E071 = ("Error creating lexeme: specified orth ID ({orth}) does not " | ||||||
|  |             "match the one in the vocab ({vocab_orth}).") | ||||||
|  |     E072 = ("Error serializing lexeme: expected data length {length}, " | ||||||
|  |             "got {bad_length}.") | ||||||
|  |     E073 = ("Cannot assign vector of length {new_length}. Existing vectors " | ||||||
|  |             "are of length {length}. You can use `vocab.reset_vectors` to " | ||||||
|  |             "clear the existing vectors and resize the table.") | ||||||
|  |     E074 = ("Error interpreting compiled match pattern: patterns are expected " | ||||||
|  |             "to end with the attribute {attr}. Got: {bad_attr}.") | ||||||
|  |     E075 = ("Error accepting match: length ({length}) > maximum length " | ||||||
|  |             "({max_len}).") | ||||||
|  |     E076 = ("Error setting tensor on Doc: tensor has {rows} rows, while Doc " | ||||||
|  |             "has {words} words.") | ||||||
|  |     E077 = ("Error computing {value}: number of Docs ({n_docs}) does not " | ||||||
|  |             "equal number of GoldParse objects ({n_golds}) in batch.") | ||||||
|  |     E078 = ("Error computing score: number of words in Doc ({words_doc}) does " | ||||||
|  |             "not equal number of words in GoldParse ({words_gold}).") | ||||||
|  |     E079 = ("Error computing states in beam: number of predicted beams " | ||||||
|  |             "({pbeams}) does not equal number of gold beams ({gbeams}).") | ||||||
|  |     E080 = ("Duplicate state found in beam: {key}.") | ||||||
|  |     E081 = ("Error getting gradient in beam: number of histories ({n_hist}) " | ||||||
|  |             "does not equal number of losses ({losses}).") | ||||||
|  |     E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), " | ||||||
|  |             "projective heads ({n_proj_heads}) and labels ({n_labels}) do not " | ||||||
|  |             "match.") | ||||||
|  |     E083 = ("Error setting extension: only one of default, getter, setter and " | ||||||
|  |             "method is allowed. {n_args} keyword arguments were specified.") | ||||||
|  |     E084 = ("Error assigning label ID {label} to span: not in StringStore.") | ||||||
|  |     E085 = ("Can't create lexeme for string '{string}'.") | ||||||
|  |     E086 = ("Error deserializing lexeme '{string}': orth ID {orth_id} does " | ||||||
|  |             "not match hash {hash_id} in StringStore.") | ||||||
|  |     E087 = ("Unknown displaCy style: {style}.") | ||||||
|  |     E088 = ("Text of length {length} exceeds maximum of {max_length}. The " | ||||||
|  |             "v2.x parser and NER models require roughly 1GB of temporary " | ||||||
|  |             "memory per 100,000 characters in the input. This means long " | ||||||
|  |             "texts may cause memory allocation errors. If you're not using " | ||||||
|  |             "the parser or NER, it's probably safe to increase the " | ||||||
|  |             "`nlp.max_length` limit. The limit is in number of characters, so " | ||||||
|  |             "you can check whether your inputs are too long by checking " | ||||||
|  |             "`len(text)`.") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | @add_codes | ||||||
|  | class TempErrors(object): | ||||||
|  |     T001 = ("Max length currently 10 for phrase matching") | ||||||
|  |     T002 = ("Pattern length ({doc_len}) >= phrase_matcher.max_length " | ||||||
|  |             "({max_len}). Length can be set on initialization, up to 10.") | ||||||
|  |     T003 = ("Resizing pre-trained Tagger models is not currently supported.") | ||||||
|  |     T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.") | ||||||
|  |     T005 = ("Currently history size is hard-coded to 0. Received: {value}.") | ||||||
|  |     T006 = ("Currently history width is hard-coded to 0. Received: {value}.") | ||||||
|  |     T007 = ("Can't yet set {attr} from Span. Vote for this feature on the " | ||||||
|  |             "issue tracker: http://github.com/explosion/spaCy/issues") | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | class ModelsWarning(UserWarning): | ||||||
|  |     pass | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | WARNINGS = { | ||||||
|  |     'user': UserWarning, | ||||||
|  |     'deprecation': DeprecationWarning, | ||||||
|  |     'models': ModelsWarning, | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _get_warn_types(arg): | ||||||
|  |     if arg == '':  # don't show any warnings | ||||||
|  |         return [] | ||||||
|  |     if not arg or arg == 'all':  # show all available warnings | ||||||
|  |         return WARNINGS.keys() | ||||||
|  |     return [w_type.strip() for w_type in arg.split(',') | ||||||
|  |             if w_type.strip() in WARNINGS] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER', 'always') | ||||||
|  | SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES')) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def user_warning(message): | ||||||
|  |     _warn(message, 'user') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def deprecation_warning(message): | ||||||
|  |     _warn(message, 'deprecation') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def models_warning(message): | ||||||
|  |     _warn(message, 'models') | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def _warn(message, warn_type='user'): | ||||||
|  |     """ | ||||||
|  |     message (unicode): The message to display. | ||||||
|  |     category (Warning): The Warning to show. | ||||||
|  |     """ | ||||||
|  |     if warn_type in SPACY_WARNING_TYPES: | ||||||
|  |         category = WARNINGS[warn_type] | ||||||
|  |         stack = inspect.stack()[-1] | ||||||
|  |         with warnings.catch_warnings(): | ||||||
|  |             warnings.simplefilter(SPACY_WARNING_FILTER, category) | ||||||
|  |             warnings.warn_explicit(message, category, stack[1], stack[2]) | ||||||
|  | @ -10,6 +10,7 @@ import itertools | ||||||
| 
 | 
 | ||||||
| from .syntax import nonproj | from .syntax import nonproj | ||||||
| from .tokens import Doc | from .tokens import Doc | ||||||
|  | from .errors import Errors | ||||||
| from . import util | from . import util | ||||||
| from .util import minibatch | from .util import minibatch | ||||||
| 
 | 
 | ||||||
|  | @ -28,7 +29,8 @@ def tags_to_entities(tags): | ||||||
|         elif tag == '-': |         elif tag == '-': | ||||||
|             continue |             continue | ||||||
|         elif tag.startswith('I'): |         elif tag.startswith('I'): | ||||||
|             assert start is not None, tags[:i] |             if start is None: | ||||||
|  |                 raise ValueError(Errors.E067.format(tags=tags[:i])) | ||||||
|             continue |             continue | ||||||
|         if tag.startswith('U'): |         if tag.startswith('U'): | ||||||
|             entities.append((tag[2:], i, i)) |             entities.append((tag[2:], i, i)) | ||||||
|  | @ -38,7 +40,7 @@ def tags_to_entities(tags): | ||||||
|             entities.append((tag[2:], start, i)) |             entities.append((tag[2:], start, i)) | ||||||
|             start = None |             start = None | ||||||
|         else: |         else: | ||||||
|             raise Exception(tag) |             raise ValueError(Errors.E068.format(tag=tag)) | ||||||
|     return entities |     return entities | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -238,7 +240,9 @@ class GoldCorpus(object): | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|     def _make_golds(cls, docs, paragraph_tuples): |     def _make_golds(cls, docs, paragraph_tuples): | ||||||
|         assert len(docs) == len(paragraph_tuples) |         if len(docs) != len(paragraph_tuples): | ||||||
|  |             raise ValueError(Errors.E070.format(n_docs=len(docs), | ||||||
|  |                                                 n_annots=len(paragraph_tuples))) | ||||||
|         if len(docs) == 1: |         if len(docs) == 1: | ||||||
|             return [GoldParse.from_annot_tuples(docs[0], |             return [GoldParse.from_annot_tuples(docs[0], | ||||||
|                                                 paragraph_tuples[0][0])] |                                                 paragraph_tuples[0][0])] | ||||||
|  | @ -461,7 +465,7 @@ cdef class GoldParse: | ||||||
| 
 | 
 | ||||||
|         cycle = nonproj.contains_cycle(self.heads) |         cycle = nonproj.contains_cycle(self.heads) | ||||||
|         if cycle is not None: |         if cycle is not None: | ||||||
|             raise Exception("Cycle found: %s" % cycle) |             raise ValueError(Errors.E069.format(cycle=cycle)) | ||||||
| 
 | 
 | ||||||
|         if make_projective: |         if make_projective: | ||||||
|             proj_heads, _ = nonproj.projectivize(self.heads, self.labels) |             proj_heads, _ = nonproj.projectivize(self.heads, self.labels) | ||||||
|  |  | ||||||
|  | @ -28,6 +28,7 @@ from .lang.punctuation import TOKENIZER_INFIXES | ||||||
| from .lang.tokenizer_exceptions import TOKEN_MATCH | from .lang.tokenizer_exceptions import TOKEN_MATCH | ||||||
| from .lang.tag_map import TAG_MAP | from .lang.tag_map import TAG_MAP | ||||||
| from .lang.lex_attrs import LEX_ATTRS, is_stop | from .lang.lex_attrs import LEX_ATTRS, is_stop | ||||||
|  | from .errors import Errors | ||||||
| from . import util | from . import util | ||||||
| from . import about | from . import about | ||||||
| 
 | 
 | ||||||
|  | @ -217,8 +218,7 @@ class Language(object): | ||||||
|         for pipe_name, component in self.pipeline: |         for pipe_name, component in self.pipeline: | ||||||
|             if pipe_name == name: |             if pipe_name == name: | ||||||
|                 return component |                 return component | ||||||
|         msg = "No component '{}' found in pipeline. Available names: {}" |         raise KeyError(Errors.E001.format(name=name, opts=self.pipe_names)) | ||||||
|         raise KeyError(msg.format(name, self.pipe_names)) |  | ||||||
| 
 | 
 | ||||||
|     def create_pipe(self, name, config=dict()): |     def create_pipe(self, name, config=dict()): | ||||||
|         """Create a pipeline component from a factory. |         """Create a pipeline component from a factory. | ||||||
|  | @ -228,7 +228,7 @@ class Language(object): | ||||||
|         RETURNS (callable): Pipeline component. |         RETURNS (callable): Pipeline component. | ||||||
|         """ |         """ | ||||||
|         if name not in self.factories: |         if name not in self.factories: | ||||||
|             raise KeyError("Can't find factory for '{}'.".format(name)) |             raise KeyError(Errors.E002.format(name=name)) | ||||||
|         factory = self.factories[name] |         factory = self.factories[name] | ||||||
|         return factory(self, **config) |         return factory(self, **config) | ||||||
| 
 | 
 | ||||||
|  | @ -253,12 +253,9 @@ class Language(object): | ||||||
|             >>> nlp.add_pipe(component, name='custom_name', last=True) |             >>> nlp.add_pipe(component, name='custom_name', last=True) | ||||||
|         """ |         """ | ||||||
|         if not hasattr(component, '__call__'): |         if not hasattr(component, '__call__'): | ||||||
|             msg = ("Not a valid pipeline component. Expected callable, but " |             msg = Errors.E003.format(component=repr(component), name=name) | ||||||
|                    "got {}. ".format(repr(component))) |  | ||||||
|             if isinstance(component, basestring_) and component in self.factories: |             if isinstance(component, basestring_) and component in self.factories: | ||||||
|                 msg += ("If you meant to add a built-in component, use " |                 msg += Errors.E004.format(component=component) | ||||||
|                         "create_pipe: nlp.add_pipe(nlp.create_pipe('{}'))" |  | ||||||
|                         .format(component)) |  | ||||||
|             raise ValueError(msg) |             raise ValueError(msg) | ||||||
|         if name is None: |         if name is None: | ||||||
|             if hasattr(component, 'name'): |             if hasattr(component, 'name'): | ||||||
|  | @ -271,11 +268,9 @@ class Language(object): | ||||||
|             else: |             else: | ||||||
|                 name = repr(component) |                 name = repr(component) | ||||||
|         if name in self.pipe_names: |         if name in self.pipe_names: | ||||||
|             raise ValueError("'{}' already exists in pipeline.".format(name)) |             raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names)) | ||||||
|         if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2: |         if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2: | ||||||
|             msg = ("Invalid constraints. You can only set one of the " |             raise ValueError(Errors.E006) | ||||||
|                    "following: before, after, first, last.") |  | ||||||
|             raise ValueError(msg) |  | ||||||
|         pipe = (name, component) |         pipe = (name, component) | ||||||
|         if last or not any([first, before, after]): |         if last or not any([first, before, after]): | ||||||
|             self.pipeline.append(pipe) |             self.pipeline.append(pipe) | ||||||
|  | @ -286,9 +281,8 @@ class Language(object): | ||||||
|         elif after and after in self.pipe_names: |         elif after and after in self.pipe_names: | ||||||
|             self.pipeline.insert(self.pipe_names.index(after) + 1, pipe) |             self.pipeline.insert(self.pipe_names.index(after) + 1, pipe) | ||||||
|         else: |         else: | ||||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" |             raise ValueError(Errors.E001.format(name=before or after, | ||||||
|             unfound = before or after |                                                 opts=self.pipe_names)) | ||||||
|             raise ValueError(msg.format(unfound, self.pipe_names)) |  | ||||||
| 
 | 
 | ||||||
|     def has_pipe(self, name): |     def has_pipe(self, name): | ||||||
|         """Check if a component name is present in the pipeline. Equivalent to |         """Check if a component name is present in the pipeline. Equivalent to | ||||||
|  | @ -306,8 +300,7 @@ class Language(object): | ||||||
|         component (callable): Pipeline component. |         component (callable): Pipeline component. | ||||||
|         """ |         """ | ||||||
|         if name not in self.pipe_names: |         if name not in self.pipe_names: | ||||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" |             raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names)) | ||||||
|             raise ValueError(msg.format(name, self.pipe_names)) |  | ||||||
|         self.pipeline[self.pipe_names.index(name)] = (name, component) |         self.pipeline[self.pipe_names.index(name)] = (name, component) | ||||||
| 
 | 
 | ||||||
|     def rename_pipe(self, old_name, new_name): |     def rename_pipe(self, old_name, new_name): | ||||||
|  | @ -317,11 +310,9 @@ class Language(object): | ||||||
|         new_name (unicode): New name of the component. |         new_name (unicode): New name of the component. | ||||||
|         """ |         """ | ||||||
|         if old_name not in self.pipe_names: |         if old_name not in self.pipe_names: | ||||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" |             raise ValueError(Errors.E001.format(name=old_name, opts=self.pipe_names)) | ||||||
|             raise ValueError(msg.format(old_name, self.pipe_names)) |  | ||||||
|         if new_name in self.pipe_names: |         if new_name in self.pipe_names: | ||||||
|             msg = "'{}' already exists in pipeline. Existing names: {}" |             raise ValueError(Errors.E007.format(name=new_name, opts=self.pipe_names)) | ||||||
|             raise ValueError(msg.format(new_name, self.pipe_names)) |  | ||||||
|         i = self.pipe_names.index(old_name) |         i = self.pipe_names.index(old_name) | ||||||
|         self.pipeline[i] = (new_name, self.pipeline[i][1]) |         self.pipeline[i] = (new_name, self.pipeline[i][1]) | ||||||
| 
 | 
 | ||||||
|  | @ -332,8 +323,7 @@ class Language(object): | ||||||
|         RETURNS (tuple): A `(name, component)` tuple of the removed component. |         RETURNS (tuple): A `(name, component)` tuple of the removed component. | ||||||
|         """ |         """ | ||||||
|         if name not in self.pipe_names: |         if name not in self.pipe_names: | ||||||
|             msg = "Can't find '{}' in pipeline. Available names: {}" |             raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names)) | ||||||
|             raise ValueError(msg.format(name, self.pipe_names)) |  | ||||||
|         return self.pipeline.pop(self.pipe_names.index(name)) |         return self.pipeline.pop(self.pipe_names.index(name)) | ||||||
| 
 | 
 | ||||||
|     def __call__(self, text, disable=[]): |     def __call__(self, text, disable=[]): | ||||||
|  | @ -351,21 +341,17 @@ class Language(object): | ||||||
|             ('An', 'NN') |             ('An', 'NN') | ||||||
|         """ |         """ | ||||||
|         if len(text) >= self.max_length: |         if len(text) >= self.max_length: | ||||||
|             msg = ( |             raise ValueError(Errors.E088.format(length=len(text), | ||||||
|                 "Text of length {length} exceeds maximum of {max_length}. " |                                                 max_length=self.max_length)) | ||||||
|                 "The v2 parser and NER models require roughly 1GB of temporary " |  | ||||||
|                 "memory per 100,000 characters in the input. This means long " |  | ||||||
|                 "texts may cause memory allocation errors. If you're not using " |  | ||||||
|                 "the parser or NER, it's probably safe to increase the " |  | ||||||
|                 "nlp.max_length limit. The limit is in number of characters, " |  | ||||||
|                 "so you can check whether your inputs are too long by checking " |  | ||||||
|                 "len(text).") |  | ||||||
|             raise ValueError(msg.format(length=len(text), max_length=self.max_length)) |  | ||||||
|         doc = self.make_doc(text) |         doc = self.make_doc(text) | ||||||
|         for name, proc in self.pipeline: |         for name, proc in self.pipeline: | ||||||
|             if name in disable: |             if name in disable: | ||||||
|                 continue |                 continue | ||||||
|  |             if not hasattr(proc, '__call__'): | ||||||
|  |                 raise ValueError(Errors.E003.format(component=type(proc), name=name)) | ||||||
|             doc = proc(doc) |             doc = proc(doc) | ||||||
|  |             if doc is None: | ||||||
|  |                 raise ValueError(Errors.E005.format(name=name)) | ||||||
|         return doc |         return doc | ||||||
| 
 | 
 | ||||||
|     def disable_pipes(self, *names): |     def disable_pipes(self, *names): | ||||||
|  | @ -407,8 +393,7 @@ class Language(object): | ||||||
|             >>>            state = nlp.update(docs, golds, sgd=optimizer) |             >>>            state = nlp.update(docs, golds, sgd=optimizer) | ||||||
|         """ |         """ | ||||||
|         if len(docs) != len(golds): |         if len(docs) != len(golds): | ||||||
|             raise IndexError("Update expects same number of docs and golds " |             raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds))) | ||||||
|                              "Got: %d, %d" % (len(docs), len(golds))) |  | ||||||
|         if len(docs) == 0: |         if len(docs) == 0: | ||||||
|             return |             return | ||||||
|         if sgd is None: |         if sgd is None: | ||||||
|  | @ -757,14 +742,7 @@ class DisabledPipes(list): | ||||||
|         if unexpected: |         if unexpected: | ||||||
|             # Don't change the pipeline if we're raising an error. |             # Don't change the pipeline if we're raising an error. | ||||||
|             self.nlp.pipeline = current |             self.nlp.pipeline = current | ||||||
|             msg = ( |             raise ValueError(Errors.E008.format(names=unexpected)) | ||||||
|                 "Some current components would be lost when restoring " |  | ||||||
|                 "previous pipeline state. If you added components after " |  | ||||||
|                 "calling nlp.disable_pipes(), you should remove them " |  | ||||||
|                 "explicitly with nlp.remove_pipe() before the pipeline is " |  | ||||||
|                 "restore. Names of the new components: %s" |  | ||||||
|             ) |  | ||||||
|             raise ValueError(msg % unexpected) |  | ||||||
|         self[:] = [] |         self[:] = [] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -15,7 +15,7 @@ from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP | ||||||
| from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV | from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV | ||||||
| from .attrs cimport PROB | from .attrs cimport PROB | ||||||
| from .attrs import intify_attrs | from .attrs import intify_attrs | ||||||
| from . import about | from .errors import Errors | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) | memset(&EMPTY_LEXEME, 0, sizeof(LexemeC)) | ||||||
|  | @ -37,7 +37,8 @@ cdef class Lexeme: | ||||||
|         self.vocab = vocab |         self.vocab = vocab | ||||||
|         self.orth = orth |         self.orth = orth | ||||||
|         self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth) |         self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth) | ||||||
|         assert self.c.orth == orth |         if self.c.orth != orth: | ||||||
|  |             raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth)) | ||||||
| 
 | 
 | ||||||
|     def __richcmp__(self, other, int op): |     def __richcmp__(self, other, int op): | ||||||
|         if other is None: |         if other is None: | ||||||
|  | @ -129,20 +130,25 @@ cdef class Lexeme: | ||||||
|         lex_data = Lexeme.c_to_bytes(self.c) |         lex_data = Lexeme.c_to_bytes(self.c) | ||||||
|         start = <const char*>&self.c.flags |         start = <const char*>&self.c.flags | ||||||
|         end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment) |         end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment) | ||||||
|         assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data)) |         if (end-start) != sizeof(lex_data.data): | ||||||
|  |             raise ValueError(Errors.E072.format(length=end-start, | ||||||
|  |                                                 bad_length=sizeof(lex_data.data))) | ||||||
|         byte_string = b'\0' * sizeof(lex_data.data) |         byte_string = b'\0' * sizeof(lex_data.data) | ||||||
|         byte_chars = <char*>byte_string |         byte_chars = <char*>byte_string | ||||||
|         for i in range(sizeof(lex_data.data)): |         for i in range(sizeof(lex_data.data)): | ||||||
|             byte_chars[i] = lex_data.data[i] |             byte_chars[i] = lex_data.data[i] | ||||||
|         assert len(byte_string) == sizeof(lex_data.data), (len(byte_string), |         if len(byte_string) != sizeof(lex_data.data): | ||||||
|                 sizeof(lex_data.data)) |             raise ValueError(Errors.E072.format(length=len(byte_string), | ||||||
|  |                                                 bad_length=sizeof(lex_data.data))) | ||||||
|         return byte_string |         return byte_string | ||||||
| 
 | 
 | ||||||
|     def from_bytes(self, bytes byte_string): |     def from_bytes(self, bytes byte_string): | ||||||
|         # This method doesn't really have a use-case --- wrote it for testing. |         # This method doesn't really have a use-case --- wrote it for testing. | ||||||
|         # Possibly delete? It puts the Lexeme out of synch with the vocab. |         # Possibly delete? It puts the Lexeme out of synch with the vocab. | ||||||
|         cdef SerializedLexemeC lex_data |         cdef SerializedLexemeC lex_data | ||||||
|         assert len(byte_string) == sizeof(lex_data.data) |         if len(byte_string) != sizeof(lex_data.data): | ||||||
|  |             raise ValueError(Errors.E072.format(length=len(byte_string), | ||||||
|  |                                                 bad_length=sizeof(lex_data.data))) | ||||||
|         for i in range(len(byte_string)): |         for i in range(len(byte_string)): | ||||||
|             lex_data.data[i] = byte_string[i] |             lex_data.data[i] = byte_string[i] | ||||||
|         Lexeme.c_from_bytes(self.c, lex_data) |         Lexeme.c_from_bytes(self.c, lex_data) | ||||||
|  | @ -169,16 +175,13 @@ cdef class Lexeme: | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             cdef int length = self.vocab.vectors_length |             cdef int length = self.vocab.vectors_length | ||||||
|             if length == 0: |             if length == 0: | ||||||
|                 raise ValueError( |                 raise ValueError(Errors.E010) | ||||||
|                     "Word vectors set to length 0. This may be because you " |  | ||||||
|                     "don't have a model installed or loaded, or because your " |  | ||||||
|                     "model doesn't include word vectors. For more info, see " |  | ||||||
|                     "the documentation: \n%s\n" % about.__docs_models__ |  | ||||||
|                 ) |  | ||||||
|             return self.vocab.get_vector(self.c.orth) |             return self.vocab.get_vector(self.c.orth) | ||||||
| 
 | 
 | ||||||
|         def __set__(self, vector): |         def __set__(self, vector): | ||||||
|             assert len(vector) == self.vocab.vectors_length |             if len(vector) != self.vocab.vectors_length: | ||||||
|  |                 raise ValueError(Errors.E073.format(new_length=len(vector), | ||||||
|  |                                                     length=self.vocab.vectors_length)) | ||||||
|             self.vocab.set_vector(self.c.orth, vector) |             self.vocab.set_vector(self.c.orth, vector) | ||||||
| 
 | 
 | ||||||
|     property rank: |     property rank: | ||||||
|  |  | ||||||
|  | @ -16,6 +16,7 @@ from .typedefs cimport hash_t | ||||||
| from .structs cimport TokenC | from .structs cimport TokenC | ||||||
| from .tokens.doc cimport Doc, get_token_attr | from .tokens.doc cimport Doc, get_token_attr | ||||||
| from .vocab cimport Vocab | from .vocab cimport Vocab | ||||||
|  | from .errors import Errors, TempErrors | ||||||
| 
 | 
 | ||||||
| from .attrs import IDS | from .attrs import IDS | ||||||
| from .attrs cimport attr_id_t, ID, NULL_ATTR | from .attrs cimport attr_id_t, ID, NULL_ATTR | ||||||
|  | @ -109,7 +110,8 @@ cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0: | ||||||
|     while pattern.nr_attr != 0: |     while pattern.nr_attr != 0: | ||||||
|         pattern += 1 |         pattern += 1 | ||||||
|     id_attr = pattern[0].attrs[0] |     id_attr = pattern[0].attrs[0] | ||||||
|     assert id_attr.attr == ID |     if id_attr.attr != ID: | ||||||
|  |         raise ValueError(Errors.E074.format(attr=ID, bad_attr=id_attr.attr)) | ||||||
|     return id_attr.value |     return id_attr.value | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -161,8 +163,8 @@ def _convert_strings(token_specs, string_store): | ||||||
|                 if value in operators: |                 if value in operators: | ||||||
|                     ops = operators[value] |                     ops = operators[value] | ||||||
|                 else: |                 else: | ||||||
|                     msg = "Unknown operator '%s'. Options: %s" |                     keys = ', '.join(operators.keys()) | ||||||
|                     raise KeyError(msg % (value, ', '.join(operators.keys()))) |                     raise KeyError(Errors.E011.format(op=value, opts=keys)) | ||||||
|             if isinstance(attr, basestring): |             if isinstance(attr, basestring): | ||||||
|                 attr = IDS.get(attr.upper()) |                 attr = IDS.get(attr.upper()) | ||||||
|             if isinstance(value, basestring): |             if isinstance(value, basestring): | ||||||
|  | @ -264,9 +266,7 @@ cdef class Matcher: | ||||||
|         """ |         """ | ||||||
|         for pattern in patterns: |         for pattern in patterns: | ||||||
|             if len(pattern) == 0: |             if len(pattern) == 0: | ||||||
|                 msg = ("Cannot add pattern for zero tokens to matcher.\n" |                 raise ValueError(Errors.E012.format(key=key)) | ||||||
|                        "key: {key}\n") |  | ||||||
|                 raise ValueError(msg.format(key=key)) |  | ||||||
|         key = self._normalize_key(key) |         key = self._normalize_key(key) | ||||||
|         for pattern in patterns: |         for pattern in patterns: | ||||||
|             specs = _convert_strings(pattern, self.vocab.strings) |             specs = _convert_strings(pattern, self.vocab.strings) | ||||||
|  | @ -348,13 +348,12 @@ cdef class Matcher: | ||||||
|             for state in partials: |             for state in partials: | ||||||
|                 action = get_action(state.second, token) |                 action = get_action(state.second, token) | ||||||
|                 if action == PANIC: |                 if action == PANIC: | ||||||
|                     raise Exception("Error selecting action in matcher") |                     raise ValueError(Errors.E013) | ||||||
|                 while action == ADVANCE_ZERO: |                 while action == ADVANCE_ZERO: | ||||||
|                     state.second += 1 |                     state.second += 1 | ||||||
|                     action = get_action(state.second, token) |                     action = get_action(state.second, token) | ||||||
|                 if action == PANIC: |                 if action == PANIC: | ||||||
|                     raise Exception("Error selecting action in matcher") |                     raise ValueError(Errors.E013) | ||||||
| 
 |  | ||||||
|                 if action == REPEAT: |                 if action == REPEAT: | ||||||
|                     # Leave the state in the queue, and advance to next slot |                     # Leave the state in the queue, and advance to next slot | ||||||
|                     # (i.e. we don't overwrite -- we want to greedily match |                     # (i.e. we don't overwrite -- we want to greedily match | ||||||
|  | @ -380,7 +379,7 @@ cdef class Matcher: | ||||||
|             for pattern in self.patterns: |             for pattern in self.patterns: | ||||||
|                 action = get_action(pattern, token) |                 action = get_action(pattern, token) | ||||||
|                 if action == PANIC: |                 if action == PANIC: | ||||||
|                     raise Exception("Error selecting action in matcher") |                     raise ValueError(Errors.E013) | ||||||
|                 while action == ADVANCE_ZERO: |                 while action == ADVANCE_ZERO: | ||||||
|                     pattern += 1 |                     pattern += 1 | ||||||
|                     action = get_action(pattern, token) |                     action = get_action(pattern, token) | ||||||
|  | @ -447,7 +446,7 @@ def get_bilou(length): | ||||||
|         return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, |         return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, | ||||||
|                 I10_ENT, I10_ENT, L10_ENT] |                 I10_ENT, I10_ENT, L10_ENT] | ||||||
|     else: |     else: | ||||||
|         raise ValueError("Max length currently 10 for phrase matching") |         raise ValueError(TempErrors.T001) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class PhraseMatcher: | cdef class PhraseMatcher: | ||||||
|  | @ -506,11 +505,8 @@ cdef class PhraseMatcher: | ||||||
|         cdef Doc doc |         cdef Doc doc | ||||||
|         for doc in docs: |         for doc in docs: | ||||||
|             if len(doc) >= self.max_length: |             if len(doc) >= self.max_length: | ||||||
|                 msg = ( |                 raise ValueError(TempErrors.T002.format(doc_len=len(doc), | ||||||
|                     "Pattern length (%d) >= phrase_matcher.max_length (%d). " |                                                         max_len=self.max_length)) | ||||||
|                     "Length can be set on initialization, up to 10." |  | ||||||
|                 ) |  | ||||||
|                 raise ValueError(msg % (len(doc), self.max_length)) |  | ||||||
|         cdef hash_t ent_id = self.matcher._normalize_key(key) |         cdef hash_t ent_id = self.matcher._normalize_key(key) | ||||||
|         self._callbacks[ent_id] = on_match |         self._callbacks[ent_id] = on_match | ||||||
|         cdef int length |         cdef int length | ||||||
|  | @ -562,7 +558,9 @@ cdef class PhraseMatcher: | ||||||
|             yield doc |             yield doc | ||||||
| 
 | 
 | ||||||
|     def accept_match(self, Doc doc, int start, int end): |     def accept_match(self, Doc doc, int start, int end): | ||||||
|         assert (end - start) < self.max_length |         if (end - start) >= self.max_length: | ||||||
|  |             raise ValueError(Errors.E075.format(length=end - start, | ||||||
|  |                                                 max_len=self.max_length)) | ||||||
|         cdef int i, j |         cdef int i, j | ||||||
|         for i in range(self.max_length): |         for i in range(self.max_length): | ||||||
|             self._phrase_key[i] = 0 |             self._phrase_key[i] = 0 | ||||||
|  |  | ||||||
|  | @ -9,6 +9,7 @@ from .attrs import LEMMA, intify_attrs | ||||||
| from .parts_of_speech cimport SPACE | from .parts_of_speech cimport SPACE | ||||||
| from .parts_of_speech import IDS as POS_IDS | from .parts_of_speech import IDS as POS_IDS | ||||||
| from .lexeme cimport Lexeme | from .lexeme cimport Lexeme | ||||||
|  | from .errors import Errors | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def _normalize_props(props): | def _normalize_props(props): | ||||||
|  | @ -93,7 +94,7 @@ cdef class Morphology: | ||||||
| 
 | 
 | ||||||
|     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: |     cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1: | ||||||
|         if tag_id > self.n_tags: |         if tag_id > self.n_tags: | ||||||
|             raise ValueError("Unknown tag ID: %s" % tag_id) |             raise ValueError(Errors.E014.format(tag=tag_id)) | ||||||
|         # TODO: It's pretty arbitrary to put this logic here. I guess the |         # TODO: It's pretty arbitrary to put this logic here. I guess the | ||||||
|         # justification is that this is where the specific word and the tag |         # justification is that this is where the specific word and the tag | ||||||
|         # interact. Still, we should have a better way to enforce this rule, or |         # interact. Still, we should have a better way to enforce this rule, or | ||||||
|  | @ -147,9 +148,7 @@ cdef class Morphology: | ||||||
|         elif force: |         elif force: | ||||||
|             memset(cached, 0, sizeof(cached[0])) |             memset(cached, 0, sizeof(cached[0])) | ||||||
|         else: |         else: | ||||||
|             raise ValueError( |             raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str)) | ||||||
|                 "Conflicting morphology exception for (%s, %s). Use " |  | ||||||
|                 "force=True to overwrite." % (tag_str, orth_str)) |  | ||||||
| 
 | 
 | ||||||
|         cached.tag = rich_tag |         cached.tag = rich_tag | ||||||
|         # TODO: Refactor this to take arbitrary attributes. |         # TODO: Refactor this to take arbitrary attributes. | ||||||
|  |  | ||||||
|  | @ -33,6 +33,7 @@ from .parts_of_speech import X | ||||||
| from ._ml import Tok2Vec, build_text_classifier, build_tagger_model | from ._ml import Tok2Vec, build_text_classifier, build_tagger_model | ||||||
| from ._ml import link_vectors_to_models, zero_init, flatten | from ._ml import link_vectors_to_models, zero_init, flatten | ||||||
| from ._ml import create_default_optimizer | from ._ml import create_default_optimizer | ||||||
|  | from .errors import Errors, TempErrors | ||||||
| from . import util | from . import util | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -336,7 +337,8 @@ class Tensorizer(Pipe): | ||||||
|         tensors (object): Vector representation for each token in the docs. |         tensors (object): Vector representation for each token in the docs. | ||||||
|         """ |         """ | ||||||
|         for doc, tensor in zip(docs, tensors): |         for doc, tensor in zip(docs, tensors): | ||||||
|             assert tensor.shape[0] == len(doc) |             if tensor.shape[0] != len(doc): | ||||||
|  |                 raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc))) | ||||||
|             doc.tensor = tensor |             doc.tensor = tensor | ||||||
| 
 | 
 | ||||||
|     def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): |     def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None): | ||||||
|  | @ -550,9 +552,7 @@ class Tagger(Pipe): | ||||||
|             # copy_array(larger.W[:smaller.nO], smaller.W) |             # copy_array(larger.W[:smaller.nO], smaller.W) | ||||||
|             # copy_array(larger.b[:smaller.nO], smaller.b) |             # copy_array(larger.b[:smaller.nO], smaller.b) | ||||||
|             # self.model._layers[-1] = larger |             # self.model._layers[-1] = larger | ||||||
|             raise ValueError( |             raise ValueError(TempErrors.T003) | ||||||
|                 "Resizing pre-trained Tagger models is not " |  | ||||||
|                 "currently supported.") |  | ||||||
|         tag_map = dict(self.vocab.morphology.tag_map) |         tag_map = dict(self.vocab.morphology.tag_map) | ||||||
|         if values is None: |         if values is None: | ||||||
|             values = {POS: "X"} |             values = {POS: "X"} | ||||||
|  | @ -671,8 +671,7 @@ class MultitaskObjective(Tagger): | ||||||
|         elif hasattr(target, '__call__'): |         elif hasattr(target, '__call__'): | ||||||
|             self.make_label = target |             self.make_label = target | ||||||
|         else: |         else: | ||||||
|             raise ValueError("MultitaskObjective target should be function or " |             raise ValueError(Errors.E016) | ||||||
|                              "one of: dep, tag, ent, dep_tag_offset, ent_tag.") |  | ||||||
|         self.cfg = dict(cfg) |         self.cfg = dict(cfg) | ||||||
|         self.cfg.setdefault('cnn_maxout_pieces', 2) |         self.cfg.setdefault('cnn_maxout_pieces', 2) | ||||||
| 
 | 
 | ||||||
|  | @ -723,7 +722,9 @@ class MultitaskObjective(Tagger): | ||||||
|         return tokvecs, scores |         return tokvecs, scores | ||||||
| 
 | 
 | ||||||
|     def get_loss(self, docs, golds, scores): |     def get_loss(self, docs, golds, scores): | ||||||
|         assert len(docs) == len(golds) |         if len(docs) != len(golds): | ||||||
|  |             raise ValueError(Errors.E077.format(value='loss', n_docs=len(docs), | ||||||
|  |                                                 n_golds=len(golds))) | ||||||
|         cdef int idx = 0 |         cdef int idx = 0 | ||||||
|         correct = numpy.zeros((scores.shape[0],), dtype='i') |         correct = numpy.zeros((scores.shape[0],), dtype='i') | ||||||
|         guesses = scores.argmax(axis=1) |         guesses = scores.argmax(axis=1) | ||||||
|  |  | ||||||
|  | @ -2,6 +2,7 @@ | ||||||
| from __future__ import division, print_function, unicode_literals | from __future__ import division, print_function, unicode_literals | ||||||
| 
 | 
 | ||||||
| from .gold import tags_to_entities | from .gold import tags_to_entities | ||||||
|  | from .errors import Errors | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| class PRFScore(object): | class PRFScore(object): | ||||||
|  | @ -84,7 +85,8 @@ class Scorer(object): | ||||||
|         } |         } | ||||||
| 
 | 
 | ||||||
|     def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')): |     def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')): | ||||||
|         assert len(tokens) == len(gold) |         if len(tokens) != len(gold): | ||||||
|  |             raise ValueError(Errors.E078.format(words_doc=len(tokens), words_gold=len(gold))) | ||||||
|         gold_deps = set() |         gold_deps = set() | ||||||
|         gold_tags = set() |         gold_tags = set() | ||||||
|         gold_ents = set(tags_to_entities([annot[-1] |         gold_ents = set(tags_to_entities([annot[-1] | ||||||
|  |  | ||||||
|  | @ -13,6 +13,7 @@ from .symbols import IDS as SYMBOLS_BY_STR | ||||||
| from .symbols import NAMES as SYMBOLS_BY_INT | from .symbols import NAMES as SYMBOLS_BY_INT | ||||||
| from .typedefs cimport hash_t | from .typedefs cimport hash_t | ||||||
| from .compat import json_dumps | from .compat import json_dumps | ||||||
|  | from .errors import Errors | ||||||
| from . import util | from . import util | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -59,7 +60,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e | ||||||
|         string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char)) |         string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char)) | ||||||
|         string.p[0] = length |         string.p[0] = length | ||||||
|         memcpy(&string.p[1], chars, length) |         memcpy(&string.p[1], chars, length) | ||||||
|         assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0] |  | ||||||
|         return string |         return string | ||||||
|     else: |     else: | ||||||
|         i = 0 |         i = 0 | ||||||
|  | @ -69,7 +69,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e | ||||||
|             string.p[i] = 255 |             string.p[i] = 255 | ||||||
|         string.p[n_length_bytes-1] = length % 255 |         string.p[n_length_bytes-1] = length % 255 | ||||||
|         memcpy(&string.p[n_length_bytes], chars, length) |         memcpy(&string.p[n_length_bytes], chars, length) | ||||||
|         assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0] |  | ||||||
|         return string |         return string | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -115,7 +114,7 @@ cdef class StringStore: | ||||||
|             self.hits.insert(key) |             self.hits.insert(key) | ||||||
|             utf8str = <Utf8Str*>self._map.get(key) |             utf8str = <Utf8Str*>self._map.get(key) | ||||||
|             if utf8str is NULL: |             if utf8str is NULL: | ||||||
|                 raise KeyError(string_or_id) |                 raise KeyError(Errors.E018.format(hash_value=string_or_id)) | ||||||
|             else: |             else: | ||||||
|                 return decode_Utf8Str(utf8str) |                 return decode_Utf8Str(utf8str) | ||||||
| 
 | 
 | ||||||
|  | @ -136,8 +135,7 @@ cdef class StringStore: | ||||||
|             key = hash_utf8(string, len(string)) |             key = hash_utf8(string, len(string)) | ||||||
|             self._intern_utf8(string, len(string)) |             self._intern_utf8(string, len(string)) | ||||||
|         else: |         else: | ||||||
|             raise TypeError( |             raise TypeError(Errors.E017.format(value_type=type(string))) | ||||||
|                 "Can only add unicode or bytes. Got type: %s" % type(string)) |  | ||||||
|         return key |         return key | ||||||
| 
 | 
 | ||||||
|     def __len__(self): |     def __len__(self): | ||||||
|  |  | ||||||
|  | @ -10,6 +10,7 @@ from thinc.extra.search cimport MaxViolation | ||||||
| 
 | 
 | ||||||
| from .transition_system cimport TransitionSystem, Transition | from .transition_system cimport TransitionSystem, Transition | ||||||
| from ..gold cimport GoldParse | from ..gold cimport GoldParse | ||||||
|  | from ..errors import Errors | ||||||
| from .stateclass cimport StateC, StateClass | from .stateclass cimport StateC, StateClass | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -220,7 +221,8 @@ def get_states(pbeams, gbeams, beam_map, nr_update): | ||||||
|     p_indices = [] |     p_indices = [] | ||||||
|     g_indices = [] |     g_indices = [] | ||||||
|     cdef Beam pbeam, gbeam |     cdef Beam pbeam, gbeam | ||||||
|     assert len(pbeams) == len(gbeams) |     if len(pbeams) != len(gbeams): | ||||||
|  |         raise ValueError(Errors.E079.format(pbeams=len(pbeams), gbeams=len(gbeams))) | ||||||
|     for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)): |     for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)): | ||||||
|         p_indices.append([]) |         p_indices.append([]) | ||||||
|         g_indices.append([]) |         g_indices.append([]) | ||||||
|  | @ -228,7 +230,8 @@ def get_states(pbeams, gbeams, beam_map, nr_update): | ||||||
|             state = StateClass.borrow(<StateC*>pbeam.at(i)) |             state = StateClass.borrow(<StateC*>pbeam.at(i)) | ||||||
|             if not state.is_final(): |             if not state.is_final(): | ||||||
|                 key = tuple([eg_id] + pbeam.histories[i]) |                 key = tuple([eg_id] + pbeam.histories[i]) | ||||||
|                 assert key not in seen, (key, seen) |                 if key in seen: | ||||||
|  |                     raise ValueError(Errors.E080.format(key=key)) | ||||||
|                 seen[key] = len(states) |                 seen[key] = len(states) | ||||||
|                 p_indices[-1].append(len(states)) |                 p_indices[-1].append(len(states)) | ||||||
|                 states.append(state) |                 states.append(state) | ||||||
|  | @ -271,7 +274,8 @@ def get_gradient(nr_class, beam_maps, histories, losses): | ||||||
|     for i in range(nr_step): |     for i in range(nr_step): | ||||||
|         grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), |         grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class), | ||||||
|                                  dtype='f')) |                                  dtype='f')) | ||||||
|     assert len(histories) == len(losses) |     if len(histories) != len(losses): | ||||||
|  |         raise ValueError(Errors.E081.format(n_hist=len(histories), losses=len(losses))) | ||||||
|     for eg_id, hists in enumerate(histories): |     for eg_id, hists in enumerate(histories): | ||||||
|         for loss, hist in zip(losses[eg_id], hists): |         for loss, hist in zip(losses[eg_id], hists): | ||||||
|             if loss == 0.0 or numpy.isnan(loss): |             if loss == 0.0 or numpy.isnan(loss): | ||||||
|  |  | ||||||
|  | @ -15,6 +15,7 @@ from .nonproj import is_nonproj_tree | ||||||
| from .transition_system cimport move_cost_func_t, label_cost_func_t | from .transition_system cimport move_cost_func_t, label_cost_func_t | ||||||
| from ..gold cimport GoldParse, GoldParseC | from ..gold cimport GoldParse, GoldParseC | ||||||
| from ..structs cimport TokenC | from ..structs cimport TokenC | ||||||
|  | from ..errors import Errors | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| DEF NON_MONOTONIC = True | DEF NON_MONOTONIC = True | ||||||
|  | @ -455,7 +456,7 @@ cdef class ArcEager(TransitionSystem): | ||||||
|             t.do = Break.transition |             t.do = Break.transition | ||||||
|             t.get_cost = Break.cost |             t.get_cost = Break.cost | ||||||
|         else: |         else: | ||||||
|             raise Exception(move) |             raise ValueError(Errors.E019.format(action=move, src='arc_eager')) | ||||||
|         return t |         return t | ||||||
| 
 | 
 | ||||||
|     cdef int initialize_state(self, StateC* st) nogil: |     cdef int initialize_state(self, StateC* st) nogil: | ||||||
|  | @ -529,28 +530,11 @@ cdef class ArcEager(TransitionSystem): | ||||||
|         if n_gold < 1: |         if n_gold < 1: | ||||||
|             # Check projectivity --- leading cause |             # Check projectivity --- leading cause | ||||||
|             if is_nonproj_tree(gold.heads): |             if is_nonproj_tree(gold.heads): | ||||||
|                 raise ValueError( |                 raise ValueError(Errors.E020) | ||||||
|                     "Could not find a gold-standard action to supervise the " |  | ||||||
|                     "dependency parser. Likely cause: the tree is " |  | ||||||
|                     "non-projective (i.e. it has crossing arcs -- see " |  | ||||||
|                     "spacy/syntax/nonproj.pyx for definitions). The ArcEager " |  | ||||||
|                     "transition system only supports projective trees. To " |  | ||||||
|                     "learn non-projective representations, transform the data " |  | ||||||
|                     "before training and after parsing. Either pass " |  | ||||||
|                     "make_projective=True to the GoldParse class, or use " |  | ||||||
|                     "spacy.syntax.nonproj.preprocess_training_data.") |  | ||||||
|             else: |             else: | ||||||
|                 print(gold.orig_annot) |                 failure_state = stcls.print_state(gold.words) | ||||||
|                 print(gold.words) |                 raise ValueError(Errors.E021.format(n_actions=self.n_moves, | ||||||
|                 print(gold.heads) |                                                     state=failure_state)) | ||||||
|                 print(gold.labels) |  | ||||||
|                 print(gold.sent_starts) |  | ||||||
|                 raise ValueError( |  | ||||||
|                     "Could not find a gold-standard action to supervise the" |  | ||||||
|                     "dependency parser. The GoldParse was projective. The " |  | ||||||
|                     "transition system has %d actions. State at failure: %s" |  | ||||||
|                     % (self.n_moves, stcls.print_state(gold.words))) |  | ||||||
|         assert n_gold >= 1 |  | ||||||
| 
 | 
 | ||||||
|     def get_beam_annot(self, Beam beam): |     def get_beam_annot(self, Beam beam): | ||||||
|         length = (<StateC*>beam.at(0)).length |         length = (<StateC*>beam.at(0)).length | ||||||
|  |  | ||||||
|  | @ -10,6 +10,7 @@ from ._state cimport StateC | ||||||
| from .transition_system cimport Transition | from .transition_system cimport Transition | ||||||
| from .transition_system cimport do_func_t | from .transition_system cimport do_func_t | ||||||
| from ..gold cimport GoldParseC, GoldParse | from ..gold cimport GoldParseC, GoldParse | ||||||
|  | from ..errors import Errors | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef enum: | cdef enum: | ||||||
|  | @ -173,7 +174,7 @@ cdef class BiluoPushDown(TransitionSystem): | ||||||
|             if self.c[i].move == move and self.c[i].label == label: |             if self.c[i].move == move and self.c[i].label == label: | ||||||
|                 return self.c[i] |                 return self.c[i] | ||||||
|         else: |         else: | ||||||
|             raise KeyError(name) |             raise KeyError(Errors.E022.format(name=name)) | ||||||
| 
 | 
 | ||||||
|     cdef Transition init_transition(self, int clas, int move, attr_t label) except *: |     cdef Transition init_transition(self, int clas, int move, attr_t label) except *: | ||||||
|         # TODO: Apparent Cython bug here when we try to use the Transition() |         # TODO: Apparent Cython bug here when we try to use the Transition() | ||||||
|  | @ -208,7 +209,7 @@ cdef class BiluoPushDown(TransitionSystem): | ||||||
|             t.do = Out.transition |             t.do = Out.transition | ||||||
|             t.get_cost = Out.cost |             t.get_cost = Out.cost | ||||||
|         else: |         else: | ||||||
|             raise Exception(move) |             raise ValueError(Errors.E019.format(action=move, src='ner')) | ||||||
|         return t |         return t | ||||||
| 
 | 
 | ||||||
|     def add_action(self, int action, label_name): |     def add_action(self, int action, label_name): | ||||||
|  | @ -230,7 +231,6 @@ cdef class BiluoPushDown(TransitionSystem): | ||||||
|             self._size *= 2 |             self._size *= 2 | ||||||
|             self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0])) |             self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0])) | ||||||
|         self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id) |         self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id) | ||||||
|         assert self.c[self.n_moves].label == label_id |  | ||||||
|         self.n_moves += 1 |         self.n_moves += 1 | ||||||
|         return 1 |         return 1 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -34,6 +34,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer | ||||||
| from ..compat import json_dumps, copy_array | from ..compat import json_dumps, copy_array | ||||||
| from ..tokens.doc cimport Doc | from ..tokens.doc cimport Doc | ||||||
| from ..gold cimport GoldParse | from ..gold cimport GoldParse | ||||||
|  | from ..errors import Errors, TempErrors | ||||||
| from .. import util | from .. import util | ||||||
| from .stateclass cimport StateClass | from .stateclass cimport StateClass | ||||||
| from ._state cimport StateC | from ._state cimport StateC | ||||||
|  | @ -242,7 +243,7 @@ cdef class Parser: | ||||||
|     def Model(cls, nr_class, **cfg): |     def Model(cls, nr_class, **cfg): | ||||||
|         depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) |         depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1)) | ||||||
|         if depth != 1: |         if depth != 1: | ||||||
|             raise ValueError("Currently parser depth is hard-coded to 1.") |             raise ValueError(TempErrors.T004.format(value=depth)) | ||||||
|         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', |         parser_maxout_pieces = util.env_opt('parser_maxout_pieces', | ||||||
|                                             cfg.get('maxout_pieces', 2)) |                                             cfg.get('maxout_pieces', 2)) | ||||||
|         token_vector_width = util.env_opt('token_vector_width', |         token_vector_width = util.env_opt('token_vector_width', | ||||||
|  | @ -252,9 +253,9 @@ cdef class Parser: | ||||||
|         hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) |         hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0)) | ||||||
|         hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) |         hist_width = util.env_opt('history_width', cfg.get('hist_width', 0)) | ||||||
|         if hist_size != 0: |         if hist_size != 0: | ||||||
|             raise ValueError("Currently history size is hard-coded to 0") |             raise ValueError(TempErrors.T005.format(value=hist_size)) | ||||||
|         if hist_width != 0: |         if hist_width != 0: | ||||||
|             raise ValueError("Currently history width is hard-coded to 0") |             raise ValueError(TempErrors.T006.format(value=hist_width)) | ||||||
|         pretrained_vectors = cfg.get('pretrained_vectors', None) |         pretrained_vectors = cfg.get('pretrained_vectors', None) | ||||||
|         tok2vec = Tok2Vec(token_vector_width, embed_size, |         tok2vec = Tok2Vec(token_vector_width, embed_size, | ||||||
|                           pretrained_vectors=pretrained_vectors) |                           pretrained_vectors=pretrained_vectors) | ||||||
|  | @ -542,7 +543,9 @@ cdef class Parser: | ||||||
|     def update(self, docs, golds, drop=0., sgd=None, losses=None): |     def update(self, docs, golds, drop=0., sgd=None, losses=None): | ||||||
|         if not any(self.moves.has_gold(gold) for gold in golds): |         if not any(self.moves.has_gold(gold) for gold in golds): | ||||||
|             return None |             return None | ||||||
|         assert len(docs) == len(golds) |         if len(docs) != len(golds): | ||||||
|  |             raise ValueError(Errors.E077.format(value='update', n_docs=len(docs), | ||||||
|  |                                                 n_golds=len(golds))) | ||||||
|         if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0: |         if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0: | ||||||
|             return self.update_beam(docs, golds, |             return self.update_beam(docs, golds, | ||||||
|                     self.cfg['beam_width'], self.cfg['beam_density'], |                     self.cfg['beam_width'], self.cfg['beam_density'], | ||||||
|  | @ -622,7 +625,6 @@ cdef class Parser: | ||||||
|         if losses is not None and self.name not in losses: |         if losses is not None and self.name not in losses: | ||||||
|             losses[self.name] = 0. |             losses[self.name] = 0. | ||||||
|         lengths = [len(d) for d in docs] |         lengths = [len(d) for d in docs] | ||||||
|         assert min(lengths) >= 1 |  | ||||||
|         states = self.moves.init_batch(docs) |         states = self.moves.init_batch(docs) | ||||||
|         for gold in golds: |         for gold in golds: | ||||||
|             self.moves.preprocess_gold(gold) |             self.moves.preprocess_gold(gold) | ||||||
|  | @ -1021,15 +1023,11 @@ def _cleanup(Beam beam): | ||||||
|             del state |             del state | ||||||
|             seen.add(addr) |             seen.add(addr) | ||||||
|         else: |         else: | ||||||
|             print(i, addr) |             raise ValueError(Errors.E023.format(addr=addr, i=i)) | ||||||
|             print(seen) |  | ||||||
|             raise Exception |  | ||||||
|         addr = <size_t>beam._states[i].content |         addr = <size_t>beam._states[i].content | ||||||
|         if addr not in seen: |         if addr not in seen: | ||||||
|             state = <StateC*>addr |             state = <StateC*>addr | ||||||
|             del state |             del state | ||||||
|             seen.add(addr) |             seen.add(addr) | ||||||
|         else: |         else: | ||||||
|             print(i, addr) |             raise ValueError(Errors.E023.format(addr=addr, i=i)) | ||||||
|             print(seen) |  | ||||||
|             raise Exception |  | ||||||
|  |  | ||||||
|  | @ -10,6 +10,7 @@ from __future__ import unicode_literals | ||||||
| from copy import copy | from copy import copy | ||||||
| 
 | 
 | ||||||
| from ..tokens.doc cimport Doc | from ..tokens.doc cimport Doc | ||||||
|  | from ..errors import Errors | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| DELIMITER = '||' | DELIMITER = '||' | ||||||
|  | @ -131,7 +132,10 @@ cpdef deprojectivize(Doc doc): | ||||||
| 
 | 
 | ||||||
| def _decorate(heads, proj_heads, labels): | def _decorate(heads, proj_heads, labels): | ||||||
|     # uses decoration scheme HEAD from Nivre & Nilsson 2005 |     # uses decoration scheme HEAD from Nivre & Nilsson 2005 | ||||||
|     assert(len(heads) == len(proj_heads) == len(labels)) |     if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)): | ||||||
|  |         raise ValueError(Errors.E082.format(n_heads=len(heads), | ||||||
|  |                                             n_proj_heads=len(proj_heads), | ||||||
|  |                                             n_labels=len(labels))) | ||||||
|     deco_labels = [] |     deco_labels = [] | ||||||
|     for tokenid, head in enumerate(heads): |     for tokenid, head in enumerate(heads): | ||||||
|         if head != proj_heads[tokenid]: |         if head != proj_heads[tokenid]: | ||||||
|  |  | ||||||
|  | @ -12,6 +12,7 @@ from ..structs cimport TokenC | ||||||
| from .stateclass cimport StateClass | from .stateclass cimport StateClass | ||||||
| from ..typedefs cimport attr_t | from ..typedefs cimport attr_t | ||||||
| from ..compat import json_dumps | from ..compat import json_dumps | ||||||
|  | from ..errors import Errors | ||||||
| from .. import util | from .. import util | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -80,10 +81,7 @@ cdef class TransitionSystem: | ||||||
|                     action.do(state.c, action.label) |                     action.do(state.c, action.label) | ||||||
|                     break |                     break | ||||||
|             else: |             else: | ||||||
|                 print(gold.words) |                 raise ValueError(Errors.E024) | ||||||
|                 print(gold.ner) |  | ||||||
|                 print(history) |  | ||||||
|                 raise ValueError("Could not find gold move") |  | ||||||
|         return history |         return history | ||||||
| 
 | 
 | ||||||
|     cdef int initialize_state(self, StateC* state) nogil: |     cdef int initialize_state(self, StateC* state) nogil: | ||||||
|  | @ -130,17 +128,7 @@ cdef class TransitionSystem: | ||||||
|             else: |             else: | ||||||
|                 costs[i] = 9000 |                 costs[i] = 9000 | ||||||
|         if n_gold <= 0: |         if n_gold <= 0: | ||||||
|             print(gold.words) |             raise ValueError(Errors.E024) | ||||||
|             print(gold.ner) |  | ||||||
|             print([gold.c.ner[i].clas for i in range(gold.length)]) |  | ||||||
|             print([gold.c.ner[i].move for i in range(gold.length)]) |  | ||||||
|             print([gold.c.ner[i].label for i in range(gold.length)]) |  | ||||||
|             print("Self labels", |  | ||||||
|                   [self.c[i].label for i in range(self.n_moves)]) |  | ||||||
|             raise ValueError( |  | ||||||
|                 "Could not find a gold-standard action to supervise " |  | ||||||
|                 "the entity recognizer. The transition system has " |  | ||||||
|                 "%d actions." % (self.n_moves)) |  | ||||||
| 
 | 
 | ||||||
|     def get_class_name(self, int clas): |     def get_class_name(self, int clas): | ||||||
|         act = self.c[clas] |         act = self.c[clas] | ||||||
|  | @ -162,7 +150,6 @@ cdef class TransitionSystem: | ||||||
|             self._size *= 2 |             self._size *= 2 | ||||||
|             self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0])) |             self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0])) | ||||||
|         self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id) |         self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id) | ||||||
|         assert self.c[self.n_moves].label == label_id |  | ||||||
|         self.n_moves += 1 |         self.n_moves += 1 | ||||||
|         return 1 |         return 1 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -13,6 +13,7 @@ cimport cython | ||||||
| 
 | 
 | ||||||
| from .tokens.doc cimport Doc | from .tokens.doc cimport Doc | ||||||
| from .strings cimport hash_string | from .strings cimport hash_string | ||||||
|  | from .errors import Errors, Warnings, deprecation_warning | ||||||
| from . import util | from . import util | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -63,11 +64,7 @@ cdef class Tokenizer: | ||||||
|         return (self.__class__, args, None, None) |         return (self.__class__, args, None, None) | ||||||
| 
 | 
 | ||||||
|     cpdef Doc tokens_from_list(self, list strings): |     cpdef Doc tokens_from_list(self, list strings): | ||||||
|         util.deprecated( |         deprecation_warning(Warnings.W002) | ||||||
|             "Tokenizer.from_list is now deprecated. Create a new Doc " |  | ||||||
|             "object instead and pass in the strings as the `words` keyword " |  | ||||||
|             "argument, for example:\nfrom spacy.tokens import Doc\n" |  | ||||||
|             "doc = Doc(nlp.vocab, words=[...])") |  | ||||||
|         return Doc(self.vocab, words=strings) |         return Doc(self.vocab, words=strings) | ||||||
| 
 | 
 | ||||||
|     @cython.boundscheck(False) |     @cython.boundscheck(False) | ||||||
|  | @ -78,8 +75,7 @@ cdef class Tokenizer: | ||||||
|         RETURNS (Doc): A container for linguistic annotations. |         RETURNS (Doc): A container for linguistic annotations. | ||||||
|         """ |         """ | ||||||
|         if len(string) >= (2 ** 30): |         if len(string) >= (2 ** 30): | ||||||
|             msg = "String is too long: %d characters. Max is 2**30." |             raise ValueError(Errors.E025.format(length=len(string))) | ||||||
|             raise ValueError(msg % len(string)) |  | ||||||
|         cdef int length = len(string) |         cdef int length = len(string) | ||||||
|         cdef Doc doc = Doc(self.vocab) |         cdef Doc doc = Doc(self.vocab) | ||||||
|         if length == 0: |         if length == 0: | ||||||
|  |  | ||||||
|  | @ -31,7 +31,7 @@ from ..attrs cimport ENT_TYPE, SENT_START | ||||||
| from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t | from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t | ||||||
| from ..util import normalize_slice | from ..util import normalize_slice | ||||||
| from ..compat import is_config, copy_reg, pickle, basestring_ | from ..compat import is_config, copy_reg, pickle, basestring_ | ||||||
| from .. import about | from ..errors import Errors, Warnings, deprecation_warning | ||||||
| from .. import util | from .. import util | ||||||
| from .underscore import Underscore | from .underscore import Underscore | ||||||
| from ._retokenize import Retokenizer | from ._retokenize import Retokenizer | ||||||
|  | @ -41,9 +41,9 @@ DEF PADDING = 5 | ||||||
| 
 | 
 | ||||||
| cdef int bounds_check(int i, int length, int padding) except -1: | cdef int bounds_check(int i, int length, int padding) except -1: | ||||||
|     if (i + padding) < 0: |     if (i + padding) < 0: | ||||||
|         raise IndexError |         raise IndexError(Errors.E026.format(i=i, length=length)) | ||||||
|     if (i - padding) >= length: |     if (i - padding) >= length: | ||||||
|         raise IndexError |         raise IndexError(Errors.E026.format(i=i, length=length)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: | cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil: | ||||||
|  | @ -98,7 +98,8 @@ cdef class Doc: | ||||||
|     def set_extension(cls, name, default=None, method=None, |     def set_extension(cls, name, default=None, method=None, | ||||||
|                       getter=None, setter=None): |                       getter=None, setter=None): | ||||||
|         nr_defined = sum(t is not None for t in (default, getter, setter, method)) |         nr_defined = sum(t is not None for t in (default, getter, setter, method)) | ||||||
|         assert nr_defined == 1 |         if nr_defined != 1: | ||||||
|  |             raise ValueError(Errors.E083.format(n_args=nr_defined)) | ||||||
|         Underscore.doc_extensions[name] = (default, method, getter, setter) |         Underscore.doc_extensions[name] = (default, method, getter, setter) | ||||||
| 
 | 
 | ||||||
|     @classmethod |     @classmethod | ||||||
|  | @ -155,11 +156,7 @@ cdef class Doc: | ||||||
|             if spaces is None: |             if spaces is None: | ||||||
|                 spaces = [True] * len(words) |                 spaces = [True] * len(words) | ||||||
|             elif len(spaces) != len(words): |             elif len(spaces) != len(words): | ||||||
|                 raise ValueError( |                 raise ValueError(Errors.E027) | ||||||
|                     "Arguments 'words' and 'spaces' should be sequences of " |  | ||||||
|                     "the same length, or 'spaces' should be left default at " |  | ||||||
|                     "None. spaces should be a sequence of booleans, with True " |  | ||||||
|                     "meaning that the word owns a ' ' character following it.") |  | ||||||
|             orths_and_spaces = zip(words, spaces) |             orths_and_spaces = zip(words, spaces) | ||||||
|         if orths_and_spaces is not None: |         if orths_and_spaces is not None: | ||||||
|             for orth_space in orths_and_spaces: |             for orth_space in orths_and_spaces: | ||||||
|  | @ -167,10 +164,7 @@ cdef class Doc: | ||||||
|                     orth = orth_space |                     orth = orth_space | ||||||
|                     has_space = True |                     has_space = True | ||||||
|                 elif isinstance(orth_space, bytes): |                 elif isinstance(orth_space, bytes): | ||||||
|                     raise ValueError( |                     raise ValueError(Errors.E028.format(value=orth_space)) | ||||||
|                         "orths_and_spaces expects either List(unicode) or " |  | ||||||
|                         "List((unicode, bool)). " |  | ||||||
|                         "Got bytes instance: %s" % (str(orth_space))) |  | ||||||
|                 else: |                 else: | ||||||
|                     orth, has_space = orth_space |                     orth, has_space = orth_space | ||||||
|                 # Note that we pass self.mem here --- we have ownership, if LexemeC |                 # Note that we pass self.mem here --- we have ownership, if LexemeC | ||||||
|  | @ -504,11 +498,7 @@ cdef class Doc: | ||||||
|         """ |         """ | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             if not self.is_parsed: |             if not self.is_parsed: | ||||||
|                 raise ValueError( |                 raise ValueError(Errors.E029) | ||||||
|                     "noun_chunks requires the dependency parse, which " |  | ||||||
|                     "requires a statistical model to be installed and loaded. " |  | ||||||
|                     "For more info, see the " |  | ||||||
|                     "documentation: \n%s\n" % about.__docs_models__) |  | ||||||
|             # Accumulate the result before beginning to iterate over it. This |             # Accumulate the result before beginning to iterate over it. This | ||||||
|             # prevents the tokenisation from being changed out from under us |             # prevents the tokenisation from being changed out from under us | ||||||
|             # during the iteration. The tricky thing here is that Span accepts |             # during the iteration. The tricky thing here is that Span accepts | ||||||
|  | @ -533,12 +523,7 @@ cdef class Doc: | ||||||
|         """ |         """ | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             if not self.is_sentenced: |             if not self.is_sentenced: | ||||||
|                 raise ValueError( |                 raise ValueError(Errors.E030) | ||||||
|                     "Sentence boundaries unset. You can add the 'sentencizer' " |  | ||||||
|                     "component to the pipeline with: " |  | ||||||
|                     "nlp.add_pipe(nlp.create_pipe('sentencizer')) " |  | ||||||
|                     "Alternatively, add the dependency parser, or set " |  | ||||||
|                     "sentence boundaries by setting doc[i].sent_start") |  | ||||||
|             if 'sents' in self.user_hooks: |             if 'sents' in self.user_hooks: | ||||||
|                 yield from self.user_hooks['sents'](self) |                 yield from self.user_hooks['sents'](self) | ||||||
|             else: |             else: | ||||||
|  | @ -568,7 +553,8 @@ cdef class Doc: | ||||||
|             t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy |             t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy | ||||||
|         t.l_edge = self.length |         t.l_edge = self.length | ||||||
|         t.r_edge = self.length |         t.r_edge = self.length | ||||||
|         assert t.lex.orth != 0 |         if t.lex.orth == 0: | ||||||
|  |             raise ValueError(Errors.E031.format(i=self.length)) | ||||||
|         t.spacy = has_space |         t.spacy = has_space | ||||||
|         self.length += 1 |         self.length += 1 | ||||||
|         return t.idx + t.lex.length + t.spacy |         return t.idx + t.lex.length + t.spacy | ||||||
|  | @ -684,13 +670,7 @@ cdef class Doc: | ||||||
| 
 | 
 | ||||||
|     def from_array(self, attrs, array): |     def from_array(self, attrs, array): | ||||||
|         if SENT_START in attrs and HEAD in attrs: |         if SENT_START in attrs and HEAD in attrs: | ||||||
|             raise ValueError( |             raise ValueError(Errors.E032) | ||||||
|                 "Conflicting attributes specified in doc.from_array(): " |  | ||||||
|                 "(HEAD, SENT_START)\n" |  | ||||||
|                 "The HEAD attribute currently sets sentence boundaries " |  | ||||||
|                 "implicitly, based on the tree structure. This means the HEAD " |  | ||||||
|                 "attribute would potentially override the sentence boundaries " |  | ||||||
|                 "set by SENT_START.") |  | ||||||
|         cdef int i, col |         cdef int i, col | ||||||
|         cdef attr_id_t attr_id |         cdef attr_id_t attr_id | ||||||
|         cdef TokenC* tokens = self.c |         cdef TokenC* tokens = self.c | ||||||
|  | @ -828,7 +808,7 @@ cdef class Doc: | ||||||
|         RETURNS (Doc): Itself. |         RETURNS (Doc): Itself. | ||||||
|         """ |         """ | ||||||
|         if self.length != 0: |         if self.length != 0: | ||||||
|             raise ValueError("Cannot load into non-empty Doc") |             raise ValueError(Errors.E033.format(length=self.length)) | ||||||
|         deserializers = { |         deserializers = { | ||||||
|             'text': lambda b: None, |             'text': lambda b: None, | ||||||
|             'array_head': lambda b: None, |             'array_head': lambda b: None, | ||||||
|  | @ -916,10 +896,7 @@ cdef class Doc: | ||||||
|         """ |         """ | ||||||
|         cdef unicode tag, lemma, ent_type |         cdef unicode tag, lemma, ent_type | ||||||
|         if len(args) == 3: |         if len(args) == 3: | ||||||
|             util.deprecated( |             deprecation_warning(Warnings.W003) | ||||||
|                 "Positional arguments to Doc.merge are deprecated. Instead, " |  | ||||||
|                 "use the keyword arguments, for example tag=, lemma= or " |  | ||||||
|                 "ent_type=.") |  | ||||||
|             tag, lemma, ent_type = args |             tag, lemma, ent_type = args | ||||||
|             attributes[TAG] = tag |             attributes[TAG] = tag | ||||||
|             attributes[LEMMA] = lemma |             attributes[LEMMA] = lemma | ||||||
|  | @ -933,13 +910,9 @@ cdef class Doc: | ||||||
|             if 'ent_type' in attributes: |             if 'ent_type' in attributes: | ||||||
|                 attributes[ENT_TYPE] = attributes['ent_type'] |                 attributes[ENT_TYPE] = attributes['ent_type'] | ||||||
|         elif args: |         elif args: | ||||||
|             raise ValueError( |             raise ValueError(Errors.E034.format(n_args=len(args), | ||||||
|                 "Doc.merge received %d non-keyword arguments. Expected either " |                                                 args=repr(args), | ||||||
|                 "3 arguments (deprecated), or 0 (use keyword arguments). " |                                                 kwargs=repr(attributes))) | ||||||
|                 "Arguments supplied:\n%s\n" |  | ||||||
|                 "Keyword arguments: %s\n" % (len(args), repr(args), |  | ||||||
|                                              repr(attributes))) |  | ||||||
| 
 |  | ||||||
|         # More deprecated attribute handling =/ |         # More deprecated attribute handling =/ | ||||||
|         if 'label' in attributes: |         if 'label' in attributes: | ||||||
|             attributes['ent_type'] = attributes.pop('label') |             attributes['ent_type'] = attributes.pop('label') | ||||||
|  |  | ||||||
|  | @ -16,7 +16,7 @@ from ..util import normalize_slice | ||||||
| from ..attrs cimport IS_PUNCT, IS_SPACE | from ..attrs cimport IS_PUNCT, IS_SPACE | ||||||
| from ..lexeme cimport Lexeme | from ..lexeme cimport Lexeme | ||||||
| from ..compat import is_config | from ..compat import is_config | ||||||
| from .. import about | from ..errors import Errors, TempErrors | ||||||
| from .underscore import Underscore | from .underscore import Underscore | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -48,8 +48,7 @@ cdef class Span: | ||||||
|         RETURNS (Span): The newly constructed object. |         RETURNS (Span): The newly constructed object. | ||||||
|         """ |         """ | ||||||
|         if not (0 <= start <= end <= len(doc)): |         if not (0 <= start <= end <= len(doc)): | ||||||
|             raise IndexError |             raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc))) | ||||||
| 
 |  | ||||||
|         self.doc = doc |         self.doc = doc | ||||||
|         self.start = start |         self.start = start | ||||||
|         self.start_char = self.doc[start].idx if start < self.doc.length else 0 |         self.start_char = self.doc[start].idx if start < self.doc.length else 0 | ||||||
|  | @ -58,7 +57,8 @@ cdef class Span: | ||||||
|             self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1]) |             self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1]) | ||||||
|         else: |         else: | ||||||
|             self.end_char = 0 |             self.end_char = 0 | ||||||
|         assert label in doc.vocab.strings, label |         if label not in doc.vocab.strings: | ||||||
|  |             raise ValueError(Errors.E084.format(label=label)) | ||||||
|         self.label = label |         self.label = label | ||||||
|         self._vector = vector |         self._vector = vector | ||||||
|         self._vector_norm = vector_norm |         self._vector_norm = vector_norm | ||||||
|  | @ -267,11 +267,10 @@ cdef class Span: | ||||||
|         or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char: |         or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char: | ||||||
|             start = token_by_start(self.doc.c, self.doc.length, self.start_char) |             start = token_by_start(self.doc.c, self.doc.length, self.start_char) | ||||||
|             if self.start == -1: |             if self.start == -1: | ||||||
|                 raise IndexError("Error calculating span: Can't find start") |                 raise IndexError(Errors.E036.format(start=self.start_char)) | ||||||
|             end = token_by_end(self.doc.c, self.doc.length, self.end_char) |             end = token_by_end(self.doc.c, self.doc.length, self.end_char) | ||||||
|             if end == -1: |             if end == -1: | ||||||
|                 raise IndexError("Error calculating span: Can't find end") |                 raise IndexError(Errors.E037.format(end=self.end_char)) | ||||||
| 
 |  | ||||||
|             self.start = start |             self.start = start | ||||||
|             self.end = end + 1 |             self.end = end + 1 | ||||||
| 
 | 
 | ||||||
|  | @ -293,7 +292,7 @@ cdef class Span: | ||||||
|                 root += root.head |                 root += root.head | ||||||
|                 n += 1 |                 n += 1 | ||||||
|                 if n >= self.doc.length: |                 if n >= self.doc.length: | ||||||
|                     raise RuntimeError |                     raise RuntimeError(Errors.E038) | ||||||
|             return self.doc[root.l_edge:root.r_edge + 1] |             return self.doc[root.l_edge:root.r_edge + 1] | ||||||
| 
 | 
 | ||||||
|     property has_vector: |     property has_vector: | ||||||
|  | @ -376,11 +375,7 @@ cdef class Span: | ||||||
|         """ |         """ | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             if not self.doc.is_parsed: |             if not self.doc.is_parsed: | ||||||
|                 raise ValueError( |                 raise ValueError(Errors.E029) | ||||||
|                     "noun_chunks requires the dependency parse, which " |  | ||||||
|                     "requires a statistical model to be installed and loaded. " |  | ||||||
|                     "For more info, see the " |  | ||||||
|                     "documentation: \n%s\n" % about.__docs_models__) |  | ||||||
|             # Accumulate the result before beginning to iterate over it. This |             # Accumulate the result before beginning to iterate over it. This | ||||||
|             # prevents the tokenisation from being changed out from under us |             # prevents the tokenisation from being changed out from under us | ||||||
|             # during the iteration. The tricky thing here is that Span accepts |             # during the iteration. The tricky thing here is that Span accepts | ||||||
|  | @ -526,9 +521,7 @@ cdef class Span: | ||||||
|             return self.root.ent_id |             return self.root.ent_id | ||||||
| 
 | 
 | ||||||
|         def __set__(self, hash_t key): |         def __set__(self, hash_t key): | ||||||
|             raise NotImplementedError( |             raise NotImplementedError(TempErrors.T007.format(attr='ent_id')) | ||||||
|                 "Can't yet set ent_id from Span. Vote for this feature on " |  | ||||||
|                 "the issue tracker: http://github.com/explosion/spaCy/issues") |  | ||||||
| 
 | 
 | ||||||
|     property ent_id_: |     property ent_id_: | ||||||
|         """RETURNS (unicode): The (string) entity ID.""" |         """RETURNS (unicode): The (string) entity ID.""" | ||||||
|  | @ -536,9 +529,7 @@ cdef class Span: | ||||||
|             return self.root.ent_id_ |             return self.root.ent_id_ | ||||||
| 
 | 
 | ||||||
|         def __set__(self, hash_t key): |         def __set__(self, hash_t key): | ||||||
|             raise NotImplementedError( |             raise NotImplementedError(TempErrors.T007.format(attr='ent_id_')) | ||||||
|                 "Can't yet set ent_id_ from Span. Vote for this feature on the " |  | ||||||
|                 "issue tracker: http://github.com/explosion/spaCy/issues") |  | ||||||
| 
 | 
 | ||||||
|     property orth_: |     property orth_: | ||||||
|         """Verbatim text content (identical to Span.text). Exists mostly for |         """Verbatim text content (identical to Span.text). Exists mostly for | ||||||
|  | @ -586,9 +577,5 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1: | ||||||
|         token += token.head |         token += token.head | ||||||
|         n += 1 |         n += 1 | ||||||
|         if n >= sent_length: |         if n >= sent_length: | ||||||
|             raise RuntimeError( |             raise RuntimeError(Errors.E039) | ||||||
|                 "Array bounds exceeded while searching for root word. This " |  | ||||||
|                 "likely means the parse tree is in an invalid state. Please " |  | ||||||
|                 "report this issue here: " |  | ||||||
|                 "http://github.com/explosion/spaCy/issues") |  | ||||||
|     return n |     return n | ||||||
|  |  | ||||||
|  | @ -6,6 +6,7 @@ from ..typedefs cimport attr_t, flags_t | ||||||
| from ..parts_of_speech cimport univ_pos_t | from ..parts_of_speech cimport univ_pos_t | ||||||
| from .doc cimport Doc | from .doc cimport Doc | ||||||
| from ..lexeme cimport Lexeme | from ..lexeme cimport Lexeme | ||||||
|  | from ..errors import Errors | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| cdef class Token: | cdef class Token: | ||||||
|  | @ -17,8 +18,7 @@ cdef class Token: | ||||||
|     @staticmethod |     @staticmethod | ||||||
|     cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, Doc doc): |     cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, Doc doc): | ||||||
|         if offset < 0 or offset >= doc.length: |         if offset < 0 or offset >= doc.length: | ||||||
|             msg = "Attempt to access token at %d, max length %d" |             raise IndexError(Errors.E040.format(i=offset, max_length=doc.length)) | ||||||
|             raise IndexError(msg % (offset, doc.length)) |  | ||||||
|         cdef Token self = Token.__new__(Token, vocab, doc, offset) |         cdef Token self = Token.__new__(Token, vocab, doc, offset) | ||||||
|         return self |         return self | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -19,8 +19,8 @@ from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM | ||||||
| from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX | from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX | ||||||
| from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP | from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP | ||||||
| from ..compat import is_config | from ..compat import is_config | ||||||
|  | from ..errors import Errors | ||||||
| from .. import util | from .. import util | ||||||
| from .. import about |  | ||||||
| from .underscore import Underscore | from .underscore import Underscore | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -106,7 +106,7 @@ cdef class Token: | ||||||
|         elif op == 5: |         elif op == 5: | ||||||
|             return my >= their |             return my >= their | ||||||
|         else: |         else: | ||||||
|             raise ValueError(op) |             raise ValueError(Errors.E041.format(op=op)) | ||||||
| 
 | 
 | ||||||
|     @property |     @property | ||||||
|     def _(self): |     def _(self): | ||||||
|  | @ -135,8 +135,7 @@ cdef class Token: | ||||||
|         RETURNS (Token): The token at position `self.doc[self.i+i]`. |         RETURNS (Token): The token at position `self.doc[self.i+i]`. | ||||||
|         """ |         """ | ||||||
|         if self.i+i < 0 or (self.i+i >= len(self.doc)): |         if self.i+i < 0 or (self.i+i >= len(self.doc)): | ||||||
|             msg = "Error accessing doc[%d].nbor(%d), for doc of length %d" |             raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc))) | ||||||
|             raise IndexError(msg % (self.i, i, len(self.doc))) |  | ||||||
|         return self.doc[self.i+i] |         return self.doc[self.i+i] | ||||||
| 
 | 
 | ||||||
|     def similarity(self, other): |     def similarity(self, other): | ||||||
|  | @ -352,14 +351,7 @@ cdef class Token: | ||||||
| 
 | 
 | ||||||
|     property sent_start: |     property sent_start: | ||||||
|         def __get__(self): |         def __get__(self): | ||||||
|             # Raising a deprecation warning causes errors for autocomplete |             # Raising a deprecation warning here causes errors for autocomplete | ||||||
|             #util.deprecated( |  | ||||||
|             #    "Token.sent_start is now deprecated. Use Token.is_sent_start " |  | ||||||
|             #    "instead, which returns a boolean value or None if the answer " |  | ||||||
|             #    "is unknown – instead of a misleading 0 for False and 1 for " |  | ||||||
|             #    "True. It also fixes a quirk in the old logic that would " |  | ||||||
|             #    "always set the property to 0 for the first word of the " |  | ||||||
|             #    "document.") |  | ||||||
|             # Handle broken backwards compatibility case: doc[0].sent_start |             # Handle broken backwards compatibility case: doc[0].sent_start | ||||||
|             # was False. |             # was False. | ||||||
|             if self.i == 0: |             if self.i == 0: | ||||||
|  | @ -384,9 +376,7 @@ cdef class Token: | ||||||
| 
 | 
 | ||||||
|         def __set__(self, value): |         def __set__(self, value): | ||||||
|             if self.doc.is_parsed: |             if self.doc.is_parsed: | ||||||
|                 raise ValueError( |                 raise ValueError(Errors.E043) | ||||||
|                     "Refusing to write to token.sent_start if its document " |  | ||||||
|                     "is parsed, because this may cause inconsistent state.") |  | ||||||
|             if value is None: |             if value is None: | ||||||
|                 self.c.sent_start = 0 |                 self.c.sent_start = 0 | ||||||
|             elif value is True: |             elif value is True: | ||||||
|  | @ -394,8 +384,7 @@ cdef class Token: | ||||||
|             elif value is False: |             elif value is False: | ||||||
|                 self.c.sent_start = -1 |                 self.c.sent_start = -1 | ||||||
|             else: |             else: | ||||||
|                 raise ValueError("Invalid value for token.sent_start. Must be " |                 raise ValueError(Errors.E044.format(value=value)) | ||||||
|                                  "one of: None, True, False") |  | ||||||
| 
 | 
 | ||||||
|     property lefts: |     property lefts: | ||||||
|         """The leftward immediate children of the word, in the syntactic |         """The leftward immediate children of the word, in the syntactic | ||||||
|  | @ -413,8 +402,7 @@ cdef class Token: | ||||||
|                 nr_iter += 1 |                 nr_iter += 1 | ||||||
|                 # This is ugly, but it's a way to guard out infinite loops |                 # This is ugly, but it's a way to guard out infinite loops | ||||||
|                 if nr_iter >= 10000000: |                 if nr_iter >= 10000000: | ||||||
|                     raise RuntimeError("Possibly infinite loop encountered " |                     raise RuntimeError(Errors.E045.format(attr='token.lefts')) | ||||||
|                                        "while looking for token.lefts") |  | ||||||
| 
 | 
 | ||||||
|     property rights: |     property rights: | ||||||
|         """The rightward immediate children of the word, in the syntactic |         """The rightward immediate children of the word, in the syntactic | ||||||
|  | @ -432,8 +420,7 @@ cdef class Token: | ||||||
|                 ptr -= 1 |                 ptr -= 1 | ||||||
|                 nr_iter += 1 |                 nr_iter += 1 | ||||||
|                 if nr_iter >= 10000000: |                 if nr_iter >= 10000000: | ||||||
|                     raise RuntimeError("Possibly infinite loop encountered " |                     raise RuntimeError(Errors.E045.format(attr='token.rights')) | ||||||
|                                        "while looking for token.rights") |  | ||||||
|             tokens.reverse() |             tokens.reverse() | ||||||
|             for t in tokens: |             for t in tokens: | ||||||
|                 yield t |                 yield t | ||||||
|  |  | ||||||
|  | @ -3,6 +3,8 @@ from __future__ import unicode_literals | ||||||
| 
 | 
 | ||||||
| import functools | import functools | ||||||
| 
 | 
 | ||||||
|  | from ..errors import Errors | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| class Underscore(object): | class Underscore(object): | ||||||
|     doc_extensions = {} |     doc_extensions = {} | ||||||
|  | @ -23,7 +25,7 @@ class Underscore(object): | ||||||
| 
 | 
 | ||||||
|     def __getattr__(self, name): |     def __getattr__(self, name): | ||||||
|         if name not in self._extensions: |         if name not in self._extensions: | ||||||
|             raise AttributeError(name) |             raise AttributeError(Errors.E046.format(name=name)) | ||||||
|         default, method, getter, setter = self._extensions[name] |         default, method, getter, setter = self._extensions[name] | ||||||
|         if getter is not None: |         if getter is not None: | ||||||
|             return getter(self._obj) |             return getter(self._obj) | ||||||
|  | @ -34,7 +36,7 @@ class Underscore(object): | ||||||
| 
 | 
 | ||||||
|     def __setattr__(self, name, value): |     def __setattr__(self, name, value): | ||||||
|         if name not in self._extensions: |         if name not in self._extensions: | ||||||
|             raise AttributeError(name) |             raise AttributeError(Errors.E047.format(name=name)) | ||||||
|         default, method, getter, setter = self._extensions[name] |         default, method, getter, setter = self._extensions[name] | ||||||
|         if setter is not None: |         if setter is not None: | ||||||
|             return setter(self._obj, value) |             return setter(self._obj, value) | ||||||
|  |  | ||||||
|  | @ -11,8 +11,6 @@ import sys | ||||||
| import textwrap | import textwrap | ||||||
| import random | import random | ||||||
| from collections import OrderedDict | from collections import OrderedDict | ||||||
| import inspect |  | ||||||
| import warnings |  | ||||||
| from thinc.neural._classes.model import Model | from thinc.neural._classes.model import Model | ||||||
| import functools | import functools | ||||||
| import cytoolz | import cytoolz | ||||||
|  | @ -22,6 +20,7 @@ import numpy.random | ||||||
| from .symbols import ORTH | from .symbols import ORTH | ||||||
| from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_ | from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_ | ||||||
| from .compat import import_file | from .compat import import_file | ||||||
|  | from .errors import Errors | ||||||
| 
 | 
 | ||||||
| # Import these directly from Thinc, so that we're sure we always have the | # Import these directly from Thinc, so that we're sure we always have the | ||||||
| # same version. | # same version. | ||||||
|  | @ -50,8 +49,7 @@ def get_lang_class(lang): | ||||||
|         try: |         try: | ||||||
|             module = importlib.import_module('.lang.%s' % lang, 'spacy') |             module = importlib.import_module('.lang.%s' % lang, 'spacy') | ||||||
|         except ImportError: |         except ImportError: | ||||||
|             msg = "Can't import language %s from spacy.lang." |             raise ImportError(Errors.E048.format(lang=lang)) | ||||||
|             raise ImportError(msg % lang) |  | ||||||
|         LANGUAGES[lang] = getattr(module, module.__all__[0]) |         LANGUAGES[lang] = getattr(module, module.__all__[0]) | ||||||
|     return LANGUAGES[lang] |     return LANGUAGES[lang] | ||||||
| 
 | 
 | ||||||
|  | @ -108,7 +106,7 @@ def load_model(name, **overrides): | ||||||
|     """ |     """ | ||||||
|     data_path = get_data_path() |     data_path = get_data_path() | ||||||
|     if not data_path or not data_path.exists(): |     if not data_path or not data_path.exists(): | ||||||
|         raise IOError("Can't find spaCy data path: %s" % path2str(data_path)) |         raise IOError(Errors.E049.format(path=path2str(data_path))) | ||||||
|     if isinstance(name, basestring_):  # in data dir / shortcut |     if isinstance(name, basestring_):  # in data dir / shortcut | ||||||
|         if name in set([d.name for d in data_path.iterdir()]): |         if name in set([d.name for d in data_path.iterdir()]): | ||||||
|             return load_model_from_link(name, **overrides) |             return load_model_from_link(name, **overrides) | ||||||
|  | @ -118,7 +116,7 @@ def load_model(name, **overrides): | ||||||
|             return load_model_from_path(Path(name), **overrides) |             return load_model_from_path(Path(name), **overrides) | ||||||
|     elif hasattr(name, 'exists'):  # Path or Path-like to model data |     elif hasattr(name, 'exists'):  # Path or Path-like to model data | ||||||
|         return load_model_from_path(name, **overrides) |         return load_model_from_path(name, **overrides) | ||||||
|     raise IOError("Can't find model '%s'" % name) |     raise IOError(Errors.E050.format(name=name)) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def load_model_from_link(name, **overrides): | def load_model_from_link(name, **overrides): | ||||||
|  | @ -127,9 +125,7 @@ def load_model_from_link(name, **overrides): | ||||||
|     try: |     try: | ||||||
|         cls = import_file(name, path) |         cls = import_file(name, path) | ||||||
|     except AttributeError: |     except AttributeError: | ||||||
|         raise IOError( |         raise IOError(Errors.E051.format(name=name)) | ||||||
|             "Cant' load '%s'. If you're using a shortcut link, make sure it " |  | ||||||
|             "points to a valid package (not just a data directory)." % name) |  | ||||||
|     return cls.load(**overrides) |     return cls.load(**overrides) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -173,8 +169,7 @@ def load_model_from_init_py(init_file, **overrides): | ||||||
|     data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version']) |     data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version']) | ||||||
|     data_path = model_path / data_dir |     data_path = model_path / data_dir | ||||||
|     if not model_path.exists(): |     if not model_path.exists(): | ||||||
|         msg = "Can't find model directory: %s" |         raise IOError(Errors.E052.format(path=path2str(data_path))) | ||||||
|         raise ValueError(msg % path2str(data_path)) |  | ||||||
|     return load_model_from_path(data_path, meta, **overrides) |     return load_model_from_path(data_path, meta, **overrides) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -186,16 +181,14 @@ def get_model_meta(path): | ||||||
|     """ |     """ | ||||||
|     model_path = ensure_path(path) |     model_path = ensure_path(path) | ||||||
|     if not model_path.exists(): |     if not model_path.exists(): | ||||||
|         msg = "Can't find model directory: %s" |         raise IOError(Errors.E052.format(path=path2str(model_path))) | ||||||
|         raise ValueError(msg % path2str(model_path)) |  | ||||||
|     meta_path = model_path / 'meta.json' |     meta_path = model_path / 'meta.json' | ||||||
|     if not meta_path.is_file(): |     if not meta_path.is_file(): | ||||||
|         raise IOError("Could not read meta.json from %s" % meta_path) |         raise IOError(Errors.E053.format(path=meta_path)) | ||||||
|     meta = read_json(meta_path) |     meta = read_json(meta_path) | ||||||
|     for setting in ['lang', 'name', 'version']: |     for setting in ['lang', 'name', 'version']: | ||||||
|         if setting not in meta or not meta[setting]: |         if setting not in meta or not meta[setting]: | ||||||
|             msg = "No valid '%s' setting found in model meta.json" |             raise ValueError(Errors.E054.format(setting=setting)) | ||||||
|             raise ValueError(msg % setting) |  | ||||||
|     return meta |     return meta | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -339,13 +332,10 @@ def update_exc(base_exceptions, *addition_dicts): | ||||||
|         for orth, token_attrs in additions.items(): |         for orth, token_attrs in additions.items(): | ||||||
|             if not all(isinstance(attr[ORTH], unicode_) |             if not all(isinstance(attr[ORTH], unicode_) | ||||||
|                        for attr in token_attrs): |                        for attr in token_attrs): | ||||||
|                 msg = "Invalid ORTH value in exception: key='%s', orths='%s'" |                 raise ValueError(Errors.E055.format(key=orth, orths=token_attrs)) | ||||||
|                 raise ValueError(msg % (orth, token_attrs)) |  | ||||||
|             described_orth = ''.join(attr[ORTH] for attr in token_attrs) |             described_orth = ''.join(attr[ORTH] for attr in token_attrs) | ||||||
|             if orth != described_orth: |             if orth != described_orth: | ||||||
|                 msg = ("Invalid tokenizer exception: ORTH values combined " |                 raise ValueError(Errors.E056.format(key=orth, orths=described_orth)) | ||||||
|                        "don't match original string. key='%s', orths='%s'") |  | ||||||
|                 raise ValueError(msg % (orth, described_orth)) |  | ||||||
|         exc.update(additions) |         exc.update(additions) | ||||||
|     exc = expand_exc(exc, "'", "’") |     exc = expand_exc(exc, "'", "’") | ||||||
|     return exc |     return exc | ||||||
|  | @ -375,8 +365,7 @@ def expand_exc(excs, search, replace): | ||||||
| 
 | 
 | ||||||
| def normalize_slice(length, start, stop, step=None): | def normalize_slice(length, start, stop, step=None): | ||||||
|     if not (step is None or step == 1): |     if not (step is None or step == 1): | ||||||
|         raise ValueError("Stepped slices not supported in Span objects." |         raise ValueError(Errors.E057) | ||||||
|                          "Try: list(tokens)[start:stop:step] instead.") |  | ||||||
|     if start is None: |     if start is None: | ||||||
|         start = 0 |         start = 0 | ||||||
|     elif start < 0: |     elif start < 0: | ||||||
|  | @ -387,7 +376,6 @@ def normalize_slice(length, start, stop, step=None): | ||||||
|     elif stop < 0: |     elif stop < 0: | ||||||
|         stop += length |         stop += length | ||||||
|     stop = min(length, max(start, stop)) |     stop = min(length, max(start, stop)) | ||||||
|     assert 0 <= start <= stop <= length |  | ||||||
|     return start, stop |     return start, stop | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -524,18 +512,6 @@ def from_disk(path, readers, exclude): | ||||||
|     return path |     return path | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def deprecated(message, filter='always'): |  | ||||||
|     """Show a deprecation warning. |  | ||||||
| 
 |  | ||||||
|     message (unicode): The message to display. |  | ||||||
|     filter (unicode): Filter value. |  | ||||||
|     """ |  | ||||||
|     stack = inspect.stack()[-1] |  | ||||||
|     with warnings.catch_warnings(): |  | ||||||
|         warnings.simplefilter(filter, DeprecationWarning) |  | ||||||
|         warnings.warn_explicit(message, DeprecationWarning, stack[1], stack[2]) |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| def print_table(data, title=None): | def print_table(data, title=None): | ||||||
|     """Print data in table format. |     """Print data in table format. | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -14,6 +14,7 @@ from thinc.neural._classes.model import Model | ||||||
| 
 | 
 | ||||||
| from .strings cimport StringStore, hash_string | from .strings cimport StringStore, hash_string | ||||||
| from .compat import basestring_, path2str | from .compat import basestring_, path2str | ||||||
|  | from .errors import Errors | ||||||
| from . import util | from . import util | ||||||
| 
 | 
 | ||||||
| from cython.operator cimport dereference as deref | from cython.operator cimport dereference as deref | ||||||
|  | @ -114,7 +115,7 @@ cdef class Vectors: | ||||||
|         """ |         """ | ||||||
|         i = self.key2row[key] |         i = self.key2row[key] | ||||||
|         if i is None: |         if i is None: | ||||||
|             raise KeyError(key) |             raise KeyError(Errors.E058.format(key=key)) | ||||||
|         else: |         else: | ||||||
|             return self.data[i] |             return self.data[i] | ||||||
| 
 | 
 | ||||||
|  | @ -215,7 +216,8 @@ cdef class Vectors: | ||||||
|         RETURNS: The requested key, keys, row or rows. |         RETURNS: The requested key, keys, row or rows. | ||||||
|         """ |         """ | ||||||
|         if sum(arg is None for arg in (key, keys, row, rows)) != 3: |         if sum(arg is None for arg in (key, keys, row, rows)) != 3: | ||||||
|             raise ValueError("One (and only one) keyword arg must be set.") |             bad_kwargs = {'key': key, 'keys': keys, 'row': row, 'rows': rows} | ||||||
|  |             raise ValueError(Errors.E059.format(kwargs=bad_kwargs)) | ||||||
|         xp = get_array_module(self.data) |         xp = get_array_module(self.data) | ||||||
|         if key is not None: |         if key is not None: | ||||||
|             if isinstance(key, basestring_): |             if isinstance(key, basestring_): | ||||||
|  | @ -254,9 +256,9 @@ cdef class Vectors: | ||||||
|             row = self.key2row[key] |             row = self.key2row[key] | ||||||
|         elif row is None: |         elif row is None: | ||||||
|             if self.is_full: |             if self.is_full: | ||||||
|                 raise ValueError("Cannot add new key to vectors -- full") |                 raise ValueError(Errors.E060.format(rows=self.data.shape[0], | ||||||
|  |                                                     cols=self.data.shape[1])) | ||||||
|             row = deref(self._unset.begin()) |             row = deref(self._unset.begin()) | ||||||
| 
 |  | ||||||
|         self.key2row[key] = row |         self.key2row[key] = row | ||||||
|         if vector is not None: |         if vector is not None: | ||||||
|             self.data[row] = vector |             self.data[row] = vector | ||||||
|  | @ -318,7 +320,7 @@ cdef class Vectors: | ||||||
|                 width = int(dims) |                 width = int(dims) | ||||||
|                 break |                 break | ||||||
|         else: |         else: | ||||||
|             raise IOError("Expected file named e.g. vectors.128.f.bin") |             raise IOError(Errors.E061.format(filename=path)) | ||||||
|         bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims, |         bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims, | ||||||
|                                                              dtype=dtype) |                                                              dtype=dtype) | ||||||
|         xp = get_array_module(self.data) |         xp = get_array_module(self.data) | ||||||
|  |  | ||||||
|  | @ -16,6 +16,7 @@ from .attrs cimport PROB, LANG, ORTH, TAG | ||||||
| from .structs cimport SerializedLexemeC | from .structs cimport SerializedLexemeC | ||||||
| 
 | 
 | ||||||
| from .compat import copy_reg, basestring_ | from .compat import copy_reg, basestring_ | ||||||
|  | from .errors import Errors | ||||||
| from .lemmatizer import Lemmatizer | from .lemmatizer import Lemmatizer | ||||||
| from .attrs import intify_attrs | from .attrs import intify_attrs | ||||||
| from .vectors import Vectors | from .vectors import Vectors | ||||||
|  | @ -100,15 +101,9 @@ cdef class Vocab: | ||||||
|                     flag_id = bit |                     flag_id = bit | ||||||
|                     break |                     break | ||||||
|             else: |             else: | ||||||
|                 raise ValueError( |                 raise ValueError(Errors.E062) | ||||||
|                     "Cannot find empty bit for new lexical flag. All bits " |  | ||||||
|                     "between 0 and 63 are occupied. You can replace one by " |  | ||||||
|                     "specifying the flag_id explicitly, e.g. " |  | ||||||
|                     "`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.") |  | ||||||
|         elif flag_id >= 64 or flag_id < 1: |         elif flag_id >= 64 or flag_id < 1: | ||||||
|             raise ValueError( |             raise ValueError(Errors.E063.format(value=flag_id)) | ||||||
|                 "Invalid value for flag_id: %d. Flag IDs must be between " |  | ||||||
|                 "1 and 63 (inclusive)" % flag_id) |  | ||||||
|         for lex in self: |         for lex in self: | ||||||
|             lex.set_flag(flag_id, flag_getter(lex.orth_)) |             lex.set_flag(flag_id, flag_getter(lex.orth_)) | ||||||
|         self.lex_attr_getters[flag_id] = flag_getter |         self.lex_attr_getters[flag_id] = flag_getter | ||||||
|  | @ -127,8 +122,9 @@ cdef class Vocab: | ||||||
|         cdef size_t addr |         cdef size_t addr | ||||||
|         if lex != NULL: |         if lex != NULL: | ||||||
|             if lex.orth != self.strings[string]: |             if lex.orth != self.strings[string]: | ||||||
|                 raise LookupError.mismatched_strings( |                 raise KeyError(Errors.E064.format(string=lex.orth, | ||||||
|                     lex.orth, self.strings[string], string) |                                                   orth=self.strings[string], | ||||||
|  |                                                   orth_id=string)) | ||||||
|             return lex |             return lex | ||||||
|         else: |         else: | ||||||
|             return self._new_lexeme(mem, string) |             return self._new_lexeme(mem, string) | ||||||
|  | @ -171,7 +167,8 @@ cdef class Vocab: | ||||||
|         if not is_oov: |         if not is_oov: | ||||||
|             key = hash_string(string) |             key = hash_string(string) | ||||||
|             self._add_lex_to_vocab(key, lex) |             self._add_lex_to_vocab(key, lex) | ||||||
|         assert lex != NULL, string |         if lex == NULL: | ||||||
|  |             raise ValueError(Errors.E085.format(string=string)) | ||||||
|         return lex |         return lex | ||||||
| 
 | 
 | ||||||
|     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: |     cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1: | ||||||
|  | @ -254,7 +251,7 @@ cdef class Vocab: | ||||||
|         width, you have to call this to change the size of the vectors. |         width, you have to call this to change the size of the vectors. | ||||||
|         """ |         """ | ||||||
|         if width is not None and shape is not None: |         if width is not None and shape is not None: | ||||||
|             raise ValueError("Only one of width and shape can be specified") |             raise ValueError(Errors.E065.format(width=width, shape=shape)) | ||||||
|         elif shape is not None: |         elif shape is not None: | ||||||
|             self.vectors = Vectors(shape=shape) |             self.vectors = Vectors(shape=shape) | ||||||
|         else: |         else: | ||||||
|  | @ -471,7 +468,10 @@ cdef class Vocab: | ||||||
|             if ptr == NULL: |             if ptr == NULL: | ||||||
|                 continue |                 continue | ||||||
|             py_str = self.strings[lexeme.orth] |             py_str = self.strings[lexeme.orth] | ||||||
|             assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth) |             if self.strings[py_str] != lexeme.orth: | ||||||
|  |                 raise ValueError(Errors.E086.format(string=py_str, | ||||||
|  |                                                     orth_id=lexeme.orth, | ||||||
|  |                                                     hash_id=self.strings[py_str])) | ||||||
|             key = hash_string(py_str) |             key = hash_string(py_str) | ||||||
|             self._by_hash.set(key, lexeme) |             self._by_hash.set(key, lexeme) | ||||||
|             self._by_orth.set(lexeme.orth, lexeme) |             self._by_orth.set(lexeme.orth, lexeme) | ||||||
|  | @ -512,16 +512,3 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir, | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab) | copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab) | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| class LookupError(Exception): |  | ||||||
|     @classmethod |  | ||||||
|     def mismatched_strings(cls, id_, id_string, original_string): |  | ||||||
|         return cls( |  | ||||||
|             "Error fetching a Lexeme from the Vocab. When looking up a " |  | ||||||
|             "string, the lexeme returned had an orth ID that did not match " |  | ||||||
|             "the query string. This means that the cached lexeme structs are " |  | ||||||
|             "mismatched to the string encoding table. The mismatched:\n" |  | ||||||
|             "Query string: {}\n" |  | ||||||
|             "Orth cached: {}\n" |  | ||||||
|             "Orth ID: {}".format(repr(original_string), repr(id_string), id_)) |  | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user