mirror of
https://github.com/explosion/spaCy.git
synced 2024-12-24 17:06:29 +03:00
💫 New system for error messages and warnings (#2163)
* Add spacy.errors module * Update deprecation and user warnings * Replace errors and asserts with new error message system * Remove redundant asserts * Fix whitespace * Add messages for print/util.prints statements * Fix typo * Fix typos * Move CLI messages to spacy.cli._messages * Add decorator to display error code with message An implementation like this is nice because it only modifies the string when it's retrieved from the containing class – so we don't have to worry about manipulating tracebacks etc. * Remove unused link in spacy.about * Update errors for invalid pipeline components * Improve error for unknown factories * Add displaCy warnings * Update formatting consistency * Move error message to spacy.errors * Update errors and check if doc returned by component is None
This commit is contained in:
parent
abf8b16d71
commit
3141e04822
|
@ -4,18 +4,14 @@ from __future__ import unicode_literals
|
|||
from .cli.info import info as cli_info
|
||||
from .glossary import explain
|
||||
from .about import __version__
|
||||
from .errors import Warnings, deprecation_warning
|
||||
from . import util
|
||||
|
||||
|
||||
def load(name, **overrides):
|
||||
depr_path = overrides.get('path')
|
||||
if depr_path not in (True, False, None):
|
||||
util.deprecated(
|
||||
"As of spaCy v2.0, the keyword argument `path=` is deprecated. "
|
||||
"You can now call spacy.load with the path as its first argument, "
|
||||
"and the model's meta.json will be used to determine the language "
|
||||
"to load. For example:\nnlp = spacy.load('{}')".format(depr_path),
|
||||
'error')
|
||||
deprecation_warning(Warnings.W001.format(path=depr_path))
|
||||
return util.load_model(name, **overrides)
|
||||
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ from thinc.neural._classes.affine import _set_dimensions_if_needed
|
|||
import thinc.extra.load_nlp
|
||||
|
||||
from .attrs import ID, ORTH, LOWER, NORM, PREFIX, SUFFIX, SHAPE
|
||||
from .errors import Errors
|
||||
from . import util
|
||||
|
||||
|
||||
|
@ -174,7 +175,7 @@ class PrecomputableAffine(Model):
|
|||
sgd(self._mem.weights, self._mem.gradient, key=self.id)
|
||||
return dXf.reshape((dXf.shape[0], self.nF, self.nI))
|
||||
return Yf, backward
|
||||
|
||||
|
||||
def _add_padding(self, Yf):
|
||||
Yf_padded = self.ops.xp.vstack((self.pad, Yf))
|
||||
return Yf_padded
|
||||
|
@ -340,10 +341,10 @@ def _divide_array(X, size):
|
|||
|
||||
|
||||
def get_col(idx):
|
||||
assert idx >= 0, idx
|
||||
if idx < 0:
|
||||
raise IndexError(Errors.E066.format(value=idx))
|
||||
|
||||
def forward(X, drop=0.):
|
||||
assert idx >= 0, idx
|
||||
if isinstance(X, numpy.ndarray):
|
||||
ops = NumpyOps()
|
||||
else:
|
||||
|
@ -351,7 +352,6 @@ def get_col(idx):
|
|||
output = ops.xp.ascontiguousarray(X[:, idx], dtype=X.dtype)
|
||||
|
||||
def backward(y, sgd=None):
|
||||
assert idx >= 0, idx
|
||||
dX = ops.allocate(X.shape)
|
||||
dX[:, idx] += y
|
||||
return dX
|
||||
|
|
|
@ -11,7 +11,6 @@ __email__ = 'contact@explosion.ai'
|
|||
__license__ = 'MIT'
|
||||
__release__ = True
|
||||
|
||||
__docs_models__ = 'https://spacy.io/usage/models'
|
||||
__download_url__ = 'https://github.com/explosion/spacy-models/releases/download'
|
||||
__compatibility__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/compatibility.json'
|
||||
__shortcuts__ = 'https://raw.githubusercontent.com/explosion/spacy-models/master/shortcuts-v2.json'
|
||||
|
|
73
spacy/cli/_messages.py
Normal file
73
spacy/cli/_messages.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
class Messages(object):
|
||||
M001 = ("Download successful but linking failed")
|
||||
M002 = ("Creating a shortcut link for 'en' didn't work (maybe you "
|
||||
"don't have admin permissions?), but you can still load the "
|
||||
"model via its full package name: nlp = spacy.load('{name}')")
|
||||
M003 = ("Server error ({code}: {desc})")
|
||||
M004 = ("Couldn't fetch {desc}. Please find a model for your spaCy "
|
||||
"installation (v{version}), and download it manually. For more "
|
||||
"details, see the documentation: https://spacy.io/usage/models")
|
||||
M005 = ("Compatibility error")
|
||||
M006 = ("No compatible models found for v{version} of spaCy.")
|
||||
M007 = ("No compatible model found for '{name}' (spaCy v{version}).")
|
||||
M008 = ("Can't locate model data")
|
||||
M009 = ("The data should be located in {path}")
|
||||
M010 = ("Can't find the spaCy data path to create model symlink")
|
||||
M011 = ("Make sure a directory `/data` exists within your spaCy "
|
||||
"installation and try again. The data directory should be "
|
||||
"located here:")
|
||||
M012 = ("Link '{name}' already exists")
|
||||
M013 = ("To overwrite an existing link, use the --force flag.")
|
||||
M014 = ("Can't overwrite symlink '{name}'")
|
||||
M015 = ("This can happen if your data directory contains a directory or "
|
||||
"file of the same name.")
|
||||
M016 = ("Error: Couldn't link model to '{name}'")
|
||||
M017 = ("Creating a symlink in spacy/data failed. Make sure you have the "
|
||||
"required permissions and try re-running the command as admin, or "
|
||||
"use a virtualenv. You can still import the model as a module and "
|
||||
"call its load() method, or create the symlink manually.")
|
||||
M018 = ("Linking successful")
|
||||
M019 = ("You can now load the model via spacy.load('{name}')")
|
||||
M020 = ("Can't find model meta.json")
|
||||
M021 = ("Couldn't fetch compatibility table.")
|
||||
M022 = ("Can't find spaCy v{version} in compatibility table")
|
||||
M023 = ("Installed models (spaCy v{version})")
|
||||
M024 = ("No models found in your current environment.")
|
||||
M025 = ("Use the following commands to update the model packages:")
|
||||
M026 = ("The following models are not available for spaCy "
|
||||
"v{version}: {models}")
|
||||
M027 = ("You may also want to overwrite the incompatible links using the "
|
||||
"`python -m spacy link` command with `--force`, or remove them "
|
||||
"from the data directory. Data path: {path}")
|
||||
M028 = ("Input file not found")
|
||||
M029 = ("Output directory not found")
|
||||
M030 = ("Unknown format")
|
||||
M031 = ("Can't find converter for {converter}")
|
||||
M032 = ("Generated output file {name}")
|
||||
M033 = ("Created {n_docs} documents")
|
||||
M034 = ("Evaluation data not found")
|
||||
M035 = ("Visualization output directory not found")
|
||||
M036 = ("Generated {n} parses as HTML")
|
||||
M037 = ("Can't find words frequencies file")
|
||||
M038 = ("Sucessfully compiled vocab")
|
||||
M039 = ("{entries} entries, {vectors} vectors")
|
||||
M040 = ("Output directory not found")
|
||||
M041 = ("Loaded meta.json from file")
|
||||
M042 = ("Successfully created package '{name}'")
|
||||
M043 = ("To build the package, run `python setup.py sdist` in this "
|
||||
"directory.")
|
||||
M044 = ("Package directory already exists")
|
||||
M045 = ("Please delete the directory and try again, or use the `--force` "
|
||||
"flag to overwrite existing directories.")
|
||||
M046 = ("Generating meta.json")
|
||||
M047 = ("Enter the package settings for your model. The following "
|
||||
"information will be read from your model data: pipeline, vectors.")
|
||||
M048 = ("No '{key}' setting found in meta.json")
|
||||
M049 = ("This setting is required to build your package.")
|
||||
M050 = ("Training data not found")
|
||||
M051 = ("Development data not found")
|
||||
M052 = ("Not a valid meta.json format")
|
||||
M053 = ("Expected dict but got: {meta_type}")
|
|
@ -5,6 +5,7 @@ import plac
|
|||
from pathlib import Path
|
||||
|
||||
from .converters import conllu2json, iob2json, conll_ner2json
|
||||
from ._messages import Messages
|
||||
from ..util import prints
|
||||
|
||||
# Converters are matched by file extension. To add a converter, add a new
|
||||
|
@ -32,14 +33,14 @@ def convert(input_file, output_dir, n_sents=1, morphology=False, converter='auto
|
|||
input_path = Path(input_file)
|
||||
output_path = Path(output_dir)
|
||||
if not input_path.exists():
|
||||
prints(input_path, title="Input file not found", exits=1)
|
||||
prints(input_path, title=Messages.M028, exits=1)
|
||||
if not output_path.exists():
|
||||
prints(output_path, title="Output directory not found", exits=1)
|
||||
prints(output_path, title=Messages.M029, exits=1)
|
||||
if converter == 'auto':
|
||||
converter = input_path.suffix[1:]
|
||||
if converter not in CONVERTERS:
|
||||
prints("Can't find converter for %s" % converter,
|
||||
title="Unknown format", exits=1)
|
||||
prints(Messages.M031.format(converter=converter),
|
||||
title=Messages.M030, exits=1)
|
||||
func = CONVERTERS[converter]
|
||||
func(input_path, output_path,
|
||||
n_sents=n_sents, use_morphology=morphology)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .._messages import Messages
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
from ...gold import iob_to_biluo
|
||||
|
@ -18,8 +19,8 @@ def conll_ner2json(input_path, output_path, n_sents=10, use_morphology=False):
|
|||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
prints("Created %d documents" % len(docs),
|
||||
title="Generated output file %s" % path2str(output_file))
|
||||
prints(Messages.M033.format(n_docs=len(docs)),
|
||||
title=Messages.M032.format(name=path2str(output_file)))
|
||||
|
||||
|
||||
def read_conll_ner(input_path):
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from .._messages import Messages
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
|
||||
|
@ -32,8 +33,8 @@ def conllu2json(input_path, output_path, n_sents=10, use_morphology=False):
|
|||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
prints("Created %d documents" % len(docs),
|
||||
title="Generated output file %s" % path2str(output_file))
|
||||
prints(Messages.M033.format(n_docs=len(docs)),
|
||||
title=Messages.M032.format(name=path2str(output_file)))
|
||||
|
||||
|
||||
def read_conllx(input_path, use_morphology=False, n=0):
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import unicode_literals
|
||||
from cytoolz import partition_all, concat
|
||||
|
||||
from .._messages import Messages
|
||||
from ...compat import json_dumps, path2str
|
||||
from ...util import prints
|
||||
from ...gold import iob_to_biluo
|
||||
|
@ -18,8 +19,8 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
|
|||
output_file = output_path / output_filename
|
||||
with output_file.open('w', encoding='utf-8') as f:
|
||||
f.write(json_dumps(docs))
|
||||
prints("Created %d documents" % len(docs),
|
||||
title="Generated output file %s" % path2str(output_file))
|
||||
prints(Messages.M033.format(n_docs=len(docs)),
|
||||
title=Messages.M032.format(name=path2str(output_file)))
|
||||
|
||||
|
||||
def read_iob(raw_sents):
|
||||
|
|
|
@ -8,6 +8,7 @@ import sys
|
|||
import ujson
|
||||
|
||||
from .link import link
|
||||
from ._messages import Messages
|
||||
from ..util import prints, get_package_path
|
||||
from ..compat import url_read, HTTPError
|
||||
from .. import about
|
||||
|
@ -32,9 +33,7 @@ def download(model, direct=False):
|
|||
version = get_version(model_name, compatibility)
|
||||
dl = download_model('{m}-{v}/{m}-{v}.tar.gz'.format(m=model_name,
|
||||
v=version))
|
||||
if dl != 0:
|
||||
# if download subprocess doesn't return 0, exit with the respective
|
||||
# exit code before doing anything else
|
||||
if dl != 0: # if download subprocess doesn't return 0, exit
|
||||
sys.exit(dl)
|
||||
try:
|
||||
# Get package path here because link uses
|
||||
|
@ -48,22 +47,15 @@ def download(model, direct=False):
|
|||
# Dirty, but since spacy.download and the auto-linking is
|
||||
# mostly a convenience wrapper, it's best to show a success
|
||||
# message and loading instructions, even if linking fails.
|
||||
prints(
|
||||
"Creating a shortcut link for 'en' didn't work (maybe "
|
||||
"you don't have admin permissions?), but you can still "
|
||||
"load the model via its full package name:",
|
||||
"nlp = spacy.load('%s')" % model_name,
|
||||
title="Download successful but linking failed")
|
||||
prints(Messages.M001.format(name=model_name), title=Messages.M002)
|
||||
|
||||
|
||||
def get_json(url, desc):
|
||||
try:
|
||||
data = url_read(url)
|
||||
except HTTPError as e:
|
||||
msg = ("Couldn't fetch %s. Please find a model for your spaCy "
|
||||
"installation (v%s), and download it manually.")
|
||||
prints(msg % (desc, about.__version__), about.__docs_models__,
|
||||
title="Server error (%d: %s)" % (e.code, e.reason), exits=1)
|
||||
prints(Messages.M004.format(desc, about.__version__),
|
||||
title=Messages.M003.format(e.code, e.reason), exits=1)
|
||||
return ujson.loads(data)
|
||||
|
||||
|
||||
|
@ -73,17 +65,16 @@ def get_compatibility():
|
|||
comp_table = get_json(about.__compatibility__, "compatibility table")
|
||||
comp = comp_table['spacy']
|
||||
if version not in comp:
|
||||
prints("No compatible models found for v%s of spaCy." % version,
|
||||
title="Compatibility error", exits=1)
|
||||
prints(Messages.M006.format(version=version), title=Messages.M005,
|
||||
exits=1)
|
||||
return comp[version]
|
||||
|
||||
|
||||
def get_version(model, comp):
|
||||
model = model.rsplit('.dev', 1)[0]
|
||||
if model not in comp:
|
||||
version = about.__version__
|
||||
msg = "No compatible model found for '%s' (spaCy v%s)."
|
||||
prints(msg % (model, version), title="Compatibility error", exits=1)
|
||||
prints(Messages.M007.format(name=model, version=about.__version__),
|
||||
title=Messages.M005, exits=1)
|
||||
return comp[model][0]
|
||||
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals, division, print_function
|
|||
import plac
|
||||
from timeit import default_timer as timer
|
||||
|
||||
from ._messages import Messages
|
||||
from ..gold import GoldCorpus
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
|
@ -33,10 +34,9 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
|
|||
data_path = util.ensure_path(data_path)
|
||||
displacy_path = util.ensure_path(displacy_path)
|
||||
if not data_path.exists():
|
||||
prints(data_path, title="Evaluation data not found", exits=1)
|
||||
prints(data_path, title=Messages.M034, exits=1)
|
||||
if displacy_path and not displacy_path.exists():
|
||||
prints(displacy_path, title="Visualization output directory not found",
|
||||
exits=1)
|
||||
prints(displacy_path, title=Messages.M035, exits=1)
|
||||
corpus = GoldCorpus(data_path, data_path)
|
||||
nlp = util.load_model(model)
|
||||
dev_docs = list(corpus.dev_docs(nlp, gold_preproc=gold_preproc))
|
||||
|
@ -52,8 +52,7 @@ def evaluate(model, data_path, gpu_id=-1, gold_preproc=False, displacy_path=None
|
|||
render_ents = 'ner' in nlp.meta.get('pipeline', [])
|
||||
render_parses(docs, displacy_path, model_name=model,
|
||||
limit=displacy_limit, deps=render_deps, ents=render_ents)
|
||||
msg = "Generated %s parses as HTML" % displacy_limit
|
||||
prints(displacy_path, title=msg)
|
||||
prints(displacy_path, title=Messages.M036.format(n=displacy_limit))
|
||||
|
||||
|
||||
def render_parses(docs, output_path, model_name='', limit=250, deps=True,
|
||||
|
|
|
@ -5,9 +5,10 @@ import plac
|
|||
import platform
|
||||
from pathlib import Path
|
||||
|
||||
from ._messages import Messages
|
||||
from ..compat import path2str
|
||||
from .. import about
|
||||
from .. import util
|
||||
from .. import about
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
|
@ -25,7 +26,7 @@ def info(model=None, markdown=False):
|
|||
model_path = util.get_data_path() / model
|
||||
meta_path = model_path / 'meta.json'
|
||||
if not meta_path.is_file():
|
||||
util.prints(meta_path, title="Can't find model meta.json", exits=1)
|
||||
util.prints(meta_path, title=Messages.M020, exits=1)
|
||||
meta = util.read_json(meta_path)
|
||||
if model_path.resolve() != model_path:
|
||||
meta['link'] = path2str(model_path)
|
||||
|
|
|
@ -11,7 +11,9 @@ from preshed.counter import PreshCounter
|
|||
import tarfile
|
||||
import gzip
|
||||
|
||||
from ._messages import Messages
|
||||
from ..vectors import Vectors
|
||||
from ..errors import Warnings, user_warning
|
||||
from ..util import prints, ensure_path, get_lang_class
|
||||
|
||||
try:
|
||||
|
@ -37,16 +39,13 @@ def init_model(lang, output_dir, freqs_loc=None, clusters_loc=None, vectors_loc=
|
|||
and word vectors.
|
||||
"""
|
||||
if freqs_loc is not None and not freqs_loc.exists():
|
||||
prints(freqs_loc, title="Can't find words frequencies file", exits=1)
|
||||
prints(freqs_loc, title=Messages.M037, exits=1)
|
||||
clusters_loc = ensure_path(clusters_loc)
|
||||
vectors_loc = ensure_path(vectors_loc)
|
||||
|
||||
probs, oov_prob = read_freqs(freqs_loc) if freqs_loc is not None else ({}, -20)
|
||||
vectors_data, vector_keys = read_vectors(vectors_loc) if vectors_loc else (None, None)
|
||||
clusters = read_clusters(clusters_loc) if clusters_loc else {}
|
||||
|
||||
nlp = create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, prune_vectors)
|
||||
|
||||
if not output_dir.exists():
|
||||
output_dir.mkdir()
|
||||
nlp.to_disk(output_dir)
|
||||
|
@ -69,7 +68,6 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
|
|||
nlp = lang_class()
|
||||
for lexeme in nlp.vocab:
|
||||
lexeme.rank = 0
|
||||
|
||||
lex_added = 0
|
||||
for i, (word, prob) in enumerate(tqdm(sorted(probs.items(), key=lambda item: item[1], reverse=True))):
|
||||
lexeme = nlp.vocab[word]
|
||||
|
@ -89,15 +87,13 @@ def create_model(lang, probs, oov_prob, clusters, vectors_data, vector_keys, pru
|
|||
lexeme = nlp.vocab[word]
|
||||
lexeme.is_oov = False
|
||||
lex_added += 1
|
||||
|
||||
if len(vectors_data):
|
||||
nlp.vocab.vectors = Vectors(data=vectors_data, keys=vector_keys)
|
||||
if prune_vectors >= 1:
|
||||
nlp.vocab.prune_vectors(prune_vectors)
|
||||
vec_added = len(nlp.vocab.vectors)
|
||||
|
||||
prints("{} entries, {} vectors".format(lex_added, vec_added),
|
||||
title="Sucessfully compiled vocab")
|
||||
prints(Messages.M039.format(entries=lex_added, vectors=vec_added),
|
||||
title=Messages.M038)
|
||||
return nlp
|
||||
|
||||
|
||||
|
@ -145,7 +141,7 @@ def read_clusters(clusters_loc):
|
|||
print("Reading clusters...")
|
||||
clusters = {}
|
||||
if ftfy is None:
|
||||
print("Warning: No text fixing. Run pip install ftfy if necessary")
|
||||
user_warning(Warnings.W004)
|
||||
with clusters_loc.open() as f:
|
||||
for line in tqdm(f):
|
||||
try:
|
||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
|||
import plac
|
||||
from pathlib import Path
|
||||
|
||||
from ._messages import Messages
|
||||
from ..compat import symlink_to, path2str
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
|
@ -24,40 +25,29 @@ def link(origin, link_name, force=False, model_path=None):
|
|||
else:
|
||||
model_path = Path(origin) if model_path is None else Path(model_path)
|
||||
if not model_path.exists():
|
||||
prints("The data should be located in %s" % path2str(model_path),
|
||||
title="Can't locate model data", exits=1)
|
||||
prints(Messages.M009.format(path=path2str(model_path)),
|
||||
title=Messages.M008, exits=1)
|
||||
data_path = util.get_data_path()
|
||||
if not data_path or not data_path.exists():
|
||||
spacy_loc = Path(__file__).parent.parent
|
||||
prints("Make sure a directory `/data` exists within your spaCy "
|
||||
"installation and try again. The data directory should be "
|
||||
"located here:", path2str(spacy_loc), exits=1,
|
||||
title="Can't find the spaCy data path to create model symlink")
|
||||
prints(Messages.M011, spacy_loc, title=Messages.M010, exits=1)
|
||||
link_path = util.get_data_path() / link_name
|
||||
if link_path.is_symlink() and not force:
|
||||
prints("To overwrite an existing link, use the --force flag.",
|
||||
title="Link %s already exists" % link_name, exits=1)
|
||||
prints(Messages.M013, title=Messages.M012.format(name=link_name),
|
||||
exits=1)
|
||||
elif link_path.is_symlink(): # does a symlink exist?
|
||||
# NB: It's important to check for is_symlink here and not for exists,
|
||||
# because invalid/outdated symlinks would return False otherwise.
|
||||
link_path.unlink()
|
||||
elif link_path.exists(): # does it exist otherwise?
|
||||
# NB: Check this last because valid symlinks also "exist".
|
||||
prints("This can happen if your data directory contains a directory "
|
||||
"or file of the same name.", link_path,
|
||||
title="Can't overwrite symlink %s" % link_name, exits=1)
|
||||
prints(Messages.M015, link_path,
|
||||
title=Messages.M014.format(name=link_name), exits=1)
|
||||
msg = "%s --> %s" % (path2str(model_path), path2str(link_path))
|
||||
try:
|
||||
symlink_to(link_path, model_path)
|
||||
except:
|
||||
# This is quite dirty, but just making sure other errors are caught.
|
||||
prints("Creating a symlink in spacy/data failed. Make sure you have "
|
||||
"the required permissions and try re-running the command as "
|
||||
"admin, or use a virtualenv. You can still import the model as "
|
||||
"a module and call its load() method, or create the symlink "
|
||||
"manually.",
|
||||
"%s --> %s" % (path2str(model_path), path2str(link_path)),
|
||||
title="Error: Couldn't link model to '%s'" % link_name)
|
||||
prints(Messages.M017, msg, title=Messages.M016.format(name=link_name))
|
||||
raise
|
||||
prints("%s --> %s" % (path2str(model_path), path2str(link_path)),
|
||||
"You can now load the model via spacy.load('%s')" % link_name,
|
||||
title="Linking successful")
|
||||
prints(msg, Messages.M019.format(name=link_name), title=Messages.M018)
|
||||
|
|
|
@ -5,6 +5,7 @@ import plac
|
|||
import shutil
|
||||
from pathlib import Path
|
||||
|
||||
from ._messages import Messages
|
||||
from ..compat import path2str, json_dumps
|
||||
from ..util import prints
|
||||
from .. import util
|
||||
|
@ -31,17 +32,17 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False,
|
|||
output_path = util.ensure_path(output_dir)
|
||||
meta_path = util.ensure_path(meta_path)
|
||||
if not input_path or not input_path.exists():
|
||||
prints(input_path, title="Model directory not found", exits=1)
|
||||
prints(input_path, title=Messages.M008, exits=1)
|
||||
if not output_path or not output_path.exists():
|
||||
prints(output_path, title="Output directory not found", exits=1)
|
||||
prints(output_path, title=Messages.M040, exits=1)
|
||||
if meta_path and not meta_path.exists():
|
||||
prints(meta_path, title="meta.json not found", exits=1)
|
||||
prints(meta_path, title=Messages.M020, exits=1)
|
||||
|
||||
meta_path = meta_path or input_path / 'meta.json'
|
||||
if meta_path.is_file():
|
||||
meta = util.read_json(meta_path)
|
||||
if not create_meta: # only print this if user doesn't want to overwrite
|
||||
prints(meta_path, title="Loaded meta.json from file")
|
||||
prints(meta_path, title=Messages.M041)
|
||||
else:
|
||||
meta = generate_meta(input_dir, meta)
|
||||
meta = validate_meta(meta, ['lang', 'name', 'version'])
|
||||
|
@ -57,9 +58,8 @@ def package(input_dir, output_dir, meta_path=None, create_meta=False,
|
|||
create_file(main_path / 'setup.py', TEMPLATE_SETUP)
|
||||
create_file(main_path / 'MANIFEST.in', TEMPLATE_MANIFEST)
|
||||
create_file(package_path / '__init__.py', TEMPLATE_INIT)
|
||||
prints(main_path, "To build the package, run `python setup.py sdist` in "
|
||||
"this directory.",
|
||||
title="Successfully created package '%s'" % model_name_v)
|
||||
prints(main_path, Messages.M043,
|
||||
title=Messages.M042.format(name=model_name_v))
|
||||
|
||||
|
||||
def create_dirs(package_path, force):
|
||||
|
@ -67,10 +67,7 @@ def create_dirs(package_path, force):
|
|||
if force:
|
||||
shutil.rmtree(path2str(package_path))
|
||||
else:
|
||||
prints(package_path, "Please delete the directory and try again, "
|
||||
"or use the --force flag to overwrite existing "
|
||||
"directories.", title="Package directory already exists",
|
||||
exits=1)
|
||||
prints(package_path, Messages.M045, title=Messages.M044, exits=1)
|
||||
Path.mkdir(package_path, parents=True)
|
||||
|
||||
|
||||
|
@ -97,9 +94,7 @@ def generate_meta(model_path, existing_meta):
|
|||
meta['vectors'] = {'width': nlp.vocab.vectors_length,
|
||||
'vectors': len(nlp.vocab.vectors),
|
||||
'keys': nlp.vocab.vectors.n_keys}
|
||||
prints("Enter the package settings for your model. The following "
|
||||
"information will be read from your model data: pipeline, vectors.",
|
||||
title="Generating meta.json")
|
||||
prints(Messages.M047, title=Messages.Mo46)
|
||||
for setting, desc, default in settings:
|
||||
response = util.get_raw_input(desc, default)
|
||||
meta[setting] = default if response == '' and default else response
|
||||
|
@ -111,8 +106,7 @@ def generate_meta(model_path, existing_meta):
|
|||
def validate_meta(meta, keys):
|
||||
for key in keys:
|
||||
if key not in meta or meta[key] == '':
|
||||
prints("This setting is required to build your package.",
|
||||
title='No "%s" setting found in meta.json' % key, exits=1)
|
||||
prints(Messages.M049, title=Messages.M048.format(key=key), exits=1)
|
||||
return meta
|
||||
|
||||
|
||||
|
|
|
@ -7,6 +7,7 @@ import tqdm
|
|||
from thinc.neural._classes.model import Model
|
||||
from timeit import default_timer as timer
|
||||
|
||||
from ._messages import Messages
|
||||
from ..attrs import PROB, IS_OOV, CLUSTER, LANG
|
||||
from ..gold import GoldCorpus, minibatch
|
||||
from ..util import prints
|
||||
|
@ -54,15 +55,15 @@ def train(lang, output_dir, train_data, dev_data, n_iter=30, n_sents=0,
|
|||
if not output_path.exists():
|
||||
output_path.mkdir()
|
||||
if not train_path.exists():
|
||||
prints(train_path, title="Training data not found", exits=1)
|
||||
prints(train_path, title=Messages.M050, exits=1)
|
||||
if dev_path and not dev_path.exists():
|
||||
prints(dev_path, title="Development data not found", exits=1)
|
||||
prints(dev_path, title=Messages.M051, exits=1)
|
||||
if meta_path is not None and not meta_path.exists():
|
||||
prints(meta_path, title="meta.json not found", exits=1)
|
||||
prints(meta_path, title=Messages.M020, exits=1)
|
||||
meta = util.read_json(meta_path) if meta_path else {}
|
||||
if not isinstance(meta, dict):
|
||||
prints("Expected dict but got: {}".format(type(meta)),
|
||||
title="Not a valid meta.json format", exits=1)
|
||||
prints(Messages.M053.format(meta_type=type(meta)),
|
||||
title=Messages.M052, exits=1)
|
||||
meta.setdefault('lang', lang)
|
||||
meta.setdefault('name', 'unnamed')
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ from pathlib import Path
|
|||
import sys
|
||||
import ujson
|
||||
|
||||
from ._messages import Messages
|
||||
from ..compat import path2str, locale_escape, url_read, HTTPError
|
||||
from ..util import prints, get_data_path, read_json
|
||||
from .. import about
|
||||
|
@ -18,14 +19,13 @@ def validate():
|
|||
try:
|
||||
data = url_read(about.__compatibility__)
|
||||
except HTTPError as e:
|
||||
prints("Couldn't fetch compatibility table.",
|
||||
title="Server error (%d: %s)" % (e.code, e.reason), exits=1)
|
||||
title = Messages.M003.format(code=e.code, desc=e.reason)
|
||||
prints(Messages.M021, title=title, exits=1)
|
||||
compat = ujson.loads(data)['spacy']
|
||||
current_compat = compat.get(about.__version__)
|
||||
if not current_compat:
|
||||
prints(about.__compatibility__, exits=1,
|
||||
title="Can't find spaCy v{} in compatibility table"
|
||||
.format(about.__version__))
|
||||
title=Messages.M022.format(version=about.__version__))
|
||||
all_models = set()
|
||||
for spacy_v, models in dict(compat).items():
|
||||
all_models.update(models.keys())
|
||||
|
@ -42,7 +42,7 @@ def validate():
|
|||
update_models = [m for m in incompat_models if m in current_compat]
|
||||
|
||||
prints(path2str(Path(__file__).parent.parent),
|
||||
title="Installed models (spaCy v{})".format(about.__version__))
|
||||
title=Messages.M023.format(version=about.__version__))
|
||||
if model_links or model_pkgs:
|
||||
print(get_row('TYPE', 'NAME', 'MODEL', 'VERSION', ''))
|
||||
for name, data in model_pkgs.items():
|
||||
|
@ -50,23 +50,16 @@ def validate():
|
|||
for name, data in model_links.items():
|
||||
print(get_model_row(current_compat, name, data, 'link'))
|
||||
else:
|
||||
prints("No models found in your current environment.", exits=0)
|
||||
|
||||
prints(Messages.M024, exits=0)
|
||||
if update_models:
|
||||
cmd = ' python -m spacy download {}'
|
||||
print("\n Use the following commands to update the model packages:")
|
||||
print("\n " + Messages.M025)
|
||||
print('\n'.join([cmd.format(pkg) for pkg in update_models]))
|
||||
|
||||
if na_models:
|
||||
prints("The following models are not available for spaCy v{}: {}"
|
||||
.format(about.__version__, ', '.join(na_models)))
|
||||
|
||||
prints(Messages.M025.format(version=about.__version__,
|
||||
models=', '.join(na_models)))
|
||||
if incompat_links:
|
||||
prints("You may also want to overwrite the incompatible links using "
|
||||
"the `python -m spacy link` command with `--force`, or remove "
|
||||
"them from the data directory. Data path: {}"
|
||||
.format(path2str(get_data_path())))
|
||||
|
||||
prints(Messages.M027.format(path=path2str(get_data_path())))
|
||||
if incompat_models or incompat_links:
|
||||
sys.exit(1)
|
||||
|
||||
|
|
|
@ -4,6 +4,7 @@ from __future__ import unicode_literals
|
|||
from .render import DependencyRenderer, EntityRenderer
|
||||
from ..tokens import Doc
|
||||
from ..compat import b_to_str
|
||||
from ..errors import Errors, Warnings, user_warning
|
||||
from ..util import prints, is_in_jupyter
|
||||
|
||||
|
||||
|
@ -27,7 +28,7 @@ def render(docs, style='dep', page=False, minify=False, jupyter=IS_JUPYTER,
|
|||
factories = {'dep': (DependencyRenderer, parse_deps),
|
||||
'ent': (EntityRenderer, parse_ents)}
|
||||
if style not in factories:
|
||||
raise ValueError("Unknown style: %s" % style)
|
||||
raise ValueError(Errors.E087.format(style=style))
|
||||
if isinstance(docs, Doc) or isinstance(docs, dict):
|
||||
docs = [docs]
|
||||
renderer, converter = factories[style]
|
||||
|
@ -57,12 +58,12 @@ def serve(docs, style='dep', page=True, minify=False, options={}, manual=False,
|
|||
render(docs, style=style, page=page, minify=minify, options=options,
|
||||
manual=manual)
|
||||
httpd = simple_server.make_server('0.0.0.0', port, app)
|
||||
prints("Using the '%s' visualizer" % style,
|
||||
title="Serving on port %d..." % port)
|
||||
prints("Using the '{}' visualizer".format(style),
|
||||
title="Serving on port {}...".format(port))
|
||||
try:
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
prints("Shutting down server on port %d." % port)
|
||||
prints("Shutting down server on port {}.".format(port))
|
||||
finally:
|
||||
httpd.server_close()
|
||||
|
||||
|
@ -83,6 +84,8 @@ def parse_deps(orig_doc, options={}):
|
|||
RETURNS (dict): Generated dependency parse keyed by words and arcs.
|
||||
"""
|
||||
doc = Doc(orig_doc.vocab).from_bytes(orig_doc.to_bytes())
|
||||
if not doc.is_parsed:
|
||||
user_warning(Warnings.W005)
|
||||
if options.get('collapse_punct', True):
|
||||
spans = []
|
||||
for word in doc[:-1]:
|
||||
|
@ -120,6 +123,8 @@ def parse_ents(doc, options={}):
|
|||
"""
|
||||
ents = [{'start': ent.start_char, 'end': ent.end_char, 'label': ent.label_}
|
||||
for ent in doc.ents]
|
||||
if not ents:
|
||||
user_warning(Warnings.W006)
|
||||
title = (doc.user_data.get('title', None)
|
||||
if hasattr(doc, 'user_data') else None)
|
||||
return {'text': doc.text, 'ents': ents, 'title': title}
|
||||
|
|
297
spacy/errors.py
Normal file
297
spacy/errors.py
Normal file
|
@ -0,0 +1,297 @@
|
|||
# coding: utf8
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import os
|
||||
import warnings
|
||||
import inspect
|
||||
|
||||
|
||||
def add_codes(err_cls):
|
||||
"""Add error codes to string messages via class attribute names."""
|
||||
class ErrorsWithCodes(object):
|
||||
def __getattribute__(self, code):
|
||||
msg = getattr(err_cls, code)
|
||||
return '[{code}] {msg}'.format(code=code, msg=msg)
|
||||
return ErrorsWithCodes()
|
||||
|
||||
|
||||
@add_codes
|
||||
class Warnings(object):
|
||||
W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
|
||||
"You can now call spacy.load with the path as its first argument, "
|
||||
"and the model's meta.json will be used to determine the language "
|
||||
"to load. For example:\nnlp = spacy.load('{path}')")
|
||||
W002 = ("Tokenizer.from_list is now deprecated. Create a new Doc object "
|
||||
"instead and pass in the strings as the `words` keyword argument, "
|
||||
"for example:\nfrom spacy.tokens import Doc\n"
|
||||
"doc = Doc(nlp.vocab, words=[...])")
|
||||
W003 = ("Positional arguments to Doc.merge are deprecated. Instead, use "
|
||||
"the keyword arguments, for example tag=, lemma= or ent_type=.")
|
||||
W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing "
|
||||
"using ftfy.fix_text if necessary.")
|
||||
W005 = ("Doc object not parsed. This means displaCy won't be able to "
|
||||
"generate a dependency visualization for it. Make sure the Doc "
|
||||
"was processed with a model that supports dependency parsing, and "
|
||||
"not just a language class like `English()`. For more info, see "
|
||||
"the docs:\nhttps://spacy.io/usage/models")
|
||||
W006 = ("No entities to visualize found in Doc object. If this is "
|
||||
"surprising to you, make sure the Doc was processed using a model "
|
||||
"that supports named entity recognition, and check the `doc.ents` "
|
||||
"property manually if necessary.")
|
||||
|
||||
|
||||
@add_codes
|
||||
class Errors(object):
|
||||
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
||||
E002 = ("Can't find factory for '{name}'. This usually happens when spaCy "
|
||||
"calls `nlp.create_pipe` with a component name that's not built "
|
||||
"in - for example, when constructing the pipeline from a model's "
|
||||
"meta.json. If you're using a custom component, you can write to "
|
||||
"`Language.factories['{name}']` or remove it from the model meta "
|
||||
"and add it via `nlp.add_pipe` instead.")
|
||||
E003 = ("Not a valid pipeline component. Expected callable, but "
|
||||
"got {component} (name: '{name}').")
|
||||
E004 = ("If you meant to add a built-in component, use `create_pipe`: "
|
||||
"`nlp.add_pipe(nlp.create_pipe('{component}'))`")
|
||||
E005 = ("Pipeline component '{name}' returned None. If you're using a "
|
||||
"custom component, maybe you forgot to return the processed Doc?")
|
||||
E006 = ("Invalid constraints. You can only set one of the following: "
|
||||
"before, after, first, last.")
|
||||
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
|
||||
E008 = ("Some current components would be lost when restoring previous "
|
||||
"pipeline state. If you added components after calling "
|
||||
"`nlp.disable_pipes()`, you should remove them explicitly with "
|
||||
"`nlp.remove_pipe()` before the pipeline is restored. Names of "
|
||||
"the new components: {names}")
|
||||
E009 = ("The `update` method expects same number of docs and golds, but "
|
||||
"got: {n_docs} docs, {n_golds} golds.")
|
||||
E010 = ("Word vectors set to length 0. This may be because you don't have "
|
||||
"a model installed or loaded, or because your model doesn't "
|
||||
"include word vectors. For more info, see the docs:\n"
|
||||
"https://spacy.io/usage/models")
|
||||
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
||||
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
||||
E013 = ("Error selecting action in matcher")
|
||||
E014 = ("Uknown tag ID: {tag}")
|
||||
E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
|
||||
"`force=True` to overwrite.")
|
||||
E016 = ("MultitaskObjective target should be function or one of: dep, "
|
||||
"tag, ent, dep_tag_offset, ent_tag.")
|
||||
E017 = ("Can only add unicode or bytes. Got type: {value_type}")
|
||||
E018 = ("Can't retrieve string for hash '{hash_value}'.")
|
||||
E019 = ("Can't create transition with unknown action ID: {action}. Action "
|
||||
"IDs are enumerated in spacy/syntax/{src}.pyx.")
|
||||
E020 = ("Could not find a gold-standard action to supervise the "
|
||||
"dependency parser. The tree is non-projective (i.e. it has "
|
||||
"crossing arcs - see spacy/syntax/nonproj.pyx for definitions). "
|
||||
"The ArcEager transition system only supports projective trees. "
|
||||
"To learn non-projective representations, transform the data "
|
||||
"before training and after parsing. Either pass "
|
||||
"`make_projective=True` to the GoldParse class, or use "
|
||||
"spacy.syntax.nonproj.preprocess_training_data.")
|
||||
E021 = ("Could not find a gold-standard action to supervise the "
|
||||
"dependency parser. The GoldParse was projective. The transition "
|
||||
"system has {n_actions} actions. State at failure: {state}")
|
||||
E022 = ("Could not find a transition with the name '{name}' in the NER "
|
||||
"model.")
|
||||
E023 = ("Error cleaning up beam: The same state occurred twice at "
|
||||
"memory address {addr} and position {i}.")
|
||||
E024 = ("Could not find an optimal move to supervise the parser. Usually, "
|
||||
"this means the GoldParse was not correct. For example, are all "
|
||||
"labels added to the model?")
|
||||
E025 = ("String is too long: {length} characters. Max is 2**30.")
|
||||
E026 = ("Error accessing token at position {i}: out of bounds in Doc of "
|
||||
"length {length}.")
|
||||
E027 = ("Arguments 'words' and 'spaces' should be sequences of the same "
|
||||
"length, or 'spaces' should be left default at None. spaces "
|
||||
"should be a sequence of booleans, with True meaning that the "
|
||||
"word owns a ' ' character following it.")
|
||||
E028 = ("orths_and_spaces expects either a list of unicode string or a "
|
||||
"list of (unicode, bool) tuples. Got bytes instance: {value}")
|
||||
E029 = ("noun_chunks requires the dependency parse, which requires a "
|
||||
"statistical model to be installed and loaded. For more info, see "
|
||||
"the documentation:\nhttps://spacy.io/usage/models")
|
||||
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
||||
"component to the pipeline with: "
|
||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
|
||||
"Alternatively, add the dependency parser, or set sentence "
|
||||
"boundaries by setting doc[i].is_sent_start.")
|
||||
E031 = ("Invalid token: empty string ('') at position {i}.")
|
||||
E032 = ("Conflicting attributes specified in doc.from_array(): "
|
||||
"(HEAD, SENT_START). The HEAD attribute currently sets sentence "
|
||||
"boundaries implicitly, based on the tree structure. This means "
|
||||
"the HEAD attribute would potentially override the sentence "
|
||||
"boundaries set by SENT_START.")
|
||||
E033 = ("Cannot load into non-empty Doc of length {length}.")
|
||||
E034 = ("Doc.merge received {n_args} non-keyword arguments. Expected "
|
||||
"either 3 arguments (deprecated), or 0 (use keyword arguments).\n"
|
||||
"Arguments supplied:\n{args}\nKeyword arguments:{kwargs}")
|
||||
E035 = ("Error creating span with start {start} and end {end} for Doc of "
|
||||
"length {length}.")
|
||||
E036 = ("Error calculating span: Can't find a token starting at character "
|
||||
"offset {start}.")
|
||||
E037 = ("Error calculating span: Can't find a token ending at character "
|
||||
"offset {end}.")
|
||||
E038 = ("Error finding sentence for span. Infinite loop detected.")
|
||||
E039 = ("Array bounds exceeded while searching for root word. This likely "
|
||||
"means the parse tree is in an invalid state. Please report this "
|
||||
"issue here: http://github.com/explosion/spaCy/issues")
|
||||
E040 = ("Attempt to access token at {i}, max length {max_length}.")
|
||||
E041 = ("Invalid comparison operator: {op}. Likely a Cython bug?")
|
||||
E042 = ("Error accessing doc[{i}].nbor({j}), for doc of length {length}.")
|
||||
E043 = ("Refusing to write to token.sent_start if its document is parsed, "
|
||||
"because this may cause inconsistent state.")
|
||||
E044 = ("Invalid value for token.sent_start: {value}. Must be one of: "
|
||||
"None, True, False")
|
||||
E045 = ("Possibly infinite loop encountered while looking for {attr}.")
|
||||
E046 = ("Can't retrieve unregistered extension attribute '{name}'. Did "
|
||||
"you forget to call the `set_extension` method?")
|
||||
E047 = ("Can't assign a value to unregistered extension attribute "
|
||||
"'{name}'. Did you forget to call the `set_extension` method?")
|
||||
E048 = ("Can't import language {lang} from spacy.lang.")
|
||||
E049 = ("Can't find spaCy data directory: '{path}'. Check your "
|
||||
"installation and permissions, or use spacy.util.set_data_path "
|
||||
"to customise the location if necessary.")
|
||||
E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut "
|
||||
"link, a Python package or a valid path to a data directory.")
|
||||
E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure "
|
||||
"it points to a valid package (not just a data directory).")
|
||||
E052 = ("Can't find model directory: {path}")
|
||||
E053 = ("Could not read meta.json from {path}")
|
||||
E054 = ("No valid '{setting}' setting found in model meta.json.")
|
||||
E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}")
|
||||
E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
|
||||
"original string.\nKey: {key}\nOrths: {orths}")
|
||||
E057 = ("Stepped slices not supported in Span objects. Try: "
|
||||
"list(tokens)[start:stop:step] instead.")
|
||||
E058 = ("Could not retrieve vector for key {key}.")
|
||||
E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
|
||||
E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
|
||||
"({rows}, {cols}).")
|
||||
E061 = ("Bad file name: {filename}. Example of a valid file name: "
|
||||
"'vectors.128.f.bin'")
|
||||
E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
|
||||
"and 63 are occupied. You can replace one by specifying the "
|
||||
"`flag_id` explicitly, e.g. "
|
||||
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
|
||||
E063 = ("Invalid value for flag_id: {value}. Flag IDs must be between 1 "
|
||||
"and 63 (inclusive).")
|
||||
E064 = ("Error fetching a Lexeme from the Vocab. When looking up a "
|
||||
"string, the lexeme returned had an orth ID that did not match "
|
||||
"the query string. This means that the cached lexeme structs are "
|
||||
"mismatched to the string encoding table. The mismatched:\n"
|
||||
"Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
|
||||
E065 = ("Only one of the vector table's width and shape can be specified. "
|
||||
"Got width {width} and shape {shape}.")
|
||||
E066 = ("Error creating model helper for extracting columns. Can only "
|
||||
"extract columns by positive integer. Got: {value}.")
|
||||
E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
|
||||
"an entity) without a preceding 'B' (beginning of an entity). "
|
||||
"Tag sequence:\n{tags}")
|
||||
E068 = ("Invalid BILUO tag: '{tag}'.")
|
||||
E069 = ("Invalid gold-standard parse tree. Found cycle between word "
|
||||
"IDs: {cycle}")
|
||||
E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) "
|
||||
"does not align with number of annotations ({n_annots}).")
|
||||
E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
|
||||
"match the one in the vocab ({vocab_orth}).")
|
||||
E072 = ("Error serializing lexeme: expected data length {length}, "
|
||||
"got {bad_length}.")
|
||||
E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
|
||||
"are of length {length}. You can use `vocab.reset_vectors` to "
|
||||
"clear the existing vectors and resize the table.")
|
||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||
E075 = ("Error accepting match: length ({length}) > maximum length "
|
||||
"({max_len}).")
|
||||
E076 = ("Error setting tensor on Doc: tensor has {rows} rows, while Doc "
|
||||
"has {words} words.")
|
||||
E077 = ("Error computing {value}: number of Docs ({n_docs}) does not "
|
||||
"equal number of GoldParse objects ({n_golds}) in batch.")
|
||||
E078 = ("Error computing score: number of words in Doc ({words_doc}) does "
|
||||
"not equal number of words in GoldParse ({words_gold}).")
|
||||
E079 = ("Error computing states in beam: number of predicted beams "
|
||||
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
||||
E080 = ("Duplicate state found in beam: {key}.")
|
||||
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
||||
"does not equal number of losses ({losses}).")
|
||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||
"match.")
|
||||
E083 = ("Error setting extension: only one of default, getter, setter and "
|
||||
"method is allowed. {n_args} keyword arguments were specified.")
|
||||
E084 = ("Error assigning label ID {label} to span: not in StringStore.")
|
||||
E085 = ("Can't create lexeme for string '{string}'.")
|
||||
E086 = ("Error deserializing lexeme '{string}': orth ID {orth_id} does "
|
||||
"not match hash {hash_id} in StringStore.")
|
||||
E087 = ("Unknown displaCy style: {style}.")
|
||||
E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
|
||||
"v2.x parser and NER models require roughly 1GB of temporary "
|
||||
"memory per 100,000 characters in the input. This means long "
|
||||
"texts may cause memory allocation errors. If you're not using "
|
||||
"the parser or NER, it's probably safe to increase the "
|
||||
"`nlp.max_length` limit. The limit is in number of characters, so "
|
||||
"you can check whether your inputs are too long by checking "
|
||||
"`len(text)`.")
|
||||
|
||||
|
||||
@add_codes
|
||||
class TempErrors(object):
|
||||
T001 = ("Max length currently 10 for phrase matching")
|
||||
T002 = ("Pattern length ({doc_len}) >= phrase_matcher.max_length "
|
||||
"({max_len}). Length can be set on initialization, up to 10.")
|
||||
T003 = ("Resizing pre-trained Tagger models is not currently supported.")
|
||||
T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
|
||||
T005 = ("Currently history size is hard-coded to 0. Received: {value}.")
|
||||
T006 = ("Currently history width is hard-coded to 0. Received: {value}.")
|
||||
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
|
||||
|
||||
class ModelsWarning(UserWarning):
|
||||
pass
|
||||
|
||||
|
||||
WARNINGS = {
|
||||
'user': UserWarning,
|
||||
'deprecation': DeprecationWarning,
|
||||
'models': ModelsWarning,
|
||||
}
|
||||
|
||||
|
||||
def _get_warn_types(arg):
|
||||
if arg == '': # don't show any warnings
|
||||
return []
|
||||
if not arg or arg == 'all': # show all available warnings
|
||||
return WARNINGS.keys()
|
||||
return [w_type.strip() for w_type in arg.split(',')
|
||||
if w_type.strip() in WARNINGS]
|
||||
|
||||
|
||||
SPACY_WARNING_FILTER = os.environ.get('SPACY_WARNING_FILTER', 'always')
|
||||
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get('SPACY_WARNING_TYPES'))
|
||||
|
||||
|
||||
def user_warning(message):
|
||||
_warn(message, 'user')
|
||||
|
||||
|
||||
def deprecation_warning(message):
|
||||
_warn(message, 'deprecation')
|
||||
|
||||
|
||||
def models_warning(message):
|
||||
_warn(message, 'models')
|
||||
|
||||
|
||||
def _warn(message, warn_type='user'):
|
||||
"""
|
||||
message (unicode): The message to display.
|
||||
category (Warning): The Warning to show.
|
||||
"""
|
||||
if warn_type in SPACY_WARNING_TYPES:
|
||||
category = WARNINGS[warn_type]
|
||||
stack = inspect.stack()[-1]
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter(SPACY_WARNING_FILTER, category)
|
||||
warnings.warn_explicit(message, category, stack[1], stack[2])
|
|
@ -10,6 +10,7 @@ import itertools
|
|||
|
||||
from .syntax import nonproj
|
||||
from .tokens import Doc
|
||||
from .errors import Errors
|
||||
from . import util
|
||||
from .util import minibatch
|
||||
|
||||
|
@ -28,7 +29,8 @@ def tags_to_entities(tags):
|
|||
elif tag == '-':
|
||||
continue
|
||||
elif tag.startswith('I'):
|
||||
assert start is not None, tags[:i]
|
||||
if start is None:
|
||||
raise ValueError(Errors.E067.format(tags=tags[:i]))
|
||||
continue
|
||||
if tag.startswith('U'):
|
||||
entities.append((tag[2:], i, i))
|
||||
|
@ -38,7 +40,7 @@ def tags_to_entities(tags):
|
|||
entities.append((tag[2:], start, i))
|
||||
start = None
|
||||
else:
|
||||
raise Exception(tag)
|
||||
raise ValueError(Errors.E068.format(tag=tag))
|
||||
return entities
|
||||
|
||||
|
||||
|
@ -238,7 +240,9 @@ class GoldCorpus(object):
|
|||
|
||||
@classmethod
|
||||
def _make_golds(cls, docs, paragraph_tuples):
|
||||
assert len(docs) == len(paragraph_tuples)
|
||||
if len(docs) != len(paragraph_tuples):
|
||||
raise ValueError(Errors.E070.format(n_docs=len(docs),
|
||||
n_annots=len(paragraph_tuples)))
|
||||
if len(docs) == 1:
|
||||
return [GoldParse.from_annot_tuples(docs[0],
|
||||
paragraph_tuples[0][0])]
|
||||
|
@ -461,7 +465,7 @@ cdef class GoldParse:
|
|||
|
||||
cycle = nonproj.contains_cycle(self.heads)
|
||||
if cycle is not None:
|
||||
raise Exception("Cycle found: %s" % cycle)
|
||||
raise ValueError(Errors.E069.format(cycle=cycle))
|
||||
|
||||
if make_projective:
|
||||
proj_heads, _ = nonproj.projectivize(self.heads, self.labels)
|
||||
|
|
|
@ -28,6 +28,7 @@ from .lang.punctuation import TOKENIZER_INFIXES
|
|||
from .lang.tokenizer_exceptions import TOKEN_MATCH
|
||||
from .lang.tag_map import TAG_MAP
|
||||
from .lang.lex_attrs import LEX_ATTRS, is_stop
|
||||
from .errors import Errors
|
||||
from . import util
|
||||
from . import about
|
||||
|
||||
|
@ -217,8 +218,7 @@ class Language(object):
|
|||
for pipe_name, component in self.pipeline:
|
||||
if pipe_name == name:
|
||||
return component
|
||||
msg = "No component '{}' found in pipeline. Available names: {}"
|
||||
raise KeyError(msg.format(name, self.pipe_names))
|
||||
raise KeyError(Errors.E001.format(name=name, opts=self.pipe_names))
|
||||
|
||||
def create_pipe(self, name, config=dict()):
|
||||
"""Create a pipeline component from a factory.
|
||||
|
@ -228,7 +228,7 @@ class Language(object):
|
|||
RETURNS (callable): Pipeline component.
|
||||
"""
|
||||
if name not in self.factories:
|
||||
raise KeyError("Can't find factory for '{}'.".format(name))
|
||||
raise KeyError(Errors.E002.format(name=name))
|
||||
factory = self.factories[name]
|
||||
return factory(self, **config)
|
||||
|
||||
|
@ -253,12 +253,9 @@ class Language(object):
|
|||
>>> nlp.add_pipe(component, name='custom_name', last=True)
|
||||
"""
|
||||
if not hasattr(component, '__call__'):
|
||||
msg = ("Not a valid pipeline component. Expected callable, but "
|
||||
"got {}. ".format(repr(component)))
|
||||
msg = Errors.E003.format(component=repr(component), name=name)
|
||||
if isinstance(component, basestring_) and component in self.factories:
|
||||
msg += ("If you meant to add a built-in component, use "
|
||||
"create_pipe: nlp.add_pipe(nlp.create_pipe('{}'))"
|
||||
.format(component))
|
||||
msg += Errors.E004.format(component=component)
|
||||
raise ValueError(msg)
|
||||
if name is None:
|
||||
if hasattr(component, 'name'):
|
||||
|
@ -271,11 +268,9 @@ class Language(object):
|
|||
else:
|
||||
name = repr(component)
|
||||
if name in self.pipe_names:
|
||||
raise ValueError("'{}' already exists in pipeline.".format(name))
|
||||
raise ValueError(Errors.E007.format(name=name, opts=self.pipe_names))
|
||||
if sum([bool(before), bool(after), bool(first), bool(last)]) >= 2:
|
||||
msg = ("Invalid constraints. You can only set one of the "
|
||||
"following: before, after, first, last.")
|
||||
raise ValueError(msg)
|
||||
raise ValueError(Errors.E006)
|
||||
pipe = (name, component)
|
||||
if last or not any([first, before, after]):
|
||||
self.pipeline.append(pipe)
|
||||
|
@ -286,9 +281,8 @@ class Language(object):
|
|||
elif after and after in self.pipe_names:
|
||||
self.pipeline.insert(self.pipe_names.index(after) + 1, pipe)
|
||||
else:
|
||||
msg = "Can't find '{}' in pipeline. Available names: {}"
|
||||
unfound = before or after
|
||||
raise ValueError(msg.format(unfound, self.pipe_names))
|
||||
raise ValueError(Errors.E001.format(name=before or after,
|
||||
opts=self.pipe_names))
|
||||
|
||||
def has_pipe(self, name):
|
||||
"""Check if a component name is present in the pipeline. Equivalent to
|
||||
|
@ -306,8 +300,7 @@ class Language(object):
|
|||
component (callable): Pipeline component.
|
||||
"""
|
||||
if name not in self.pipe_names:
|
||||
msg = "Can't find '{}' in pipeline. Available names: {}"
|
||||
raise ValueError(msg.format(name, self.pipe_names))
|
||||
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
|
||||
self.pipeline[self.pipe_names.index(name)] = (name, component)
|
||||
|
||||
def rename_pipe(self, old_name, new_name):
|
||||
|
@ -317,11 +310,9 @@ class Language(object):
|
|||
new_name (unicode): New name of the component.
|
||||
"""
|
||||
if old_name not in self.pipe_names:
|
||||
msg = "Can't find '{}' in pipeline. Available names: {}"
|
||||
raise ValueError(msg.format(old_name, self.pipe_names))
|
||||
raise ValueError(Errors.E001.format(name=old_name, opts=self.pipe_names))
|
||||
if new_name in self.pipe_names:
|
||||
msg = "'{}' already exists in pipeline. Existing names: {}"
|
||||
raise ValueError(msg.format(new_name, self.pipe_names))
|
||||
raise ValueError(Errors.E007.format(name=new_name, opts=self.pipe_names))
|
||||
i = self.pipe_names.index(old_name)
|
||||
self.pipeline[i] = (new_name, self.pipeline[i][1])
|
||||
|
||||
|
@ -332,8 +323,7 @@ class Language(object):
|
|||
RETURNS (tuple): A `(name, component)` tuple of the removed component.
|
||||
"""
|
||||
if name not in self.pipe_names:
|
||||
msg = "Can't find '{}' in pipeline. Available names: {}"
|
||||
raise ValueError(msg.format(name, self.pipe_names))
|
||||
raise ValueError(Errors.E001.format(name=name, opts=self.pipe_names))
|
||||
return self.pipeline.pop(self.pipe_names.index(name))
|
||||
|
||||
def __call__(self, text, disable=[]):
|
||||
|
@ -351,21 +341,17 @@ class Language(object):
|
|||
('An', 'NN')
|
||||
"""
|
||||
if len(text) >= self.max_length:
|
||||
msg = (
|
||||
"Text of length {length} exceeds maximum of {max_length}. "
|
||||
"The v2 parser and NER models require roughly 1GB of temporary "
|
||||
"memory per 100,000 characters in the input. This means long "
|
||||
"texts may cause memory allocation errors. If you're not using "
|
||||
"the parser or NER, it's probably safe to increase the "
|
||||
"nlp.max_length limit. The limit is in number of characters, "
|
||||
"so you can check whether your inputs are too long by checking "
|
||||
"len(text).")
|
||||
raise ValueError(msg.format(length=len(text), max_length=self.max_length))
|
||||
raise ValueError(Errors.E088.format(length=len(text),
|
||||
max_length=self.max_length))
|
||||
doc = self.make_doc(text)
|
||||
for name, proc in self.pipeline:
|
||||
if name in disable:
|
||||
continue
|
||||
if not hasattr(proc, '__call__'):
|
||||
raise ValueError(Errors.E003.format(component=type(proc), name=name))
|
||||
doc = proc(doc)
|
||||
if doc is None:
|
||||
raise ValueError(Errors.E005.format(name=name))
|
||||
return doc
|
||||
|
||||
def disable_pipes(self, *names):
|
||||
|
@ -407,8 +393,7 @@ class Language(object):
|
|||
>>> state = nlp.update(docs, golds, sgd=optimizer)
|
||||
"""
|
||||
if len(docs) != len(golds):
|
||||
raise IndexError("Update expects same number of docs and golds "
|
||||
"Got: %d, %d" % (len(docs), len(golds)))
|
||||
raise IndexError(Errors.E009.format(n_docs=len(docs), n_golds=len(golds)))
|
||||
if len(docs) == 0:
|
||||
return
|
||||
if sgd is None:
|
||||
|
@ -757,14 +742,7 @@ class DisabledPipes(list):
|
|||
if unexpected:
|
||||
# Don't change the pipeline if we're raising an error.
|
||||
self.nlp.pipeline = current
|
||||
msg = (
|
||||
"Some current components would be lost when restoring "
|
||||
"previous pipeline state. If you added components after "
|
||||
"calling nlp.disable_pipes(), you should remove them "
|
||||
"explicitly with nlp.remove_pipe() before the pipeline is "
|
||||
"restore. Names of the new components: %s"
|
||||
)
|
||||
raise ValueError(msg % unexpected)
|
||||
raise ValueError(Errors.E008.format(names=unexpected))
|
||||
self[:] = []
|
||||
|
||||
|
||||
|
|
|
@ -15,7 +15,7 @@ from .attrs cimport IS_TITLE, IS_UPPER, LIKE_URL, LIKE_NUM, LIKE_EMAIL, IS_STOP
|
|||
from .attrs cimport IS_BRACKET, IS_QUOTE, IS_LEFT_PUNCT, IS_RIGHT_PUNCT, IS_CURRENCY, IS_OOV
|
||||
from .attrs cimport PROB
|
||||
from .attrs import intify_attrs
|
||||
from . import about
|
||||
from .errors import Errors
|
||||
|
||||
|
||||
memset(&EMPTY_LEXEME, 0, sizeof(LexemeC))
|
||||
|
@ -37,7 +37,8 @@ cdef class Lexeme:
|
|||
self.vocab = vocab
|
||||
self.orth = orth
|
||||
self.c = <LexemeC*><void*>vocab.get_by_orth(vocab.mem, orth)
|
||||
assert self.c.orth == orth
|
||||
if self.c.orth != orth:
|
||||
raise ValueError(Errors.E071.format(orth=orth, vocab_orth=self.c.orth))
|
||||
|
||||
def __richcmp__(self, other, int op):
|
||||
if other is None:
|
||||
|
@ -129,20 +130,25 @@ cdef class Lexeme:
|
|||
lex_data = Lexeme.c_to_bytes(self.c)
|
||||
start = <const char*>&self.c.flags
|
||||
end = <const char*>&self.c.sentiment + sizeof(self.c.sentiment)
|
||||
assert (end-start) == sizeof(lex_data.data), (end-start, sizeof(lex_data.data))
|
||||
if (end-start) != sizeof(lex_data.data):
|
||||
raise ValueError(Errors.E072.format(length=end-start,
|
||||
bad_length=sizeof(lex_data.data)))
|
||||
byte_string = b'\0' * sizeof(lex_data.data)
|
||||
byte_chars = <char*>byte_string
|
||||
for i in range(sizeof(lex_data.data)):
|
||||
byte_chars[i] = lex_data.data[i]
|
||||
assert len(byte_string) == sizeof(lex_data.data), (len(byte_string),
|
||||
sizeof(lex_data.data))
|
||||
if len(byte_string) != sizeof(lex_data.data):
|
||||
raise ValueError(Errors.E072.format(length=len(byte_string),
|
||||
bad_length=sizeof(lex_data.data)))
|
||||
return byte_string
|
||||
|
||||
def from_bytes(self, bytes byte_string):
|
||||
# This method doesn't really have a use-case --- wrote it for testing.
|
||||
# Possibly delete? It puts the Lexeme out of synch with the vocab.
|
||||
cdef SerializedLexemeC lex_data
|
||||
assert len(byte_string) == sizeof(lex_data.data)
|
||||
if len(byte_string) != sizeof(lex_data.data):
|
||||
raise ValueError(Errors.E072.format(length=len(byte_string),
|
||||
bad_length=sizeof(lex_data.data)))
|
||||
for i in range(len(byte_string)):
|
||||
lex_data.data[i] = byte_string[i]
|
||||
Lexeme.c_from_bytes(self.c, lex_data)
|
||||
|
@ -169,16 +175,13 @@ cdef class Lexeme:
|
|||
def __get__(self):
|
||||
cdef int length = self.vocab.vectors_length
|
||||
if length == 0:
|
||||
raise ValueError(
|
||||
"Word vectors set to length 0. This may be because you "
|
||||
"don't have a model installed or loaded, or because your "
|
||||
"model doesn't include word vectors. For more info, see "
|
||||
"the documentation: \n%s\n" % about.__docs_models__
|
||||
)
|
||||
raise ValueError(Errors.E010)
|
||||
return self.vocab.get_vector(self.c.orth)
|
||||
|
||||
def __set__(self, vector):
|
||||
assert len(vector) == self.vocab.vectors_length
|
||||
if len(vector) != self.vocab.vectors_length:
|
||||
raise ValueError(Errors.E073.format(new_length=len(vector),
|
||||
length=self.vocab.vectors_length))
|
||||
self.vocab.set_vector(self.c.orth, vector)
|
||||
|
||||
property rank:
|
||||
|
|
|
@ -16,6 +16,7 @@ from .typedefs cimport hash_t
|
|||
from .structs cimport TokenC
|
||||
from .tokens.doc cimport Doc, get_token_attr
|
||||
from .vocab cimport Vocab
|
||||
from .errors import Errors, TempErrors
|
||||
|
||||
from .attrs import IDS
|
||||
from .attrs cimport attr_id_t, ID, NULL_ATTR
|
||||
|
@ -109,7 +110,8 @@ cdef attr_t get_pattern_key(const TokenPatternC* pattern) except 0:
|
|||
while pattern.nr_attr != 0:
|
||||
pattern += 1
|
||||
id_attr = pattern[0].attrs[0]
|
||||
assert id_attr.attr == ID
|
||||
if id_attr.attr != ID:
|
||||
raise ValueError(Errors.E074.format(attr=ID, bad_attr=id_attr.attr))
|
||||
return id_attr.value
|
||||
|
||||
|
||||
|
@ -161,8 +163,8 @@ def _convert_strings(token_specs, string_store):
|
|||
if value in operators:
|
||||
ops = operators[value]
|
||||
else:
|
||||
msg = "Unknown operator '%s'. Options: %s"
|
||||
raise KeyError(msg % (value, ', '.join(operators.keys())))
|
||||
keys = ', '.join(operators.keys())
|
||||
raise KeyError(Errors.E011.format(op=value, opts=keys))
|
||||
if isinstance(attr, basestring):
|
||||
attr = IDS.get(attr.upper())
|
||||
if isinstance(value, basestring):
|
||||
|
@ -264,9 +266,7 @@ cdef class Matcher:
|
|||
"""
|
||||
for pattern in patterns:
|
||||
if len(pattern) == 0:
|
||||
msg = ("Cannot add pattern for zero tokens to matcher.\n"
|
||||
"key: {key}\n")
|
||||
raise ValueError(msg.format(key=key))
|
||||
raise ValueError(Errors.E012.format(key=key))
|
||||
key = self._normalize_key(key)
|
||||
for pattern in patterns:
|
||||
specs = _convert_strings(pattern, self.vocab.strings)
|
||||
|
@ -348,13 +348,12 @@ cdef class Matcher:
|
|||
for state in partials:
|
||||
action = get_action(state.second, token)
|
||||
if action == PANIC:
|
||||
raise Exception("Error selecting action in matcher")
|
||||
raise ValueError(Errors.E013)
|
||||
while action == ADVANCE_ZERO:
|
||||
state.second += 1
|
||||
action = get_action(state.second, token)
|
||||
if action == PANIC:
|
||||
raise Exception("Error selecting action in matcher")
|
||||
|
||||
raise ValueError(Errors.E013)
|
||||
if action == REPEAT:
|
||||
# Leave the state in the queue, and advance to next slot
|
||||
# (i.e. we don't overwrite -- we want to greedily match
|
||||
|
@ -380,7 +379,7 @@ cdef class Matcher:
|
|||
for pattern in self.patterns:
|
||||
action = get_action(pattern, token)
|
||||
if action == PANIC:
|
||||
raise Exception("Error selecting action in matcher")
|
||||
raise ValueError(Errors.E013)
|
||||
while action == ADVANCE_ZERO:
|
||||
pattern += 1
|
||||
action = get_action(pattern, token)
|
||||
|
@ -447,7 +446,7 @@ def get_bilou(length):
|
|||
return [B10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT, I10_ENT,
|
||||
I10_ENT, I10_ENT, L10_ENT]
|
||||
else:
|
||||
raise ValueError("Max length currently 10 for phrase matching")
|
||||
raise ValueError(TempErrors.T001)
|
||||
|
||||
|
||||
cdef class PhraseMatcher:
|
||||
|
@ -506,11 +505,8 @@ cdef class PhraseMatcher:
|
|||
cdef Doc doc
|
||||
for doc in docs:
|
||||
if len(doc) >= self.max_length:
|
||||
msg = (
|
||||
"Pattern length (%d) >= phrase_matcher.max_length (%d). "
|
||||
"Length can be set on initialization, up to 10."
|
||||
)
|
||||
raise ValueError(msg % (len(doc), self.max_length))
|
||||
raise ValueError(TempErrors.T002.format(doc_len=len(doc),
|
||||
max_len=self.max_length))
|
||||
cdef hash_t ent_id = self.matcher._normalize_key(key)
|
||||
self._callbacks[ent_id] = on_match
|
||||
cdef int length
|
||||
|
@ -562,7 +558,9 @@ cdef class PhraseMatcher:
|
|||
yield doc
|
||||
|
||||
def accept_match(self, Doc doc, int start, int end):
|
||||
assert (end - start) < self.max_length
|
||||
if (end - start) >= self.max_length:
|
||||
raise ValueError(Errors.E075.format(length=end - start,
|
||||
max_len=self.max_length))
|
||||
cdef int i, j
|
||||
for i in range(self.max_length):
|
||||
self._phrase_key[i] = 0
|
||||
|
|
|
@ -9,6 +9,7 @@ from .attrs import LEMMA, intify_attrs
|
|||
from .parts_of_speech cimport SPACE
|
||||
from .parts_of_speech import IDS as POS_IDS
|
||||
from .lexeme cimport Lexeme
|
||||
from .errors import Errors
|
||||
|
||||
|
||||
def _normalize_props(props):
|
||||
|
@ -93,7 +94,7 @@ cdef class Morphology:
|
|||
|
||||
cdef int assign_tag_id(self, TokenC* token, int tag_id) except -1:
|
||||
if tag_id > self.n_tags:
|
||||
raise ValueError("Unknown tag ID: %s" % tag_id)
|
||||
raise ValueError(Errors.E014.format(tag=tag_id))
|
||||
# TODO: It's pretty arbitrary to put this logic here. I guess the
|
||||
# justification is that this is where the specific word and the tag
|
||||
# interact. Still, we should have a better way to enforce this rule, or
|
||||
|
@ -129,7 +130,7 @@ cdef class Morphology:
|
|||
tag (unicode): The part-of-speech tag to key the exception.
|
||||
orth (unicode): The word-form to key the exception.
|
||||
"""
|
||||
# TODO: Currently we've assumed that we know the number of tags --
|
||||
# TODO: Currently we've assumed that we know the number of tags --
|
||||
# RichTagC is an array, and _cache is a PreshMapArray
|
||||
# This is really bad: it makes the morphology typed to the tagger
|
||||
# classes, which is all wrong.
|
||||
|
@ -147,9 +148,7 @@ cdef class Morphology:
|
|||
elif force:
|
||||
memset(cached, 0, sizeof(cached[0]))
|
||||
else:
|
||||
raise ValueError(
|
||||
"Conflicting morphology exception for (%s, %s). Use "
|
||||
"force=True to overwrite." % (tag_str, orth_str))
|
||||
raise ValueError(Errors.E015.format(tag=tag_str, orth=orth_str))
|
||||
|
||||
cached.tag = rich_tag
|
||||
# TODO: Refactor this to take arbitrary attributes.
|
||||
|
|
|
@ -33,6 +33,7 @@ from .parts_of_speech import X
|
|||
from ._ml import Tok2Vec, build_text_classifier, build_tagger_model
|
||||
from ._ml import link_vectors_to_models, zero_init, flatten
|
||||
from ._ml import create_default_optimizer
|
||||
from .errors import Errors, TempErrors
|
||||
from . import util
|
||||
|
||||
|
||||
|
@ -169,7 +170,7 @@ class Pipe(object):
|
|||
problem.
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def create_optimizer(self):
|
||||
return create_default_optimizer(self.model.ops,
|
||||
**self.cfg.get('optimizer', {}))
|
||||
|
@ -336,7 +337,8 @@ class Tensorizer(Pipe):
|
|||
tensors (object): Vector representation for each token in the docs.
|
||||
"""
|
||||
for doc, tensor in zip(docs, tensors):
|
||||
assert tensor.shape[0] == len(doc)
|
||||
if tensor.shape[0] != len(doc):
|
||||
raise ValueError(Errors.E076.format(rows=tensor.shape[0], words=len(doc)))
|
||||
doc.tensor = tensor
|
||||
|
||||
def update(self, docs, golds, state=None, drop=0., sgd=None, losses=None):
|
||||
|
@ -550,9 +552,7 @@ class Tagger(Pipe):
|
|||
# copy_array(larger.W[:smaller.nO], smaller.W)
|
||||
# copy_array(larger.b[:smaller.nO], smaller.b)
|
||||
# self.model._layers[-1] = larger
|
||||
raise ValueError(
|
||||
"Resizing pre-trained Tagger models is not "
|
||||
"currently supported.")
|
||||
raise ValueError(TempErrors.T003)
|
||||
tag_map = dict(self.vocab.morphology.tag_map)
|
||||
if values is None:
|
||||
values = {POS: "X"}
|
||||
|
@ -671,8 +671,7 @@ class MultitaskObjective(Tagger):
|
|||
elif hasattr(target, '__call__'):
|
||||
self.make_label = target
|
||||
else:
|
||||
raise ValueError("MultitaskObjective target should be function or "
|
||||
"one of: dep, tag, ent, dep_tag_offset, ent_tag.")
|
||||
raise ValueError(Errors.E016)
|
||||
self.cfg = dict(cfg)
|
||||
self.cfg.setdefault('cnn_maxout_pieces', 2)
|
||||
|
||||
|
@ -723,7 +722,9 @@ class MultitaskObjective(Tagger):
|
|||
return tokvecs, scores
|
||||
|
||||
def get_loss(self, docs, golds, scores):
|
||||
assert len(docs) == len(golds)
|
||||
if len(docs) != len(golds):
|
||||
raise ValueError(Errors.E077.format(value='loss', n_docs=len(docs),
|
||||
n_golds=len(golds)))
|
||||
cdef int idx = 0
|
||||
correct = numpy.zeros((scores.shape[0],), dtype='i')
|
||||
guesses = scores.argmax(axis=1)
|
||||
|
@ -936,7 +937,7 @@ cdef class DependencyParser(Parser):
|
|||
@property
|
||||
def postprocesses(self):
|
||||
return [nonproj.deprojectivize]
|
||||
|
||||
|
||||
def add_multitask_objective(self, target):
|
||||
labeller = MultitaskObjective(self.vocab, target=target)
|
||||
self._multitasks.append(labeller)
|
||||
|
@ -957,7 +958,7 @@ cdef class EntityRecognizer(Parser):
|
|||
TransitionSystem = BiluoPushDown
|
||||
|
||||
nr_feature = 6
|
||||
|
||||
|
||||
def add_multitask_objective(self, target):
|
||||
labeller = MultitaskObjective(self.vocab, target=target)
|
||||
self._multitasks.append(labeller)
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
from __future__ import division, print_function, unicode_literals
|
||||
|
||||
from .gold import tags_to_entities
|
||||
from .errors import Errors
|
||||
|
||||
|
||||
class PRFScore(object):
|
||||
|
@ -84,7 +85,8 @@ class Scorer(object):
|
|||
}
|
||||
|
||||
def score(self, tokens, gold, verbose=False, punct_labels=('p', 'punct')):
|
||||
assert len(tokens) == len(gold)
|
||||
if len(tokens) != len(gold):
|
||||
raise ValueError(Errors.E078.format(words_doc=len(tokens), words_gold=len(gold)))
|
||||
gold_deps = set()
|
||||
gold_tags = set()
|
||||
gold_ents = set(tags_to_entities([annot[-1]
|
||||
|
|
|
@ -13,6 +13,7 @@ from .symbols import IDS as SYMBOLS_BY_STR
|
|||
from .symbols import NAMES as SYMBOLS_BY_INT
|
||||
from .typedefs cimport hash_t
|
||||
from .compat import json_dumps
|
||||
from .errors import Errors
|
||||
from . import util
|
||||
|
||||
|
||||
|
@ -59,7 +60,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
|
|||
string.p = <unsigned char*>mem.alloc(length + 1, sizeof(unsigned char))
|
||||
string.p[0] = length
|
||||
memcpy(&string.p[1], chars, length)
|
||||
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
|
||||
return string
|
||||
else:
|
||||
i = 0
|
||||
|
@ -69,7 +69,6 @@ cdef Utf8Str* _allocate(Pool mem, const unsigned char* chars, uint32_t length) e
|
|||
string.p[i] = 255
|
||||
string.p[n_length_bytes-1] = length % 255
|
||||
memcpy(&string.p[n_length_bytes], chars, length)
|
||||
assert string.s[0] >= sizeof(string.s) or string.s[0] == 0, string.s[0]
|
||||
return string
|
||||
|
||||
|
||||
|
@ -115,7 +114,7 @@ cdef class StringStore:
|
|||
self.hits.insert(key)
|
||||
utf8str = <Utf8Str*>self._map.get(key)
|
||||
if utf8str is NULL:
|
||||
raise KeyError(string_or_id)
|
||||
raise KeyError(Errors.E018.format(hash_value=string_or_id))
|
||||
else:
|
||||
return decode_Utf8Str(utf8str)
|
||||
|
||||
|
@ -136,8 +135,7 @@ cdef class StringStore:
|
|||
key = hash_utf8(string, len(string))
|
||||
self._intern_utf8(string, len(string))
|
||||
else:
|
||||
raise TypeError(
|
||||
"Can only add unicode or bytes. Got type: %s" % type(string))
|
||||
raise TypeError(Errors.E017.format(value_type=type(string)))
|
||||
return key
|
||||
|
||||
def __len__(self):
|
||||
|
|
|
@ -10,6 +10,7 @@ from thinc.extra.search cimport MaxViolation
|
|||
|
||||
from .transition_system cimport TransitionSystem, Transition
|
||||
from ..gold cimport GoldParse
|
||||
from ..errors import Errors
|
||||
from .stateclass cimport StateC, StateClass
|
||||
|
||||
|
||||
|
@ -220,7 +221,8 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
|
|||
p_indices = []
|
||||
g_indices = []
|
||||
cdef Beam pbeam, gbeam
|
||||
assert len(pbeams) == len(gbeams)
|
||||
if len(pbeams) != len(gbeams):
|
||||
raise ValueError(Errors.E079.format(pbeams=len(pbeams), gbeams=len(gbeams)))
|
||||
for eg_id, (pbeam, gbeam) in enumerate(zip(pbeams, gbeams)):
|
||||
p_indices.append([])
|
||||
g_indices.append([])
|
||||
|
@ -228,7 +230,8 @@ def get_states(pbeams, gbeams, beam_map, nr_update):
|
|||
state = StateClass.borrow(<StateC*>pbeam.at(i))
|
||||
if not state.is_final():
|
||||
key = tuple([eg_id] + pbeam.histories[i])
|
||||
assert key not in seen, (key, seen)
|
||||
if key in seen:
|
||||
raise ValueError(Errors.E080.format(key=key))
|
||||
seen[key] = len(states)
|
||||
p_indices[-1].append(len(states))
|
||||
states.append(state)
|
||||
|
@ -271,7 +274,8 @@ def get_gradient(nr_class, beam_maps, histories, losses):
|
|||
for i in range(nr_step):
|
||||
grads.append(numpy.zeros((max(beam_maps[i].values())+1, nr_class),
|
||||
dtype='f'))
|
||||
assert len(histories) == len(losses)
|
||||
if len(histories) != len(losses):
|
||||
raise ValueError(Errors.E081.format(n_hist=len(histories), losses=len(losses)))
|
||||
for eg_id, hists in enumerate(histories):
|
||||
for loss, hist in zip(losses[eg_id], hists):
|
||||
if loss == 0.0 or numpy.isnan(loss):
|
||||
|
|
|
@ -15,6 +15,7 @@ from .nonproj import is_nonproj_tree
|
|||
from .transition_system cimport move_cost_func_t, label_cost_func_t
|
||||
from ..gold cimport GoldParse, GoldParseC
|
||||
from ..structs cimport TokenC
|
||||
from ..errors import Errors
|
||||
|
||||
|
||||
DEF NON_MONOTONIC = True
|
||||
|
@ -455,7 +456,7 @@ cdef class ArcEager(TransitionSystem):
|
|||
t.do = Break.transition
|
||||
t.get_cost = Break.cost
|
||||
else:
|
||||
raise Exception(move)
|
||||
raise ValueError(Errors.E019.format(action=move, src='arc_eager'))
|
||||
return t
|
||||
|
||||
cdef int initialize_state(self, StateC* st) nogil:
|
||||
|
@ -529,28 +530,11 @@ cdef class ArcEager(TransitionSystem):
|
|||
if n_gold < 1:
|
||||
# Check projectivity --- leading cause
|
||||
if is_nonproj_tree(gold.heads):
|
||||
raise ValueError(
|
||||
"Could not find a gold-standard action to supervise the "
|
||||
"dependency parser. Likely cause: the tree is "
|
||||
"non-projective (i.e. it has crossing arcs -- see "
|
||||
"spacy/syntax/nonproj.pyx for definitions). The ArcEager "
|
||||
"transition system only supports projective trees. To "
|
||||
"learn non-projective representations, transform the data "
|
||||
"before training and after parsing. Either pass "
|
||||
"make_projective=True to the GoldParse class, or use "
|
||||
"spacy.syntax.nonproj.preprocess_training_data.")
|
||||
raise ValueError(Errors.E020)
|
||||
else:
|
||||
print(gold.orig_annot)
|
||||
print(gold.words)
|
||||
print(gold.heads)
|
||||
print(gold.labels)
|
||||
print(gold.sent_starts)
|
||||
raise ValueError(
|
||||
"Could not find a gold-standard action to supervise the"
|
||||
"dependency parser. The GoldParse was projective. The "
|
||||
"transition system has %d actions. State at failure: %s"
|
||||
% (self.n_moves, stcls.print_state(gold.words)))
|
||||
assert n_gold >= 1
|
||||
failure_state = stcls.print_state(gold.words)
|
||||
raise ValueError(Errors.E021.format(n_actions=self.n_moves,
|
||||
state=failure_state))
|
||||
|
||||
def get_beam_annot(self, Beam beam):
|
||||
length = (<StateC*>beam.at(0)).length
|
||||
|
|
|
@ -10,6 +10,7 @@ from ._state cimport StateC
|
|||
from .transition_system cimport Transition
|
||||
from .transition_system cimport do_func_t
|
||||
from ..gold cimport GoldParseC, GoldParse
|
||||
from ..errors import Errors
|
||||
|
||||
|
||||
cdef enum:
|
||||
|
@ -173,7 +174,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
if self.c[i].move == move and self.c[i].label == label:
|
||||
return self.c[i]
|
||||
else:
|
||||
raise KeyError(name)
|
||||
raise KeyError(Errors.E022.format(name=name))
|
||||
|
||||
cdef Transition init_transition(self, int clas, int move, attr_t label) except *:
|
||||
# TODO: Apparent Cython bug here when we try to use the Transition()
|
||||
|
@ -208,7 +209,7 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
t.do = Out.transition
|
||||
t.get_cost = Out.cost
|
||||
else:
|
||||
raise Exception(move)
|
||||
raise ValueError(Errors.E019.format(action=move, src='ner'))
|
||||
return t
|
||||
|
||||
def add_action(self, int action, label_name):
|
||||
|
@ -230,7 +231,6 @@ cdef class BiluoPushDown(TransitionSystem):
|
|||
self._size *= 2
|
||||
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
|
||||
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
||||
assert self.c[self.n_moves].label == label_id
|
||||
self.n_moves += 1
|
||||
return 1
|
||||
|
||||
|
|
|
@ -34,6 +34,7 @@ from .._ml import link_vectors_to_models, create_default_optimizer
|
|||
from ..compat import json_dumps, copy_array
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..gold cimport GoldParse
|
||||
from ..errors import Errors, TempErrors
|
||||
from .. import util
|
||||
from .stateclass cimport StateClass
|
||||
from ._state cimport StateC
|
||||
|
@ -242,7 +243,7 @@ cdef class Parser:
|
|||
def Model(cls, nr_class, **cfg):
|
||||
depth = util.env_opt('parser_hidden_depth', cfg.get('hidden_depth', 1))
|
||||
if depth != 1:
|
||||
raise ValueError("Currently parser depth is hard-coded to 1.")
|
||||
raise ValueError(TempErrors.T004.format(value=depth))
|
||||
parser_maxout_pieces = util.env_opt('parser_maxout_pieces',
|
||||
cfg.get('maxout_pieces', 2))
|
||||
token_vector_width = util.env_opt('token_vector_width',
|
||||
|
@ -252,9 +253,9 @@ cdef class Parser:
|
|||
hist_size = util.env_opt('history_feats', cfg.get('hist_size', 0))
|
||||
hist_width = util.env_opt('history_width', cfg.get('hist_width', 0))
|
||||
if hist_size != 0:
|
||||
raise ValueError("Currently history size is hard-coded to 0")
|
||||
raise ValueError(TempErrors.T005.format(value=hist_size))
|
||||
if hist_width != 0:
|
||||
raise ValueError("Currently history width is hard-coded to 0")
|
||||
raise ValueError(TempErrors.T006.format(value=hist_width))
|
||||
pretrained_vectors = cfg.get('pretrained_vectors', None)
|
||||
tok2vec = Tok2Vec(token_vector_width, embed_size,
|
||||
pretrained_vectors=pretrained_vectors)
|
||||
|
@ -431,7 +432,7 @@ cdef class Parser:
|
|||
[len(doc) for doc in docs])
|
||||
return state_objs, tokvecs
|
||||
|
||||
cdef void _parseC(self, StateC* state,
|
||||
cdef void _parseC(self, StateC* state,
|
||||
const float* feat_weights, const float* bias,
|
||||
const float* hW, const float* hb,
|
||||
int nr_class, int nr_hidden, int nr_feat, int nr_piece) nogil:
|
||||
|
@ -542,7 +543,9 @@ cdef class Parser:
|
|||
def update(self, docs, golds, drop=0., sgd=None, losses=None):
|
||||
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||
return None
|
||||
assert len(docs) == len(golds)
|
||||
if len(docs) != len(golds):
|
||||
raise ValueError(Errors.E077.format(value='update', n_docs=len(docs),
|
||||
n_golds=len(golds)))
|
||||
if self.cfg.get('beam_width', 1) >= 2 and numpy.random.random() >= 0.0:
|
||||
return self.update_beam(docs, golds,
|
||||
self.cfg['beam_width'], self.cfg['beam_density'],
|
||||
|
@ -608,7 +611,7 @@ cdef class Parser:
|
|||
break
|
||||
self._make_updates(d_tokvecs,
|
||||
bp_tokvecs, backprops, sgd, cuda_stream)
|
||||
|
||||
|
||||
def update_beam(self, docs, golds, width=None, density=None,
|
||||
drop=0., sgd=None, losses=None):
|
||||
if not any(self.moves.has_gold(gold) for gold in golds):
|
||||
|
@ -622,7 +625,6 @@ cdef class Parser:
|
|||
if losses is not None and self.name not in losses:
|
||||
losses[self.name] = 0.
|
||||
lengths = [len(d) for d in docs]
|
||||
assert min(lengths) >= 1
|
||||
states = self.moves.init_batch(docs)
|
||||
for gold in golds:
|
||||
self.moves.preprocess_gold(gold)
|
||||
|
@ -851,7 +853,7 @@ cdef class Parser:
|
|||
def add_multitask_objective(self, target):
|
||||
# Defined in subclasses, to avoid circular import
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def init_multitask_objectives(self, gold_tuples, pipeline, **cfg):
|
||||
'''Setup models for secondary objectives, to benefit from multi-task
|
||||
learning. This method is intended to be overridden by subclasses.
|
||||
|
@ -1021,15 +1023,11 @@ def _cleanup(Beam beam):
|
|||
del state
|
||||
seen.add(addr)
|
||||
else:
|
||||
print(i, addr)
|
||||
print(seen)
|
||||
raise Exception
|
||||
raise ValueError(Errors.E023.format(addr=addr, i=i))
|
||||
addr = <size_t>beam._states[i].content
|
||||
if addr not in seen:
|
||||
state = <StateC*>addr
|
||||
del state
|
||||
seen.add(addr)
|
||||
else:
|
||||
print(i, addr)
|
||||
print(seen)
|
||||
raise Exception
|
||||
raise ValueError(Errors.E023.format(addr=addr, i=i))
|
||||
|
|
|
@ -10,6 +10,7 @@ from __future__ import unicode_literals
|
|||
from copy import copy
|
||||
|
||||
from ..tokens.doc cimport Doc
|
||||
from ..errors import Errors
|
||||
|
||||
|
||||
DELIMITER = '||'
|
||||
|
@ -131,7 +132,10 @@ cpdef deprojectivize(Doc doc):
|
|||
|
||||
def _decorate(heads, proj_heads, labels):
|
||||
# uses decoration scheme HEAD from Nivre & Nilsson 2005
|
||||
assert(len(heads) == len(proj_heads) == len(labels))
|
||||
if (len(heads) != len(proj_heads)) or (len(proj_heads) != len(labels)):
|
||||
raise ValueError(Errors.E082.format(n_heads=len(heads),
|
||||
n_proj_heads=len(proj_heads),
|
||||
n_labels=len(labels)))
|
||||
deco_labels = []
|
||||
for tokenid, head in enumerate(heads):
|
||||
if head != proj_heads[tokenid]:
|
||||
|
|
|
@ -12,6 +12,7 @@ from ..structs cimport TokenC
|
|||
from .stateclass cimport StateClass
|
||||
from ..typedefs cimport attr_t
|
||||
from ..compat import json_dumps
|
||||
from ..errors import Errors
|
||||
from .. import util
|
||||
|
||||
|
||||
|
@ -80,10 +81,7 @@ cdef class TransitionSystem:
|
|||
action.do(state.c, action.label)
|
||||
break
|
||||
else:
|
||||
print(gold.words)
|
||||
print(gold.ner)
|
||||
print(history)
|
||||
raise ValueError("Could not find gold move")
|
||||
raise ValueError(Errors.E024)
|
||||
return history
|
||||
|
||||
cdef int initialize_state(self, StateC* state) nogil:
|
||||
|
@ -130,17 +128,7 @@ cdef class TransitionSystem:
|
|||
else:
|
||||
costs[i] = 9000
|
||||
if n_gold <= 0:
|
||||
print(gold.words)
|
||||
print(gold.ner)
|
||||
print([gold.c.ner[i].clas for i in range(gold.length)])
|
||||
print([gold.c.ner[i].move for i in range(gold.length)])
|
||||
print([gold.c.ner[i].label for i in range(gold.length)])
|
||||
print("Self labels",
|
||||
[self.c[i].label for i in range(self.n_moves)])
|
||||
raise ValueError(
|
||||
"Could not find a gold-standard action to supervise "
|
||||
"the entity recognizer. The transition system has "
|
||||
"%d actions." % (self.n_moves))
|
||||
raise ValueError(Errors.E024)
|
||||
|
||||
def get_class_name(self, int clas):
|
||||
act = self.c[clas]
|
||||
|
@ -162,7 +150,6 @@ cdef class TransitionSystem:
|
|||
self._size *= 2
|
||||
self.c = <Transition*>self.mem.realloc(self.c, self._size * sizeof(self.c[0]))
|
||||
self.c[self.n_moves] = self.init_transition(self.n_moves, action, label_id)
|
||||
assert self.c[self.n_moves].label == label_id
|
||||
self.n_moves += 1
|
||||
return 1
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ cimport cython
|
|||
|
||||
from .tokens.doc cimport Doc
|
||||
from .strings cimport hash_string
|
||||
from .errors import Errors, Warnings, deprecation_warning
|
||||
from . import util
|
||||
|
||||
|
||||
|
@ -63,11 +64,7 @@ cdef class Tokenizer:
|
|||
return (self.__class__, args, None, None)
|
||||
|
||||
cpdef Doc tokens_from_list(self, list strings):
|
||||
util.deprecated(
|
||||
"Tokenizer.from_list is now deprecated. Create a new Doc "
|
||||
"object instead and pass in the strings as the `words` keyword "
|
||||
"argument, for example:\nfrom spacy.tokens import Doc\n"
|
||||
"doc = Doc(nlp.vocab, words=[...])")
|
||||
deprecation_warning(Warnings.W002)
|
||||
return Doc(self.vocab, words=strings)
|
||||
|
||||
@cython.boundscheck(False)
|
||||
|
@ -78,8 +75,7 @@ cdef class Tokenizer:
|
|||
RETURNS (Doc): A container for linguistic annotations.
|
||||
"""
|
||||
if len(string) >= (2 ** 30):
|
||||
msg = "String is too long: %d characters. Max is 2**30."
|
||||
raise ValueError(msg % len(string))
|
||||
raise ValueError(Errors.E025.format(length=len(string)))
|
||||
cdef int length = len(string)
|
||||
cdef Doc doc = Doc(self.vocab)
|
||||
if length == 0:
|
||||
|
|
|
@ -31,7 +31,7 @@ from ..attrs cimport ENT_TYPE, SENT_START
|
|||
from ..parts_of_speech cimport CCONJ, PUNCT, NOUN, univ_pos_t
|
||||
from ..util import normalize_slice
|
||||
from ..compat import is_config, copy_reg, pickle, basestring_
|
||||
from .. import about
|
||||
from ..errors import Errors, Warnings, deprecation_warning
|
||||
from .. import util
|
||||
from .underscore import Underscore
|
||||
from ._retokenize import Retokenizer
|
||||
|
@ -41,9 +41,9 @@ DEF PADDING = 5
|
|||
|
||||
cdef int bounds_check(int i, int length, int padding) except -1:
|
||||
if (i + padding) < 0:
|
||||
raise IndexError
|
||||
raise IndexError(Errors.E026.format(i=i, length=length))
|
||||
if (i - padding) >= length:
|
||||
raise IndexError
|
||||
raise IndexError(Errors.E026.format(i=i, length=length))
|
||||
|
||||
|
||||
cdef attr_t get_token_attr(const TokenC* token, attr_id_t feat_name) nogil:
|
||||
|
@ -98,7 +98,8 @@ cdef class Doc:
|
|||
def set_extension(cls, name, default=None, method=None,
|
||||
getter=None, setter=None):
|
||||
nr_defined = sum(t is not None for t in (default, getter, setter, method))
|
||||
assert nr_defined == 1
|
||||
if nr_defined != 1:
|
||||
raise ValueError(Errors.E083.format(n_args=nr_defined))
|
||||
Underscore.doc_extensions[name] = (default, method, getter, setter)
|
||||
|
||||
@classmethod
|
||||
|
@ -155,11 +156,7 @@ cdef class Doc:
|
|||
if spaces is None:
|
||||
spaces = [True] * len(words)
|
||||
elif len(spaces) != len(words):
|
||||
raise ValueError(
|
||||
"Arguments 'words' and 'spaces' should be sequences of "
|
||||
"the same length, or 'spaces' should be left default at "
|
||||
"None. spaces should be a sequence of booleans, with True "
|
||||
"meaning that the word owns a ' ' character following it.")
|
||||
raise ValueError(Errors.E027)
|
||||
orths_and_spaces = zip(words, spaces)
|
||||
if orths_and_spaces is not None:
|
||||
for orth_space in orths_and_spaces:
|
||||
|
@ -167,10 +164,7 @@ cdef class Doc:
|
|||
orth = orth_space
|
||||
has_space = True
|
||||
elif isinstance(orth_space, bytes):
|
||||
raise ValueError(
|
||||
"orths_and_spaces expects either List(unicode) or "
|
||||
"List((unicode, bool)). "
|
||||
"Got bytes instance: %s" % (str(orth_space)))
|
||||
raise ValueError(Errors.E028.format(value=orth_space))
|
||||
else:
|
||||
orth, has_space = orth_space
|
||||
# Note that we pass self.mem here --- we have ownership, if LexemeC
|
||||
|
@ -504,11 +498,7 @@ cdef class Doc:
|
|||
"""
|
||||
def __get__(self):
|
||||
if not self.is_parsed:
|
||||
raise ValueError(
|
||||
"noun_chunks requires the dependency parse, which "
|
||||
"requires a statistical model to be installed and loaded. "
|
||||
"For more info, see the "
|
||||
"documentation: \n%s\n" % about.__docs_models__)
|
||||
raise ValueError(Errors.E029)
|
||||
# Accumulate the result before beginning to iterate over it. This
|
||||
# prevents the tokenisation from being changed out from under us
|
||||
# during the iteration. The tricky thing here is that Span accepts
|
||||
|
@ -533,12 +523,7 @@ cdef class Doc:
|
|||
"""
|
||||
def __get__(self):
|
||||
if not self.is_sentenced:
|
||||
raise ValueError(
|
||||
"Sentence boundaries unset. You can add the 'sentencizer' "
|
||||
"component to the pipeline with: "
|
||||
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
|
||||
"Alternatively, add the dependency parser, or set "
|
||||
"sentence boundaries by setting doc[i].sent_start")
|
||||
raise ValueError(Errors.E030)
|
||||
if 'sents' in self.user_hooks:
|
||||
yield from self.user_hooks['sents'](self)
|
||||
else:
|
||||
|
@ -568,7 +553,8 @@ cdef class Doc:
|
|||
t.idx = (t-1).idx + (t-1).lex.length + (t-1).spacy
|
||||
t.l_edge = self.length
|
||||
t.r_edge = self.length
|
||||
assert t.lex.orth != 0
|
||||
if t.lex.orth == 0:
|
||||
raise ValueError(Errors.E031.format(i=self.length))
|
||||
t.spacy = has_space
|
||||
self.length += 1
|
||||
return t.idx + t.lex.length + t.spacy
|
||||
|
@ -684,13 +670,7 @@ cdef class Doc:
|
|||
|
||||
def from_array(self, attrs, array):
|
||||
if SENT_START in attrs and HEAD in attrs:
|
||||
raise ValueError(
|
||||
"Conflicting attributes specified in doc.from_array(): "
|
||||
"(HEAD, SENT_START)\n"
|
||||
"The HEAD attribute currently sets sentence boundaries "
|
||||
"implicitly, based on the tree structure. This means the HEAD "
|
||||
"attribute would potentially override the sentence boundaries "
|
||||
"set by SENT_START.")
|
||||
raise ValueError(Errors.E032)
|
||||
cdef int i, col
|
||||
cdef attr_id_t attr_id
|
||||
cdef TokenC* tokens = self.c
|
||||
|
@ -828,7 +808,7 @@ cdef class Doc:
|
|||
RETURNS (Doc): Itself.
|
||||
"""
|
||||
if self.length != 0:
|
||||
raise ValueError("Cannot load into non-empty Doc")
|
||||
raise ValueError(Errors.E033.format(length=self.length))
|
||||
deserializers = {
|
||||
'text': lambda b: None,
|
||||
'array_head': lambda b: None,
|
||||
|
@ -916,10 +896,7 @@ cdef class Doc:
|
|||
"""
|
||||
cdef unicode tag, lemma, ent_type
|
||||
if len(args) == 3:
|
||||
util.deprecated(
|
||||
"Positional arguments to Doc.merge are deprecated. Instead, "
|
||||
"use the keyword arguments, for example tag=, lemma= or "
|
||||
"ent_type=.")
|
||||
deprecation_warning(Warnings.W003)
|
||||
tag, lemma, ent_type = args
|
||||
attributes[TAG] = tag
|
||||
attributes[LEMMA] = lemma
|
||||
|
@ -933,13 +910,9 @@ cdef class Doc:
|
|||
if 'ent_type' in attributes:
|
||||
attributes[ENT_TYPE] = attributes['ent_type']
|
||||
elif args:
|
||||
raise ValueError(
|
||||
"Doc.merge received %d non-keyword arguments. Expected either "
|
||||
"3 arguments (deprecated), or 0 (use keyword arguments). "
|
||||
"Arguments supplied:\n%s\n"
|
||||
"Keyword arguments: %s\n" % (len(args), repr(args),
|
||||
repr(attributes)))
|
||||
|
||||
raise ValueError(Errors.E034.format(n_args=len(args),
|
||||
args=repr(args),
|
||||
kwargs=repr(attributes)))
|
||||
# More deprecated attribute handling =/
|
||||
if 'label' in attributes:
|
||||
attributes['ent_type'] = attributes.pop('label')
|
||||
|
|
|
@ -16,7 +16,7 @@ from ..util import normalize_slice
|
|||
from ..attrs cimport IS_PUNCT, IS_SPACE
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..compat import is_config
|
||||
from .. import about
|
||||
from ..errors import Errors, TempErrors
|
||||
from .underscore import Underscore
|
||||
|
||||
|
||||
|
@ -48,8 +48,7 @@ cdef class Span:
|
|||
RETURNS (Span): The newly constructed object.
|
||||
"""
|
||||
if not (0 <= start <= end <= len(doc)):
|
||||
raise IndexError
|
||||
|
||||
raise IndexError(Errors.E035.format(start=start, end=end, length=len(doc)))
|
||||
self.doc = doc
|
||||
self.start = start
|
||||
self.start_char = self.doc[start].idx if start < self.doc.length else 0
|
||||
|
@ -58,7 +57,8 @@ cdef class Span:
|
|||
self.end_char = self.doc[end - 1].idx + len(self.doc[end - 1])
|
||||
else:
|
||||
self.end_char = 0
|
||||
assert label in doc.vocab.strings, label
|
||||
if label not in doc.vocab.strings:
|
||||
raise ValueError(Errors.E084.format(label=label))
|
||||
self.label = label
|
||||
self._vector = vector
|
||||
self._vector_norm = vector_norm
|
||||
|
@ -267,11 +267,10 @@ cdef class Span:
|
|||
or (self.doc.c[self.end-1].idx + self.doc.c[self.end-1].lex.length) != self.end_char:
|
||||
start = token_by_start(self.doc.c, self.doc.length, self.start_char)
|
||||
if self.start == -1:
|
||||
raise IndexError("Error calculating span: Can't find start")
|
||||
raise IndexError(Errors.E036.format(start=self.start_char))
|
||||
end = token_by_end(self.doc.c, self.doc.length, self.end_char)
|
||||
if end == -1:
|
||||
raise IndexError("Error calculating span: Can't find end")
|
||||
|
||||
raise IndexError(Errors.E037.format(end=self.end_char))
|
||||
self.start = start
|
||||
self.end = end + 1
|
||||
|
||||
|
@ -293,7 +292,7 @@ cdef class Span:
|
|||
root += root.head
|
||||
n += 1
|
||||
if n >= self.doc.length:
|
||||
raise RuntimeError
|
||||
raise RuntimeError(Errors.E038)
|
||||
return self.doc[root.l_edge:root.r_edge + 1]
|
||||
|
||||
property has_vector:
|
||||
|
@ -376,11 +375,7 @@ cdef class Span:
|
|||
"""
|
||||
def __get__(self):
|
||||
if not self.doc.is_parsed:
|
||||
raise ValueError(
|
||||
"noun_chunks requires the dependency parse, which "
|
||||
"requires a statistical model to be installed and loaded. "
|
||||
"For more info, see the "
|
||||
"documentation: \n%s\n" % about.__docs_models__)
|
||||
raise ValueError(Errors.E029)
|
||||
# Accumulate the result before beginning to iterate over it. This
|
||||
# prevents the tokenisation from being changed out from under us
|
||||
# during the iteration. The tricky thing here is that Span accepts
|
||||
|
@ -526,9 +521,7 @@ cdef class Span:
|
|||
return self.root.ent_id
|
||||
|
||||
def __set__(self, hash_t key):
|
||||
raise NotImplementedError(
|
||||
"Can't yet set ent_id from Span. Vote for this feature on "
|
||||
"the issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
raise NotImplementedError(TempErrors.T007.format(attr='ent_id'))
|
||||
|
||||
property ent_id_:
|
||||
"""RETURNS (unicode): The (string) entity ID."""
|
||||
|
@ -536,9 +529,7 @@ cdef class Span:
|
|||
return self.root.ent_id_
|
||||
|
||||
def __set__(self, hash_t key):
|
||||
raise NotImplementedError(
|
||||
"Can't yet set ent_id_ from Span. Vote for this feature on the "
|
||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||
raise NotImplementedError(TempErrors.T007.format(attr='ent_id_'))
|
||||
|
||||
property orth_:
|
||||
"""Verbatim text content (identical to Span.text). Exists mostly for
|
||||
|
@ -586,9 +577,5 @@ cdef int _count_words_to_root(const TokenC* token, int sent_length) except -1:
|
|||
token += token.head
|
||||
n += 1
|
||||
if n >= sent_length:
|
||||
raise RuntimeError(
|
||||
"Array bounds exceeded while searching for root word. This "
|
||||
"likely means the parse tree is in an invalid state. Please "
|
||||
"report this issue here: "
|
||||
"http://github.com/explosion/spaCy/issues")
|
||||
raise RuntimeError(Errors.E039)
|
||||
return n
|
||||
|
|
|
@ -6,6 +6,7 @@ from ..typedefs cimport attr_t, flags_t
|
|||
from ..parts_of_speech cimport univ_pos_t
|
||||
from .doc cimport Doc
|
||||
from ..lexeme cimport Lexeme
|
||||
from ..errors import Errors
|
||||
|
||||
|
||||
cdef class Token:
|
||||
|
@ -17,8 +18,7 @@ cdef class Token:
|
|||
@staticmethod
|
||||
cdef inline Token cinit(Vocab vocab, const TokenC* token, int offset, Doc doc):
|
||||
if offset < 0 or offset >= doc.length:
|
||||
msg = "Attempt to access token at %d, max length %d"
|
||||
raise IndexError(msg % (offset, doc.length))
|
||||
raise IndexError(Errors.E040.format(i=offset, max_length=doc.length))
|
||||
cdef Token self = Token.__new__(Token, vocab, doc, offset)
|
||||
return self
|
||||
|
||||
|
|
|
@ -19,8 +19,8 @@ from ..attrs cimport IS_OOV, IS_TITLE, IS_UPPER, IS_CURRENCY, LIKE_URL, LIKE_NUM
|
|||
from ..attrs cimport IS_STOP, ID, ORTH, NORM, LOWER, SHAPE, PREFIX, SUFFIX
|
||||
from ..attrs cimport LENGTH, CLUSTER, LEMMA, POS, TAG, DEP
|
||||
from ..compat import is_config
|
||||
from ..errors import Errors
|
||||
from .. import util
|
||||
from .. import about
|
||||
from .underscore import Underscore
|
||||
|
||||
|
||||
|
@ -106,7 +106,7 @@ cdef class Token:
|
|||
elif op == 5:
|
||||
return my >= their
|
||||
else:
|
||||
raise ValueError(op)
|
||||
raise ValueError(Errors.E041.format(op=op))
|
||||
|
||||
@property
|
||||
def _(self):
|
||||
|
@ -135,8 +135,7 @@ cdef class Token:
|
|||
RETURNS (Token): The token at position `self.doc[self.i+i]`.
|
||||
"""
|
||||
if self.i+i < 0 or (self.i+i >= len(self.doc)):
|
||||
msg = "Error accessing doc[%d].nbor(%d), for doc of length %d"
|
||||
raise IndexError(msg % (self.i, i, len(self.doc)))
|
||||
raise IndexError(Errors.E042.format(i=self.i, j=i, length=len(self.doc)))
|
||||
return self.doc[self.i+i]
|
||||
|
||||
def similarity(self, other):
|
||||
|
@ -352,14 +351,7 @@ cdef class Token:
|
|||
|
||||
property sent_start:
|
||||
def __get__(self):
|
||||
# Raising a deprecation warning causes errors for autocomplete
|
||||
#util.deprecated(
|
||||
# "Token.sent_start is now deprecated. Use Token.is_sent_start "
|
||||
# "instead, which returns a boolean value or None if the answer "
|
||||
# "is unknown – instead of a misleading 0 for False and 1 for "
|
||||
# "True. It also fixes a quirk in the old logic that would "
|
||||
# "always set the property to 0 for the first word of the "
|
||||
# "document.")
|
||||
# Raising a deprecation warning here causes errors for autocomplete
|
||||
# Handle broken backwards compatibility case: doc[0].sent_start
|
||||
# was False.
|
||||
if self.i == 0:
|
||||
|
@ -384,9 +376,7 @@ cdef class Token:
|
|||
|
||||
def __set__(self, value):
|
||||
if self.doc.is_parsed:
|
||||
raise ValueError(
|
||||
"Refusing to write to token.sent_start if its document "
|
||||
"is parsed, because this may cause inconsistent state.")
|
||||
raise ValueError(Errors.E043)
|
||||
if value is None:
|
||||
self.c.sent_start = 0
|
||||
elif value is True:
|
||||
|
@ -394,8 +384,7 @@ cdef class Token:
|
|||
elif value is False:
|
||||
self.c.sent_start = -1
|
||||
else:
|
||||
raise ValueError("Invalid value for token.sent_start. Must be "
|
||||
"one of: None, True, False")
|
||||
raise ValueError(Errors.E044.format(value=value))
|
||||
|
||||
property lefts:
|
||||
"""The leftward immediate children of the word, in the syntactic
|
||||
|
@ -413,8 +402,7 @@ cdef class Token:
|
|||
nr_iter += 1
|
||||
# This is ugly, but it's a way to guard out infinite loops
|
||||
if nr_iter >= 10000000:
|
||||
raise RuntimeError("Possibly infinite loop encountered "
|
||||
"while looking for token.lefts")
|
||||
raise RuntimeError(Errors.E045.format(attr='token.lefts'))
|
||||
|
||||
property rights:
|
||||
"""The rightward immediate children of the word, in the syntactic
|
||||
|
@ -432,8 +420,7 @@ cdef class Token:
|
|||
ptr -= 1
|
||||
nr_iter += 1
|
||||
if nr_iter >= 10000000:
|
||||
raise RuntimeError("Possibly infinite loop encountered "
|
||||
"while looking for token.rights")
|
||||
raise RuntimeError(Errors.E045.format(attr='token.rights'))
|
||||
tokens.reverse()
|
||||
for t in tokens:
|
||||
yield t
|
||||
|
|
|
@ -3,6 +3,8 @@ from __future__ import unicode_literals
|
|||
|
||||
import functools
|
||||
|
||||
from ..errors import Errors
|
||||
|
||||
|
||||
class Underscore(object):
|
||||
doc_extensions = {}
|
||||
|
@ -23,7 +25,7 @@ class Underscore(object):
|
|||
|
||||
def __getattr__(self, name):
|
||||
if name not in self._extensions:
|
||||
raise AttributeError(name)
|
||||
raise AttributeError(Errors.E046.format(name=name))
|
||||
default, method, getter, setter = self._extensions[name]
|
||||
if getter is not None:
|
||||
return getter(self._obj)
|
||||
|
@ -34,7 +36,7 @@ class Underscore(object):
|
|||
|
||||
def __setattr__(self, name, value):
|
||||
if name not in self._extensions:
|
||||
raise AttributeError(name)
|
||||
raise AttributeError(Errors.E047.format(name=name))
|
||||
default, method, getter, setter = self._extensions[name]
|
||||
if setter is not None:
|
||||
return setter(self._obj, value)
|
||||
|
|
|
@ -11,8 +11,6 @@ import sys
|
|||
import textwrap
|
||||
import random
|
||||
from collections import OrderedDict
|
||||
import inspect
|
||||
import warnings
|
||||
from thinc.neural._classes.model import Model
|
||||
import functools
|
||||
import cytoolz
|
||||
|
@ -22,6 +20,7 @@ import numpy.random
|
|||
from .symbols import ORTH
|
||||
from .compat import cupy, CudaStream, path2str, basestring_, input_, unicode_
|
||||
from .compat import import_file
|
||||
from .errors import Errors
|
||||
|
||||
# Import these directly from Thinc, so that we're sure we always have the
|
||||
# same version.
|
||||
|
@ -50,8 +49,7 @@ def get_lang_class(lang):
|
|||
try:
|
||||
module = importlib.import_module('.lang.%s' % lang, 'spacy')
|
||||
except ImportError:
|
||||
msg = "Can't import language %s from spacy.lang."
|
||||
raise ImportError(msg % lang)
|
||||
raise ImportError(Errors.E048.format(lang=lang))
|
||||
LANGUAGES[lang] = getattr(module, module.__all__[0])
|
||||
return LANGUAGES[lang]
|
||||
|
||||
|
@ -108,7 +106,7 @@ def load_model(name, **overrides):
|
|||
"""
|
||||
data_path = get_data_path()
|
||||
if not data_path or not data_path.exists():
|
||||
raise IOError("Can't find spaCy data path: %s" % path2str(data_path))
|
||||
raise IOError(Errors.E049.format(path=path2str(data_path)))
|
||||
if isinstance(name, basestring_): # in data dir / shortcut
|
||||
if name in set([d.name for d in data_path.iterdir()]):
|
||||
return load_model_from_link(name, **overrides)
|
||||
|
@ -118,7 +116,7 @@ def load_model(name, **overrides):
|
|||
return load_model_from_path(Path(name), **overrides)
|
||||
elif hasattr(name, 'exists'): # Path or Path-like to model data
|
||||
return load_model_from_path(name, **overrides)
|
||||
raise IOError("Can't find model '%s'" % name)
|
||||
raise IOError(Errors.E050.format(name=name))
|
||||
|
||||
|
||||
def load_model_from_link(name, **overrides):
|
||||
|
@ -127,9 +125,7 @@ def load_model_from_link(name, **overrides):
|
|||
try:
|
||||
cls = import_file(name, path)
|
||||
except AttributeError:
|
||||
raise IOError(
|
||||
"Cant' load '%s'. If you're using a shortcut link, make sure it "
|
||||
"points to a valid package (not just a data directory)." % name)
|
||||
raise IOError(Errors.E051.format(name=name))
|
||||
return cls.load(**overrides)
|
||||
|
||||
|
||||
|
@ -173,8 +169,7 @@ def load_model_from_init_py(init_file, **overrides):
|
|||
data_dir = '%s_%s-%s' % (meta['lang'], meta['name'], meta['version'])
|
||||
data_path = model_path / data_dir
|
||||
if not model_path.exists():
|
||||
msg = "Can't find model directory: %s"
|
||||
raise ValueError(msg % path2str(data_path))
|
||||
raise IOError(Errors.E052.format(path=path2str(data_path)))
|
||||
return load_model_from_path(data_path, meta, **overrides)
|
||||
|
||||
|
||||
|
@ -186,16 +181,14 @@ def get_model_meta(path):
|
|||
"""
|
||||
model_path = ensure_path(path)
|
||||
if not model_path.exists():
|
||||
msg = "Can't find model directory: %s"
|
||||
raise ValueError(msg % path2str(model_path))
|
||||
raise IOError(Errors.E052.format(path=path2str(model_path)))
|
||||
meta_path = model_path / 'meta.json'
|
||||
if not meta_path.is_file():
|
||||
raise IOError("Could not read meta.json from %s" % meta_path)
|
||||
raise IOError(Errors.E053.format(path=meta_path))
|
||||
meta = read_json(meta_path)
|
||||
for setting in ['lang', 'name', 'version']:
|
||||
if setting not in meta or not meta[setting]:
|
||||
msg = "No valid '%s' setting found in model meta.json"
|
||||
raise ValueError(msg % setting)
|
||||
raise ValueError(Errors.E054.format(setting=setting))
|
||||
return meta
|
||||
|
||||
|
||||
|
@ -339,13 +332,10 @@ def update_exc(base_exceptions, *addition_dicts):
|
|||
for orth, token_attrs in additions.items():
|
||||
if not all(isinstance(attr[ORTH], unicode_)
|
||||
for attr in token_attrs):
|
||||
msg = "Invalid ORTH value in exception: key='%s', orths='%s'"
|
||||
raise ValueError(msg % (orth, token_attrs))
|
||||
raise ValueError(Errors.E055.format(key=orth, orths=token_attrs))
|
||||
described_orth = ''.join(attr[ORTH] for attr in token_attrs)
|
||||
if orth != described_orth:
|
||||
msg = ("Invalid tokenizer exception: ORTH values combined "
|
||||
"don't match original string. key='%s', orths='%s'")
|
||||
raise ValueError(msg % (orth, described_orth))
|
||||
raise ValueError(Errors.E056.format(key=orth, orths=described_orth))
|
||||
exc.update(additions)
|
||||
exc = expand_exc(exc, "'", "’")
|
||||
return exc
|
||||
|
@ -375,8 +365,7 @@ def expand_exc(excs, search, replace):
|
|||
|
||||
def normalize_slice(length, start, stop, step=None):
|
||||
if not (step is None or step == 1):
|
||||
raise ValueError("Stepped slices not supported in Span objects."
|
||||
"Try: list(tokens)[start:stop:step] instead.")
|
||||
raise ValueError(Errors.E057)
|
||||
if start is None:
|
||||
start = 0
|
||||
elif start < 0:
|
||||
|
@ -387,7 +376,6 @@ def normalize_slice(length, start, stop, step=None):
|
|||
elif stop < 0:
|
||||
stop += length
|
||||
stop = min(length, max(start, stop))
|
||||
assert 0 <= start <= stop <= length
|
||||
return start, stop
|
||||
|
||||
|
||||
|
@ -524,18 +512,6 @@ def from_disk(path, readers, exclude):
|
|||
return path
|
||||
|
||||
|
||||
def deprecated(message, filter='always'):
|
||||
"""Show a deprecation warning.
|
||||
|
||||
message (unicode): The message to display.
|
||||
filter (unicode): Filter value.
|
||||
"""
|
||||
stack = inspect.stack()[-1]
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter(filter, DeprecationWarning)
|
||||
warnings.warn_explicit(message, DeprecationWarning, stack[1], stack[2])
|
||||
|
||||
|
||||
def print_table(data, title=None):
|
||||
"""Print data in table format.
|
||||
|
||||
|
|
|
@ -14,6 +14,7 @@ from thinc.neural._classes.model import Model
|
|||
|
||||
from .strings cimport StringStore, hash_string
|
||||
from .compat import basestring_, path2str
|
||||
from .errors import Errors
|
||||
from . import util
|
||||
|
||||
from cython.operator cimport dereference as deref
|
||||
|
@ -114,7 +115,7 @@ cdef class Vectors:
|
|||
"""
|
||||
i = self.key2row[key]
|
||||
if i is None:
|
||||
raise KeyError(key)
|
||||
raise KeyError(Errors.E058.format(key=key))
|
||||
else:
|
||||
return self.data[i]
|
||||
|
||||
|
@ -215,7 +216,8 @@ cdef class Vectors:
|
|||
RETURNS: The requested key, keys, row or rows.
|
||||
"""
|
||||
if sum(arg is None for arg in (key, keys, row, rows)) != 3:
|
||||
raise ValueError("One (and only one) keyword arg must be set.")
|
||||
bad_kwargs = {'key': key, 'keys': keys, 'row': row, 'rows': rows}
|
||||
raise ValueError(Errors.E059.format(kwargs=bad_kwargs))
|
||||
xp = get_array_module(self.data)
|
||||
if key is not None:
|
||||
if isinstance(key, basestring_):
|
||||
|
@ -254,9 +256,9 @@ cdef class Vectors:
|
|||
row = self.key2row[key]
|
||||
elif row is None:
|
||||
if self.is_full:
|
||||
raise ValueError("Cannot add new key to vectors -- full")
|
||||
raise ValueError(Errors.E060.format(rows=self.data.shape[0],
|
||||
cols=self.data.shape[1]))
|
||||
row = deref(self._unset.begin())
|
||||
|
||||
self.key2row[key] = row
|
||||
if vector is not None:
|
||||
self.data[row] = vector
|
||||
|
@ -318,7 +320,7 @@ cdef class Vectors:
|
|||
width = int(dims)
|
||||
break
|
||||
else:
|
||||
raise IOError("Expected file named e.g. vectors.128.f.bin")
|
||||
raise IOError(Errors.E061.format(filename=path))
|
||||
bin_loc = path / 'vectors.{dims}.{dtype}.bin'.format(dims=dims,
|
||||
dtype=dtype)
|
||||
xp = get_array_module(self.data)
|
||||
|
|
|
@ -16,6 +16,7 @@ from .attrs cimport PROB, LANG, ORTH, TAG
|
|||
from .structs cimport SerializedLexemeC
|
||||
|
||||
from .compat import copy_reg, basestring_
|
||||
from .errors import Errors
|
||||
from .lemmatizer import Lemmatizer
|
||||
from .attrs import intify_attrs
|
||||
from .vectors import Vectors
|
||||
|
@ -100,15 +101,9 @@ cdef class Vocab:
|
|||
flag_id = bit
|
||||
break
|
||||
else:
|
||||
raise ValueError(
|
||||
"Cannot find empty bit for new lexical flag. All bits "
|
||||
"between 0 and 63 are occupied. You can replace one by "
|
||||
"specifying the flag_id explicitly, e.g. "
|
||||
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
|
||||
raise ValueError(Errors.E062)
|
||||
elif flag_id >= 64 or flag_id < 1:
|
||||
raise ValueError(
|
||||
"Invalid value for flag_id: %d. Flag IDs must be between "
|
||||
"1 and 63 (inclusive)" % flag_id)
|
||||
raise ValueError(Errors.E063.format(value=flag_id))
|
||||
for lex in self:
|
||||
lex.set_flag(flag_id, flag_getter(lex.orth_))
|
||||
self.lex_attr_getters[flag_id] = flag_getter
|
||||
|
@ -127,8 +122,9 @@ cdef class Vocab:
|
|||
cdef size_t addr
|
||||
if lex != NULL:
|
||||
if lex.orth != self.strings[string]:
|
||||
raise LookupError.mismatched_strings(
|
||||
lex.orth, self.strings[string], string)
|
||||
raise KeyError(Errors.E064.format(string=lex.orth,
|
||||
orth=self.strings[string],
|
||||
orth_id=string))
|
||||
return lex
|
||||
else:
|
||||
return self._new_lexeme(mem, string)
|
||||
|
@ -171,7 +167,8 @@ cdef class Vocab:
|
|||
if not is_oov:
|
||||
key = hash_string(string)
|
||||
self._add_lex_to_vocab(key, lex)
|
||||
assert lex != NULL, string
|
||||
if lex == NULL:
|
||||
raise ValueError(Errors.E085.format(string=string))
|
||||
return lex
|
||||
|
||||
cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1:
|
||||
|
@ -254,7 +251,7 @@ cdef class Vocab:
|
|||
width, you have to call this to change the size of the vectors.
|
||||
"""
|
||||
if width is not None and shape is not None:
|
||||
raise ValueError("Only one of width and shape can be specified")
|
||||
raise ValueError(Errors.E065.format(width=width, shape=shape))
|
||||
elif shape is not None:
|
||||
self.vectors = Vectors(shape=shape)
|
||||
else:
|
||||
|
@ -471,7 +468,10 @@ cdef class Vocab:
|
|||
if ptr == NULL:
|
||||
continue
|
||||
py_str = self.strings[lexeme.orth]
|
||||
assert self.strings[py_str] == lexeme.orth, (py_str, lexeme.orth)
|
||||
if self.strings[py_str] != lexeme.orth:
|
||||
raise ValueError(Errors.E086.format(string=py_str,
|
||||
orth_id=lexeme.orth,
|
||||
hash_id=self.strings[py_str]))
|
||||
key = hash_string(py_str)
|
||||
self._by_hash.set(key, lexeme)
|
||||
self._by_orth.set(lexeme.orth, lexeme)
|
||||
|
@ -512,16 +512,3 @@ def unpickle_vocab(sstore, vectors, morphology, data_dir,
|
|||
|
||||
|
||||
copy_reg.pickle(Vocab, pickle_vocab, unpickle_vocab)
|
||||
|
||||
|
||||
class LookupError(Exception):
|
||||
@classmethod
|
||||
def mismatched_strings(cls, id_, id_string, original_string):
|
||||
return cls(
|
||||
"Error fetching a Lexeme from the Vocab. When looking up a "
|
||||
"string, the lexeme returned had an orth ID that did not match "
|
||||
"the query string. This means that the cached lexeme structs are "
|
||||
"mismatched to the string encoding table. The mismatched:\n"
|
||||
"Query string: {}\n"
|
||||
"Orth cached: {}\n"
|
||||
"Orth ID: {}".format(repr(original_string), repr(id_string), id_))
|
||||
|
|
Loading…
Reference in New Issue
Block a user