2018-04-03 16:50:31 +03:00
|
|
|
# coding: utf8
|
|
|
|
from __future__ import unicode_literals
|
|
|
|
|
|
|
|
import os
|
|
|
|
import warnings
|
|
|
|
import inspect
|
|
|
|
|
|
|
|
|
|
|
|
def add_codes(err_cls):
|
|
|
|
"""Add error codes to string messages via class attribute names."""
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
|
2018-04-03 16:50:31 +03:00
|
|
|
class ErrorsWithCodes(object):
|
|
|
|
def __getattribute__(self, code):
|
|
|
|
msg = getattr(err_cls, code)
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
return "[{code}] {msg}".format(code=code, msg=msg)
|
|
|
|
|
2018-04-03 16:50:31 +03:00
|
|
|
return ErrorsWithCodes()
|
|
|
|
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
# fmt: off
|
|
|
|
|
2018-04-03 16:50:31 +03:00
|
|
|
@add_codes
|
|
|
|
class Warnings(object):
|
|
|
|
W001 = ("As of spaCy v2.0, the keyword argument `path=` is deprecated. "
|
|
|
|
"You can now call spacy.load with the path as its first argument, "
|
|
|
|
"and the model's meta.json will be used to determine the language "
|
|
|
|
"to load. For example:\nnlp = spacy.load('{path}')")
|
|
|
|
W002 = ("Tokenizer.from_list is now deprecated. Create a new Doc object "
|
|
|
|
"instead and pass in the strings as the `words` keyword argument, "
|
|
|
|
"for example:\nfrom spacy.tokens import Doc\n"
|
|
|
|
"doc = Doc(nlp.vocab, words=[...])")
|
|
|
|
W003 = ("Positional arguments to Doc.merge are deprecated. Instead, use "
|
|
|
|
"the keyword arguments, for example tag=, lemma= or ent_type=.")
|
|
|
|
W004 = ("No text fixing enabled. Run `pip install ftfy` to enable fixing "
|
|
|
|
"using ftfy.fix_text if necessary.")
|
|
|
|
W005 = ("Doc object not parsed. This means displaCy won't be able to "
|
|
|
|
"generate a dependency visualization for it. Make sure the Doc "
|
|
|
|
"was processed with a model that supports dependency parsing, and "
|
|
|
|
"not just a language class like `English()`. For more info, see "
|
|
|
|
"the docs:\nhttps://spacy.io/usage/models")
|
|
|
|
W006 = ("No entities to visualize found in Doc object. If this is "
|
|
|
|
"surprising to you, make sure the Doc was processed using a model "
|
|
|
|
"that supports named entity recognition, and check the `doc.ents` "
|
|
|
|
"property manually if necessary.")
|
2018-05-21 02:22:38 +03:00
|
|
|
W007 = ("The model you're using has no word vectors loaded, so the result "
|
|
|
|
"of the {obj}.similarity method will be based on the tagger, "
|
|
|
|
"parser and NER, which may not give useful similarity judgements. "
|
|
|
|
"This may happen if you're using one of the small models, e.g. "
|
|
|
|
"`en_core_web_sm`, which don't ship with word vectors and only "
|
|
|
|
"use context-sensitive tensors. You can always add your own word "
|
|
|
|
"vectors, or use one of the larger models instead if available.")
|
|
|
|
W008 = ("Evaluating {obj}.similarity based on empty vectors.")
|
2018-05-22 19:29:45 +03:00
|
|
|
W009 = ("Custom factory '{name}' provided by entry points of another "
|
|
|
|
"package overwrites built-in factory.")
|
2018-11-14 21:10:46 +03:00
|
|
|
W010 = ("As of v2.1.0, the PhraseMatcher doesn't have a phrase length "
|
|
|
|
"limit anymore, so the max_length argument is now deprecated.")
|
2018-12-20 19:32:04 +03:00
|
|
|
W011 = ("It looks like you're calling displacy.serve from within a "
|
|
|
|
"Jupyter notebook or a similar environment. This likely means "
|
|
|
|
"you're already running a local web server, so there's no need to "
|
|
|
|
"make displaCy start another one. Instead, you should be able to "
|
|
|
|
"replace displacy.serve with displacy.render to show the "
|
|
|
|
"visualization.")
|
2019-02-12 17:45:31 +03:00
|
|
|
W012 = ("A Doc object you're adding to the PhraseMatcher for pattern "
|
|
|
|
"'{key}' is parsed and/or tagged, but to match on '{attr}', you "
|
|
|
|
"don't actually need this information. This means that creating "
|
|
|
|
"the patterns is potentially much slower, because all pipeline "
|
|
|
|
"components are applied. To only create tokenized Doc objects, "
|
|
|
|
"try using `nlp.make_doc(text)` or process all texts as a stream "
|
|
|
|
"using `list(nlp.tokenizer.pipe(all_texts))`.")
|
2019-02-15 12:29:44 +03:00
|
|
|
W013 = ("As of v2.1.0, {obj}.merge is deprecated. Please use the more "
|
|
|
|
"efficient and less error-prone Doc.retokenize context manager "
|
|
|
|
"instead.")
|
2019-03-10 21:16:45 +03:00
|
|
|
W014 = ("As of v2.1.0, the `disable` keyword argument on the serialization "
|
|
|
|
"methods is and should be replaced with `exclude`. This makes it "
|
|
|
|
"consistent with the other objects serializable.")
|
|
|
|
W015 = ("As of v2.1.0, the use of keyword arguments to exclude fields from "
|
|
|
|
"being serialized or deserialized is deprecated. Please use the "
|
|
|
|
"`exclude` argument instead. For example: exclude=['{arg}'].")
|
2019-03-15 18:38:44 +03:00
|
|
|
W016 = ("The keyword argument `n_threads` on the is now deprecated, as "
|
|
|
|
"the v2.x models cannot release the global interpreter lock. "
|
|
|
|
"Future versions may introduce a `n_process` argument for "
|
|
|
|
"parallel inference via multiprocessing.")
|
2019-03-22 18:55:05 +03:00
|
|
|
W017 = ("Alias '{alias}' already exists in the Knowledge base.")
|
|
|
|
W018 = ("Entity '{entity}' already exists in the Knowledge base.")
|
2019-07-11 15:44:32 +03:00
|
|
|
W019 = ("Changing vectors name from {old} to {new}, to avoid clash with "
|
|
|
|
"previously loaded vectors. See Issue #3853.")
|
2018-04-03 16:50:31 +03:00
|
|
|
|
|
|
|
|
|
|
|
@add_codes
|
|
|
|
class Errors(object):
|
|
|
|
E001 = ("No component '{name}' found in pipeline. Available names: {opts}")
|
|
|
|
E002 = ("Can't find factory for '{name}'. This usually happens when spaCy "
|
|
|
|
"calls `nlp.create_pipe` with a component name that's not built "
|
|
|
|
"in - for example, when constructing the pipeline from a model's "
|
|
|
|
"meta.json. If you're using a custom component, you can write to "
|
|
|
|
"`Language.factories['{name}']` or remove it from the model meta "
|
|
|
|
"and add it via `nlp.add_pipe` instead.")
|
|
|
|
E003 = ("Not a valid pipeline component. Expected callable, but "
|
|
|
|
"got {component} (name: '{name}').")
|
|
|
|
E004 = ("If you meant to add a built-in component, use `create_pipe`: "
|
|
|
|
"`nlp.add_pipe(nlp.create_pipe('{component}'))`")
|
|
|
|
E005 = ("Pipeline component '{name}' returned None. If you're using a "
|
|
|
|
"custom component, maybe you forgot to return the processed Doc?")
|
|
|
|
E006 = ("Invalid constraints. You can only set one of the following: "
|
|
|
|
"before, after, first, last.")
|
|
|
|
E007 = ("'{name}' already exists in pipeline. Existing names: {opts}")
|
|
|
|
E008 = ("Some current components would be lost when restoring previous "
|
|
|
|
"pipeline state. If you added components after calling "
|
|
|
|
"`nlp.disable_pipes()`, you should remove them explicitly with "
|
|
|
|
"`nlp.remove_pipe()` before the pipeline is restored. Names of "
|
|
|
|
"the new components: {names}")
|
|
|
|
E009 = ("The `update` method expects same number of docs and golds, but "
|
|
|
|
"got: {n_docs} docs, {n_golds} golds.")
|
|
|
|
E010 = ("Word vectors set to length 0. This may be because you don't have "
|
|
|
|
"a model installed or loaded, or because your model doesn't "
|
|
|
|
"include word vectors. For more info, see the docs:\n"
|
|
|
|
"https://spacy.io/usage/models")
|
|
|
|
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
|
|
|
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
|
|
|
E013 = ("Error selecting action in matcher")
|
|
|
|
E014 = ("Uknown tag ID: {tag}")
|
|
|
|
E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
|
|
|
|
"`force=True` to overwrite.")
|
|
|
|
E016 = ("MultitaskObjective target should be function or one of: dep, "
|
|
|
|
"tag, ent, dep_tag_offset, ent_tag.")
|
|
|
|
E017 = ("Can only add unicode or bytes. Got type: {value_type}")
|
2019-08-13 16:38:59 +03:00
|
|
|
E018 = ("Can't retrieve string for hash '{hash_value}'. This usually refers "
|
|
|
|
"to an issue with the `Vocab` or `StringStore`.")
|
2018-04-03 16:50:31 +03:00
|
|
|
E019 = ("Can't create transition with unknown action ID: {action}. Action "
|
|
|
|
"IDs are enumerated in spacy/syntax/{src}.pyx.")
|
|
|
|
E020 = ("Could not find a gold-standard action to supervise the "
|
|
|
|
"dependency parser. The tree is non-projective (i.e. it has "
|
|
|
|
"crossing arcs - see spacy/syntax/nonproj.pyx for definitions). "
|
|
|
|
"The ArcEager transition system only supports projective trees. "
|
|
|
|
"To learn non-projective representations, transform the data "
|
|
|
|
"before training and after parsing. Either pass "
|
|
|
|
"`make_projective=True` to the GoldParse class, or use "
|
|
|
|
"spacy.syntax.nonproj.preprocess_training_data.")
|
|
|
|
E021 = ("Could not find a gold-standard action to supervise the "
|
|
|
|
"dependency parser. The GoldParse was projective. The transition "
|
|
|
|
"system has {n_actions} actions. State at failure: {state}")
|
|
|
|
E022 = ("Could not find a transition with the name '{name}' in the NER "
|
|
|
|
"model.")
|
|
|
|
E023 = ("Error cleaning up beam: The same state occurred twice at "
|
|
|
|
"memory address {addr} and position {i}.")
|
|
|
|
E024 = ("Could not find an optimal move to supervise the parser. Usually, "
|
2019-06-01 15:37:27 +03:00
|
|
|
"this means that the model can't be updated in a way that's valid "
|
|
|
|
"and satisfies the correct annotations specified in the GoldParse. "
|
|
|
|
"For example, are all labels added to the model? If you're "
|
|
|
|
"training a named entity recognizer, also make sure that none of "
|
|
|
|
"your annotated entity spans have leading or trailing whitespace. "
|
|
|
|
"You can also use the experimental `debug-data` command to "
|
|
|
|
"validate your JSON-formatted training data. For details, run:\n"
|
|
|
|
"python -m spacy debug-data --help")
|
2018-04-03 16:50:31 +03:00
|
|
|
E025 = ("String is too long: {length} characters. Max is 2**30.")
|
|
|
|
E026 = ("Error accessing token at position {i}: out of bounds in Doc of "
|
|
|
|
"length {length}.")
|
|
|
|
E027 = ("Arguments 'words' and 'spaces' should be sequences of the same "
|
|
|
|
"length, or 'spaces' should be left default at None. spaces "
|
|
|
|
"should be a sequence of booleans, with True meaning that the "
|
|
|
|
"word owns a ' ' character following it.")
|
|
|
|
E028 = ("orths_and_spaces expects either a list of unicode string or a "
|
|
|
|
"list of (unicode, bool) tuples. Got bytes instance: {value}")
|
|
|
|
E029 = ("noun_chunks requires the dependency parse, which requires a "
|
|
|
|
"statistical model to be installed and loaded. For more info, see "
|
|
|
|
"the documentation:\nhttps://spacy.io/usage/models")
|
|
|
|
E030 = ("Sentence boundaries unset. You can add the 'sentencizer' "
|
|
|
|
"component to the pipeline with: "
|
|
|
|
"nlp.add_pipe(nlp.create_pipe('sentencizer')) "
|
|
|
|
"Alternatively, add the dependency parser, or set sentence "
|
|
|
|
"boundaries by setting doc[i].is_sent_start.")
|
|
|
|
E031 = ("Invalid token: empty string ('') at position {i}.")
|
|
|
|
E032 = ("Conflicting attributes specified in doc.from_array(): "
|
|
|
|
"(HEAD, SENT_START). The HEAD attribute currently sets sentence "
|
|
|
|
"boundaries implicitly, based on the tree structure. This means "
|
|
|
|
"the HEAD attribute would potentially override the sentence "
|
|
|
|
"boundaries set by SENT_START.")
|
|
|
|
E033 = ("Cannot load into non-empty Doc of length {length}.")
|
|
|
|
E034 = ("Doc.merge received {n_args} non-keyword arguments. Expected "
|
|
|
|
"either 3 arguments (deprecated), or 0 (use keyword arguments).\n"
|
|
|
|
"Arguments supplied:\n{args}\nKeyword arguments:{kwargs}")
|
|
|
|
E035 = ("Error creating span with start {start} and end {end} for Doc of "
|
|
|
|
"length {length}.")
|
|
|
|
E036 = ("Error calculating span: Can't find a token starting at character "
|
|
|
|
"offset {start}.")
|
|
|
|
E037 = ("Error calculating span: Can't find a token ending at character "
|
|
|
|
"offset {end}.")
|
|
|
|
E038 = ("Error finding sentence for span. Infinite loop detected.")
|
|
|
|
E039 = ("Array bounds exceeded while searching for root word. This likely "
|
|
|
|
"means the parse tree is in an invalid state. Please report this "
|
|
|
|
"issue here: http://github.com/explosion/spaCy/issues")
|
|
|
|
E040 = ("Attempt to access token at {i}, max length {max_length}.")
|
|
|
|
E041 = ("Invalid comparison operator: {op}. Likely a Cython bug?")
|
|
|
|
E042 = ("Error accessing doc[{i}].nbor({j}), for doc of length {length}.")
|
|
|
|
E043 = ("Refusing to write to token.sent_start if its document is parsed, "
|
|
|
|
"because this may cause inconsistent state.")
|
|
|
|
E044 = ("Invalid value for token.sent_start: {value}. Must be one of: "
|
|
|
|
"None, True, False")
|
|
|
|
E045 = ("Possibly infinite loop encountered while looking for {attr}.")
|
|
|
|
E046 = ("Can't retrieve unregistered extension attribute '{name}'. Did "
|
|
|
|
"you forget to call the `set_extension` method?")
|
|
|
|
E047 = ("Can't assign a value to unregistered extension attribute "
|
|
|
|
"'{name}'. Did you forget to call the `set_extension` method?")
|
2019-02-13 18:52:25 +03:00
|
|
|
E048 = ("Can't import language {lang} from spacy.lang: {err}")
|
2018-04-03 16:50:31 +03:00
|
|
|
E049 = ("Can't find spaCy data directory: '{path}'. Check your "
|
|
|
|
"installation and permissions, or use spacy.util.set_data_path "
|
|
|
|
"to customise the location if necessary.")
|
|
|
|
E050 = ("Can't find model '{name}'. It doesn't seem to be a shortcut "
|
|
|
|
"link, a Python package or a valid path to a data directory.")
|
|
|
|
E051 = ("Cant' load '{name}'. If you're using a shortcut link, make sure "
|
|
|
|
"it points to a valid package (not just a data directory).")
|
|
|
|
E052 = ("Can't find model directory: {path}")
|
|
|
|
E053 = ("Could not read meta.json from {path}")
|
|
|
|
E054 = ("No valid '{setting}' setting found in model meta.json.")
|
|
|
|
E055 = ("Invalid ORTH value in exception:\nKey: {key}\nOrths: {orths}")
|
|
|
|
E056 = ("Invalid tokenizer exception: ORTH values combined don't match "
|
|
|
|
"original string.\nKey: {key}\nOrths: {orths}")
|
|
|
|
E057 = ("Stepped slices not supported in Span objects. Try: "
|
|
|
|
"list(tokens)[start:stop:step] instead.")
|
|
|
|
E058 = ("Could not retrieve vector for key {key}.")
|
|
|
|
E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
|
|
|
|
E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
|
|
|
|
"({rows}, {cols}).")
|
|
|
|
E061 = ("Bad file name: {filename}. Example of a valid file name: "
|
|
|
|
"'vectors.128.f.bin'")
|
|
|
|
E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
|
|
|
|
"and 63 are occupied. You can replace one by specifying the "
|
|
|
|
"`flag_id` explicitly, e.g. "
|
|
|
|
"`nlp.vocab.add_flag(your_func, flag_id=IS_ALPHA`.")
|
|
|
|
E063 = ("Invalid value for flag_id: {value}. Flag IDs must be between 1 "
|
|
|
|
"and 63 (inclusive).")
|
|
|
|
E064 = ("Error fetching a Lexeme from the Vocab. When looking up a "
|
|
|
|
"string, the lexeme returned had an orth ID that did not match "
|
|
|
|
"the query string. This means that the cached lexeme structs are "
|
|
|
|
"mismatched to the string encoding table. The mismatched:\n"
|
|
|
|
"Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
|
|
|
|
E065 = ("Only one of the vector table's width and shape can be specified. "
|
|
|
|
"Got width {width} and shape {shape}.")
|
|
|
|
E066 = ("Error creating model helper for extracting columns. Can only "
|
|
|
|
"extract columns by positive integer. Got: {value}.")
|
|
|
|
E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
|
|
|
|
"an entity) without a preceding 'B' (beginning of an entity). "
|
|
|
|
"Tag sequence:\n{tags}")
|
|
|
|
E068 = ("Invalid BILUO tag: '{tag}'.")
|
|
|
|
E069 = ("Invalid gold-standard parse tree. Found cycle between word "
|
2019-08-15 19:08:28 +03:00
|
|
|
"IDs: {cycle} (tokens: {cycle_tokens}) in the document starting "
|
|
|
|
"with tokens: {doc_tokens}.")
|
2018-04-03 16:50:31 +03:00
|
|
|
E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) "
|
|
|
|
"does not align with number of annotations ({n_annots}).")
|
|
|
|
E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
|
|
|
|
"match the one in the vocab ({vocab_orth}).")
|
|
|
|
E072 = ("Error serializing lexeme: expected data length {length}, "
|
|
|
|
"got {bad_length}.")
|
|
|
|
E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
|
|
|
|
"are of length {length}. You can use `vocab.reset_vectors` to "
|
|
|
|
"clear the existing vectors and resize the table.")
|
|
|
|
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
|
|
|
"to end with the attribute {attr}. Got: {bad_attr}.")
|
|
|
|
E075 = ("Error accepting match: length ({length}) > maximum length "
|
|
|
|
"({max_len}).")
|
|
|
|
E076 = ("Error setting tensor on Doc: tensor has {rows} rows, while Doc "
|
|
|
|
"has {words} words.")
|
|
|
|
E077 = ("Error computing {value}: number of Docs ({n_docs}) does not "
|
|
|
|
"equal number of GoldParse objects ({n_golds}) in batch.")
|
|
|
|
E078 = ("Error computing score: number of words in Doc ({words_doc}) does "
|
|
|
|
"not equal number of words in GoldParse ({words_gold}).")
|
|
|
|
E079 = ("Error computing states in beam: number of predicted beams "
|
|
|
|
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
|
|
|
E080 = ("Duplicate state found in beam: {key}.")
|
|
|
|
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
|
|
|
"does not equal number of losses ({losses}).")
|
|
|
|
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
|
|
|
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
|
|
|
"match.")
|
2018-04-03 19:30:17 +03:00
|
|
|
E083 = ("Error setting extension: only one of `default`, `method`, or "
|
|
|
|
"`getter` (plus optional `setter`) is allowed. Got: {nr_defined}")
|
2018-04-03 16:50:31 +03:00
|
|
|
E084 = ("Error assigning label ID {label} to span: not in StringStore.")
|
|
|
|
E085 = ("Can't create lexeme for string '{string}'.")
|
|
|
|
E086 = ("Error deserializing lexeme '{string}': orth ID {orth_id} does "
|
|
|
|
"not match hash {hash_id} in StringStore.")
|
|
|
|
E087 = ("Unknown displaCy style: {style}.")
|
|
|
|
E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
|
|
|
|
"v2.x parser and NER models require roughly 1GB of temporary "
|
|
|
|
"memory per 100,000 characters in the input. This means long "
|
|
|
|
"texts may cause memory allocation errors. If you're not using "
|
|
|
|
"the parser or NER, it's probably safe to increase the "
|
|
|
|
"`nlp.max_length` limit. The limit is in number of characters, so "
|
|
|
|
"you can check whether your inputs are too long by checking "
|
|
|
|
"`len(text)`.")
|
2018-04-03 19:30:17 +03:00
|
|
|
E089 = ("Extensions can't have a setter argument without a getter "
|
|
|
|
"argument. Check the keyword arguments on `set_extension`.")
|
|
|
|
E090 = ("Extension '{name}' already exists on {obj}. To overwrite the "
|
|
|
|
"existing extension, set `force=True` on `{obj}.set_extension`.")
|
|
|
|
E091 = ("Invalid extension attribute {name}: expected callable or None, "
|
|
|
|
"but got: {value}")
|
2018-04-03 22:40:29 +03:00
|
|
|
E092 = ("Could not find or assign name for word vectors. Ususally, the "
|
|
|
|
"name is read from the model's meta.json in vector.name. "
|
|
|
|
"Alternatively, it is built from the 'lang' and 'name' keys in "
|
|
|
|
"the meta.json. Vector names are required to avoid issue #1660.")
|
|
|
|
E093 = ("token.ent_iob values make invalid sequence: I without B\n{seq}")
|
2018-04-10 22:26:37 +03:00
|
|
|
E094 = ("Error reading line {line_num} in vectors file {loc}.")
|
2018-05-20 16:13:37 +03:00
|
|
|
E095 = ("Can't write to frozen dictionary. This is likely an internal "
|
|
|
|
"error. Are you writing to a default function argument?")
|
2018-06-25 15:55:16 +03:00
|
|
|
E096 = ("Invalid object passed to displaCy: Can only visualize Doc or "
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
"Span objects, or dicts if set to manual=True.")
|
2018-07-18 20:43:16 +03:00
|
|
|
E097 = ("Invalid pattern: expected token pattern (list of dicts) or "
|
|
|
|
"phrase pattern (string) but got:\n{pattern}")
|
2018-10-30 01:21:39 +03:00
|
|
|
E098 = ("Invalid pattern specified: expected both SPEC and PATTERN.")
|
|
|
|
E099 = ("First node of pattern should be a root node. The root should "
|
|
|
|
"only contain NODE_NAME.")
|
|
|
|
E100 = ("Nodes apart from the root should contain NODE_NAME, NBOR_NAME and "
|
|
|
|
"NBOR_RELOP.")
|
|
|
|
E101 = ("NODE_NAME should be a new node and NBOR_NAME should already have "
|
|
|
|
"have been declared in previous edges.")
|
2019-02-24 17:11:28 +03:00
|
|
|
E102 = ("Can't merge non-disjoint spans. '{token}' is already part of "
|
|
|
|
"tokens to merge.")
|
💫 Port master changes over to develop (#2979)
* Create aryaprabhudesai.md (#2681)
* Update _install.jade (#2688)
Typo fix: "models" -> "model"
* Add FAC to spacy.explain (resolves #2706)
* Remove docstrings for deprecated arguments (see #2703)
* When calling getoption() in conftest.py, pass a default option (#2709)
* When calling getoption() in conftest.py, pass a default option
This is necessary to allow testing an installed spacy by running:
pytest --pyargs spacy
* Add contributor agreement
* update bengali token rules for hyphen and digits (#2731)
* Less norm computations in token similarity (#2730)
* Less norm computations in token similarity
* Contributor agreement
* Remove ')' for clarity (#2737)
Sorry, don't mean to be nitpicky, I just noticed this when going through the CLI and thought it was a quick fix. That said, if this was intention than please let me know.
* added contributor agreement for mbkupfer (#2738)
* Basic support for Telugu language (#2751)
* Lex _attrs for polish language (#2750)
* Signed spaCy contributor agreement
* Added polish version of english lex_attrs
* Introduces a bulk merge function, in order to solve issue #653 (#2696)
* Fix comment
* Introduce bulk merge to increase performance on many span merges
* Sign contributor agreement
* Implement pull request suggestions
* Describe converters more explicitly (see #2643)
* Add multi-threading note to Language.pipe (resolves #2582) [ci skip]
* Fix formatting
* Fix dependency scheme docs (closes #2705) [ci skip]
* Don't set stop word in example (closes #2657) [ci skip]
* Add words to portuguese language _num_words (#2759)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Update Indonesian model (#2752)
* adding e-KTP in tokenizer exceptions list
* add exception token
* removing lines with containing space as it won't matter since we use .split() method in the end, added new tokens in exception
* add tokenizer exceptions list
* combining base_norms with norm_exceptions
* adding norm_exception
* fix double key in lemmatizer
* remove unused import on punctuation.py
* reformat stop_words to reduce number of lines, improve readibility
* updating tokenizer exception
* implement is_currency for lang/id
* adding orth_first_upper in tokenizer_exceptions
* update the norm_exception list
* remove bunch of abbreviations
* adding contributors file
* Fixed spaCy+Keras example (#2763)
* bug fixes in keras example
* created contributor agreement
* Adding French hyphenated first name (#2786)
* Fix typo (closes #2784)
* Fix typo (#2795) [ci skip]
Fixed typo on line 6 "regcognizer --> recognizer"
* Adding basic support for Sinhala language. (#2788)
* adding Sinhala language package, stop words, examples and lex_attrs.
* Adding contributor agreement
* Updating contributor agreement
* Also include lowercase norm exceptions
* Fix error (#2802)
* Fix error
ValueError: cannot resize an array that references or is referenced
by another array in this way. Use the resize function
* added spaCy Contributor Agreement
* Add charlax's contributor agreement (#2805)
* agreement of contributor, may I introduce a tiny pl languge contribution (#2799)
* Contributors agreement
* Contributors agreement
* Contributors agreement
* Add jupyter=True to displacy.render in documentation (#2806)
* Revert "Also include lowercase norm exceptions"
This reverts commit 70f4e8adf37cfcfab60be2b97d6deae949b30e9e.
* Remove deprecated encoding argument to msgpack
* Set up dependency tree pattern matching skeleton (#2732)
* Fix bug when too many entity types. Fixes #2800
* Fix Python 2 test failure
* Require older msgpack-numpy
* Restore encoding arg on msgpack-numpy
* Try to fix version pin for msgpack-numpy
* Update Portuguese Language (#2790)
* Add words to portuguese language _num_words
* Add words to portuguese language _num_words
* Portuguese - Add/remove stopwords, fix tokenizer, add currency symbols
* Extended punctuation and norm_exceptions in the Portuguese language
* Correct error in spacy universe docs concerning spacy-lookup (#2814)
* Update Keras Example for (Parikh et al, 2016) implementation (#2803)
* bug fixes in keras example
* created contributor agreement
* baseline for Parikh model
* initial version of parikh 2016 implemented
* tested asymmetric models
* fixed grevious error in normalization
* use standard SNLI test file
* begin to rework parikh example
* initial version of running example
* start to document the new version
* start to document the new version
* Update Decompositional Attention.ipynb
* fixed calls to similarity
* updated the README
* import sys package duh
* simplified indexing on mapping word to IDs
* stupid python indent error
* added code from https://github.com/tensorflow/tensorflow/issues/3388 for tf bug workaround
* Fix typo (closes #2815) [ci skip]
* Update regex version dependency
* Set version to 2.0.13.dev3
* Skip seemingly problematic test
* Remove problematic test
* Try previous version of regex
* Revert "Remove problematic test"
This reverts commit bdebbef45552d698d390aa430b527ee27830f11b.
* Unskip test
* Try older version of regex
* 💫 Update training examples and use minibatching (#2830)
<!--- Provide a general summary of your changes in the title. -->
## Description
Update the training examples in `/examples/training` to show usage of spaCy's `minibatch` and `compounding` helpers ([see here](https://spacy.io/usage/training#tips-batch-size) for details). The lack of batching in the examples has caused some confusion in the past, especially for beginners who would copy-paste the examples, update them with large training sets and experienced slow and unsatisfying results.
### Types of change
enhancements
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Visual C++ link updated (#2842) (closes #2841) [ci skip]
* New landing page
* Add contribution agreement
* Correcting lang/ru/examples.py (#2845)
* Correct some grammatical inaccuracies in lang\ru\examples.py; filled Contributor Agreement
* Correct some grammatical inaccuracies in lang\ru\examples.py
* Move contributor agreement to separate file
* Set version to 2.0.13.dev4
* Add Persian(Farsi) language support (#2797)
* Also include lowercase norm exceptions
* Remove in favour of https://github.com/explosion/spaCy/graphs/contributors
* Rule-based French Lemmatizer (#2818)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
Add a rule-based French Lemmatizer following the english one and the excellent PR for [greek language optimizations](https://github.com/explosion/spaCy/pull/2558) to adapt the Lemmatizer class.
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
- Lemma dictionary used can be found [here](http://infolingu.univ-mlv.fr/DonneesLinguistiques/Dictionnaires/telechargement.html), I used the XML version.
- Add several files containing exhaustive list of words for each part of speech
- Add some lemma rules
- Add POS that are not checked in the standard Lemmatizer, i.e PRON, DET, ADV and AUX
- Modify the Lemmatizer class to check in lookup table as a last resort if POS not mentionned
- Modify the lemmatize function to check in lookup table as a last resort
- Init files are updated so the model can support all the functionalities mentioned above
- Add words to tokenizer_exceptions_list.py in respect to regex used in tokenizer_exceptions.py
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [X] I have submitted the spaCy Contributor Agreement.
- [X] I ran the tests, and all new and existing tests passed.
- [X] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Set version to 2.0.13
* Fix formatting and consistency
* Update docs for new version [ci skip]
* Increment version [ci skip]
* Add info on wheels [ci skip]
* Adding "This is a sentence" example to Sinhala (#2846)
* Add wheels badge
* Update badge [ci skip]
* Update README.rst [ci skip]
* Update murmurhash pin
* Increment version to 2.0.14.dev0
* Update GPU docs for v2.0.14
* Add wheel to setup_requires
* Import prefer_gpu and require_gpu functions from Thinc
* Add tests for prefer_gpu() and require_gpu()
* Update requirements and setup.py
* Workaround bug in thinc require_gpu
* Set version to v2.0.14
* Update push-tag script
* Unhack prefer_gpu
* Require thinc 6.10.6
* Update prefer_gpu and require_gpu docs [ci skip]
* Fix specifiers for GPU
* Set version to 2.0.14.dev1
* Set version to 2.0.14
* Update Thinc version pin
* Increment version
* Fix msgpack-numpy version pin
* Increment version
* Update version to 2.0.16
* Update version [ci skip]
* Redundant ')' in the Stop words' example (#2856)
<!--- Provide a general summary of your changes in the title. -->
## Description
<!--- Use this section to describe your changes. If your changes required
testing, include information about the testing environment and the tests you
ran. If your test fixes a bug reported in an issue, don't forget to include the
issue number. If your PR is still a work in progress, that's totally fine – just
include a note to let us know. -->
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [ ] I have submitted the spaCy Contributor Agreement.
- [ ] I ran the tests, and all new and existing tests passed.
- [ ] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Documentation improvement regarding joblib and SO (#2867)
Some documentation improvements
## Description
1. Fixed the dead URL to joblib
2. Fixed Stack Overflow brand name (with space)
### Types of change
Documentation
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* raise error when setting overlapping entities as doc.ents (#2880)
* Fix out-of-bounds access in NER training
The helper method state.B(1) gets the index of the first token of the
buffer, or -1 if no such token exists. Normally this is safe because we
pass this to functions like state.safe_get(), which returns an empty
token. Here we used it directly as an array index, which is not okay!
This error may have been the cause of out-of-bounds access errors during
training. Similar errors may still be around, so much be hunted down.
Hunting this one down took a long time...I printed out values across
training runs and diffed, looking for points of divergence between
runs, when no randomness should be allowed.
* Change PyThaiNLP Url (#2876)
* Fix missing comma
* Add example showing a fix-up rule for space entities
* Set version to 2.0.17.dev0
* Update regex version
* Revert "Update regex version"
This reverts commit 62358dd867d15bc6a475942dff34effba69dd70a.
* Try setting older regex version, to align with conda
* Set version to 2.0.17
* Add spacy-js to universe [ci-skip]
* Add spacy-raspberry to universe (closes #2889)
* Add script to validate universe json [ci skip]
* Removed space in docs + added contributor indo (#2909)
* - removed unneeded space in documentation
* - added contributor info
* Allow input text of length up to max_length, inclusive (#2922)
* Include universe spec for spacy-wordnet component (#2919)
* feat: include universe spec for spacy-wordnet component
* chore: include spaCy contributor agreement
* Minor formatting changes [ci skip]
* Fix image [ci skip]
Twitter URL doesn't work on live site
* Check if the word is in one of the regular lists specific to each POS (#2886)
* 💫 Create random IDs for SVGs to prevent ID clashes (#2927)
Resolves #2924.
## Description
Fixes problem where multiple visualizations in Jupyter notebooks would have clashing arc IDs, resulting in weirdly positioned arc labels. Generating a random ID prefix so even identical parses won't receive the same IDs for consistency (even if effect of ID clash isn't noticable here.)
### Types of change
bug fix
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix typo [ci skip]
* fixes symbolic link on py3 and windows (#2949)
* fixes symbolic link on py3 and windows
during setup of spacy using command
python -m spacy link en_core_web_sm en
closes #2948
* Update spacy/compat.py
Co-Authored-By: cicorias <cicorias@users.noreply.github.com>
* Fix formatting
* Update universe [ci skip]
* Catalan Language Support (#2940)
* Catalan language Support
* Ddding Catalan to documentation
* Sort languages alphabetically [ci skip]
* Update tests for pytest 4.x (#2965)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Replace marks in params for pytest 4.0 compat ([see here](https://docs.pytest.org/en/latest/deprecations.html#marks-in-pytest-mark-parametrize))
- [x] Un-xfail passing tests (some fixes in a recent update resolved a bunch of issues, but tests were apparently never updated here)
### Types of change
<!-- What type of change does your PR cover? Is it a bug fix, an enhancement
or new feature, or a change to the documentation? -->
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
* Fix regex pin to harmonize with conda (#2964)
* Update README.rst
* Fix bug where Vocab.prune_vector did not use 'batch_size' (#2977)
Fixes #2976
* Fix typo
* Fix typo
* Remove duplicate file
* Require thinc 7.0.0.dev2
Fixes bug in gpu_ops that would use cupy instead of numpy on CPU
* Add missing import
* Fix error IDs
* Fix tests
2018-11-29 18:30:29 +03:00
|
|
|
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A token"
|
|
|
|
" can only be part of one entity, so make sure the entities you're "
|
|
|
|
"setting don't overlap.")
|
2018-11-30 22:16:14 +03:00
|
|
|
E104 = ("Can't find JSON schema for '{name}'.")
|
|
|
|
E105 = ("The Doc.print_tree() method is now deprecated. Please use "
|
2019-02-08 20:09:23 +03:00
|
|
|
"Doc.to_json() instead or write your own function.")
|
2018-11-30 22:16:14 +03:00
|
|
|
E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
|
|
|
|
"settings: {opts}")
|
|
|
|
E107 = ("Value of doc._.{attr} is not JSON-serializable: {value}")
|
2018-11-30 23:22:40 +03:00
|
|
|
E108 = ("As of spaCy v2.1, the pipe name `sbd` has been deprecated "
|
|
|
|
"in favor of the pipe name `sentencizer`, which does the same "
|
|
|
|
"thing. For example, use `nlp.create_pipeline('sentencizer')`")
|
2018-12-20 17:54:53 +03:00
|
|
|
E109 = ("Model for component '{name}' not initialized. Did you forget to load "
|
|
|
|
"a model, or forget to call begin_training()?")
|
2018-12-20 19:32:04 +03:00
|
|
|
E110 = ("Invalid displaCy render wrapper. Expected callable, got: {obj}")
|
2019-02-13 13:27:04 +03:00
|
|
|
E111 = ("Pickling a token is not supported, because tokens are only views "
|
|
|
|
"of the parent Doc and can't exist on their own. A pickled token "
|
|
|
|
"would always have to include its Doc and Vocab, which has "
|
|
|
|
"practically no advantage over pickling the parent Doc directly. "
|
|
|
|
"So instead of pickling the token, pickle the Doc it belongs to.")
|
2019-02-13 15:22:05 +03:00
|
|
|
E112 = ("Pickling a span is not supported, because spans are only views "
|
|
|
|
"of the parent Doc and can't exist on their own. A pickled span "
|
|
|
|
"would always have to include its Doc and Vocab, which has "
|
|
|
|
"practically no advantage over pickling the parent Doc directly. "
|
|
|
|
"So instead of pickling the span, pickle the Doc it belongs to or "
|
|
|
|
"use Span.as_doc to convert the span to a standalone Doc object.")
|
2019-02-14 17:35:20 +03:00
|
|
|
E113 = ("The newly split token can only have one root (head = 0).")
|
2019-02-24 17:11:28 +03:00
|
|
|
E114 = ("The newly split token needs to have a root (head = 0).")
|
|
|
|
E115 = ("All subtokens must have associated heads.")
|
2019-02-14 22:03:19 +03:00
|
|
|
E116 = ("Cannot currently add labels to pre-trained text classifier. Add "
|
|
|
|
"labels before training begins. This functionality was available "
|
|
|
|
"in previous versions, but had significant bugs that led to poor "
|
2019-02-24 17:11:28 +03:00
|
|
|
"performance.")
|
2019-02-15 19:32:31 +03:00
|
|
|
E117 = ("The newly split tokens must match the text of the original token. "
|
2019-02-17 14:22:07 +03:00
|
|
|
"New orths: {new}. Old text: {old}.")
|
2019-02-24 20:38:47 +03:00
|
|
|
E118 = ("The custom extension attribute '{attr}' is not registered on the "
|
|
|
|
"Token object so it can't be set during retokenization. To "
|
|
|
|
"register an attribute, use the Token.set_extension classmethod.")
|
|
|
|
E119 = ("Can't set custom extension attribute '{attr}' during retokenization "
|
|
|
|
"because it's not writable. This usually means it was registered "
|
|
|
|
"with a getter function (and no setter) or as a method extension, "
|
|
|
|
"so the value is computed dynamically. To overwrite a custom "
|
|
|
|
"attribute manually, it should be registered with a default value "
|
|
|
|
"or with a getter AND setter.")
|
|
|
|
E120 = ("Can't set custom extension attributes during retokenization. "
|
|
|
|
"Expected dict mapping attribute names to values, but got: {value}")
|
2019-03-08 13:42:26 +03:00
|
|
|
E121 = ("Can't bulk merge spans. Attribute length {attr_len} should be "
|
|
|
|
"equal to span length ({span_len}).")
|
|
|
|
E122 = ("Cannot find token to be split. Did it get merged?")
|
|
|
|
E123 = ("Cannot find head of token to be split. Did it get merged?")
|
2019-03-09 01:15:23 +03:00
|
|
|
E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
|
2019-03-08 13:42:26 +03:00
|
|
|
E125 = ("Unexpected value: {value}")
|
|
|
|
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
|
|
|
|
"This is likely a bug in spaCy, so feel free to open an issue.")
|
|
|
|
E127 = ("Cannot create phrase pattern representation for length 0. This "
|
|
|
|
"is likely a bug in spaCy.")
|
2019-03-10 21:16:45 +03:00
|
|
|
E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
|
|
|
|
"arguments to exclude fields from being serialized or deserialized "
|
|
|
|
"is now deprecated. Please use the `exclude` argument instead. "
|
|
|
|
"For example: exclude=['{arg}'].")
|
2019-03-15 02:46:45 +03:00
|
|
|
E129 = ("Cannot write the label of an existing Span object because a Span "
|
|
|
|
"is a read-only view of the underlying Token objects stored in the Doc. "
|
|
|
|
"Instead, create a new Span object and specify the `label` keyword argument, "
|
|
|
|
"for example:\nfrom spacy.tokens import Span\n"
|
|
|
|
"span = Span(doc, start={start}, end={end}, label='{label}')")
|
2019-03-20 11:55:45 +03:00
|
|
|
E130 = ("You are running a narrow unicode build, which is incompatible "
|
|
|
|
"with spacy >= 2.1.0. To fix this, reinstall Python and use a wide "
|
|
|
|
"unicode build instead. You can also rebuild Python and set the "
|
|
|
|
"--enable-unicode=ucs4 flag.")
|
2019-03-22 14:05:35 +03:00
|
|
|
E131 = ("Cannot write the kb_id of an existing Span object because a Span "
|
|
|
|
"is a read-only view of the underlying Token objects stored in the Doc. "
|
|
|
|
"Instead, create a new Span object and specify the `kb_id` keyword argument, "
|
|
|
|
"for example:\nfrom spacy.tokens import Span\n"
|
|
|
|
"span = Span(doc, start={start}, end={end}, label='{label}', kb_id='{kb_id}')")
|
2019-03-22 18:55:05 +03:00
|
|
|
E132 = ("The vectors for entities and probabilities for alias '{alias}' should have equal length, "
|
|
|
|
"but found {entities_length} and {probabilities_length} respectively.")
|
|
|
|
E133 = ("The sum of prior probabilities for alias '{alias}' should not exceed 1, "
|
|
|
|
"but found {sum}.")
|
|
|
|
E134 = ("Alias '{alias}' defined for unknown entity '{entity}'.")
|
2019-05-14 17:59:31 +03:00
|
|
|
E135 = ("If you meant to replace a built-in component, use `create_pipe`: "
|
|
|
|
"`nlp.replace_pipe('{name}', nlp.create_pipe('{name}'))`")
|
2019-05-30 15:34:58 +03:00
|
|
|
E136 = ("This additional feature requires the jsonschema library to be "
|
|
|
|
"installed:\npip install jsonschema")
|
2019-06-16 14:22:57 +03:00
|
|
|
E137 = ("Expected 'dict' type, but got '{type}' from '{line}'. Make sure to provide a valid JSON "
|
|
|
|
"object as input with either the `text` or `tokens` key. For more info, see the docs:\n"
|
|
|
|
"https://spacy.io/api/cli#pretrain-jsonl")
|
|
|
|
E138 = ("Invalid JSONL format for raw text '{text}'. Make sure the input includes either the "
|
|
|
|
"`text` or `tokens` key. For more info, see the docs:\n"
|
|
|
|
"https://spacy.io/api/cli#pretrain-jsonl")
|
2019-06-19 13:35:26 +03:00
|
|
|
E139 = ("Knowledge base for component '{name}' not initialized. Did you forget to call set_kb()?")
|
|
|
|
E140 = ("The list of entities, prior probabilities and entity vectors should be of equal length.")
|
|
|
|
E141 = ("Entity vectors should be of length {required} instead of the provided {found}.")
|
2019-06-20 11:35:37 +03:00
|
|
|
E142 = ("Unsupported loss_function '{loss_func}'. Use either 'L2' or 'cosine'")
|
2019-07-10 20:39:38 +03:00
|
|
|
E143 = ("Labels for component '{name}' not initialized. Did you forget to call add_label()?")
|
2019-07-15 12:42:50 +03:00
|
|
|
E144 = ("Could not find parameter `{param}` when building the entity linker model.")
|
2019-07-22 15:36:07 +03:00
|
|
|
E145 = ("Error reading `{param}` from input file.")
|
2019-07-22 15:56:13 +03:00
|
|
|
E146 = ("Could not access `{path}`.")
|
2019-07-23 12:52:48 +03:00
|
|
|
E147 = ("Unexpected error in the {method} functionality of the EntityLinker: {msg}. "
|
|
|
|
"This is likely a bug in spaCy, so feel free to open an issue.")
|
|
|
|
E148 = ("Expected {ents} KB identifiers but got {ids}. Make sure that each entity in `doc.ents` "
|
|
|
|
"is assigned to a KB identifier.")
|
2019-07-24 12:27:34 +03:00
|
|
|
E149 = ("Error deserializing model. Check that the config used to create the "
|
|
|
|
"component matches the model being loaded.")
|
2019-08-01 18:13:01 +03:00
|
|
|
E150 = ("The language of the `nlp` object and the `vocab` should be the same, "
|
|
|
|
"but found '{nlp}' and '{vocab}' respectively.")
|
2019-08-06 12:01:25 +03:00
|
|
|
E151 = ("Trying to call nlp.update without required annotation types. "
|
|
|
|
"Expected top-level keys: {expected_keys}."
|
|
|
|
" Got: {unexpected_keys}.")
|
2019-08-13 16:38:59 +03:00
|
|
|
E152 = ("The `nlp` object should have a pre-trained `ner` component.")
|
|
|
|
E153 = ("Either provide a path to a preprocessed training directory, "
|
|
|
|
"or to the original Wikipedia XML dump.")
|
|
|
|
E154 = ("Either the `nlp` model or the `vocab` should be specified.")
|
|
|
|
E155 = ("The `nlp` object should have access to pre-trained word vectors, cf. "
|
|
|
|
"https://spacy.io/usage/models#languages.")
|
2019-02-13 17:29:08 +03:00
|
|
|
|
2018-04-03 16:50:31 +03:00
|
|
|
@add_codes
|
|
|
|
class TempErrors(object):
|
|
|
|
T003 = ("Resizing pre-trained Tagger models is not currently supported.")
|
|
|
|
T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
|
|
|
|
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
|
|
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
2018-04-03 22:40:29 +03:00
|
|
|
T008 = ("Bad configuration of Tagger. This is probably a bug within "
|
|
|
|
"spaCy. We changed the name of an internal attribute for loading "
|
|
|
|
"pre-trained vectors, and the class has been passed the old name "
|
|
|
|
"(pretrained_dims) but not the new name (pretrained_vectors).")
|
2018-04-03 16:50:31 +03:00
|
|
|
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
# fmt: on
|
|
|
|
|
|
|
|
|
2019-02-12 17:47:26 +03:00
|
|
|
class MatchPatternError(ValueError):
|
|
|
|
def __init__(self, key, errors):
|
|
|
|
"""Custom error for validating match patterns.
|
|
|
|
|
|
|
|
key (unicode): The name of the matcher rule.
|
|
|
|
errors (dict): Validation errors (sequence of strings) mapped to pattern
|
|
|
|
ID, i.e. the index of the added pattern.
|
|
|
|
"""
|
|
|
|
msg = "Invalid token patterns for matcher rule '{}'\n".format(key)
|
|
|
|
for pattern_idx, error_msgs in errors.items():
|
|
|
|
pattern_errors = "\n".join(["- {}".format(e) for e in error_msgs])
|
|
|
|
msg += "\nPattern {}:\n{}\n".format(pattern_idx, pattern_errors)
|
|
|
|
ValueError.__init__(self, msg)
|
|
|
|
|
|
|
|
|
2018-04-03 16:50:31 +03:00
|
|
|
class ModelsWarning(UserWarning):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
WARNINGS = {
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
"user": UserWarning,
|
|
|
|
"deprecation": DeprecationWarning,
|
|
|
|
"models": ModelsWarning,
|
2018-04-03 16:50:31 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
def _get_warn_types(arg):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
if arg == "": # don't show any warnings
|
2018-04-03 16:50:31 +03:00
|
|
|
return []
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
if not arg or arg == "all": # show all available warnings
|
2018-04-03 16:50:31 +03:00
|
|
|
return WARNINGS.keys()
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
return [w_type.strip() for w_type in arg.split(",") if w_type.strip() in WARNINGS]
|
2018-04-03 16:50:31 +03:00
|
|
|
|
|
|
|
|
2018-05-21 02:22:38 +03:00
|
|
|
def _get_warn_excl(arg):
|
|
|
|
if not arg:
|
|
|
|
return []
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
return [w_id.strip() for w_id in arg.split(",")]
|
2018-05-21 02:22:38 +03:00
|
|
|
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
SPACY_WARNING_FILTER = os.environ.get("SPACY_WARNING_FILTER")
|
|
|
|
SPACY_WARNING_TYPES = _get_warn_types(os.environ.get("SPACY_WARNING_TYPES"))
|
|
|
|
SPACY_WARNING_IGNORE = _get_warn_excl(os.environ.get("SPACY_WARNING_IGNORE"))
|
2018-04-03 16:50:31 +03:00
|
|
|
|
|
|
|
|
|
|
|
def user_warning(message):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
_warn(message, "user")
|
2018-04-03 16:50:31 +03:00
|
|
|
|
|
|
|
|
|
|
|
def deprecation_warning(message):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
_warn(message, "deprecation")
|
2018-04-03 16:50:31 +03:00
|
|
|
|
|
|
|
|
|
|
|
def models_warning(message):
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
_warn(message, "models")
|
2018-04-03 16:50:31 +03:00
|
|
|
|
|
|
|
|
💫 Tidy up and auto-format .py files (#2983)
<!--- Provide a general summary of your changes in the title. -->
## Description
- [x] Use [`black`](https://github.com/ambv/black) to auto-format all `.py` files.
- [x] Update flake8 config to exclude very large files (lemmatization tables etc.)
- [x] Update code to be compatible with flake8 rules
- [x] Fix various small bugs, inconsistencies and messy stuff in the language data
- [x] Update docs to explain new code style (`black`, `flake8`, when to use `# fmt: off` and `# fmt: on` and what `# noqa` means)
Once #2932 is merged, which auto-formats and tidies up the CLI, we'll be able to run `flake8 spacy` actually get meaningful results.
At the moment, the code style and linting isn't applied automatically, but I'm hoping that the new [GitHub Actions](https://github.com/features/actions) will let us auto-format pull requests and post comments with relevant linting information.
### Types of change
enhancement, code style
## Checklist
<!--- Before you submit the PR, go over this checklist and make sure you can
tick off all the boxes. [] -> [x] -->
- [x] I have submitted the spaCy Contributor Agreement.
- [x] I ran the tests, and all new and existing tests passed.
- [x] My changes don't require a change to the documentation, or if they do, I've added all required information.
2018-11-30 19:03:03 +03:00
|
|
|
def _warn(message, warn_type="user"):
|
2018-04-03 16:50:31 +03:00
|
|
|
"""
|
|
|
|
message (unicode): The message to display.
|
|
|
|
category (Warning): The Warning to show.
|
|
|
|
"""
|
2018-12-20 19:32:04 +03:00
|
|
|
if message.startswith("["):
|
|
|
|
w_id = message.split("[", 1)[1].split("]", 1)[0] # get ID from string
|
|
|
|
else:
|
|
|
|
w_id = None
|
|
|
|
ignore_warning = w_id and w_id in SPACY_WARNING_IGNORE
|
|
|
|
if warn_type in SPACY_WARNING_TYPES and not ignore_warning:
|
2018-04-03 16:50:31 +03:00
|
|
|
category = WARNINGS[warn_type]
|
|
|
|
stack = inspect.stack()[-1]
|
|
|
|
with warnings.catch_warnings():
|
2018-05-26 19:44:15 +03:00
|
|
|
if SPACY_WARNING_FILTER:
|
|
|
|
warnings.simplefilter(SPACY_WARNING_FILTER, category)
|
2018-04-03 16:50:31 +03:00
|
|
|
warnings.warn_explicit(message, category, stack[1], stack[2])
|