refactor fixes (#5664)

* fixes in ud_train, UX for morphs

* update pyproject with new version of thinc

* fixes in debug_data script

* cleanup of old unused error messages

* remove obsolete TempErrors

* move error messages to errors.py

* add ENT_KB_ID to default DocBin serialization

* few fixes to simple_ner

* fix tags
This commit is contained in:
Sofie Van Landeghem 2020-06-29 14:33:00 +02:00 committed by GitHub
parent fc3cb1fa9e
commit 8d3c0306e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
13 changed files with 68 additions and 139 deletions

View File

@ -78,8 +78,7 @@ def read_data(
head = int(head) - 1 if head != "0" else id_ head = int(head) - 1 if head != "0" else id_
sent["words"].append(word) sent["words"].append(word)
sent["tags"].append(tag) sent["tags"].append(tag)
sent["morphology"].append(_parse_morph_string(morph)) sent["morphs"].append(_compile_morph_string(morph, pos))
sent["morphology"][-1].add("POS_%s" % pos)
sent["heads"].append(head) sent["heads"].append(head)
sent["deps"].append("ROOT" if dep == "root" else dep) sent["deps"].append("ROOT" if dep == "root" else dep)
sent["spaces"].append(space_after == "_") sent["spaces"].append(space_after == "_")
@ -88,12 +87,12 @@ def read_data(
if oracle_segments: if oracle_segments:
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"])) docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
golds.append(sent) golds.append(sent)
assert golds[-1].morphology is not None assert golds[-1]["morphs"] is not None
sent_annots.append(sent) sent_annots.append(sent)
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length: if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
doc, gold = _make_gold(nlp, None, sent_annots) doc, gold = _make_gold(nlp, None, sent_annots)
assert gold.morphology is not None assert gold["morphs"] is not None
sent_annots = [] sent_annots = []
docs.append(doc) docs.append(doc)
golds.append(gold) golds.append(gold)
@ -109,17 +108,10 @@ def read_data(
return golds_to_gold_data(docs, golds) return golds_to_gold_data(docs, golds)
def _parse_morph_string(morph_string): def _compile_morph_string(morph_string, pos):
if morph_string == '_': if morph_string == '_':
return set() return f"POS={pos}"
output = [] return morph_string + f"|POS={pos}"
replacements = {'1': 'one', '2': 'two', '3': 'three'}
for feature in morph_string.split('|'):
key, value = feature.split('=')
value = replacements.get(value, value)
value = value.split(',')[0]
output.append('%s_%s' % (key, value.lower()))
return set(output)
def read_conllu(file_): def read_conllu(file_):
@ -155,7 +147,7 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
sent_starts = [] sent_starts = []
for sent in sent_annots: for sent in sent_annots:
gold["heads"].extend(len(gold["words"])+head for head in sent["heads"]) gold["heads"].extend(len(gold["words"])+head for head in sent["heads"])
for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]: for field in ["words", "tags", "deps", "morphs", "entities", "spaces"]:
gold[field].extend(sent[field]) gold[field].extend(sent[field])
sent_starts.append(True) sent_starts.append(True)
sent_starts.extend([False] * (len(sent["words"]) - 1)) sent_starts.extend([False] * (len(sent["words"]) - 1))
@ -168,7 +160,7 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
doc = nlp.make_doc(text) doc = nlp.make_doc(text)
gold.pop("spaces") gold.pop("spaces")
gold["sent_starts"] = sent_starts gold["sent_starts"] = sent_starts
for i in range(len(gold.heads)): for i in range(len(gold["heads"])):
if random.random() < drop_deps: if random.random() < drop_deps:
gold["heads"][i] = None gold["heads"][i] = None
gold["labels"][i] = None gold["labels"][i] = None
@ -185,7 +177,7 @@ def golds_to_gold_data(docs, golds):
"""Get out the training data format used by begin_training""" """Get out the training data format used by begin_training"""
data = [] data = []
for doc, gold in zip(docs, golds): for doc, gold in zip(docs, golds):
example = Example.from_dict(doc, gold) example = Example.from_dict(doc, dict(gold))
data.append(example) data.append(example)
return data return data
@ -354,8 +346,7 @@ def initialize_pipeline(nlp, examples, config, device):
if config.multitask_sent: if config.multitask_sent:
nlp.parser.add_multitask_objective("sent_start") nlp.parser.add_multitask_objective("sent_start")
for eg in examples: for eg in examples:
gold = eg.gold for tag in eg.get_aligned("TAG", as_string=True):
for tag in gold.tags:
if tag is not None: if tag is not None:
nlp.tagger.add_label(tag) nlp.tagger.add_label(tag)
if torch is not None and device != -1: if torch is not None and device != -1:
@ -489,10 +480,6 @@ def main(
Token.set_extension("begins_fused", default=False) Token.set_extension("begins_fused", default=False)
Token.set_extension("inside_fused", default=False) Token.set_extension("inside_fused", default=False)
Token.set_extension("get_conllu_lines", method=get_token_conllu)
Token.set_extension("begins_fused", default=False)
Token.set_extension("inside_fused", default=False)
spacy.util.fix_random_seed() spacy.util.fix_random_seed()
lang.zh.Chinese.Defaults.use_jieba = False lang.zh.Chinese.Defaults.use_jieba = False
lang.ja.Japanese.Defaults.use_janome = False lang.ja.Japanese.Defaults.use_janome = False
@ -535,10 +522,10 @@ def main(
else: else:
batches = minibatch(examples, size=batch_sizes) batches = minibatch(examples, size=batch_sizes)
losses = {} losses = {}
n_train_words = sum(len(eg.doc) for eg in examples) n_train_words = sum(len(eg.predicted) for eg in examples)
with tqdm.tqdm(total=n_train_words, leave=False) as pbar: with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
for batch in batches: for batch in batches:
pbar.update(sum(len(ex.doc) for ex in batch)) pbar.update(sum(len(ex.predicted) for ex in batch))
nlp.parser.cfg["beam_update_prob"] = next(beam_prob) nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
nlp.update( nlp.update(
batch, batch,

View File

@ -283,7 +283,7 @@ def initialize_pipeline(nlp, examples, config):
nlp.parser.moves.add_action(2, "subtok") nlp.parser.moves.add_action(2, "subtok")
nlp.add_pipe(nlp.create_pipe("tagger")) nlp.add_pipe(nlp.create_pipe("tagger"))
for eg in examples: for eg in examples:
for tag in eg.gold.tags: for tag in eg.get_aligned("TAG", as_string=True):
if tag is not None: if tag is not None:
nlp.tagger.add_label(tag) nlp.tagger.add_label(tag)
# Replace labels that didn't make the frequency cutoff # Replace labels that didn't make the frequency cutoff

View File

@ -56,7 +56,7 @@ def main(model=None, output_dir=None, n_iter=100):
print("Add label", ent[2]) print("Add label", ent[2])
ner.add_label(ent[2]) ner.add_label(ent[2])
with nlp.select_pipes(enable="ner") and warnings.catch_warnings(): with nlp.select_pipes(enable="simple_ner") and warnings.catch_warnings():
# show warnings for misaligned entity spans once # show warnings for misaligned entity spans once
warnings.filterwarnings("once", category=UserWarning, module="spacy") warnings.filterwarnings("once", category=UserWarning, module="spacy")

View File

@ -102,9 +102,6 @@ def debug_data(
corpus = Corpus(train_path, dev_path) corpus = Corpus(train_path, dev_path)
try: try:
train_dataset = list(corpus.train_dataset(nlp)) train_dataset = list(corpus.train_dataset(nlp))
train_dataset_unpreprocessed = list(
corpus.train_dataset_without_preprocessing(nlp)
)
except ValueError as e: except ValueError as e:
loading_train_error_message = f"Training data cannot be loaded: {e}" loading_train_error_message = f"Training data cannot be loaded: {e}"
try: try:
@ -120,11 +117,9 @@ def debug_data(
msg.good("Corpus is loadable") msg.good("Corpus is loadable")
# Create all gold data here to avoid iterating over the train_dataset constantly # Create all gold data here to avoid iterating over the train_dataset constantly
gold_train_data = _compile_gold(train_dataset, pipeline, nlp) gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
gold_train_unpreprocessed_data = _compile_gold( gold_train_unpreprocessed_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=False)
train_dataset_unpreprocessed, pipeline gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)
)
gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp)
train_texts = gold_train_data["texts"] train_texts = gold_train_data["texts"]
dev_texts = gold_dev_data["texts"] dev_texts = gold_dev_data["texts"]
@ -497,7 +492,7 @@ def _load_file(file_path: Path, msg: Printer) -> None:
def _compile_gold( def _compile_gold(
examples: Sequence[Example], pipeline: List[str], nlp: Language examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool
) -> Dict[str, Any]: ) -> Dict[str, Any]:
data = { data = {
"ner": Counter(), "ner": Counter(),
@ -517,9 +512,9 @@ def _compile_gold(
"n_cats_multilabel": 0, "n_cats_multilabel": 0,
"texts": set(), "texts": set(),
} }
for example in examples: for eg in examples:
gold = example.reference gold = eg.reference
doc = example.predicted doc = eg.predicted
valid_words = [x for x in gold if x is not None] valid_words = [x for x in gold if x is not None]
data["words"].update(valid_words) data["words"].update(valid_words)
data["n_words"] += len(valid_words) data["n_words"] += len(valid_words)
@ -530,7 +525,7 @@ def _compile_gold(
if nlp.vocab.strings[word] not in nlp.vocab.vectors: if nlp.vocab.strings[word] not in nlp.vocab.vectors:
data["words_missing_vectors"].update([word]) data["words_missing_vectors"].update([word])
if "ner" in pipeline: if "ner" in pipeline:
for i, label in enumerate(gold.ner): for i, label in enumerate(eg.get_aligned_ner()):
if label is None: if label is None:
continue continue
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space: if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
@ -556,16 +551,18 @@ def _compile_gold(
if list(gold.cats.values()).count(1.0) != 1: if list(gold.cats.values()).count(1.0) != 1:
data["n_cats_multilabel"] += 1 data["n_cats_multilabel"] += 1
if "tagger" in pipeline: if "tagger" in pipeline:
data["tags"].update([x for x in gold.tags if x is not None]) tags = eg.get_aligned("TAG", as_string=True)
data["tags"].update([x for x in tags if x is not None])
if "parser" in pipeline: if "parser" in pipeline:
data["deps"].update([x for x in gold.labels if x is not None]) aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
for i, (dep, head) in enumerate(zip(gold.labels, gold.heads)): data["deps"].update([x for x in aligned_deps if x is not None])
for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):
if head == i: if head == i:
data["roots"].update([dep]) data["roots"].update([dep])
data["n_sents"] += 1 data["n_sents"] += 1
if nonproj.is_nonproj_tree(gold.heads): if nonproj.is_nonproj_tree(aligned_heads):
data["n_nonproj"] += 1 data["n_nonproj"] += 1
if nonproj.contains_cycle(gold.heads): if nonproj.contains_cycle(aligned_heads):
data["n_cycles"] += 1 data["n_cycles"] += 1
return data return data
@ -581,7 +578,7 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
for eg in data: for eg in data:
labels = [ labels = [
label.split("-")[1] label.split("-")[1]
for label in eg.gold.ner for label in eg.get_aligned_ner()
if label not in ("O", "-", None) if label not in ("O", "-", None)
] ]
if label not in labels: if label not in labels:

View File

@ -132,6 +132,7 @@ class Warnings(object):
"are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.") "are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
W093 = ("Could not find any data to train the {name} on. Is your " W093 = ("Could not find any data to train the {name} on. Is your "
"input data correctly formatted ?") "input data correctly formatted ?")
W094 = ("Model '{model}' ({model_version}) specifies an under-constrained " W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
@ -154,7 +155,7 @@ class Warnings(object):
"so a default configuration was used.") "so a default configuration was used.")
W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', " W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
"but got '{type}' instead, so ignoring it.") "but got '{type}' instead, so ignoring it.")
W100 = ("Skipping unsupported morphological feature(s): {feature}. " W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or " "Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
"string \"Field1=Value1,Value2|Field2=Value3\".") "string \"Field1=Value1,Value2|Field2=Value3\".")
@ -182,18 +183,13 @@ class Errors(object):
"`nlp.select_pipes()`, you should remove them explicitly with " "`nlp.select_pipes()`, you should remove them explicitly with "
"`nlp.remove_pipe()` before the pipeline is restored. Names of " "`nlp.remove_pipe()` before the pipeline is restored. Names of "
"the new components: {names}") "the new components: {names}")
E009 = ("The `update` method expects same number of docs and golds, but "
"got: {n_docs} docs, {n_golds} golds.")
E010 = ("Word vectors set to length 0. This may be because you don't have " E010 = ("Word vectors set to length 0. This may be because you don't have "
"a model installed or loaded, or because your model doesn't " "a model installed or loaded, or because your model doesn't "
"include word vectors. For more info, see the docs:\n" "include word vectors. For more info, see the docs:\n"
"https://spacy.io/usage/models") "https://spacy.io/usage/models")
E011 = ("Unknown operator: '{op}'. Options: {opts}") E011 = ("Unknown operator: '{op}'. Options: {opts}")
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}") E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
E013 = ("Error selecting action in matcher")
E014 = ("Unknown tag ID: {tag}") E014 = ("Unknown tag ID: {tag}")
E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
"`force=True` to overwrite.")
E016 = ("MultitaskObjective target should be function or one of: dep, " E016 = ("MultitaskObjective target should be function or one of: dep, "
"tag, ent, dep_tag_offset, ent_tag.") "tag, ent, dep_tag_offset, ent_tag.")
E017 = ("Can only add unicode or bytes. Got type: {value_type}") E017 = ("Can only add unicode or bytes. Got type: {value_type}")
@ -201,21 +197,8 @@ class Errors(object):
"refers to an issue with the `Vocab` or `StringStore`.") "refers to an issue with the `Vocab` or `StringStore`.")
E019 = ("Can't create transition with unknown action ID: {action}. Action " E019 = ("Can't create transition with unknown action ID: {action}. Action "
"IDs are enumerated in spacy/syntax/{src}.pyx.") "IDs are enumerated in spacy/syntax/{src}.pyx.")
E020 = ("Could not find a gold-standard action to supervise the "
"dependency parser. The tree is non-projective (i.e. it has "
"crossing arcs - see spacy/syntax/nonproj.pyx for definitions). "
"The ArcEager transition system only supports projective trees. "
"To learn non-projective representations, transform the data "
"before training and after parsing. Either pass "
"`make_projective=True` to the GoldParse class, or use "
"spacy.syntax.nonproj.preprocess_training_data.")
E021 = ("Could not find a gold-standard action to supervise the "
"dependency parser. The GoldParse was projective. The transition "
"system has {n_actions} actions. State at failure: {state}")
E022 = ("Could not find a transition with the name '{name}' in the NER " E022 = ("Could not find a transition with the name '{name}' in the NER "
"model.") "model.")
E023 = ("Error cleaning up beam: The same state occurred twice at "
"memory address {addr} and position {i}.")
E024 = ("Could not find an optimal move to supervise the parser. Usually, " E024 = ("Could not find an optimal move to supervise the parser. Usually, "
"this means that the model can't be updated in a way that's valid " "this means that the model can't be updated in a way that's valid "
"and satisfies the correct annotations specified in the GoldParse. " "and satisfies the correct annotations specified in the GoldParse. "
@ -259,7 +242,6 @@ class Errors(object):
"offset {start}.") "offset {start}.")
E037 = ("Error calculating span: Can't find a token ending at character " E037 = ("Error calculating span: Can't find a token ending at character "
"offset {end}.") "offset {end}.")
E038 = ("Error finding sentence for span. Infinite loop detected.")
E039 = ("Array bounds exceeded while searching for root word. This likely " E039 = ("Array bounds exceeded while searching for root word. This likely "
"means the parse tree is in an invalid state. Please report this " "means the parse tree is in an invalid state. Please report this "
"issue here: http://github.com/explosion/spaCy/issues") "issue here: http://github.com/explosion/spaCy/issues")
@ -290,8 +272,6 @@ class Errors(object):
E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}") E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
E060 = ("Cannot add new key to vectors: the table is full. Current shape: " E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
"({rows}, {cols}).") "({rows}, {cols}).")
E061 = ("Bad file name: {filename}. Example of a valid file name: "
"'vectors.128.f.bin'")
E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 " E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
"and 63 are occupied. You can replace one by specifying the " "and 63 are occupied. You can replace one by specifying the "
"`flag_id` explicitly, e.g. " "`flag_id` explicitly, e.g. "
@ -305,39 +285,17 @@ class Errors(object):
"Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}") "Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
E065 = ("Only one of the vector table's width and shape can be specified. " E065 = ("Only one of the vector table's width and shape can be specified. "
"Got width {width} and shape {shape}.") "Got width {width} and shape {shape}.")
E066 = ("Error creating model helper for extracting columns. Can only "
"extract columns by positive integer. Got: {value}.")
E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside " E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
"an entity) without a preceding 'B' (beginning of an entity). " "an entity) without a preceding 'B' (beginning of an entity). "
"Tag sequence:\n{tags}") "Tag sequence:\n{tags}")
E068 = ("Invalid BILUO tag: '{tag}'.") E068 = ("Invalid BILUO tag: '{tag}'.")
E069 = ("Invalid gold-standard parse tree. Found cycle between word "
"IDs: {cycle} (tokens: {cycle_tokens}) in the document starting "
"with tokens: {doc_tokens}.")
E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) "
"does not align with number of annotations ({n_annots}).")
E071 = ("Error creating lexeme: specified orth ID ({orth}) does not " E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
"match the one in the vocab ({vocab_orth}).") "match the one in the vocab ({vocab_orth}).")
E072 = ("Error serializing lexeme: expected data length {length}, "
"got {bad_length}.")
E073 = ("Cannot assign vector of length {new_length}. Existing vectors " E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
"are of length {length}. You can use `vocab.reset_vectors` to " "are of length {length}. You can use `vocab.reset_vectors` to "
"clear the existing vectors and resize the table.") "clear the existing vectors and resize the table.")
E074 = ("Error interpreting compiled match pattern: patterns are expected " E074 = ("Error interpreting compiled match pattern: patterns are expected "
"to end with the attribute {attr}. Got: {bad_attr}.") "to end with the attribute {attr}. Got: {bad_attr}.")
E075 = ("Error accepting match: length ({length}) > maximum length "
"({max_len}).")
E076 = ("Error setting tensor on Doc: tensor has {rows} rows, while Doc "
"has {words} words.")
E077 = ("Error computing {value}: number of Docs ({n_docs}) does not "
"equal number of GoldParse objects ({n_golds}) in batch.")
E078 = ("Error computing score: number of words in Doc ({words_doc}) does "
"not equal number of words in GoldParse ({words_gold}).")
E079 = ("Error computing states in beam: number of predicted beams "
"({pbeams}) does not equal number of gold beams ({gbeams}).")
E080 = ("Duplicate state found in beam: {key}.")
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
"does not equal number of losses ({losses}).")
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), " E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not " "projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
"match.") "match.")
@ -345,8 +303,6 @@ class Errors(object):
"`getter` (plus optional `setter`) is allowed. Got: {nr_defined}") "`getter` (plus optional `setter`) is allowed. Got: {nr_defined}")
E084 = ("Error assigning label ID {label} to span: not in StringStore.") E084 = ("Error assigning label ID {label} to span: not in StringStore.")
E085 = ("Can't create lexeme for string '{string}'.") E085 = ("Can't create lexeme for string '{string}'.")
E086 = ("Error deserializing lexeme '{string}': orth ID {orth_id} does "
"not match hash {hash_id} in StringStore.")
E087 = ("Unknown displaCy style: {style}.") E087 = ("Unknown displaCy style: {style}.")
E088 = ("Text of length {length} exceeds maximum of {max_length}. The " E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
"v2.x parser and NER models require roughly 1GB of temporary " "v2.x parser and NER models require roughly 1GB of temporary "
@ -388,7 +344,6 @@ class Errors(object):
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A " E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
"token can only be part of one entity, so make sure the entities " "token can only be part of one entity, so make sure the entities "
"you're setting don't overlap.") "you're setting don't overlap.")
E104 = ("Can't find JSON schema for '{name}'.")
E105 = ("The Doc.print_tree() method is now deprecated. Please use " E105 = ("The Doc.print_tree() method is now deprecated. Please use "
"Doc.to_json() instead or write your own function.") "Doc.to_json() instead or write your own function.")
E106 = ("Can't find doc._.{attr} attribute specified in the underscore " E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
@ -411,8 +366,6 @@ class Errors(object):
"practically no advantage over pickling the parent Doc directly. " "practically no advantage over pickling the parent Doc directly. "
"So instead of pickling the span, pickle the Doc it belongs to or " "So instead of pickling the span, pickle the Doc it belongs to or "
"use Span.as_doc to convert the span to a standalone Doc object.") "use Span.as_doc to convert the span to a standalone Doc object.")
E113 = ("The newly split token can only have one root (head = 0).")
E114 = ("The newly split token needs to have a root (head = 0).")
E115 = ("All subtokens must have associated heads.") E115 = ("All subtokens must have associated heads.")
E116 = ("Cannot currently add labels to pretrained text classifier. Add " E116 = ("Cannot currently add labels to pretrained text classifier. Add "
"labels before training begins. This functionality was available " "labels before training begins. This functionality was available "
@ -435,12 +388,9 @@ class Errors(object):
"equal to span length ({span_len}).") "equal to span length ({span_len}).")
E122 = ("Cannot find token to be split. Did it get merged?") E122 = ("Cannot find token to be split. Did it get merged?")
E123 = ("Cannot find head of token to be split. Did it get merged?") E123 = ("Cannot find head of token to be split. Did it get merged?")
E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
E125 = ("Unexpected value: {value}") E125 = ("Unexpected value: {value}")
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. " E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
"This is likely a bug in spaCy, so feel free to open an issue.") "This is likely a bug in spaCy, so feel free to open an issue.")
E127 = ("Cannot create phrase pattern representation for length 0. This "
"is likely a bug in spaCy.")
E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword " E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
"arguments to exclude fields from being serialized or deserialized " "arguments to exclude fields from being serialized or deserialized "
"is now deprecated. Please use the `exclude` argument instead. " "is now deprecated. Please use the `exclude` argument instead. "
@ -482,8 +432,6 @@ class Errors(object):
"provided {found}.") "provided {found}.")
E143 = ("Labels for component '{name}' not initialized. Did you forget to " E143 = ("Labels for component '{name}' not initialized. Did you forget to "
"call add_label()?") "call add_label()?")
E144 = ("Could not find parameter `{param}` when building the entity "
"linker model.")
E145 = ("Error reading `{param}` from input file.") E145 = ("Error reading `{param}` from input file.")
E146 = ("Could not access `{path}`.") E146 = ("Could not access `{path}`.")
E147 = ("Unexpected error in the {method} functionality of the " E147 = ("Unexpected error in the {method} functionality of the "
@ -495,8 +443,6 @@ class Errors(object):
"the component matches the model being loaded.") "the component matches the model being loaded.")
E150 = ("The language of the `nlp` object and the `vocab` should be the " E150 = ("The language of the `nlp` object and the `vocab` should be the "
"same, but found '{nlp}' and '{vocab}' respectively.") "same, but found '{nlp}' and '{vocab}' respectively.")
E151 = ("Trying to call nlp.update without required annotation types. "
"Expected top-level keys: {exp}. Got: {unexp}.")
E152 = ("The attribute {attr} is not supported for token patterns. " E152 = ("The attribute {attr} is not supported for token patterns. "
"Please use the option validate=True with Matcher, PhraseMatcher, " "Please use the option validate=True with Matcher, PhraseMatcher, "
"or EntityRuler for more details.") "or EntityRuler for more details.")
@ -533,11 +479,6 @@ class Errors(object):
"that case.") "that case.")
E166 = ("Can only merge DocBins with the same pre-defined attributes.\n" E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
"Current DocBin: {current}\nOther DocBin: {other}") "Current DocBin: {current}\nOther DocBin: {other}")
E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can "
"happen if the tagger was trained with a different set of "
"morphological features. If you're using a pretrained model, make "
"sure that your models are up to date:\npython -m spacy validate")
E168 = ("Unknown field: {field}")
E169 = ("Can't find module: {module}") E169 = ("Can't find module: {module}")
E170 = ("Cannot apply transition {name}: invalid for the current state.") E170 = ("Cannot apply transition {name}: invalid for the current state.")
E171 = ("Matcher.add received invalid on_match callback argument: expected " E171 = ("Matcher.add received invalid on_match callback argument: expected "
@ -548,8 +489,6 @@ class Errors(object):
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of " E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
"Lookups containing the lemmatization tables. See the docs for " "Lookups containing the lemmatization tables. See the docs for "
"details: https://spacy.io/api/lemmatizer#init") "details: https://spacy.io/api/lemmatizer#init")
E174 = ("Architecture '{name}' not found in registry. Available "
"names: {names}")
E175 = ("Can't remove rule for unknown match pattern ID: {key}") E175 = ("Can't remove rule for unknown match pattern ID: {key}")
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.") E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
E177 = ("Ill-formed IOB input detected: {tag}") E177 = ("Ill-formed IOB input detected: {tag}")
@ -597,10 +536,19 @@ class Errors(object):
E198 = ("Unable to return {n} most similar vectors for the current vectors " E198 = ("Unable to return {n} most similar vectors for the current vectors "
"table, which contains {n_rows} vectors.") "table, which contains {n_rows} vectors.")
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].") E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
E200 = ("Specifying a base model with a pretrained component '{component}' "
"can not be combined with adding a pretrained Tok2Vec layer.")
# TODO: fix numbering after merging develop into master # TODO: fix numbering after merging develop into master
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
"array and {doc_length} for the Doc itself.")
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
E973 = ("Unexpected type for NER data")
E974 = ("Unknown {obj} attribute: {key}")
E975 = ("The method Example.from_dict expects a Doc as first argument, "
"but got {type}")
E976 = ("The method Example.from_dict expects a dict as second argument, "
"but received None.")
E977 = ("Can not compare a MorphAnalysis with a string object. "
"This is likely a bug in spaCy, so feel free to open an issue.")
E978 = ("The {method} method of component {name} takes a list of Example objects, " E978 = ("The {method} method of component {name} takes a list of Example objects, "
"but found {types} instead.") "but found {types} instead.")
E979 = ("Cannot convert {type} to an Example object.") E979 = ("Cannot convert {type} to an Example object.")
@ -648,13 +596,8 @@ class Errors(object):
@add_codes @add_codes
class TempErrors(object): class TempErrors(object):
T003 = ("Resizing pretrained Tagger models is not currently supported.") T003 = ("Resizing pretrained Tagger models is not currently supported.")
T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the " T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
"issue tracker: http://github.com/explosion/spaCy/issues") "issue tracker: http://github.com/explosion/spaCy/issues")
T008 = ("Bad configuration of Tagger. This is probably a bug within "
"spaCy. We changed the name of an internal attribute for loading "
"pretrained vectors, and the class has been passed the old name "
"(pretrained_dims) but not the new name (pretrained_vectors).")
# fmt: on # fmt: on

View File

@ -45,7 +45,7 @@ class Corpus:
def make_examples(self, nlp, reference_docs, max_length=0): def make_examples(self, nlp, reference_docs, max_length=0):
for reference in reference_docs: for reference in reference_docs:
if max_length >= 1 and len(reference) >= max_length: if len(reference) >= max_length >= 1:
if reference.is_sentenced: if reference.is_sentenced:
for ref_sent in reference.sents: for ref_sent in reference.sents:
yield Example( yield Example(

View File

@ -2,7 +2,6 @@ import warnings
import numpy import numpy
from ..tokens import Token
from ..tokens.doc cimport Doc from ..tokens.doc cimport Doc
from ..tokens.span cimport Span from ..tokens.span cimport Span
from ..tokens.span import Span from ..tokens.span import Span
@ -11,9 +10,8 @@ from .align cimport Alignment
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
from .iob_utils import spans_from_biluo_tags from .iob_utils import spans_from_biluo_tags
from .align import Alignment from .align import Alignment
from ..errors import Errors, AlignmentError from ..errors import Errors, Warnings
from ..syntax import nonproj from ..syntax import nonproj
from ..util import get_words_and_spaces
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot): cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
@ -32,11 +30,10 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
cdef class Example: cdef class Example:
def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None): def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
""" Doc can either be text, or an actual Doc """ """ Doc can either be text, or an actual Doc """
msg = "Example.__init__ got None for '{arg}'. Requires Doc."
if predicted is None: if predicted is None:
raise TypeError(msg.format(arg="predicted")) raise TypeError(Errors.E972.format(arg="predicted"))
if reference is None: if reference is None:
raise TypeError(msg.format(arg="reference")) raise TypeError(Errors.E972.format(arg="reference"))
self.x = predicted self.x = predicted
self.y = reference self.y = reference
self._alignment = alignment self._alignment = alignment
@ -64,9 +61,9 @@ cdef class Example:
@classmethod @classmethod
def from_dict(cls, Doc predicted, dict example_dict): def from_dict(cls, Doc predicted, dict example_dict):
if example_dict is None: if example_dict is None:
raise ValueError("Example.from_dict expected dict, received None") raise ValueError(Errors.E976)
if not isinstance(predicted, Doc): if not isinstance(predicted, Doc):
raise TypeError(f"Argument 1 should be Doc. Got {type(predicted)}") raise TypeError(Errors.E975.format(type=type(predicted)))
example_dict = _fix_legacy_dict_data(example_dict) example_dict = _fix_legacy_dict_data(example_dict)
tok_dict, doc_dict = _parse_example_dict_data(example_dict) tok_dict, doc_dict = _parse_example_dict_data(example_dict)
if "ORTH" not in tok_dict: if "ORTH" not in tok_dict:
@ -118,6 +115,7 @@ cdef class Example:
aligned_deps = [None] * self.x.length aligned_deps = [None] * self.x.length
heads = [token.head.i for token in self.y] heads = [token.head.i for token in self.y]
deps = [token.dep_ for token in self.y] deps = [token.dep_ for token in self.y]
if projectivize:
heads, deps = nonproj.projectivize(heads, deps) heads, deps = nonproj.projectivize(heads, deps)
for cand_i in range(self.x.length): for cand_i in range(self.x.length):
gold_i = cand_to_gold[cand_i] gold_i = cand_to_gold[cand_i]
@ -245,11 +243,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
elif key == "cats": elif key == "cats":
pass pass
else: else:
raise ValueError(f"Unknown doc attribute: {key}") raise ValueError(Errors.E974.format(obj="doc", key=key))
for key, value in tok_annot.items(): for key, value in tok_annot.items():
if key not in IDS: if key not in IDS:
raise ValueError(f"Unknown token attribute: {key}") raise ValueError(Errors.E974.format(obj="token", key=key))
elif key in ["ORTH", "SPACY"]: elif key in ["ORTH", "SPACY"]:
pass pass
elif key == "HEAD": elif key == "HEAD":
@ -289,7 +287,7 @@ def _add_entities_to_doc(doc, ner_data):
doc.ents = ner_data doc.ents = ner_data
doc.ents = [span for span in ner_data if span.label_] doc.ents = [span for span in ner_data if span.label_]
else: else:
raise ValueError("Unexpected type for NER data") raise ValueError(Errors.E973)
def _parse_example_dict_data(example_dict): def _parse_example_dict_data(example_dict):
@ -341,7 +339,7 @@ def _fix_legacy_dict_data(example_dict):
if "HEAD" in token_dict and "SENT_START" in token_dict: if "HEAD" in token_dict and "SENT_START" in token_dict:
# If heads are set, we don't also redundantly specify SENT_START. # If heads are set, we don't also redundantly specify SENT_START.
token_dict.pop("SENT_START") token_dict.pop("SENT_START")
warnings.warn("Ignoring annotations for sentence starts, as dependency heads are set") warnings.warn(Warnings.W092)
return { return {
"token_annotation": token_dict, "token_annotation": token_dict,
"doc_annotation": doc_dict "doc_annotation": doc_dict

View File

@ -145,7 +145,7 @@ def json_to_annotations(doc):
example["doc_annotation"] = dict( example["doc_annotation"] = dict(
cats=cats, cats=cats,
entities=ner_tags, entities=ner_tags,
links=paragraph.get("links", []) # TODO: fix/test links=paragraph.get("links", [])
) )
yield example yield example

View File

@ -107,9 +107,9 @@ cdef class Morphology:
Returns the hash of the new analysis. Returns the hash of the new analysis.
""" """
cdef MorphAnalysisC* tag_ptr cdef MorphAnalysisC* tag_ptr
if isinstance(features, str):
if features == self.EMPTY_MORPH: if features == self.EMPTY_MORPH:
features = "" features = ""
if isinstance(features, str):
tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features]) tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
if tag_ptr != NULL: if tag_ptr != NULL:
return tag_ptr.key return tag_ptr.key

View File

@ -70,7 +70,7 @@ class SimpleNER(Pipe):
def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None): def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
if not any(_has_ner(eg) for eg in examples): if not any(_has_ner(eg) for eg in examples):
return 0 return 0
docs = [eg.doc for eg in examples] docs = [eg.predicted for eg in examples]
set_dropout_rate(self.model, drop) set_dropout_rate(self.model, drop)
scores, bp_scores = self.model.begin_update(docs) scores, bp_scores = self.model.begin_update(docs)
loss, d_scores = self.get_loss(examples, scores) loss, d_scores = self.get_loss(examples, scores)
@ -89,7 +89,8 @@ class SimpleNER(Pipe):
d_scores = [] d_scores = []
truths = [] truths = []
for eg in examples: for eg in examples:
gold_tags = [(tag if tag != "-" else None) for tag in eg.gold.ner] tags = eg.get_aligned("TAG", as_string=True)
gold_tags = [(tag if tag != "-" else None) for tag in tags]
if not self.is_biluo: if not self.is_biluo:
gold_tags = biluo_to_iob(gold_tags) gold_tags = biluo_to_iob(gold_tags)
truths.append(gold_tags) truths.append(gold_tags)
@ -128,8 +129,8 @@ class SimpleNER(Pipe):
pass pass
def _has_ner(eg): def _has_ner(example):
for ner_tag in eg.gold.ner: for ner_tag in example.get_aligned_ner():
if ner_tag != "-" and ner_tag is not None: if ner_tag != "-" and ner_tag is not None:
return True return True
else: else:

View File

@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
from ..errors import Errors from ..errors import Errors
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "LEMMA", "MORPH") ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH")
class DocBin(object): class DocBin(object):

View File

@ -816,7 +816,7 @@ cdef class Doc:
cdef TokenC* tokens = self.c cdef TokenC* tokens = self.c
cdef int length = len(array) cdef int length = len(array)
if length != len(self): if length != len(self):
raise ValueError("Cannot set array values longer than the document.") raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
# Get set up for fast loading # Get set up for fast loading
cdef Pool mem = Pool() cdef Pool mem = Pool()

View File

@ -1,6 +1,7 @@
from libc.string cimport memset from libc.string cimport memset
cimport numpy as np cimport numpy as np
from ..errors import Errors
from ..vocab cimport Vocab from ..vocab cimport Vocab
from ..typedefs cimport hash_t, attr_t from ..typedefs cimport hash_t, attr_t
from ..morphology cimport list_features, check_feature, get_by_field from ..morphology cimport list_features, check_feature, get_by_field
@ -49,6 +50,8 @@ cdef class MorphAnalysis:
return self.key return self.key
def __eq__(self, other): def __eq__(self, other):
if isinstance(other, str):
raise ValueError(Errors.E977)
return self.key == other.key return self.key == other.key
def __ne__(self, other): def __ne__(self, other):