mirror of
https://github.com/explosion/spaCy.git
synced 2025-01-12 10:16:27 +03:00
refactor fixes (#5664)
* fixes in ud_train, UX for morphs * update pyproject with new version of thinc * fixes in debug_data script * cleanup of old unused error messages * remove obsolete TempErrors * move error messages to errors.py * add ENT_KB_ID to default DocBin serialization * few fixes to simple_ner * fix tags
This commit is contained in:
parent
fc3cb1fa9e
commit
8d3c0306e1
|
@ -78,8 +78,7 @@ def read_data(
|
||||||
head = int(head) - 1 if head != "0" else id_
|
head = int(head) - 1 if head != "0" else id_
|
||||||
sent["words"].append(word)
|
sent["words"].append(word)
|
||||||
sent["tags"].append(tag)
|
sent["tags"].append(tag)
|
||||||
sent["morphology"].append(_parse_morph_string(morph))
|
sent["morphs"].append(_compile_morph_string(morph, pos))
|
||||||
sent["morphology"][-1].add("POS_%s" % pos)
|
|
||||||
sent["heads"].append(head)
|
sent["heads"].append(head)
|
||||||
sent["deps"].append("ROOT" if dep == "root" else dep)
|
sent["deps"].append("ROOT" if dep == "root" else dep)
|
||||||
sent["spaces"].append(space_after == "_")
|
sent["spaces"].append(space_after == "_")
|
||||||
|
@ -88,12 +87,12 @@ def read_data(
|
||||||
if oracle_segments:
|
if oracle_segments:
|
||||||
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
|
docs.append(Doc(nlp.vocab, words=sent["words"], spaces=sent["spaces"]))
|
||||||
golds.append(sent)
|
golds.append(sent)
|
||||||
assert golds[-1].morphology is not None
|
assert golds[-1]["morphs"] is not None
|
||||||
|
|
||||||
sent_annots.append(sent)
|
sent_annots.append(sent)
|
||||||
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
|
if raw_text and max_doc_length and len(sent_annots) >= max_doc_length:
|
||||||
doc, gold = _make_gold(nlp, None, sent_annots)
|
doc, gold = _make_gold(nlp, None, sent_annots)
|
||||||
assert gold.morphology is not None
|
assert gold["morphs"] is not None
|
||||||
sent_annots = []
|
sent_annots = []
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
golds.append(gold)
|
golds.append(gold)
|
||||||
|
@ -109,17 +108,10 @@ def read_data(
|
||||||
return golds_to_gold_data(docs, golds)
|
return golds_to_gold_data(docs, golds)
|
||||||
|
|
||||||
|
|
||||||
def _parse_morph_string(morph_string):
|
def _compile_morph_string(morph_string, pos):
|
||||||
if morph_string == '_':
|
if morph_string == '_':
|
||||||
return set()
|
return f"POS={pos}"
|
||||||
output = []
|
return morph_string + f"|POS={pos}"
|
||||||
replacements = {'1': 'one', '2': 'two', '3': 'three'}
|
|
||||||
for feature in morph_string.split('|'):
|
|
||||||
key, value = feature.split('=')
|
|
||||||
value = replacements.get(value, value)
|
|
||||||
value = value.split(',')[0]
|
|
||||||
output.append('%s_%s' % (key, value.lower()))
|
|
||||||
return set(output)
|
|
||||||
|
|
||||||
|
|
||||||
def read_conllu(file_):
|
def read_conllu(file_):
|
||||||
|
@ -155,7 +147,7 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
||||||
sent_starts = []
|
sent_starts = []
|
||||||
for sent in sent_annots:
|
for sent in sent_annots:
|
||||||
gold["heads"].extend(len(gold["words"])+head for head in sent["heads"])
|
gold["heads"].extend(len(gold["words"])+head for head in sent["heads"])
|
||||||
for field in ["words", "tags", "deps", "morphology", "entities", "spaces"]:
|
for field in ["words", "tags", "deps", "morphs", "entities", "spaces"]:
|
||||||
gold[field].extend(sent[field])
|
gold[field].extend(sent[field])
|
||||||
sent_starts.append(True)
|
sent_starts.append(True)
|
||||||
sent_starts.extend([False] * (len(sent["words"]) - 1))
|
sent_starts.extend([False] * (len(sent["words"]) - 1))
|
||||||
|
@ -168,7 +160,7 @@ def _make_gold(nlp, text, sent_annots, drop_deps=0.0):
|
||||||
doc = nlp.make_doc(text)
|
doc = nlp.make_doc(text)
|
||||||
gold.pop("spaces")
|
gold.pop("spaces")
|
||||||
gold["sent_starts"] = sent_starts
|
gold["sent_starts"] = sent_starts
|
||||||
for i in range(len(gold.heads)):
|
for i in range(len(gold["heads"])):
|
||||||
if random.random() < drop_deps:
|
if random.random() < drop_deps:
|
||||||
gold["heads"][i] = None
|
gold["heads"][i] = None
|
||||||
gold["labels"][i] = None
|
gold["labels"][i] = None
|
||||||
|
@ -185,7 +177,7 @@ def golds_to_gold_data(docs, golds):
|
||||||
"""Get out the training data format used by begin_training"""
|
"""Get out the training data format used by begin_training"""
|
||||||
data = []
|
data = []
|
||||||
for doc, gold in zip(docs, golds):
|
for doc, gold in zip(docs, golds):
|
||||||
example = Example.from_dict(doc, gold)
|
example = Example.from_dict(doc, dict(gold))
|
||||||
data.append(example)
|
data.append(example)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
@ -354,8 +346,7 @@ def initialize_pipeline(nlp, examples, config, device):
|
||||||
if config.multitask_sent:
|
if config.multitask_sent:
|
||||||
nlp.parser.add_multitask_objective("sent_start")
|
nlp.parser.add_multitask_objective("sent_start")
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
gold = eg.gold
|
for tag in eg.get_aligned("TAG", as_string=True):
|
||||||
for tag in gold.tags:
|
|
||||||
if tag is not None:
|
if tag is not None:
|
||||||
nlp.tagger.add_label(tag)
|
nlp.tagger.add_label(tag)
|
||||||
if torch is not None and device != -1:
|
if torch is not None and device != -1:
|
||||||
|
@ -489,10 +480,6 @@ def main(
|
||||||
Token.set_extension("begins_fused", default=False)
|
Token.set_extension("begins_fused", default=False)
|
||||||
Token.set_extension("inside_fused", default=False)
|
Token.set_extension("inside_fused", default=False)
|
||||||
|
|
||||||
Token.set_extension("get_conllu_lines", method=get_token_conllu)
|
|
||||||
Token.set_extension("begins_fused", default=False)
|
|
||||||
Token.set_extension("inside_fused", default=False)
|
|
||||||
|
|
||||||
spacy.util.fix_random_seed()
|
spacy.util.fix_random_seed()
|
||||||
lang.zh.Chinese.Defaults.use_jieba = False
|
lang.zh.Chinese.Defaults.use_jieba = False
|
||||||
lang.ja.Japanese.Defaults.use_janome = False
|
lang.ja.Japanese.Defaults.use_janome = False
|
||||||
|
@ -535,10 +522,10 @@ def main(
|
||||||
else:
|
else:
|
||||||
batches = minibatch(examples, size=batch_sizes)
|
batches = minibatch(examples, size=batch_sizes)
|
||||||
losses = {}
|
losses = {}
|
||||||
n_train_words = sum(len(eg.doc) for eg in examples)
|
n_train_words = sum(len(eg.predicted) for eg in examples)
|
||||||
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
|
||||||
for batch in batches:
|
for batch in batches:
|
||||||
pbar.update(sum(len(ex.doc) for ex in batch))
|
pbar.update(sum(len(ex.predicted) for ex in batch))
|
||||||
nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
|
nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
|
||||||
nlp.update(
|
nlp.update(
|
||||||
batch,
|
batch,
|
||||||
|
|
|
@ -283,7 +283,7 @@ def initialize_pipeline(nlp, examples, config):
|
||||||
nlp.parser.moves.add_action(2, "subtok")
|
nlp.parser.moves.add_action(2, "subtok")
|
||||||
nlp.add_pipe(nlp.create_pipe("tagger"))
|
nlp.add_pipe(nlp.create_pipe("tagger"))
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
for tag in eg.gold.tags:
|
for tag in eg.get_aligned("TAG", as_string=True):
|
||||||
if tag is not None:
|
if tag is not None:
|
||||||
nlp.tagger.add_label(tag)
|
nlp.tagger.add_label(tag)
|
||||||
# Replace labels that didn't make the frequency cutoff
|
# Replace labels that didn't make the frequency cutoff
|
||||||
|
|
|
@ -56,7 +56,7 @@ def main(model=None, output_dir=None, n_iter=100):
|
||||||
print("Add label", ent[2])
|
print("Add label", ent[2])
|
||||||
ner.add_label(ent[2])
|
ner.add_label(ent[2])
|
||||||
|
|
||||||
with nlp.select_pipes(enable="ner") and warnings.catch_warnings():
|
with nlp.select_pipes(enable="simple_ner") and warnings.catch_warnings():
|
||||||
# show warnings for misaligned entity spans once
|
# show warnings for misaligned entity spans once
|
||||||
warnings.filterwarnings("once", category=UserWarning, module="spacy")
|
warnings.filterwarnings("once", category=UserWarning, module="spacy")
|
||||||
|
|
||||||
|
|
|
@ -102,9 +102,6 @@ def debug_data(
|
||||||
corpus = Corpus(train_path, dev_path)
|
corpus = Corpus(train_path, dev_path)
|
||||||
try:
|
try:
|
||||||
train_dataset = list(corpus.train_dataset(nlp))
|
train_dataset = list(corpus.train_dataset(nlp))
|
||||||
train_dataset_unpreprocessed = list(
|
|
||||||
corpus.train_dataset_without_preprocessing(nlp)
|
|
||||||
)
|
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
loading_train_error_message = f"Training data cannot be loaded: {e}"
|
loading_train_error_message = f"Training data cannot be loaded: {e}"
|
||||||
try:
|
try:
|
||||||
|
@ -120,11 +117,9 @@ def debug_data(
|
||||||
msg.good("Corpus is loadable")
|
msg.good("Corpus is loadable")
|
||||||
|
|
||||||
# Create all gold data here to avoid iterating over the train_dataset constantly
|
# Create all gold data here to avoid iterating over the train_dataset constantly
|
||||||
gold_train_data = _compile_gold(train_dataset, pipeline, nlp)
|
gold_train_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=True)
|
||||||
gold_train_unpreprocessed_data = _compile_gold(
|
gold_train_unpreprocessed_data = _compile_gold(train_dataset, pipeline, nlp, make_proj=False)
|
||||||
train_dataset_unpreprocessed, pipeline
|
gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp, make_proj=True)
|
||||||
)
|
|
||||||
gold_dev_data = _compile_gold(dev_dataset, pipeline, nlp)
|
|
||||||
|
|
||||||
train_texts = gold_train_data["texts"]
|
train_texts = gold_train_data["texts"]
|
||||||
dev_texts = gold_dev_data["texts"]
|
dev_texts = gold_dev_data["texts"]
|
||||||
|
@ -497,7 +492,7 @@ def _load_file(file_path: Path, msg: Printer) -> None:
|
||||||
|
|
||||||
|
|
||||||
def _compile_gold(
|
def _compile_gold(
|
||||||
examples: Sequence[Example], pipeline: List[str], nlp: Language
|
examples: Sequence[Example], pipeline: List[str], nlp: Language, make_proj: bool
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
data = {
|
data = {
|
||||||
"ner": Counter(),
|
"ner": Counter(),
|
||||||
|
@ -517,9 +512,9 @@ def _compile_gold(
|
||||||
"n_cats_multilabel": 0,
|
"n_cats_multilabel": 0,
|
||||||
"texts": set(),
|
"texts": set(),
|
||||||
}
|
}
|
||||||
for example in examples:
|
for eg in examples:
|
||||||
gold = example.reference
|
gold = eg.reference
|
||||||
doc = example.predicted
|
doc = eg.predicted
|
||||||
valid_words = [x for x in gold if x is not None]
|
valid_words = [x for x in gold if x is not None]
|
||||||
data["words"].update(valid_words)
|
data["words"].update(valid_words)
|
||||||
data["n_words"] += len(valid_words)
|
data["n_words"] += len(valid_words)
|
||||||
|
@ -530,7 +525,7 @@ def _compile_gold(
|
||||||
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
if nlp.vocab.strings[word] not in nlp.vocab.vectors:
|
||||||
data["words_missing_vectors"].update([word])
|
data["words_missing_vectors"].update([word])
|
||||||
if "ner" in pipeline:
|
if "ner" in pipeline:
|
||||||
for i, label in enumerate(gold.ner):
|
for i, label in enumerate(eg.get_aligned_ner()):
|
||||||
if label is None:
|
if label is None:
|
||||||
continue
|
continue
|
||||||
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
|
if label.startswith(("B-", "U-", "L-")) and doc[i].is_space:
|
||||||
|
@ -556,16 +551,18 @@ def _compile_gold(
|
||||||
if list(gold.cats.values()).count(1.0) != 1:
|
if list(gold.cats.values()).count(1.0) != 1:
|
||||||
data["n_cats_multilabel"] += 1
|
data["n_cats_multilabel"] += 1
|
||||||
if "tagger" in pipeline:
|
if "tagger" in pipeline:
|
||||||
data["tags"].update([x for x in gold.tags if x is not None])
|
tags = eg.get_aligned("TAG", as_string=True)
|
||||||
|
data["tags"].update([x for x in tags if x is not None])
|
||||||
if "parser" in pipeline:
|
if "parser" in pipeline:
|
||||||
data["deps"].update([x for x in gold.labels if x is not None])
|
aligned_heads, aligned_deps = eg.get_aligned_parse(projectivize=make_proj)
|
||||||
for i, (dep, head) in enumerate(zip(gold.labels, gold.heads)):
|
data["deps"].update([x for x in aligned_deps if x is not None])
|
||||||
|
for i, (dep, head) in enumerate(zip(aligned_deps, aligned_heads)):
|
||||||
if head == i:
|
if head == i:
|
||||||
data["roots"].update([dep])
|
data["roots"].update([dep])
|
||||||
data["n_sents"] += 1
|
data["n_sents"] += 1
|
||||||
if nonproj.is_nonproj_tree(gold.heads):
|
if nonproj.is_nonproj_tree(aligned_heads):
|
||||||
data["n_nonproj"] += 1
|
data["n_nonproj"] += 1
|
||||||
if nonproj.contains_cycle(gold.heads):
|
if nonproj.contains_cycle(aligned_heads):
|
||||||
data["n_cycles"] += 1
|
data["n_cycles"] += 1
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
@ -581,7 +578,7 @@ def _get_examples_without_label(data: Sequence[Example], label: str) -> int:
|
||||||
for eg in data:
|
for eg in data:
|
||||||
labels = [
|
labels = [
|
||||||
label.split("-")[1]
|
label.split("-")[1]
|
||||||
for label in eg.gold.ner
|
for label in eg.get_aligned_ner()
|
||||||
if label not in ("O", "-", None)
|
if label not in ("O", "-", None)
|
||||||
]
|
]
|
||||||
if label not in labels:
|
if label not in labels:
|
||||||
|
|
|
@ -132,6 +132,7 @@ class Warnings(object):
|
||||||
"are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
|
"are currently: da, de, el, en, id, lb, pt, ru, sr, ta, th.")
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
W092 = ("Ignoring annotations for sentence starts, as dependency heads are set.")
|
||||||
W093 = ("Could not find any data to train the {name} on. Is your "
|
W093 = ("Could not find any data to train the {name} on. Is your "
|
||||||
"input data correctly formatted ?")
|
"input data correctly formatted ?")
|
||||||
W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
|
W094 = ("Model '{model}' ({model_version}) specifies an under-constrained "
|
||||||
|
@ -154,7 +155,7 @@ class Warnings(object):
|
||||||
"so a default configuration was used.")
|
"so a default configuration was used.")
|
||||||
W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
|
W099 = ("Expected 'dict' type for the 'model' argument of pipe '{pipe}', "
|
||||||
"but got '{type}' instead, so ignoring it.")
|
"but got '{type}' instead, so ignoring it.")
|
||||||
W100 = ("Skipping unsupported morphological feature(s): {feature}. "
|
W100 = ("Skipping unsupported morphological feature(s): '{feature}'. "
|
||||||
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
|
"Provide features as a dict {{\"Field1\": \"Value1,Value2\"}} or "
|
||||||
"string \"Field1=Value1,Value2|Field2=Value3\".")
|
"string \"Field1=Value1,Value2|Field2=Value3\".")
|
||||||
|
|
||||||
|
@ -182,18 +183,13 @@ class Errors(object):
|
||||||
"`nlp.select_pipes()`, you should remove them explicitly with "
|
"`nlp.select_pipes()`, you should remove them explicitly with "
|
||||||
"`nlp.remove_pipe()` before the pipeline is restored. Names of "
|
"`nlp.remove_pipe()` before the pipeline is restored. Names of "
|
||||||
"the new components: {names}")
|
"the new components: {names}")
|
||||||
E009 = ("The `update` method expects same number of docs and golds, but "
|
|
||||||
"got: {n_docs} docs, {n_golds} golds.")
|
|
||||||
E010 = ("Word vectors set to length 0. This may be because you don't have "
|
E010 = ("Word vectors set to length 0. This may be because you don't have "
|
||||||
"a model installed or loaded, or because your model doesn't "
|
"a model installed or loaded, or because your model doesn't "
|
||||||
"include word vectors. For more info, see the docs:\n"
|
"include word vectors. For more info, see the docs:\n"
|
||||||
"https://spacy.io/usage/models")
|
"https://spacy.io/usage/models")
|
||||||
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
E011 = ("Unknown operator: '{op}'. Options: {opts}")
|
||||||
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
E012 = ("Cannot add pattern for zero tokens to matcher.\nKey: {key}")
|
||||||
E013 = ("Error selecting action in matcher")
|
|
||||||
E014 = ("Unknown tag ID: {tag}")
|
E014 = ("Unknown tag ID: {tag}")
|
||||||
E015 = ("Conflicting morphology exception for ({tag}, {orth}). Use "
|
|
||||||
"`force=True` to overwrite.")
|
|
||||||
E016 = ("MultitaskObjective target should be function or one of: dep, "
|
E016 = ("MultitaskObjective target should be function or one of: dep, "
|
||||||
"tag, ent, dep_tag_offset, ent_tag.")
|
"tag, ent, dep_tag_offset, ent_tag.")
|
||||||
E017 = ("Can only add unicode or bytes. Got type: {value_type}")
|
E017 = ("Can only add unicode or bytes. Got type: {value_type}")
|
||||||
|
@ -201,21 +197,8 @@ class Errors(object):
|
||||||
"refers to an issue with the `Vocab` or `StringStore`.")
|
"refers to an issue with the `Vocab` or `StringStore`.")
|
||||||
E019 = ("Can't create transition with unknown action ID: {action}. Action "
|
E019 = ("Can't create transition with unknown action ID: {action}. Action "
|
||||||
"IDs are enumerated in spacy/syntax/{src}.pyx.")
|
"IDs are enumerated in spacy/syntax/{src}.pyx.")
|
||||||
E020 = ("Could not find a gold-standard action to supervise the "
|
|
||||||
"dependency parser. The tree is non-projective (i.e. it has "
|
|
||||||
"crossing arcs - see spacy/syntax/nonproj.pyx for definitions). "
|
|
||||||
"The ArcEager transition system only supports projective trees. "
|
|
||||||
"To learn non-projective representations, transform the data "
|
|
||||||
"before training and after parsing. Either pass "
|
|
||||||
"`make_projective=True` to the GoldParse class, or use "
|
|
||||||
"spacy.syntax.nonproj.preprocess_training_data.")
|
|
||||||
E021 = ("Could not find a gold-standard action to supervise the "
|
|
||||||
"dependency parser. The GoldParse was projective. The transition "
|
|
||||||
"system has {n_actions} actions. State at failure: {state}")
|
|
||||||
E022 = ("Could not find a transition with the name '{name}' in the NER "
|
E022 = ("Could not find a transition with the name '{name}' in the NER "
|
||||||
"model.")
|
"model.")
|
||||||
E023 = ("Error cleaning up beam: The same state occurred twice at "
|
|
||||||
"memory address {addr} and position {i}.")
|
|
||||||
E024 = ("Could not find an optimal move to supervise the parser. Usually, "
|
E024 = ("Could not find an optimal move to supervise the parser. Usually, "
|
||||||
"this means that the model can't be updated in a way that's valid "
|
"this means that the model can't be updated in a way that's valid "
|
||||||
"and satisfies the correct annotations specified in the GoldParse. "
|
"and satisfies the correct annotations specified in the GoldParse. "
|
||||||
|
@ -259,7 +242,6 @@ class Errors(object):
|
||||||
"offset {start}.")
|
"offset {start}.")
|
||||||
E037 = ("Error calculating span: Can't find a token ending at character "
|
E037 = ("Error calculating span: Can't find a token ending at character "
|
||||||
"offset {end}.")
|
"offset {end}.")
|
||||||
E038 = ("Error finding sentence for span. Infinite loop detected.")
|
|
||||||
E039 = ("Array bounds exceeded while searching for root word. This likely "
|
E039 = ("Array bounds exceeded while searching for root word. This likely "
|
||||||
"means the parse tree is in an invalid state. Please report this "
|
"means the parse tree is in an invalid state. Please report this "
|
||||||
"issue here: http://github.com/explosion/spaCy/issues")
|
"issue here: http://github.com/explosion/spaCy/issues")
|
||||||
|
@ -290,8 +272,6 @@ class Errors(object):
|
||||||
E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
|
E059 = ("One (and only one) keyword arg must be set. Got: {kwargs}")
|
||||||
E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
|
E060 = ("Cannot add new key to vectors: the table is full. Current shape: "
|
||||||
"({rows}, {cols}).")
|
"({rows}, {cols}).")
|
||||||
E061 = ("Bad file name: {filename}. Example of a valid file name: "
|
|
||||||
"'vectors.128.f.bin'")
|
|
||||||
E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
|
E062 = ("Cannot find empty bit for new lexical flag. All bits between 0 "
|
||||||
"and 63 are occupied. You can replace one by specifying the "
|
"and 63 are occupied. You can replace one by specifying the "
|
||||||
"`flag_id` explicitly, e.g. "
|
"`flag_id` explicitly, e.g. "
|
||||||
|
@ -305,39 +285,17 @@ class Errors(object):
|
||||||
"Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
|
"Query string: {string}\nOrth cached: {orth}\nOrth ID: {orth_id}")
|
||||||
E065 = ("Only one of the vector table's width and shape can be specified. "
|
E065 = ("Only one of the vector table's width and shape can be specified. "
|
||||||
"Got width {width} and shape {shape}.")
|
"Got width {width} and shape {shape}.")
|
||||||
E066 = ("Error creating model helper for extracting columns. Can only "
|
|
||||||
"extract columns by positive integer. Got: {value}.")
|
|
||||||
E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
|
E067 = ("Invalid BILUO tag sequence: Got a tag starting with 'I' (inside "
|
||||||
"an entity) without a preceding 'B' (beginning of an entity). "
|
"an entity) without a preceding 'B' (beginning of an entity). "
|
||||||
"Tag sequence:\n{tags}")
|
"Tag sequence:\n{tags}")
|
||||||
E068 = ("Invalid BILUO tag: '{tag}'.")
|
E068 = ("Invalid BILUO tag: '{tag}'.")
|
||||||
E069 = ("Invalid gold-standard parse tree. Found cycle between word "
|
|
||||||
"IDs: {cycle} (tokens: {cycle_tokens}) in the document starting "
|
|
||||||
"with tokens: {doc_tokens}.")
|
|
||||||
E070 = ("Invalid gold-standard data. Number of documents ({n_docs}) "
|
|
||||||
"does not align with number of annotations ({n_annots}).")
|
|
||||||
E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
|
E071 = ("Error creating lexeme: specified orth ID ({orth}) does not "
|
||||||
"match the one in the vocab ({vocab_orth}).")
|
"match the one in the vocab ({vocab_orth}).")
|
||||||
E072 = ("Error serializing lexeme: expected data length {length}, "
|
|
||||||
"got {bad_length}.")
|
|
||||||
E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
|
E073 = ("Cannot assign vector of length {new_length}. Existing vectors "
|
||||||
"are of length {length}. You can use `vocab.reset_vectors` to "
|
"are of length {length}. You can use `vocab.reset_vectors` to "
|
||||||
"clear the existing vectors and resize the table.")
|
"clear the existing vectors and resize the table.")
|
||||||
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
E074 = ("Error interpreting compiled match pattern: patterns are expected "
|
||||||
"to end with the attribute {attr}. Got: {bad_attr}.")
|
"to end with the attribute {attr}. Got: {bad_attr}.")
|
||||||
E075 = ("Error accepting match: length ({length}) > maximum length "
|
|
||||||
"({max_len}).")
|
|
||||||
E076 = ("Error setting tensor on Doc: tensor has {rows} rows, while Doc "
|
|
||||||
"has {words} words.")
|
|
||||||
E077 = ("Error computing {value}: number of Docs ({n_docs}) does not "
|
|
||||||
"equal number of GoldParse objects ({n_golds}) in batch.")
|
|
||||||
E078 = ("Error computing score: number of words in Doc ({words_doc}) does "
|
|
||||||
"not equal number of words in GoldParse ({words_gold}).")
|
|
||||||
E079 = ("Error computing states in beam: number of predicted beams "
|
|
||||||
"({pbeams}) does not equal number of gold beams ({gbeams}).")
|
|
||||||
E080 = ("Duplicate state found in beam: {key}.")
|
|
||||||
E081 = ("Error getting gradient in beam: number of histories ({n_hist}) "
|
|
||||||
"does not equal number of losses ({losses}).")
|
|
||||||
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
E082 = ("Error deprojectivizing parse: number of heads ({n_heads}), "
|
||||||
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
"projective heads ({n_proj_heads}) and labels ({n_labels}) do not "
|
||||||
"match.")
|
"match.")
|
||||||
|
@ -345,8 +303,6 @@ class Errors(object):
|
||||||
"`getter` (plus optional `setter`) is allowed. Got: {nr_defined}")
|
"`getter` (plus optional `setter`) is allowed. Got: {nr_defined}")
|
||||||
E084 = ("Error assigning label ID {label} to span: not in StringStore.")
|
E084 = ("Error assigning label ID {label} to span: not in StringStore.")
|
||||||
E085 = ("Can't create lexeme for string '{string}'.")
|
E085 = ("Can't create lexeme for string '{string}'.")
|
||||||
E086 = ("Error deserializing lexeme '{string}': orth ID {orth_id} does "
|
|
||||||
"not match hash {hash_id} in StringStore.")
|
|
||||||
E087 = ("Unknown displaCy style: {style}.")
|
E087 = ("Unknown displaCy style: {style}.")
|
||||||
E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
|
E088 = ("Text of length {length} exceeds maximum of {max_length}. The "
|
||||||
"v2.x parser and NER models require roughly 1GB of temporary "
|
"v2.x parser and NER models require roughly 1GB of temporary "
|
||||||
|
@ -388,7 +344,6 @@ class Errors(object):
|
||||||
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
|
E103 = ("Trying to set conflicting doc.ents: '{span1}' and '{span2}'. A "
|
||||||
"token can only be part of one entity, so make sure the entities "
|
"token can only be part of one entity, so make sure the entities "
|
||||||
"you're setting don't overlap.")
|
"you're setting don't overlap.")
|
||||||
E104 = ("Can't find JSON schema for '{name}'.")
|
|
||||||
E105 = ("The Doc.print_tree() method is now deprecated. Please use "
|
E105 = ("The Doc.print_tree() method is now deprecated. Please use "
|
||||||
"Doc.to_json() instead or write your own function.")
|
"Doc.to_json() instead or write your own function.")
|
||||||
E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
|
E106 = ("Can't find doc._.{attr} attribute specified in the underscore "
|
||||||
|
@ -411,8 +366,6 @@ class Errors(object):
|
||||||
"practically no advantage over pickling the parent Doc directly. "
|
"practically no advantage over pickling the parent Doc directly. "
|
||||||
"So instead of pickling the span, pickle the Doc it belongs to or "
|
"So instead of pickling the span, pickle the Doc it belongs to or "
|
||||||
"use Span.as_doc to convert the span to a standalone Doc object.")
|
"use Span.as_doc to convert the span to a standalone Doc object.")
|
||||||
E113 = ("The newly split token can only have one root (head = 0).")
|
|
||||||
E114 = ("The newly split token needs to have a root (head = 0).")
|
|
||||||
E115 = ("All subtokens must have associated heads.")
|
E115 = ("All subtokens must have associated heads.")
|
||||||
E116 = ("Cannot currently add labels to pretrained text classifier. Add "
|
E116 = ("Cannot currently add labels to pretrained text classifier. Add "
|
||||||
"labels before training begins. This functionality was available "
|
"labels before training begins. This functionality was available "
|
||||||
|
@ -435,12 +388,9 @@ class Errors(object):
|
||||||
"equal to span length ({span_len}).")
|
"equal to span length ({span_len}).")
|
||||||
E122 = ("Cannot find token to be split. Did it get merged?")
|
E122 = ("Cannot find token to be split. Did it get merged?")
|
||||||
E123 = ("Cannot find head of token to be split. Did it get merged?")
|
E123 = ("Cannot find head of token to be split. Did it get merged?")
|
||||||
E124 = ("Cannot read from file: {path}. Supported formats: {formats}")
|
|
||||||
E125 = ("Unexpected value: {value}")
|
E125 = ("Unexpected value: {value}")
|
||||||
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
|
E126 = ("Unexpected matcher predicate: '{bad}'. Expected one of: {good}. "
|
||||||
"This is likely a bug in spaCy, so feel free to open an issue.")
|
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||||
E127 = ("Cannot create phrase pattern representation for length 0. This "
|
|
||||||
"is likely a bug in spaCy.")
|
|
||||||
E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
|
E128 = ("Unsupported serialization argument: '{arg}'. The use of keyword "
|
||||||
"arguments to exclude fields from being serialized or deserialized "
|
"arguments to exclude fields from being serialized or deserialized "
|
||||||
"is now deprecated. Please use the `exclude` argument instead. "
|
"is now deprecated. Please use the `exclude` argument instead. "
|
||||||
|
@ -482,8 +432,6 @@ class Errors(object):
|
||||||
"provided {found}.")
|
"provided {found}.")
|
||||||
E143 = ("Labels for component '{name}' not initialized. Did you forget to "
|
E143 = ("Labels for component '{name}' not initialized. Did you forget to "
|
||||||
"call add_label()?")
|
"call add_label()?")
|
||||||
E144 = ("Could not find parameter `{param}` when building the entity "
|
|
||||||
"linker model.")
|
|
||||||
E145 = ("Error reading `{param}` from input file.")
|
E145 = ("Error reading `{param}` from input file.")
|
||||||
E146 = ("Could not access `{path}`.")
|
E146 = ("Could not access `{path}`.")
|
||||||
E147 = ("Unexpected error in the {method} functionality of the "
|
E147 = ("Unexpected error in the {method} functionality of the "
|
||||||
|
@ -495,8 +443,6 @@ class Errors(object):
|
||||||
"the component matches the model being loaded.")
|
"the component matches the model being loaded.")
|
||||||
E150 = ("The language of the `nlp` object and the `vocab` should be the "
|
E150 = ("The language of the `nlp` object and the `vocab` should be the "
|
||||||
"same, but found '{nlp}' and '{vocab}' respectively.")
|
"same, but found '{nlp}' and '{vocab}' respectively.")
|
||||||
E151 = ("Trying to call nlp.update without required annotation types. "
|
|
||||||
"Expected top-level keys: {exp}. Got: {unexp}.")
|
|
||||||
E152 = ("The attribute {attr} is not supported for token patterns. "
|
E152 = ("The attribute {attr} is not supported for token patterns. "
|
||||||
"Please use the option validate=True with Matcher, PhraseMatcher, "
|
"Please use the option validate=True with Matcher, PhraseMatcher, "
|
||||||
"or EntityRuler for more details.")
|
"or EntityRuler for more details.")
|
||||||
|
@ -533,11 +479,6 @@ class Errors(object):
|
||||||
"that case.")
|
"that case.")
|
||||||
E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
|
E166 = ("Can only merge DocBins with the same pre-defined attributes.\n"
|
||||||
"Current DocBin: {current}\nOther DocBin: {other}")
|
"Current DocBin: {current}\nOther DocBin: {other}")
|
||||||
E167 = ("Unknown morphological feature: '{feat}' ({feat_id}). This can "
|
|
||||||
"happen if the tagger was trained with a different set of "
|
|
||||||
"morphological features. If you're using a pretrained model, make "
|
|
||||||
"sure that your models are up to date:\npython -m spacy validate")
|
|
||||||
E168 = ("Unknown field: {field}")
|
|
||||||
E169 = ("Can't find module: {module}")
|
E169 = ("Can't find module: {module}")
|
||||||
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
E170 = ("Cannot apply transition {name}: invalid for the current state.")
|
||||||
E171 = ("Matcher.add received invalid on_match callback argument: expected "
|
E171 = ("Matcher.add received invalid on_match callback argument: expected "
|
||||||
|
@ -548,8 +489,6 @@ class Errors(object):
|
||||||
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
|
E173 = ("As of v2.2, the Lemmatizer is initialized with an instance of "
|
||||||
"Lookups containing the lemmatization tables. See the docs for "
|
"Lookups containing the lemmatization tables. See the docs for "
|
||||||
"details: https://spacy.io/api/lemmatizer#init")
|
"details: https://spacy.io/api/lemmatizer#init")
|
||||||
E174 = ("Architecture '{name}' not found in registry. Available "
|
|
||||||
"names: {names}")
|
|
||||||
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
|
E175 = ("Can't remove rule for unknown match pattern ID: {key}")
|
||||||
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
|
E176 = ("Alias '{alias}' is not defined in the Knowledge Base.")
|
||||||
E177 = ("Ill-formed IOB input detected: {tag}")
|
E177 = ("Ill-formed IOB input detected: {tag}")
|
||||||
|
@ -597,10 +536,19 @@ class Errors(object):
|
||||||
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
E198 = ("Unable to return {n} most similar vectors for the current vectors "
|
||||||
"table, which contains {n_rows} vectors.")
|
"table, which contains {n_rows} vectors.")
|
||||||
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
E199 = ("Unable to merge 0-length span at doc[{start}:{end}].")
|
||||||
E200 = ("Specifying a base model with a pretrained component '{component}' "
|
|
||||||
"can not be combined with adding a pretrained Tok2Vec layer.")
|
|
||||||
|
|
||||||
# TODO: fix numbering after merging develop into master
|
# TODO: fix numbering after merging develop into master
|
||||||
|
E971 = ("Found incompatible lengths in Doc.from_array: {array_length} for the "
|
||||||
|
"array and {doc_length} for the Doc itself.")
|
||||||
|
E972 = ("Example.__init__ got None for '{arg}'. Requires Doc.")
|
||||||
|
E973 = ("Unexpected type for NER data")
|
||||||
|
E974 = ("Unknown {obj} attribute: {key}")
|
||||||
|
E975 = ("The method Example.from_dict expects a Doc as first argument, "
|
||||||
|
"but got {type}")
|
||||||
|
E976 = ("The method Example.from_dict expects a dict as second argument, "
|
||||||
|
"but received None.")
|
||||||
|
E977 = ("Can not compare a MorphAnalysis with a string object. "
|
||||||
|
"This is likely a bug in spaCy, so feel free to open an issue.")
|
||||||
E978 = ("The {method} method of component {name} takes a list of Example objects, "
|
E978 = ("The {method} method of component {name} takes a list of Example objects, "
|
||||||
"but found {types} instead.")
|
"but found {types} instead.")
|
||||||
E979 = ("Cannot convert {type} to an Example object.")
|
E979 = ("Cannot convert {type} to an Example object.")
|
||||||
|
@ -648,13 +596,8 @@ class Errors(object):
|
||||||
@add_codes
|
@add_codes
|
||||||
class TempErrors(object):
|
class TempErrors(object):
|
||||||
T003 = ("Resizing pretrained Tagger models is not currently supported.")
|
T003 = ("Resizing pretrained Tagger models is not currently supported.")
|
||||||
T004 = ("Currently parser depth is hard-coded to 1. Received: {value}.")
|
|
||||||
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
T007 = ("Can't yet set {attr} from Span. Vote for this feature on the "
|
||||||
"issue tracker: http://github.com/explosion/spaCy/issues")
|
"issue tracker: http://github.com/explosion/spaCy/issues")
|
||||||
T008 = ("Bad configuration of Tagger. This is probably a bug within "
|
|
||||||
"spaCy. We changed the name of an internal attribute for loading "
|
|
||||||
"pretrained vectors, and the class has been passed the old name "
|
|
||||||
"(pretrained_dims) but not the new name (pretrained_vectors).")
|
|
||||||
|
|
||||||
|
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
|
@ -45,7 +45,7 @@ class Corpus:
|
||||||
|
|
||||||
def make_examples(self, nlp, reference_docs, max_length=0):
|
def make_examples(self, nlp, reference_docs, max_length=0):
|
||||||
for reference in reference_docs:
|
for reference in reference_docs:
|
||||||
if max_length >= 1 and len(reference) >= max_length:
|
if len(reference) >= max_length >= 1:
|
||||||
if reference.is_sentenced:
|
if reference.is_sentenced:
|
||||||
for ref_sent in reference.sents:
|
for ref_sent in reference.sents:
|
||||||
yield Example(
|
yield Example(
|
||||||
|
|
|
@ -2,7 +2,6 @@ import warnings
|
||||||
|
|
||||||
import numpy
|
import numpy
|
||||||
|
|
||||||
from ..tokens import Token
|
|
||||||
from ..tokens.doc cimport Doc
|
from ..tokens.doc cimport Doc
|
||||||
from ..tokens.span cimport Span
|
from ..tokens.span cimport Span
|
||||||
from ..tokens.span import Span
|
from ..tokens.span import Span
|
||||||
|
@ -11,9 +10,8 @@ from .align cimport Alignment
|
||||||
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
from .iob_utils import biluo_to_iob, biluo_tags_from_offsets, biluo_tags_from_doc
|
||||||
from .iob_utils import spans_from_biluo_tags
|
from .iob_utils import spans_from_biluo_tags
|
||||||
from .align import Alignment
|
from .align import Alignment
|
||||||
from ..errors import Errors, AlignmentError
|
from ..errors import Errors, Warnings
|
||||||
from ..syntax import nonproj
|
from ..syntax import nonproj
|
||||||
from ..util import get_words_and_spaces
|
|
||||||
|
|
||||||
|
|
||||||
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
||||||
|
@ -32,11 +30,10 @@ cpdef Doc annotations2doc(vocab, tok_annot, doc_annot):
|
||||||
cdef class Example:
|
cdef class Example:
|
||||||
def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
|
def __init__(self, Doc predicted, Doc reference, *, Alignment alignment=None):
|
||||||
""" Doc can either be text, or an actual Doc """
|
""" Doc can either be text, or an actual Doc """
|
||||||
msg = "Example.__init__ got None for '{arg}'. Requires Doc."
|
|
||||||
if predicted is None:
|
if predicted is None:
|
||||||
raise TypeError(msg.format(arg="predicted"))
|
raise TypeError(Errors.E972.format(arg="predicted"))
|
||||||
if reference is None:
|
if reference is None:
|
||||||
raise TypeError(msg.format(arg="reference"))
|
raise TypeError(Errors.E972.format(arg="reference"))
|
||||||
self.x = predicted
|
self.x = predicted
|
||||||
self.y = reference
|
self.y = reference
|
||||||
self._alignment = alignment
|
self._alignment = alignment
|
||||||
|
@ -64,9 +61,9 @@ cdef class Example:
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_dict(cls, Doc predicted, dict example_dict):
|
def from_dict(cls, Doc predicted, dict example_dict):
|
||||||
if example_dict is None:
|
if example_dict is None:
|
||||||
raise ValueError("Example.from_dict expected dict, received None")
|
raise ValueError(Errors.E976)
|
||||||
if not isinstance(predicted, Doc):
|
if not isinstance(predicted, Doc):
|
||||||
raise TypeError(f"Argument 1 should be Doc. Got {type(predicted)}")
|
raise TypeError(Errors.E975.format(type=type(predicted)))
|
||||||
example_dict = _fix_legacy_dict_data(example_dict)
|
example_dict = _fix_legacy_dict_data(example_dict)
|
||||||
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
tok_dict, doc_dict = _parse_example_dict_data(example_dict)
|
||||||
if "ORTH" not in tok_dict:
|
if "ORTH" not in tok_dict:
|
||||||
|
@ -118,6 +115,7 @@ cdef class Example:
|
||||||
aligned_deps = [None] * self.x.length
|
aligned_deps = [None] * self.x.length
|
||||||
heads = [token.head.i for token in self.y]
|
heads = [token.head.i for token in self.y]
|
||||||
deps = [token.dep_ for token in self.y]
|
deps = [token.dep_ for token in self.y]
|
||||||
|
if projectivize:
|
||||||
heads, deps = nonproj.projectivize(heads, deps)
|
heads, deps = nonproj.projectivize(heads, deps)
|
||||||
for cand_i in range(self.x.length):
|
for cand_i in range(self.x.length):
|
||||||
gold_i = cand_to_gold[cand_i]
|
gold_i = cand_to_gold[cand_i]
|
||||||
|
@ -245,11 +243,11 @@ def _annot2array(vocab, tok_annot, doc_annot):
|
||||||
elif key == "cats":
|
elif key == "cats":
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown doc attribute: {key}")
|
raise ValueError(Errors.E974.format(obj="doc", key=key))
|
||||||
|
|
||||||
for key, value in tok_annot.items():
|
for key, value in tok_annot.items():
|
||||||
if key not in IDS:
|
if key not in IDS:
|
||||||
raise ValueError(f"Unknown token attribute: {key}")
|
raise ValueError(Errors.E974.format(obj="token", key=key))
|
||||||
elif key in ["ORTH", "SPACY"]:
|
elif key in ["ORTH", "SPACY"]:
|
||||||
pass
|
pass
|
||||||
elif key == "HEAD":
|
elif key == "HEAD":
|
||||||
|
@ -289,7 +287,7 @@ def _add_entities_to_doc(doc, ner_data):
|
||||||
doc.ents = ner_data
|
doc.ents = ner_data
|
||||||
doc.ents = [span for span in ner_data if span.label_]
|
doc.ents = [span for span in ner_data if span.label_]
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unexpected type for NER data")
|
raise ValueError(Errors.E973)
|
||||||
|
|
||||||
|
|
||||||
def _parse_example_dict_data(example_dict):
|
def _parse_example_dict_data(example_dict):
|
||||||
|
@ -341,7 +339,7 @@ def _fix_legacy_dict_data(example_dict):
|
||||||
if "HEAD" in token_dict and "SENT_START" in token_dict:
|
if "HEAD" in token_dict and "SENT_START" in token_dict:
|
||||||
# If heads are set, we don't also redundantly specify SENT_START.
|
# If heads are set, we don't also redundantly specify SENT_START.
|
||||||
token_dict.pop("SENT_START")
|
token_dict.pop("SENT_START")
|
||||||
warnings.warn("Ignoring annotations for sentence starts, as dependency heads are set")
|
warnings.warn(Warnings.W092)
|
||||||
return {
|
return {
|
||||||
"token_annotation": token_dict,
|
"token_annotation": token_dict,
|
||||||
"doc_annotation": doc_dict
|
"doc_annotation": doc_dict
|
||||||
|
|
|
@ -145,7 +145,7 @@ def json_to_annotations(doc):
|
||||||
example["doc_annotation"] = dict(
|
example["doc_annotation"] = dict(
|
||||||
cats=cats,
|
cats=cats,
|
||||||
entities=ner_tags,
|
entities=ner_tags,
|
||||||
links=paragraph.get("links", []) # TODO: fix/test
|
links=paragraph.get("links", [])
|
||||||
)
|
)
|
||||||
yield example
|
yield example
|
||||||
|
|
||||||
|
|
|
@ -107,9 +107,9 @@ cdef class Morphology:
|
||||||
Returns the hash of the new analysis.
|
Returns the hash of the new analysis.
|
||||||
"""
|
"""
|
||||||
cdef MorphAnalysisC* tag_ptr
|
cdef MorphAnalysisC* tag_ptr
|
||||||
|
if isinstance(features, str):
|
||||||
if features == self.EMPTY_MORPH:
|
if features == self.EMPTY_MORPH:
|
||||||
features = ""
|
features = ""
|
||||||
if isinstance(features, str):
|
|
||||||
tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
|
tag_ptr = <MorphAnalysisC*>self.tags.get(<hash_t>self.strings[features])
|
||||||
if tag_ptr != NULL:
|
if tag_ptr != NULL:
|
||||||
return tag_ptr.key
|
return tag_ptr.key
|
||||||
|
|
|
@ -70,7 +70,7 @@ class SimpleNER(Pipe):
|
||||||
def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
def update(self, examples, set_annotations=False, drop=0.0, sgd=None, losses=None):
|
||||||
if not any(_has_ner(eg) for eg in examples):
|
if not any(_has_ner(eg) for eg in examples):
|
||||||
return 0
|
return 0
|
||||||
docs = [eg.doc for eg in examples]
|
docs = [eg.predicted for eg in examples]
|
||||||
set_dropout_rate(self.model, drop)
|
set_dropout_rate(self.model, drop)
|
||||||
scores, bp_scores = self.model.begin_update(docs)
|
scores, bp_scores = self.model.begin_update(docs)
|
||||||
loss, d_scores = self.get_loss(examples, scores)
|
loss, d_scores = self.get_loss(examples, scores)
|
||||||
|
@ -89,7 +89,8 @@ class SimpleNER(Pipe):
|
||||||
d_scores = []
|
d_scores = []
|
||||||
truths = []
|
truths = []
|
||||||
for eg in examples:
|
for eg in examples:
|
||||||
gold_tags = [(tag if tag != "-" else None) for tag in eg.gold.ner]
|
tags = eg.get_aligned("TAG", as_string=True)
|
||||||
|
gold_tags = [(tag if tag != "-" else None) for tag in tags]
|
||||||
if not self.is_biluo:
|
if not self.is_biluo:
|
||||||
gold_tags = biluo_to_iob(gold_tags)
|
gold_tags = biluo_to_iob(gold_tags)
|
||||||
truths.append(gold_tags)
|
truths.append(gold_tags)
|
||||||
|
@ -128,8 +129,8 @@ class SimpleNER(Pipe):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def _has_ner(eg):
|
def _has_ner(example):
|
||||||
for ner_tag in eg.gold.ner:
|
for ner_tag in example.get_aligned_ner():
|
||||||
if ner_tag != "-" and ner_tag is not None:
|
if ner_tag != "-" and ner_tag is not None:
|
||||||
return True
|
return True
|
||||||
else:
|
else:
|
||||||
|
|
|
@ -9,7 +9,7 @@ from ..attrs import SPACY, ORTH, intify_attr
|
||||||
from ..errors import Errors
|
from ..errors import Errors
|
||||||
|
|
||||||
|
|
||||||
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "LEMMA", "MORPH")
|
ALL_ATTRS = ("ORTH", "TAG", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE", "ENT_KB_ID", "LEMMA", "MORPH")
|
||||||
|
|
||||||
|
|
||||||
class DocBin(object):
|
class DocBin(object):
|
||||||
|
|
|
@ -816,7 +816,7 @@ cdef class Doc:
|
||||||
cdef TokenC* tokens = self.c
|
cdef TokenC* tokens = self.c
|
||||||
cdef int length = len(array)
|
cdef int length = len(array)
|
||||||
if length != len(self):
|
if length != len(self):
|
||||||
raise ValueError("Cannot set array values longer than the document.")
|
raise ValueError(Errors.E971.format(array_length=length, doc_length=len(self)))
|
||||||
|
|
||||||
# Get set up for fast loading
|
# Get set up for fast loading
|
||||||
cdef Pool mem = Pool()
|
cdef Pool mem = Pool()
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
from libc.string cimport memset
|
from libc.string cimport memset
|
||||||
cimport numpy as np
|
cimport numpy as np
|
||||||
|
|
||||||
|
from ..errors import Errors
|
||||||
from ..vocab cimport Vocab
|
from ..vocab cimport Vocab
|
||||||
from ..typedefs cimport hash_t, attr_t
|
from ..typedefs cimport hash_t, attr_t
|
||||||
from ..morphology cimport list_features, check_feature, get_by_field
|
from ..morphology cimport list_features, check_feature, get_by_field
|
||||||
|
@ -49,6 +50,8 @@ cdef class MorphAnalysis:
|
||||||
return self.key
|
return self.key
|
||||||
|
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
|
if isinstance(other, str):
|
||||||
|
raise ValueError(Errors.E977)
|
||||||
return self.key == other.key
|
return self.key == other.key
|
||||||
|
|
||||||
def __ne__(self, other):
|
def __ne__(self, other):
|
||||||
|
|
Loading…
Reference in New Issue
Block a user