Fix training/ partially.

This commit is contained in:
Raphael Mitsch 2023-07-03 11:32:31 +02:00
parent 77e9e4ddad
commit 50dac51dc8
2 changed files with 25 additions and 8 deletions

View File

@ -589,6 +589,7 @@ def _fix_legacy_dict_data(example_dict):
"doc_annotation": doc_dict
}
def _has_field(annot, field):
if field not in annot:
return False
@ -625,6 +626,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
ent_types.append("")
return ent_iobs, ent_types
def _parse_links(vocab, words, spaces, links):
reference = Doc(vocab, words=words, spaces=spaces)
starts = {token.idx: token.i for token in reference}

View File

@ -1,4 +1,3 @@
import json
import warnings
import srsly
@ -6,7 +5,7 @@ import srsly
from .. import util
from ..errors import Warnings
from ..tokens import Doc
from .iob_utils import offsets_to_biluo_tags, tags_to_entities
from .iob_utils import offsets_to_biluo_tags
def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
@ -23,7 +22,13 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
json_doc = {"id": doc_id, "paragraphs": []}
for i, doc in enumerate(docs):
raw = None if doc.has_unknown_spaces else doc.text
json_para = {'raw': raw, "sentences": [], "cats": [], "entities": [], "links": []}
json_para = {
'raw': raw,
"sentences": [],
"cats": [],
"entities": [],
"links": []
}
for cat, val in doc.cats.items():
json_cat = {"label": cat, "value": val}
json_para["cats"].append(json_cat)
@ -35,13 +40,17 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
if ent.kb_id_:
link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
json_para["links"].append(link_dict)
biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
biluo_tags = offsets_to_biluo_tags(
doc, json_para["entities"], missing=ner_missing_tag
)
attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
for j, sent in enumerate(doc.sents):
json_sent = {"tokens": [], "brackets": []}
for token in sent:
json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
json_token = {
"id": token.i, "orth": token.text, "space": token.whitespace_
}
if include_annotation["TAG"]:
json_token["tag"] = token.tag_
if include_annotation["POS"]:
@ -125,9 +134,14 @@ def json_to_annotations(doc):
else:
sent_starts.append(-1)
if "brackets" in sent:
brackets.extend((b["first"] + sent_start_i,
b["last"] + sent_start_i, b["label"])
for b in sent["brackets"])
brackets.extend(
(
b["first"] + sent_start_i,
b["last"] + sent_start_i,
b["label"]
)
for b in sent["brackets"]
)
example["token_annotation"] = dict(
ids=ids,
@ -160,6 +174,7 @@ def json_to_annotations(doc):
)
yield example
def json_iterate(bytes utf8_str):
# We should've made these files jsonl...But since we didn't, parse out
# the docs one-by-one to reduce memory usage.