Fix training/ partially.

2025-09-18 18:12:45 +03:00 · 2023-07-03 11:32:31 +02:00 · 2023-07-03 11:32:31 +02:00 · 50dac51dc8
commit 50dac51dc8
parent 77e9e4ddad
2 changed files with 25 additions and 8 deletions
--- a/spacy/training/example.pyx
+++ b/spacy/training/example.pyx
@ -589,6 +589,7 @@ def _fix_legacy_dict_data(example_dict):
        "doc_annotation": doc_dict
    }

+
 def _has_field(annot, field):
    if field not in annot:
        return False
@ -625,6 +626,7 @@ def _parse_ner_tags(biluo_or_offsets, vocab, words, spaces):
                ent_types.append("")
    return ent_iobs, ent_types

+
 def _parse_links(vocab, words, spaces, links):
    reference = Doc(vocab, words=words, spaces=spaces)
    starts = {token.idx: token.i for token in reference}
--- a/spacy/training/gold_io.pyx
+++ b/spacy/training/gold_io.pyx
@ -1,4 +1,3 @@
-import json
 import warnings

 import srsly
@ -6,7 +5,7 @@ import srsly
 from .. import util
 from ..errors import Warnings
 from ..tokens import Doc
-from .iob_utils import offsets_to_biluo_tags, tags_to_entities
+from .iob_utils import offsets_to_biluo_tags


 def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
@ -23,7 +22,13 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
    json_doc = {"id": doc_id, "paragraphs": []}
    for i, doc in enumerate(docs):
        raw = None if doc.has_unknown_spaces else doc.text
-        json_para = {'raw': raw, "sentences": [], "cats": [], "entities": [], "links": []}
+        json_para = {
+            'raw': raw,
+            "sentences": [],
+            "cats": [],
+            "entities": [],
+            "links": []
+        }
        for cat, val in doc.cats.items():
            json_cat = {"label": cat, "value": val}
            json_para["cats"].append(json_cat)
@ -35,13 +40,17 @@ def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
            if ent.kb_id_:
                link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
                json_para["links"].append(link_dict)
-        biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
+        biluo_tags = offsets_to_biluo_tags(
+            doc, json_para["entities"], missing=ner_missing_tag
+        )
        attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
        include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
        for j, sent in enumerate(doc.sents):
            json_sent = {"tokens": [], "brackets": []}
            for token in sent:
-                json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
+                json_token = {
+                    "id": token.i, "orth": token.text, "space": token.whitespace_
+                }
                if include_annotation["TAG"]:
                    json_token["tag"] = token.tag_
                if include_annotation["POS"]:
@ -125,9 +134,14 @@ def json_to_annotations(doc):
                else:
                    sent_starts.append(-1)
            if "brackets" in sent:
-                brackets.extend((b["first"] + sent_start_i,
-                                 b["last"] + sent_start_i, b["label"])
-                                 for b in sent["brackets"])
+                brackets.extend(
+                    (
+                        b["first"] + sent_start_i,
+                        b["last"] + sent_start_i,
+                        b["label"]
+                    )
+                    for b in sent["brackets"]
+                )

        example["token_annotation"] = dict(
            ids=ids,
@ -160,6 +174,7 @@ def json_to_annotations(doc):
        )
        yield example

+
 def json_iterate(bytes utf8_str):
    # We should've made these files jsonl...But since we didn't, parse out
    # the docs one-by-one to reduce memory usage.