Add gold_io.pyx

2025-10-30 23:47:31 +03:00 · 2020-06-06 14:41:49 +02:00 · 2020-06-06 14:41:49 +02:00 · 32c8fb1372
commit 32c8fb1372
parent 156466ca69
1 changed files with 202 additions and 0 deletions
--- a/spacy/_gold/gold_io.pyx
+++ b/spacy/_gold/gold_io.pyx
@ -0,0 +1,202 @@
+import warnings
+import srsly
+from .. import util
+from ..errors import Warnings
+from ..tokens import Token, Doc
+from .example import Example
+from .iob_utils import biluo_tags_from_offsets
+
+
+def merge_sents(sents):
+    m_deps = [[], [], [], [], [], []]
+    m_cats = {}
+    m_brackets = []
+    i = 0
+    for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents:
+        m_deps[0].extend(id_ + i for id_ in ids)
+        m_deps[1].extend(words)
+        m_deps[2].extend(tags)
+        m_deps[3].extend(head + i for head in heads)
+        m_deps[4].extend(labels)
+        m_deps[5].extend(ner)
+        m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
+                          for b in brackets)
+        m_cats.update(cats)
+        i += len(ids)
+    return [(m_deps, (m_cats, m_brackets))]
+
+
+def docs_to_json(docs, id=0, ner_missing_tag="O"):
+    """Convert a list of Doc objects into the JSON-serializable format used by
+    the spacy train command.
+
+    docs (iterable / Doc): The Doc object(s) to convert.
+    id (int): Id for the JSON.
+    RETURNS (dict): The data in spaCy's JSON format
+        - each input doc will be treated as a paragraph in the output doc
+    """
+    if isinstance(docs, Doc):
+        docs = [docs]
+    json_doc = {"id": id, "paragraphs": []}
+    for i, doc in enumerate(docs):
+        json_para = {'raw': doc.text, "sentences": [], "cats": []}
+        for cat, val in doc.cats.items():
+            json_cat = {"label": cat, "value": val}
+            json_para["cats"].append(json_cat)
+        ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
+        biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
+        for j, sent in enumerate(doc.sents):
+            json_sent = {"tokens": [], "brackets": []}
+            for token in sent:
+                json_token = {"id": token.i, "orth": token.text}
+                if doc.is_tagged:
+                    json_token["tag"] = token.tag_
+                    json_token["pos"] = token.pos_
+                    json_token["morph"] = token.morph_
+                    json_token["lemma"] = token.lemma_
+                if doc.is_parsed:
+                    json_token["head"] = token.head.i-token.i
+                    json_token["dep"] = token.dep_
+                json_token["ner"] = biluo_tags[token.i]
+                json_sent["tokens"].append(json_token)
+            json_para["sentences"].append(json_sent)
+        json_doc["paragraphs"].append(json_para)
+    return json_doc
+
+
+def json_to_examples(doc):
+    """Convert an item in the JSON-formatted training data to the format
+    used by GoldParse.
+
+    doc (dict): One entry in the training data.
+    YIELDS (Example): The reformatted data - one training example per paragraph
+    """
+    for paragraph in doc["paragraphs"]:
+        example = Example(doc=paragraph.get("raw", None))
+        words = []
+        ids = []
+        tags = []
+        pos = []
+        morphs = []
+        lemmas = []
+        heads = []
+        labels = []
+        ner = []
+        sent_starts = []
+        brackets = []
+        for sent in paragraph["sentences"]:
+            sent_start_i = len(words)
+            for i, token in enumerate(sent["tokens"]):
+                words.append(token["orth"])
+                ids.append(token.get('id', sent_start_i + i))
+                tags.append(token.get('tag', "-"))
+                pos.append(token.get("pos", ""))
+                morphs.append(token.get("morph", ""))
+                lemmas.append(token.get("lemma", ""))
+                heads.append(token.get("head", 0) + sent_start_i + i)
+                labels.append(token.get("dep", ""))
+                # Ensure ROOT label is case-insensitive
+                if labels[-1].lower() == "root":
+                    labels[-1] = "ROOT"
+                ner.append(token.get("ner", "-"))
+                if i == 0:
+                    sent_starts.append(1)
+                else:
+                    sent_starts.append(0)
+            if "brackets" in sent:
+                brackets.extend((b["first"] + sent_start_i,
+                                 b["last"] + sent_start_i, b["label"])
+                                 for b in sent["brackets"])
+        cats = {}
+        for cat in paragraph.get("cats", {}):
+            cats[cat["label"]] = cat["value"]
+        example.set_token_annotation(ids=ids, words=words, tags=tags,
+                pos=pos, morphs=morphs, lemmas=lemmas, heads=heads,
+                deps=labels, entities=ner, sent_starts=sent_starts,
+                brackets=brackets)
+        example.set_doc_annotation(cats=cats)
+        yield example
+
+
+def read_json_file(loc, docs_filter=None, limit=None):
+    loc = util.ensure_path(loc)
+    if loc.is_dir():
+        for filename in loc.iterdir():
+            yield from read_json_file(loc / filename, limit=limit)
+    else:
+        for doc in json_iterate(loc):
+            if docs_filter is not None and not docs_filter(doc):
+                continue
+            for json_data in json_to_examples(doc):
+                yield json_data
+
+
+def read_json_object(json_corpus_section):
+    """Take a list of JSON-formatted documents (e.g. from an already loaded
+    training data file) and yield annotations in the GoldParse format.
+
+    json_corpus_section (list): The data.
+    YIELDS (Example): The reformatted data - one training example per paragraph
+    """
+    for json_doc in json_corpus_section:
+        examples = json_to_examples(json_doc)
+        for ex in examples:
+            yield ex
+
+
+def json_iterate(loc):
+    # We should've made these files jsonl...But since we didn't, parse out
+    # the docs one-by-one to reduce memory usage.
+    # It's okay to read in the whole file -- just don't parse it into JSON.
+    cdef bytes py_raw
+    loc = util.ensure_path(loc)
+    with loc.open("rb") as file_:
+        py_raw = file_.read()
+    cdef long file_length = len(py_raw)
+    if file_length > 2 ** 30:
+        warnings.warn(Warnings.W027.format(size=file_length))
+
+    raw = <char*>py_raw
+    cdef int square_depth = 0
+    cdef int curly_depth = 0
+    cdef int inside_string = 0
+    cdef int escape = 0
+    cdef long start = -1
+    cdef char c
+    cdef char quote = ord('"')
+    cdef char backslash = ord("\\")
+    cdef char open_square = ord("[")
+    cdef char close_square = ord("]")
+    cdef char open_curly = ord("{")
+    cdef char close_curly = ord("}")
+    for i in range(file_length):
+        c = raw[i]
+        if escape:
+            escape = False
+            continue
+        if c == backslash:
+            escape = True
+            continue
+        if c == quote:
+            inside_string = not inside_string
+            continue
+        if inside_string:
+            continue
+        if c == open_square:
+            square_depth += 1
+        elif c == close_square:
+            square_depth -= 1
+        elif c == open_curly:
+            if square_depth == 1 and curly_depth == 0:
+                start = i
+            curly_depth += 1
+        elif c == close_curly:
+            curly_depth -= 1
+            if square_depth == 1 and curly_depth == 0:
+                py_str = py_raw[start : i + 1].decode("utf8")
+                try:
+                    yield srsly.json_loads(py_str)
+                except Exception:
+                    print(py_str)
+                    raise
+                start = -1