spaCy/spacy/gold/gold_io.pyx

import warnings
import srsly
from .. import util
from ..errors import Warnings
from ..tokens import Token, Doc
from .iob_utils import biluo_tags_from_offsets


def merge_sents(sents):
    m_deps = [[], [], [], [], [], []]
    m_cats = {}
    m_brackets = []
    i = 0
    for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents:
        m_deps[0].extend(id_ + i for id_ in ids)
        m_deps[1].extend(words)
        m_deps[2].extend(tags)
        m_deps[3].extend(head + i for head in heads)
        m_deps[4].extend(labels)
        m_deps[5].extend(ner)
        m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])
                          for b in brackets)
        m_cats.update(cats)
        i += len(ids)
    return [(m_deps, (m_cats, m_brackets))]


def docs_to_json(docs, id=0, ner_missing_tag="O"):
    """Convert a list of Doc objects into the JSON-serializable format used by
    the spacy train command.

    docs (iterable / Doc): The Doc object(s) to convert.
    id (int): Id for the JSON.
    RETURNS (dict): The data in spaCy's JSON format
        - each input doc will be treated as a paragraph in the output doc
    """
    if isinstance(docs, Doc):
        docs = [docs]
    json_doc = {"id": id, "paragraphs": []}
    for i, doc in enumerate(docs):
        json_para = {'raw': doc.text, "sentences": [], "cats": [], "entities": [], "links": []}
        for cat, val in doc.cats.items():
            json_cat = {"label": cat, "value": val}
            json_para["cats"].append(json_cat)
        for ent in doc.ents:
            ent_tuple = (ent.start_char, ent.end_char, ent.label_)
            json_para["entities"].append(ent_tuple)
            if ent.kb_id_:
                link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
                json_para["links"].append(link_dict)
        ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]
        biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)
        for j, sent in enumerate(doc.sents):
            json_sent = {"tokens": [], "brackets": []}
            for token in sent:
                json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
                if doc.is_tagged:
                    json_token["tag"] = token.tag_
                    json_token["pos"] = token.pos_
                    json_token["morph"] = token.morph_
                    json_token["lemma"] = token.lemma_
                if doc.is_parsed:
                    json_token["head"] = token.head.i-token.i
                    json_token["dep"] = token.dep_
                json_sent["tokens"].append(json_token)
            json_para["sentences"].append(json_sent)
        json_doc["paragraphs"].append(json_para)
    return json_doc


def read_json_file(loc, docs_filter=None, limit=None):
    """Read Example dictionaries from a json file or directory."""
    loc = util.ensure_path(loc)
    if loc.is_dir():
        for filename in loc.iterdir():
            yield from read_json_file(loc / filename, limit=limit)
    else:
        for doc in json_iterate(loc):
            if docs_filter is not None and not docs_filter(doc):
                continue
            for json_data in json_to_annotations(doc):
                yield json_data


def json_to_annotations(doc):
    """Convert an item in the JSON-formatted training data to the format
    used by Example.

    doc (dict): One entry in the training data.
    YIELDS (tuple): The reformatted data - one training example per paragraph
    """
    for paragraph in doc["paragraphs"]:
        example = {"text": paragraph.get("raw", None)}
        words = []
        spaces = []
        ids = []
        tags = []
        pos = []
        morphs = []
        lemmas = []
        heads = []
        labels = []
        sent_starts = []
        brackets = []
        for sent in paragraph["sentences"]:
            sent_start_i = len(words)
            for i, token in enumerate(sent["tokens"]):
                words.append(token["orth"])
                spaces.append(token.get("space", True))
                ids.append(token.get('id', sent_start_i + i))
                if "tag" in token:
                    tags.append(token["tag"])
                if "pos" in token:
                    pos.append(token["pos"])
                if "morph" in token:
                    morphs.append(token["morph"])
                if "lemma" in token:
                    lemmas.append(token["lemma"])
                if "head" in token:
                    heads.append(token["head"])
                if "dep" in token:
                    labels.append(token["dep"])
                    # Ensure ROOT label is case-insensitive
                    if labels[-1].lower() == "root":
                        labels[-1] = "ROOT"
                if i == 0:
                    sent_starts.append(1)
                else:
                    sent_starts.append(0)
            if "brackets" in sent:
                brackets.extend((b["first"] + sent_start_i,
                                 b["last"] + sent_start_i, b["label"])
                                 for b in sent["brackets"])

        example["token_annotation"] = dict(
            ids=ids,
            words=words,
            spaces=spaces,
            sent_starts=sent_starts,
            brackets=brackets
        )
        # avoid including dummy values that looks like gold info was present
        if tags:
            example["token_annotation"]["tags"] = tags
        if pos:
            example["token_annotation"]["pos"] = pos
        if morphs:
            example["token_annotation"]["morphs"] = morphs
        if lemmas:
            example["token_annotation"]["lemmas"] = lemmas
        if heads:
            example["token_annotation"]["heads"] = heads
        if labels:
            example["token_annotation"]["deps"] = labels
        if pos:
            example["token_annotation"]["pos"] = pos

        cats = {}
        for cat in paragraph.get("cats", {}):
            cats[cat["label"]] = cat["value"]
        entities = []
        for start, end, label in paragraph.get("entities", {}):
            ent_tuple = (start, end, label)
            entities.append(ent_tuple)
        example["doc_annotation"] = dict(
            cats=cats,
            entities=entities,
            links=paragraph.get("links", [])   # TODO: fix/test
        )
        yield example

def json_iterate(loc):
    # We should've made these files jsonl...But since we didn't, parse out
    # the docs one-by-one to reduce memory usage.
    # It's okay to read in the whole file -- just don't parse it into JSON.
    cdef bytes py_raw
    loc = util.ensure_path(loc)
    with loc.open("rb") as file_:
        py_raw = file_.read()
    cdef long file_length = len(py_raw)
    if file_length > 2 ** 30:
        warnings.warn(Warnings.W027.format(size=file_length))

    raw = <char*>py_raw
    cdef int square_depth = 0
    cdef int curly_depth = 0
    cdef int inside_string = 0
    cdef int escape = 0
    cdef long start = -1
    cdef char c
    cdef char quote = ord('"')
    cdef char backslash = ord("\\")
    cdef char open_square = ord("[")
    cdef char close_square = ord("]")
    cdef char open_curly = ord("{")
    cdef char close_curly = ord("}")
    for i in range(file_length):
        c = raw[i]
        if escape:
            escape = False
            continue
        if c == backslash:
            escape = True
            continue
        if c == quote:
            inside_string = not inside_string
            continue
        if inside_string:
            continue
        if c == open_square:
            square_depth += 1
        elif c == close_square:
            square_depth -= 1
        elif c == open_curly:
            if square_depth == 1 and curly_depth == 0:
                start = i
            curly_depth += 1
        elif c == close_curly:
            curly_depth -= 1
            if square_depth == 1 and curly_depth == 0:
                py_str = py_raw[start : i + 1].decode("utf8")
                try:
                    yield srsly.json_loads(py_str)
                except Exception:
                    print(py_str)
                    raise
                start = -1
Add gold_io.pyx 2020-06-06 15:41:49 +03:00			`import warnings`
			`import srsly`
			`from .. import util`
			`from ..errors import Warnings`
			`from ..tokens import Token, Doc`
			`from .iob_utils import biluo_tags_from_offsets`


			`def merge_sents(sents):`
			`m_deps = [[], [], [], [], [], []]`
			`m_cats = {}`
			`m_brackets = []`
			`i = 0`
			`for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents:`
			`m_deps[0].extend(id_ + i for id_ in ids)`
			`m_deps[1].extend(words)`
			`m_deps[2].extend(tags)`
			`m_deps[3].extend(head + i for head in heads)`
			`m_deps[4].extend(labels)`
			`m_deps[5].extend(ner)`
			`m_brackets.extend((b["first"] + i, b["last"] + i, b["label"])`
			`for b in brackets)`
			`m_cats.update(cats)`
			`i += len(ids)`
			`return [(m_deps, (m_cats, m_brackets))]`


			`def docs_to_json(docs, id=0, ner_missing_tag="O"):`
			`"""Convert a list of Doc objects into the JSON-serializable format used by`
			`the spacy train command.`

			`docs (iterable / Doc): The Doc object(s) to convert.`
			`id (int): Id for the JSON.`
			`RETURNS (dict): The data in spaCy's JSON format`
			`- each input doc will be treated as a paragraph in the output doc`
			`"""`
			`if isinstance(docs, Doc):`
			`docs = [docs]`
			`json_doc = {"id": id, "paragraphs": []}`
			`for i, doc in enumerate(docs):`
fixed some tests + WIP roundtrip unit test 2020-06-16 19:26:50 +03:00			`json_para = {'raw': doc.text, "sentences": [], "cats": [], "entities": [], "links": []}`
Add gold_io.pyx 2020-06-06 15:41:49 +03:00			`for cat, val in doc.cats.items():`
			`json_cat = {"label": cat, "value": val}`
			`json_para["cats"].append(json_cat)`
fixed some tests + WIP roundtrip unit test 2020-06-16 19:26:50 +03:00			`for ent in doc.ents:`
			`ent_tuple = (ent.start_char, ent.end_char, ent.label_)`
			`json_para["entities"].append(ent_tuple)`
			`if ent.kb_id_:`
			`link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}`
			`json_para["links"].append(link_dict)`
Add gold_io.pyx 2020-06-06 15:41:49 +03:00			`ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents]`
			`biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag)`
			`for j, sent in enumerate(doc.sents):`
			`json_sent = {"tokens": [], "brackets": []}`
			`for token in sent:`
add spaces to json output format 2020-06-16 20:30:03 +03:00			`json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}`
Add gold_io.pyx 2020-06-06 15:41:49 +03:00			`if doc.is_tagged:`
			`json_token["tag"] = token.tag_`
			`json_token["pos"] = token.pos_`
			`json_token["morph"] = token.morph_`
			`json_token["lemma"] = token.lemma_`
			`if doc.is_parsed:`
			`json_token["head"] = token.head.i-token.i`
			`json_token["dep"] = token.dep_`
			`json_sent["tokens"].append(json_token)`
			`json_para["sentences"].append(json_sent)`
			`json_doc["paragraphs"].append(json_para)`
			`return json_doc`


Remove GoldParse from public API * Move get_parses_from_example to spacy.syntax * Get GoldParse out of Example * Avoid expecting GoldParse input in parser * Add Alignment to spacy.gold.align * Update Example object * Add comment * Update pipeline * Fix imports * Simplify gold_io * WIP on GoldCorpus * Update test * Xfail some gold tests * Remove ignore_misaligned option from GoldCorpus * Fix Example constructor * Update test * Fix usage of Example * Add deprecated_get_gold method on Example * Patch scorer * Fix test * Fix test * Update tests * Xfail a test * Fix passing of make_projective * Pass make_projective by default * Hack data format in Example.from_dict * Update tests * Fix example.from_dict * Update morphologizer * Fix entity linker * Add get_field to TokenAnnotation * Fix Example.get_aligned * Update test * Fix alignment * Fix corpus * Fix GoldCorpus * Handle misaligned * Format * Fix missing import 2020-06-08 23:09:57 +03:00			`def read_json_file(loc, docs_filter=None, limit=None):`
various fixes in scripts - needs to be further tested 2020-06-17 13:05:58 +03:00			`"""Read Example dictionaries from a json file or directory."""`
Remove GoldParse from public API * Move get_parses_from_example to spacy.syntax * Get GoldParse out of Example * Avoid expecting GoldParse input in parser * Add Alignment to spacy.gold.align * Update Example object * Add comment * Update pipeline * Fix imports * Simplify gold_io * WIP on GoldCorpus * Update test * Xfail some gold tests * Remove ignore_misaligned option from GoldCorpus * Fix Example constructor * Update test * Fix usage of Example * Add deprecated_get_gold method on Example * Patch scorer * Fix test * Fix test * Update tests * Xfail a test * Fix passing of make_projective * Pass make_projective by default * Hack data format in Example.from_dict * Update tests * Fix example.from_dict * Update morphologizer * Fix entity linker * Add get_field to TokenAnnotation * Fix Example.get_aligned * Update test * Fix alignment * Fix corpus * Fix GoldCorpus * Handle misaligned * Format * Fix missing import 2020-06-08 23:09:57 +03:00			`loc = util.ensure_path(loc)`
			`if loc.is_dir():`
			`for filename in loc.iterdir():`
			`yield from read_json_file(loc / filename, limit=limit)`
			`else:`
			`for doc in json_iterate(loc):`
			`if docs_filter is not None and not docs_filter(doc):`
			`continue`
Fix gold_io 2020-06-09 13:43:53 +03:00			`for json_data in json_to_annotations(doc):`
Remove GoldParse from public API * Move get_parses_from_example to spacy.syntax * Get GoldParse out of Example * Avoid expecting GoldParse input in parser * Add Alignment to spacy.gold.align * Update Example object * Add comment * Update pipeline * Fix imports * Simplify gold_io * WIP on GoldCorpus * Update test * Xfail some gold tests * Remove ignore_misaligned option from GoldCorpus * Fix Example constructor * Update test * Fix usage of Example * Add deprecated_get_gold method on Example * Patch scorer * Fix test * Fix test * Update tests * Xfail a test * Fix passing of make_projective * Pass make_projective by default * Hack data format in Example.from_dict * Update tests * Fix example.from_dict * Update morphologizer * Fix entity linker * Add get_field to TokenAnnotation * Fix Example.get_aligned * Update test * Fix alignment * Fix corpus * Fix GoldCorpus * Handle misaligned * Format * Fix missing import 2020-06-08 23:09:57 +03:00			`yield json_data`


* Make GoldCorpus return dict, not Example * Make Example require a Doc object (previously optional) Clarify methods in GoldCorpus WIP refactor Example Refactor Example.split_sents Fix test Fix augment Update test Update test Fix import Update test_scorer Update Example 2020-06-08 23:28:50 +03:00			`def json_to_annotations(doc):`
Add gold_io.pyx 2020-06-06 15:41:49 +03:00			`"""Convert an item in the JSON-formatted training data to the format`
fixed some tests + WIP roundtrip unit test 2020-06-16 19:26:50 +03:00			`used by Example.`
Add gold_io.pyx 2020-06-06 15:41:49 +03:00
			`doc (dict): One entry in the training data.`
* Make GoldCorpus return dict, not Example * Make Example require a Doc object (previously optional) Clarify methods in GoldCorpus WIP refactor Example Refactor Example.split_sents Fix test Fix augment Update test Update test Fix import Update test_scorer Update Example 2020-06-08 23:28:50 +03:00			`YIELDS (tuple): The reformatted data - one training example per paragraph`
Add gold_io.pyx 2020-06-06 15:41:49 +03:00			`"""`
			`for paragraph in doc["paragraphs"]:`
Remove GoldParse from public API * Move get_parses_from_example to spacy.syntax * Get GoldParse out of Example * Avoid expecting GoldParse input in parser * Add Alignment to spacy.gold.align * Update Example object * Add comment * Update pipeline * Fix imports * Simplify gold_io * WIP on GoldCorpus * Update test * Xfail some gold tests * Remove ignore_misaligned option from GoldCorpus * Fix Example constructor * Update test * Fix usage of Example * Add deprecated_get_gold method on Example * Patch scorer * Fix test * Fix test * Update tests * Xfail a test * Fix passing of make_projective * Pass make_projective by default * Hack data format in Example.from_dict * Update tests * Fix example.from_dict * Update morphologizer * Fix entity linker * Add get_field to TokenAnnotation * Fix Example.get_aligned * Update test * Fix alignment * Fix corpus * Fix GoldCorpus * Handle misaligned * Format * Fix missing import 2020-06-08 23:09:57 +03:00			`example = {"text": paragraph.get("raw", None)}`
Add gold_io.pyx 2020-06-06 15:41:49 +03:00			`words = []`
add spaces to json output format 2020-06-16 20:30:03 +03:00			`spaces = []`
Add gold_io.pyx 2020-06-06 15:41:49 +03:00			`ids = []`
			`tags = []`
			`pos = []`
			`morphs = []`
			`lemmas = []`
			`heads = []`
			`labels = []`
			`sent_starts = []`
			`brackets = []`
			`for sent in paragraph["sentences"]:`
			`sent_start_i = len(words)`
			`for i, token in enumerate(sent["tokens"]):`
			`words.append(token["orth"])`
various fixes in scripts - needs to be further tested 2020-06-17 13:05:58 +03:00			`spaces.append(token.get("space", True))`
Add gold_io.pyx 2020-06-06 15:41:49 +03:00			`ids.append(token.get('id', sent_start_i + i))`
prevent writing dummy values like deps because that could interfer with sent_start values 2020-06-18 18:47:59 +03:00			`if "tag" in token:`
			`tags.append(token["tag"])`
			`if "pos" in token:`
			`pos.append(token["pos"])`
			`if "morph" in token:`
			`morphs.append(token["morph"])`
			`if "lemma" in token:`
			`lemmas.append(token["lemma"])`
			`if "head" in token:`
			`heads.append(token["head"])`
			`if "dep" in token:`
			`labels.append(token["dep"])`
			`# Ensure ROOT label is case-insensitive`
			`if labels[-1].lower() == "root":`
			`labels[-1] = "ROOT"`
Add gold_io.pyx 2020-06-06 15:41:49 +03:00			`if i == 0:`
			`sent_starts.append(1)`
			`else:`
			`sent_starts.append(0)`
			`if "brackets" in sent:`
			`brackets.extend((b["first"] + sent_start_i,`
			`b["last"] + sent_start_i, b["label"])`
			`for b in sent["brackets"])`
fixed some tests + WIP roundtrip unit test 2020-06-16 19:26:50 +03:00
Remove GoldParse from public API * Move get_parses_from_example to spacy.syntax * Get GoldParse out of Example * Avoid expecting GoldParse input in parser * Add Alignment to spacy.gold.align * Update Example object * Add comment * Update pipeline * Fix imports * Simplify gold_io * WIP on GoldCorpus * Update test * Xfail some gold tests * Remove ignore_misaligned option from GoldCorpus * Fix Example constructor * Update test * Fix usage of Example * Add deprecated_get_gold method on Example * Patch scorer * Fix test * Fix test * Update tests * Xfail a test * Fix passing of make_projective * Pass make_projective by default * Hack data format in Example.from_dict * Update tests * Fix example.from_dict * Update morphologizer * Fix entity linker * Add get_field to TokenAnnotation * Fix Example.get_aligned * Update test * Fix alignment * Fix corpus * Fix GoldCorpus * Handle misaligned * Format * Fix missing import 2020-06-08 23:09:57 +03:00			`example["token_annotation"] = dict(`
			`ids=ids,`
			`words=words,`
add spaces to json output format 2020-06-16 20:30:03 +03:00			`spaces=spaces,`
Remove GoldParse from public API * Move get_parses_from_example to spacy.syntax * Get GoldParse out of Example * Avoid expecting GoldParse input in parser * Add Alignment to spacy.gold.align * Update Example object * Add comment * Update pipeline * Fix imports * Simplify gold_io * WIP on GoldCorpus * Update test * Xfail some gold tests * Remove ignore_misaligned option from GoldCorpus * Fix Example constructor * Update test * Fix usage of Example * Add deprecated_get_gold method on Example * Patch scorer * Fix test * Fix test * Update tests * Xfail a test * Fix passing of make_projective * Pass make_projective by default * Hack data format in Example.from_dict * Update tests * Fix example.from_dict * Update morphologizer * Fix entity linker * Add get_field to TokenAnnotation * Fix Example.get_aligned * Update test * Fix alignment * Fix corpus * Fix GoldCorpus * Handle misaligned * Format * Fix missing import 2020-06-08 23:09:57 +03:00			`sent_starts=sent_starts,`
			`brackets=brackets`
			`)`
prevent writing dummy values like deps because that could interfer with sent_start values 2020-06-18 18:47:59 +03:00			`# avoid including dummy values that looks like gold info was present`
			`if tags:`
			`example["token_annotation"]["tags"] = tags`
			`if pos:`
			`example["token_annotation"]["pos"] = pos`
			`if morphs:`
			`example["token_annotation"]["morphs"] = morphs`
			`if lemmas:`
			`example["token_annotation"]["lemmas"] = lemmas`
			`if heads:`
			`example["token_annotation"]["heads"] = heads`
			`if labels:`
			`example["token_annotation"]["deps"] = labels`
			`if pos:`
			`example["token_annotation"]["pos"] = pos`
Add gold_io.pyx 2020-06-06 15:41:49 +03:00
fixed some tests + WIP roundtrip unit test 2020-06-16 19:26:50 +03:00			`cats = {}`
			`for cat in paragraph.get("cats", {}):`
			`cats[cat["label"]] = cat["value"]`
			`entities = []`
			`for start, end, label in paragraph.get("entities", {}):`
			`ent_tuple = (start, end, label)`
			`entities.append(ent_tuple)`
			`example["doc_annotation"] = dict(`
			`cats=cats,`
			`entities=entities,`
			`links=paragraph.get("links", []) # TODO: fix/test`
			`)`
			`yield example`
Add gold_io.pyx 2020-06-06 15:41:49 +03:00
			`def json_iterate(loc):`
			`# We should've made these files jsonl...But since we didn't, parse out`
			`# the docs one-by-one to reduce memory usage.`
			`# It's okay to read in the whole file -- just don't parse it into JSON.`
			`cdef bytes py_raw`
			`loc = util.ensure_path(loc)`
			`with loc.open("rb") as file_:`
			`py_raw = file_.read()`
			`cdef long file_length = len(py_raw)`
			`if file_length > 2 ** 30:`
			`warnings.warn(Warnings.W027.format(size=file_length))`

			`raw = <char*>py_raw`
			`cdef int square_depth = 0`
			`cdef int curly_depth = 0`
			`cdef int inside_string = 0`
			`cdef int escape = 0`
			`cdef long start = -1`
			`cdef char c`
			`cdef char quote = ord('"')`
			`cdef char backslash = ord("\\")`
			`cdef char open_square = ord("[")`
			`cdef char close_square = ord("]")`
			`cdef char open_curly = ord("{")`
			`cdef char close_curly = ord("}")`
			`for i in range(file_length):`
			`c = raw[i]`
			`if escape:`
			`escape = False`
			`continue`
			`if c == backslash:`
			`escape = True`
			`continue`
			`if c == quote:`
			`inside_string = not inside_string`
			`continue`
			`if inside_string:`
			`continue`
			`if c == open_square:`
			`square_depth += 1`
			`elif c == close_square:`
			`square_depth -= 1`
			`elif c == open_curly:`
			`if square_depth == 1 and curly_depth == 0:`
			`start = i`
			`curly_depth += 1`
			`elif c == close_curly:`
			`curly_depth -= 1`
			`if square_depth == 1 and curly_depth == 0:`
			`py_str = py_raw[start : i + 1].decode("utf8")`
			`try:`
			`yield srsly.json_loads(py_str)`
			`except Exception:`
			`print(py_str)`
			`raise`
			`start = -1`