mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-25 21:21:10 +03:00 
			
		
		
		
	Data in the JSON format is split into sentences, and each sentence is saved with is_sent_start flags. Currently the flags are 1 for the first token and 0 for the others. When deserialized this results in a pattern of True, None, None, None... which makes single-sentence documents look as though they haven't had sentence boundaries set. Since items saved in JSON format have been split into sentences already, the is_sent_start values should all be True or False.
		
			
				
	
	
		
			209 lines
		
	
	
		
			7.8 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
			
		
		
	
	
			209 lines
		
	
	
		
			7.8 KiB
		
	
	
	
		
			Cython
		
	
	
	
	
	
| import warnings
 | |
| import srsly
 | |
| from .. import util
 | |
| from ..errors import Warnings
 | |
| from ..tokens import Doc
 | |
| from .iob_utils import offsets_to_biluo_tags, tags_to_entities
 | |
| import json
 | |
| 
 | |
| 
 | |
| def docs_to_json(docs, doc_id=0, ner_missing_tag="O"):
 | |
|     """Convert a list of Doc objects into the JSON-serializable format used by
 | |
|     the spacy train command.
 | |
| 
 | |
|     docs (iterable / Doc): The Doc object(s) to convert.
 | |
|     doc_id (int): Id for the JSON.
 | |
|     RETURNS (dict): The data in spaCy's JSON format
 | |
|         - each input doc will be treated as a paragraph in the output doc
 | |
|     """
 | |
|     if isinstance(docs, Doc):
 | |
|         docs = [docs]
 | |
|     json_doc = {"id": doc_id, "paragraphs": []}
 | |
|     for i, doc in enumerate(docs):
 | |
|         raw = None if doc.has_unknown_spaces else doc.text
 | |
|         json_para = {'raw': raw, "sentences": [], "cats": [], "entities": [], "links": []}
 | |
|         for cat, val in doc.cats.items():
 | |
|             json_cat = {"label": cat, "value": val}
 | |
|             json_para["cats"].append(json_cat)
 | |
|         # warning: entities information is currently duplicated as
 | |
|         # doc-level "entities" and token-level "ner"
 | |
|         for ent in doc.ents:
 | |
|             ent_tuple = (ent.start_char, ent.end_char, ent.label_)
 | |
|             json_para["entities"].append(ent_tuple)
 | |
|             if ent.kb_id_:
 | |
|                 link_dict = {(ent.start_char, ent.end_char): {ent.kb_id_: 1.0}}
 | |
|                 json_para["links"].append(link_dict)
 | |
|         biluo_tags = offsets_to_biluo_tags(doc, json_para["entities"], missing=ner_missing_tag)
 | |
|         attrs = ("TAG", "POS", "MORPH", "LEMMA", "DEP", "ENT_IOB")
 | |
|         include_annotation = {attr: doc.has_annotation(attr) for attr in attrs}
 | |
|         for j, sent in enumerate(doc.sents):
 | |
|             json_sent = {"tokens": [], "brackets": []}
 | |
|             for token in sent:
 | |
|                 json_token = {"id": token.i, "orth": token.text, "space": token.whitespace_}
 | |
|                 if include_annotation["TAG"]:
 | |
|                     json_token["tag"] = token.tag_
 | |
|                 if include_annotation["POS"]:
 | |
|                     json_token["pos"] = token.pos_
 | |
|                 if include_annotation["MORPH"]:
 | |
|                     json_token["morph"] = str(token.morph)
 | |
|                 if include_annotation["LEMMA"]:
 | |
|                     json_token["lemma"] = token.lemma_
 | |
|                 if include_annotation["DEP"]:
 | |
|                     json_token["head"] = token.head.i-token.i
 | |
|                     json_token["dep"] = token.dep_
 | |
|                 if include_annotation["ENT_IOB"]:
 | |
|                     json_token["ner"] = biluo_tags[token.i]
 | |
|                 json_sent["tokens"].append(json_token)
 | |
|             json_para["sentences"].append(json_sent)
 | |
|         json_doc["paragraphs"].append(json_para)
 | |
|     return json_doc
 | |
| 
 | |
| 
 | |
| def read_json_file(loc, docs_filter=None, limit=None):
 | |
|     """Read Example dictionaries from a json file or directory."""
 | |
|     loc = util.ensure_path(loc)
 | |
|     if loc.is_dir():
 | |
|         for filename in sorted(loc.iterdir()):
 | |
|             yield from read_json_file(loc / filename, limit=limit)
 | |
|     else:
 | |
|         with loc.open("rb") as file_:
 | |
|             utf8_str = file_.read()
 | |
|         for json_doc in json_iterate(utf8_str):
 | |
|             if docs_filter is not None and not docs_filter(json_doc):
 | |
|                 continue
 | |
|             for json_paragraph in json_to_annotations(json_doc):
 | |
|                 yield json_paragraph
 | |
| 
 | |
| 
 | |
| def json_to_annotations(doc):
 | |
|     """Convert an item in the JSON-formatted training data to the format
 | |
|     used by Example.
 | |
| 
 | |
|     doc (dict): One entry in the training data.
 | |
|     YIELDS (tuple): The reformatted data - one training example per paragraph
 | |
|     """
 | |
|     for paragraph in doc["paragraphs"]:
 | |
|         example = {"text": paragraph.get("raw", None)}
 | |
|         words = []
 | |
|         spaces = []
 | |
|         ids = []
 | |
|         tags = []
 | |
|         ner_tags = []
 | |
|         pos = []
 | |
|         morphs = []
 | |
|         lemmas = []
 | |
|         heads = []
 | |
|         labels = []
 | |
|         sent_starts = []
 | |
|         brackets = []
 | |
|         for sent in paragraph["sentences"]:
 | |
|             sent_start_i = len(words)
 | |
|             for i, token in enumerate(sent["tokens"]):
 | |
|                 words.append(token["orth"])
 | |
|                 spaces.append(token.get("space", None))
 | |
|                 ids.append(token.get('id', sent_start_i + i))
 | |
|                 tags.append(token.get("tag", None))
 | |
|                 pos.append(token.get("pos", None))
 | |
|                 morphs.append(token.get("morph", None))
 | |
|                 lemmas.append(token.get("lemma", None))
 | |
|                 if "head" in token:
 | |
|                     heads.append(token["head"] + sent_start_i + i)
 | |
|                 else:
 | |
|                     heads.append(None)
 | |
|                 if "dep" in token:
 | |
|                     labels.append(token["dep"])
 | |
|                     # Ensure ROOT label is case-insensitive
 | |
|                     if labels[-1].lower() == "root":
 | |
|                         labels[-1] = "ROOT"
 | |
|                 else:
 | |
|                     labels.append(None)
 | |
|                 ner_tags.append(token.get("ner", None))
 | |
|                 if i == 0:
 | |
|                     sent_starts.append(1)
 | |
|                 else:
 | |
|                     sent_starts.append(-1)
 | |
|             if "brackets" in sent:
 | |
|                 brackets.extend((b["first"] + sent_start_i,
 | |
|                                  b["last"] + sent_start_i, b["label"])
 | |
|                                  for b in sent["brackets"])
 | |
| 
 | |
|         example["token_annotation"] = dict(
 | |
|             ids=ids,
 | |
|             words=words,
 | |
|             spaces=spaces,
 | |
|             sent_starts=sent_starts,
 | |
|             brackets=brackets
 | |
|         )
 | |
|         # avoid including dummy values that looks like gold info was present
 | |
|         if any(tags):
 | |
|             example["token_annotation"]["tags"] = tags
 | |
|         if any(pos):
 | |
|             example["token_annotation"]["pos"] = pos
 | |
|         if any(morphs):
 | |
|             example["token_annotation"]["morphs"] = morphs
 | |
|         if any(lemmas):
 | |
|             example["token_annotation"]["lemmas"] = lemmas
 | |
|         if any(head is not None for head in heads):
 | |
|             example["token_annotation"]["heads"] = heads
 | |
|         if any(labels):
 | |
|             example["token_annotation"]["deps"] = labels
 | |
| 
 | |
|         cats = {}
 | |
|         for cat in paragraph.get("cats", {}):
 | |
|             cats[cat["label"]] = cat["value"]
 | |
|         example["doc_annotation"] = dict(
 | |
|             cats=cats,
 | |
|             entities=ner_tags,
 | |
|             links=paragraph.get("links", [])
 | |
|         )
 | |
|         yield example
 | |
| 
 | |
| def json_iterate(bytes utf8_str):
 | |
|     # We should've made these files jsonl...But since we didn't, parse out
 | |
|     # the docs one-by-one to reduce memory usage.
 | |
|     # It's okay to read in the whole file -- just don't parse it into JSON.
 | |
|     cdef long file_length = len(utf8_str)
 | |
|     if file_length > 2 ** 30:
 | |
|         warnings.warn(Warnings.W027.format(size=file_length))
 | |
| 
 | |
|     raw = <char*>utf8_str
 | |
|     cdef int square_depth = 0
 | |
|     cdef int curly_depth = 0
 | |
|     cdef int inside_string = 0
 | |
|     cdef int escape = 0
 | |
|     cdef long start = -1
 | |
|     cdef char c
 | |
|     cdef char quote = ord('"')
 | |
|     cdef char backslash = ord("\\")
 | |
|     cdef char open_square = ord("[")
 | |
|     cdef char close_square = ord("]")
 | |
|     cdef char open_curly = ord("{")
 | |
|     cdef char close_curly = ord("}")
 | |
|     for i in range(file_length):
 | |
|         c = raw[i]
 | |
|         if escape:
 | |
|             escape = False
 | |
|             continue
 | |
|         if c == backslash:
 | |
|             escape = True
 | |
|             continue
 | |
|         if c == quote:
 | |
|             inside_string = not inside_string
 | |
|             continue
 | |
|         if inside_string:
 | |
|             continue
 | |
|         if c == open_square:
 | |
|             square_depth += 1
 | |
|         elif c == close_square:
 | |
|             square_depth -= 1
 | |
|         elif c == open_curly:
 | |
|             if square_depth == 1 and curly_depth == 0:
 | |
|                 start = i
 | |
|             curly_depth += 1
 | |
|         elif c == close_curly:
 | |
|             curly_depth -= 1
 | |
|             if square_depth == 1 and curly_depth == 0:
 | |
|                 substr = utf8_str[start : i + 1].decode("utf8")
 | |
|                 yield srsly.json_loads(substr)
 | |
|                 start = -1
 |