mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Add gold_io.pyx
This commit is contained in:
		
							parent
							
								
									156466ca69
								
							
						
					
					
						commit
						32c8fb1372
					
				
							
								
								
									
										202
									
								
								spacy/_gold/gold_io.pyx
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										202
									
								
								spacy/_gold/gold_io.pyx
									
									
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,202 @@ | |||
| import warnings | ||||
| import srsly | ||||
| from .. import util | ||||
| from ..errors import Warnings | ||||
| from ..tokens import Token, Doc | ||||
| from .example import Example | ||||
| from .iob_utils import biluo_tags_from_offsets | ||||
| 
 | ||||
| 
 | ||||
| def merge_sents(sents): | ||||
|     m_deps = [[], [], [], [], [], []] | ||||
|     m_cats = {} | ||||
|     m_brackets = [] | ||||
|     i = 0 | ||||
|     for (ids, words, tags, heads, labels, ner), (cats, brackets) in sents: | ||||
|         m_deps[0].extend(id_ + i for id_ in ids) | ||||
|         m_deps[1].extend(words) | ||||
|         m_deps[2].extend(tags) | ||||
|         m_deps[3].extend(head + i for head in heads) | ||||
|         m_deps[4].extend(labels) | ||||
|         m_deps[5].extend(ner) | ||||
|         m_brackets.extend((b["first"] + i, b["last"] + i, b["label"]) | ||||
|                           for b in brackets) | ||||
|         m_cats.update(cats) | ||||
|         i += len(ids) | ||||
|     return [(m_deps, (m_cats, m_brackets))] | ||||
| 
 | ||||
| 
 | ||||
| def docs_to_json(docs, id=0, ner_missing_tag="O"): | ||||
|     """Convert a list of Doc objects into the JSON-serializable format used by | ||||
|     the spacy train command. | ||||
| 
 | ||||
|     docs (iterable / Doc): The Doc object(s) to convert. | ||||
|     id (int): Id for the JSON. | ||||
|     RETURNS (dict): The data in spaCy's JSON format | ||||
|         - each input doc will be treated as a paragraph in the output doc | ||||
|     """ | ||||
|     if isinstance(docs, Doc): | ||||
|         docs = [docs] | ||||
|     json_doc = {"id": id, "paragraphs": []} | ||||
|     for i, doc in enumerate(docs): | ||||
|         json_para = {'raw': doc.text, "sentences": [], "cats": []} | ||||
|         for cat, val in doc.cats.items(): | ||||
|             json_cat = {"label": cat, "value": val} | ||||
|             json_para["cats"].append(json_cat) | ||||
|         ent_offsets = [(e.start_char, e.end_char, e.label_) for e in doc.ents] | ||||
|         biluo_tags = biluo_tags_from_offsets(doc, ent_offsets, missing=ner_missing_tag) | ||||
|         for j, sent in enumerate(doc.sents): | ||||
|             json_sent = {"tokens": [], "brackets": []} | ||||
|             for token in sent: | ||||
|                 json_token = {"id": token.i, "orth": token.text} | ||||
|                 if doc.is_tagged: | ||||
|                     json_token["tag"] = token.tag_ | ||||
|                     json_token["pos"] = token.pos_ | ||||
|                     json_token["morph"] = token.morph_ | ||||
|                     json_token["lemma"] = token.lemma_ | ||||
|                 if doc.is_parsed: | ||||
|                     json_token["head"] = token.head.i-token.i | ||||
|                     json_token["dep"] = token.dep_ | ||||
|                 json_token["ner"] = biluo_tags[token.i] | ||||
|                 json_sent["tokens"].append(json_token) | ||||
|             json_para["sentences"].append(json_sent) | ||||
|         json_doc["paragraphs"].append(json_para) | ||||
|     return json_doc | ||||
| 
 | ||||
| 
 | ||||
| def json_to_examples(doc): | ||||
|     """Convert an item in the JSON-formatted training data to the format | ||||
|     used by GoldParse. | ||||
| 
 | ||||
|     doc (dict): One entry in the training data. | ||||
|     YIELDS (Example): The reformatted data - one training example per paragraph | ||||
|     """ | ||||
|     for paragraph in doc["paragraphs"]: | ||||
|         example = Example(doc=paragraph.get("raw", None)) | ||||
|         words = [] | ||||
|         ids = [] | ||||
|         tags = [] | ||||
|         pos = [] | ||||
|         morphs = [] | ||||
|         lemmas = [] | ||||
|         heads = [] | ||||
|         labels = [] | ||||
|         ner = [] | ||||
|         sent_starts = [] | ||||
|         brackets = [] | ||||
|         for sent in paragraph["sentences"]: | ||||
|             sent_start_i = len(words) | ||||
|             for i, token in enumerate(sent["tokens"]): | ||||
|                 words.append(token["orth"]) | ||||
|                 ids.append(token.get('id', sent_start_i + i)) | ||||
|                 tags.append(token.get('tag', "-")) | ||||
|                 pos.append(token.get("pos", "")) | ||||
|                 morphs.append(token.get("morph", "")) | ||||
|                 lemmas.append(token.get("lemma", "")) | ||||
|                 heads.append(token.get("head", 0) + sent_start_i + i) | ||||
|                 labels.append(token.get("dep", "")) | ||||
|                 # Ensure ROOT label is case-insensitive | ||||
|                 if labels[-1].lower() == "root": | ||||
|                     labels[-1] = "ROOT" | ||||
|                 ner.append(token.get("ner", "-")) | ||||
|                 if i == 0: | ||||
|                     sent_starts.append(1) | ||||
|                 else: | ||||
|                     sent_starts.append(0) | ||||
|             if "brackets" in sent: | ||||
|                 brackets.extend((b["first"] + sent_start_i, | ||||
|                                  b["last"] + sent_start_i, b["label"]) | ||||
|                                  for b in sent["brackets"]) | ||||
|         cats = {} | ||||
|         for cat in paragraph.get("cats", {}): | ||||
|             cats[cat["label"]] = cat["value"] | ||||
|         example.set_token_annotation(ids=ids, words=words, tags=tags, | ||||
|                 pos=pos, morphs=morphs, lemmas=lemmas, heads=heads, | ||||
|                 deps=labels, entities=ner, sent_starts=sent_starts, | ||||
|                 brackets=brackets) | ||||
|         example.set_doc_annotation(cats=cats) | ||||
|         yield example | ||||
| 
 | ||||
| 
 | ||||
| def read_json_file(loc, docs_filter=None, limit=None): | ||||
|     loc = util.ensure_path(loc) | ||||
|     if loc.is_dir(): | ||||
|         for filename in loc.iterdir(): | ||||
|             yield from read_json_file(loc / filename, limit=limit) | ||||
|     else: | ||||
|         for doc in json_iterate(loc): | ||||
|             if docs_filter is not None and not docs_filter(doc): | ||||
|                 continue | ||||
|             for json_data in json_to_examples(doc): | ||||
|                 yield json_data | ||||
| 
 | ||||
| 
 | ||||
| def read_json_object(json_corpus_section): | ||||
|     """Take a list of JSON-formatted documents (e.g. from an already loaded | ||||
|     training data file) and yield annotations in the GoldParse format. | ||||
| 
 | ||||
|     json_corpus_section (list): The data. | ||||
|     YIELDS (Example): The reformatted data - one training example per paragraph | ||||
|     """ | ||||
|     for json_doc in json_corpus_section: | ||||
|         examples = json_to_examples(json_doc) | ||||
|         for ex in examples: | ||||
|             yield ex | ||||
| 
 | ||||
| 
 | ||||
| def json_iterate(loc): | ||||
|     # We should've made these files jsonl...But since we didn't, parse out | ||||
|     # the docs one-by-one to reduce memory usage. | ||||
|     # It's okay to read in the whole file -- just don't parse it into JSON. | ||||
|     cdef bytes py_raw | ||||
|     loc = util.ensure_path(loc) | ||||
|     with loc.open("rb") as file_: | ||||
|         py_raw = file_.read() | ||||
|     cdef long file_length = len(py_raw) | ||||
|     if file_length > 2 ** 30: | ||||
|         warnings.warn(Warnings.W027.format(size=file_length)) | ||||
| 
 | ||||
|     raw = <char*>py_raw | ||||
|     cdef int square_depth = 0 | ||||
|     cdef int curly_depth = 0 | ||||
|     cdef int inside_string = 0 | ||||
|     cdef int escape = 0 | ||||
|     cdef long start = -1 | ||||
|     cdef char c | ||||
|     cdef char quote = ord('"') | ||||
|     cdef char backslash = ord("\\") | ||||
|     cdef char open_square = ord("[") | ||||
|     cdef char close_square = ord("]") | ||||
|     cdef char open_curly = ord("{") | ||||
|     cdef char close_curly = ord("}") | ||||
|     for i in range(file_length): | ||||
|         c = raw[i] | ||||
|         if escape: | ||||
|             escape = False | ||||
|             continue | ||||
|         if c == backslash: | ||||
|             escape = True | ||||
|             continue | ||||
|         if c == quote: | ||||
|             inside_string = not inside_string | ||||
|             continue | ||||
|         if inside_string: | ||||
|             continue | ||||
|         if c == open_square: | ||||
|             square_depth += 1 | ||||
|         elif c == close_square: | ||||
|             square_depth -= 1 | ||||
|         elif c == open_curly: | ||||
|             if square_depth == 1 and curly_depth == 0: | ||||
|                 start = i | ||||
|             curly_depth += 1 | ||||
|         elif c == close_curly: | ||||
|             curly_depth -= 1 | ||||
|             if square_depth == 1 and curly_depth == 0: | ||||
|                 py_str = py_raw[start : i + 1].decode("utf8") | ||||
|                 try: | ||||
|                     yield srsly.json_loads(py_str) | ||||
|                 except Exception: | ||||
|                     print(py_str) | ||||
|                     raise | ||||
|                 start = -1 | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user