mirror of
				https://github.com/explosion/spaCy.git
				synced 2025-10-30 23:47:31 +03:00 
			
		
		
		
	Fix conversion of NER data
This commit is contained in:
		
							parent
							
								
									b82431207d
								
							
						
					
					
						commit
						78e9e15e9e
					
				|  | @ -3,7 +3,8 @@ import srsly | ||||||
| from .. import util | from .. import util | ||||||
| from ..errors import Warnings | from ..errors import Warnings | ||||||
| from ..tokens import Doc | from ..tokens import Doc | ||||||
| from .iob_utils import biluo_tags_from_offsets | from .iob_utils import biluo_tags_from_offsets, tags_to_entities | ||||||
|  | import json | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def merge_sents(sents): | def merge_sents(sents): | ||||||
|  | @ -97,6 +98,7 @@ def json_to_annotations(doc): | ||||||
|         spaces = [] |         spaces = [] | ||||||
|         ids = [] |         ids = [] | ||||||
|         tags = [] |         tags = [] | ||||||
|  |         ner_tags = [] | ||||||
|         pos = [] |         pos = [] | ||||||
|         morphs = [] |         morphs = [] | ||||||
|         lemmas = [] |         lemmas = [] | ||||||
|  | @ -110,21 +112,22 @@ def json_to_annotations(doc): | ||||||
|                 words.append(token["orth"]) |                 words.append(token["orth"]) | ||||||
|                 spaces.append(token.get("space", True)) |                 spaces.append(token.get("space", True)) | ||||||
|                 ids.append(token.get('id', sent_start_i + i)) |                 ids.append(token.get('id', sent_start_i + i)) | ||||||
|                 if "tag" in token: |                 tags.append(token.get("tag", None)) | ||||||
|                     tags.append(token["tag"]) |                 pos.append(token.get("pos", None)) | ||||||
|                 if "pos" in token: |                 morphs.append(token.get("morph", None)) | ||||||
|                     pos.append(token["pos"]) |                 lemmas.append(token.get("lemma", None)) | ||||||
|                 if "morph" in token: |  | ||||||
|                     morphs.append(token["morph"]) |  | ||||||
|                 if "lemma" in token: |  | ||||||
|                     lemmas.append(token["lemma"]) |  | ||||||
|                 if "head" in token: |                 if "head" in token: | ||||||
|                     heads.append(token["head"] + sent_start_i + i) |                     heads.append(token["head"] + sent_start_i + i) | ||||||
|  |                 else: | ||||||
|  |                     heads.append(None) | ||||||
|                 if "dep" in token: |                 if "dep" in token: | ||||||
|                     labels.append(token["dep"]) |                     labels.append(token["dep"]) | ||||||
|                     # Ensure ROOT label is case-insensitive |                     # Ensure ROOT label is case-insensitive | ||||||
|                     if labels[-1].lower() == "root": |                     if labels[-1].lower() == "root": | ||||||
|                         labels[-1] = "ROOT" |                         labels[-1] = "ROOT" | ||||||
|  |                 else: | ||||||
|  |                     labels.append(None) | ||||||
|  |                 ner_tags.append(token.get("ner", None)) | ||||||
|                 if i == 0: |                 if i == 0: | ||||||
|                     sent_starts.append(1) |                     sent_starts.append(1) | ||||||
|                 else: |                 else: | ||||||
|  | @ -142,31 +145,25 @@ def json_to_annotations(doc): | ||||||
|             brackets=brackets |             brackets=brackets | ||||||
|         ) |         ) | ||||||
|         # avoid including dummy values that looks like gold info was present |         # avoid including dummy values that looks like gold info was present | ||||||
|         if tags: |         if any(tags): | ||||||
|             example["token_annotation"]["tags"] = tags |             example["token_annotation"]["tags"] = tags | ||||||
|         if pos: |         if any(pos): | ||||||
|             example["token_annotation"]["pos"] = pos |             example["token_annotation"]["pos"] = pos | ||||||
|         if morphs: |         if any(morphs): | ||||||
|             example["token_annotation"]["morphs"] = morphs |             example["token_annotation"]["morphs"] = morphs | ||||||
|         if lemmas: |         if any(lemmas): | ||||||
|             example["token_annotation"]["lemmas"] = lemmas |             example["token_annotation"]["lemmas"] = lemmas | ||||||
|         if heads: |         if any(head is not None for head in heads): | ||||||
|             example["token_annotation"]["heads"] = heads |             example["token_annotation"]["heads"] = heads | ||||||
|         if labels: |         if any(labels): | ||||||
|             example["token_annotation"]["deps"] = labels |             example["token_annotation"]["deps"] = labels | ||||||
|         if pos: |  | ||||||
|             example["token_annotation"]["pos"] = pos |  | ||||||
| 
 | 
 | ||||||
|         cats = {} |         cats = {} | ||||||
|         for cat in paragraph.get("cats", {}): |         for cat in paragraph.get("cats", {}): | ||||||
|             cats[cat["label"]] = cat["value"] |             cats[cat["label"]] = cat["value"] | ||||||
|         entities = [] |  | ||||||
|         for start, end, label in paragraph.get("entities", {}): |  | ||||||
|             ent_tuple = (start, end, label) |  | ||||||
|             entities.append(ent_tuple) |  | ||||||
|         example["doc_annotation"] = dict( |         example["doc_annotation"] = dict( | ||||||
|             cats=cats, |             cats=cats, | ||||||
|             entities=entities, |             entities=ner_tags, | ||||||
|             links=paragraph.get("links", [])   # TODO: fix/test |             links=paragraph.get("links", [])   # TODO: fix/test | ||||||
|         ) |         ) | ||||||
|         yield example |         yield example | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user