mirror of
https://github.com/explosion/spaCy.git
synced 2025-03-22 02:44:15 +03:00
Fix conversion of NER data
This commit is contained in:
parent
b82431207d
commit
78e9e15e9e
|
@ -3,7 +3,8 @@ import srsly
|
||||||
from .. import util
|
from .. import util
|
||||||
from ..errors import Warnings
|
from ..errors import Warnings
|
||||||
from ..tokens import Doc
|
from ..tokens import Doc
|
||||||
from .iob_utils import biluo_tags_from_offsets
|
from .iob_utils import biluo_tags_from_offsets, tags_to_entities
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
def merge_sents(sents):
|
def merge_sents(sents):
|
||||||
|
@ -97,6 +98,7 @@ def json_to_annotations(doc):
|
||||||
spaces = []
|
spaces = []
|
||||||
ids = []
|
ids = []
|
||||||
tags = []
|
tags = []
|
||||||
|
ner_tags = []
|
||||||
pos = []
|
pos = []
|
||||||
morphs = []
|
morphs = []
|
||||||
lemmas = []
|
lemmas = []
|
||||||
|
@ -110,21 +112,22 @@ def json_to_annotations(doc):
|
||||||
words.append(token["orth"])
|
words.append(token["orth"])
|
||||||
spaces.append(token.get("space", True))
|
spaces.append(token.get("space", True))
|
||||||
ids.append(token.get('id', sent_start_i + i))
|
ids.append(token.get('id', sent_start_i + i))
|
||||||
if "tag" in token:
|
tags.append(token.get("tag", None))
|
||||||
tags.append(token["tag"])
|
pos.append(token.get("pos", None))
|
||||||
if "pos" in token:
|
morphs.append(token.get("morph", None))
|
||||||
pos.append(token["pos"])
|
lemmas.append(token.get("lemma", None))
|
||||||
if "morph" in token:
|
|
||||||
morphs.append(token["morph"])
|
|
||||||
if "lemma" in token:
|
|
||||||
lemmas.append(token["lemma"])
|
|
||||||
if "head" in token:
|
if "head" in token:
|
||||||
heads.append(token["head"] + sent_start_i + i)
|
heads.append(token["head"] + sent_start_i + i)
|
||||||
|
else:
|
||||||
|
heads.append(None)
|
||||||
if "dep" in token:
|
if "dep" in token:
|
||||||
labels.append(token["dep"])
|
labels.append(token["dep"])
|
||||||
# Ensure ROOT label is case-insensitive
|
# Ensure ROOT label is case-insensitive
|
||||||
if labels[-1].lower() == "root":
|
if labels[-1].lower() == "root":
|
||||||
labels[-1] = "ROOT"
|
labels[-1] = "ROOT"
|
||||||
|
else:
|
||||||
|
labels.append(None)
|
||||||
|
ner_tags.append(token.get("ner", None))
|
||||||
if i == 0:
|
if i == 0:
|
||||||
sent_starts.append(1)
|
sent_starts.append(1)
|
||||||
else:
|
else:
|
||||||
|
@ -142,31 +145,25 @@ def json_to_annotations(doc):
|
||||||
brackets=brackets
|
brackets=brackets
|
||||||
)
|
)
|
||||||
# avoid including dummy values that looks like gold info was present
|
# avoid including dummy values that looks like gold info was present
|
||||||
if tags:
|
if any(tags):
|
||||||
example["token_annotation"]["tags"] = tags
|
example["token_annotation"]["tags"] = tags
|
||||||
if pos:
|
if any(pos):
|
||||||
example["token_annotation"]["pos"] = pos
|
example["token_annotation"]["pos"] = pos
|
||||||
if morphs:
|
if any(morphs):
|
||||||
example["token_annotation"]["morphs"] = morphs
|
example["token_annotation"]["morphs"] = morphs
|
||||||
if lemmas:
|
if any(lemmas):
|
||||||
example["token_annotation"]["lemmas"] = lemmas
|
example["token_annotation"]["lemmas"] = lemmas
|
||||||
if heads:
|
if any(head is not None for head in heads):
|
||||||
example["token_annotation"]["heads"] = heads
|
example["token_annotation"]["heads"] = heads
|
||||||
if labels:
|
if any(labels):
|
||||||
example["token_annotation"]["deps"] = labels
|
example["token_annotation"]["deps"] = labels
|
||||||
if pos:
|
|
||||||
example["token_annotation"]["pos"] = pos
|
|
||||||
|
|
||||||
cats = {}
|
cats = {}
|
||||||
for cat in paragraph.get("cats", {}):
|
for cat in paragraph.get("cats", {}):
|
||||||
cats[cat["label"]] = cat["value"]
|
cats[cat["label"]] = cat["value"]
|
||||||
entities = []
|
|
||||||
for start, end, label in paragraph.get("entities", {}):
|
|
||||||
ent_tuple = (start, end, label)
|
|
||||||
entities.append(ent_tuple)
|
|
||||||
example["doc_annotation"] = dict(
|
example["doc_annotation"] = dict(
|
||||||
cats=cats,
|
cats=cats,
|
||||||
entities=entities,
|
entities=ner_tags,
|
||||||
links=paragraph.get("links", []) # TODO: fix/test
|
links=paragraph.get("links", []) # TODO: fix/test
|
||||||
)
|
)
|
||||||
yield example
|
yield example
|
||||||
|
|
Loading…
Reference in New Issue
Block a user