prevent writing dummy values like deps because that could interfer with sent_start values

This commit is contained in:
svlandeg 2020-06-18 17:47:59 +02:00
parent 0b6d45eae1
commit e822367cf7
2 changed files with 31 additions and 16 deletions

View File

@ -72,7 +72,7 @@ class GoldCorpus(object):
@staticmethod @staticmethod
def read_annotations(locs, limit=0): def read_annotations(locs, limit=0):
""" Yield training examples """ """ Yield training examples as example dicts """
i = 0 i = 0
for loc in locs: for loc in locs:
loc = util.ensure_path(loc) loc = util.ensure_path(loc)

View File

@ -108,15 +108,21 @@ def json_to_annotations(doc):
words.append(token["orth"]) words.append(token["orth"])
spaces.append(token.get("space", True)) spaces.append(token.get("space", True))
ids.append(token.get('id', sent_start_i + i)) ids.append(token.get('id', sent_start_i + i))
tags.append(token.get('tag', "-")) if "tag" in token:
pos.append(token.get("pos", "")) tags.append(token["tag"])
morphs.append(token.get("morph", "")) if "pos" in token:
lemmas.append(token.get("lemma", "")) pos.append(token["pos"])
heads.append(token.get("head", 0) + sent_start_i + i) if "morph" in token:
labels.append(token.get("dep", "")) morphs.append(token["morph"])
# Ensure ROOT label is case-insensitive if "lemma" in token:
if labels[-1].lower() == "root": lemmas.append(token["lemma"])
labels[-1] = "ROOT" if "head" in token:
heads.append(token["head"])
if "dep" in token:
labels.append(token["dep"])
# Ensure ROOT label is case-insensitive
if labels[-1].lower() == "root":
labels[-1] = "ROOT"
if i == 0: if i == 0:
sent_starts.append(1) sent_starts.append(1)
else: else:
@ -130,15 +136,24 @@ def json_to_annotations(doc):
ids=ids, ids=ids,
words=words, words=words,
spaces=spaces, spaces=spaces,
tags=tags,
pos=pos,
morphs=morphs,
lemmas=lemmas,
heads=heads,
deps=labels,
sent_starts=sent_starts, sent_starts=sent_starts,
brackets=brackets brackets=brackets
) )
# avoid including dummy values that looks like gold info was present
if tags:
example["token_annotation"]["tags"] = tags
if pos:
example["token_annotation"]["pos"] = pos
if morphs:
example["token_annotation"]["morphs"] = morphs
if lemmas:
example["token_annotation"]["lemmas"] = lemmas
if heads:
example["token_annotation"]["heads"] = heads
if labels:
example["token_annotation"]["deps"] = labels
if pos:
example["token_annotation"]["pos"] = pos
cats = {} cats = {}
for cat in paragraph.get("cats", {}): for cat in paragraph.get("cats", {}):