Handle iob with no tag in converter

This commit is contained in:
Matthew Honnibal 2017-05-28 08:11:39 -05:00
parent 49235017bf
commit 5cf47b847b

View File

@ -12,7 +12,7 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
""" """
# TODO: This isn't complete yet -- need to map from IOB to # TODO: This isn't complete yet -- need to map from IOB to
# BILUO # BILUO
with input_path.open() as file_: with input_path.open('r', encoding='utf8') as file_:
docs = read_iob(file_) docs = read_iob(file_)
output_filename = input_path.parts[-1].replace(".iob", ".json") output_filename = input_path.parts[-1].replace(".iob", ".json")
@ -28,8 +28,12 @@ def read_iob(file_):
for line in file_: for line in file_:
if not line.strip(): if not line.strip():
continue continue
tokens = [t.rsplit('|', 2) for t in line.split()] tokens = [t.split('|') for t in line.split()]
if len(tokens[0]) == 3:
words, pos, iob = zip(*tokens) words, pos, iob = zip(*tokens)
else:
words, iob = zip(*tokens)
pos = ['-'] * len(words)
biluo = iob_to_biluo(iob) biluo = iob_to_biluo(iob)
sentences.append([ sentences.append([
{'orth': w, 'tag': p, 'ner': ent} {'orth': w, 'tag': p, 'ner': ent}