From 5cf47b847ba50dc04253d77b65cf63a9b7347890 Mon Sep 17 00:00:00 2001 From: Matthew Honnibal Date: Sun, 28 May 2017 08:11:39 -0500 Subject: [PATCH] Handle iob with no tag in converter --- spacy/cli/converters/iob2json.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index c2e944c0a..4849345e9 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -12,7 +12,7 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): """ # TODO: This isn't complete yet -- need to map from IOB to # BILUO - with input_path.open() as file_: + with input_path.open('r', encoding='utf8') as file_: docs = read_iob(file_) output_filename = input_path.parts[-1].replace(".iob", ".json") @@ -28,8 +28,12 @@ def read_iob(file_): for line in file_: if not line.strip(): continue - tokens = [t.rsplit('|', 2) for t in line.split()] - words, pos, iob = zip(*tokens) + tokens = [t.split('|') for t in line.split()] + if len(tokens[0]) == 3: + words, pos, iob = zip(*tokens) + else: + words, iob = zip(*tokens) + pos = ['-'] * len(words) biluo = iob_to_biluo(iob) sentences.append([ {'orth': w, 'tag': p, 'ner': ent}