From 9c8c4287bf09836a2c9b7ff552337fac559bfe06 Mon Sep 17 00:00:00 2001 From: Gavriel Loria Date: Thu, 6 Dec 2018 09:50:25 -0500 Subject: [PATCH] Accept iob2 and allow generic whitespace (#2999) * accept non-pipe whitespace as delimiter; allow iob2 filename * added small documentation note for IOB2 allowance * added contributor agreement --- spacy/cli/converters/iob2json.py | 8 ++++++-- website/api/cli.jade | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index 5a0e9e046..3f38a6e25 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -7,6 +7,8 @@ from ...compat import json_dumps, path2str from ...util import prints from ...gold import iob_to_biluo +import re + def iob2json(input_path, output_path, n_sents=10, *a, **k): """ @@ -15,7 +17,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k): with input_path.open('r', encoding='utf8') as file_: sentences = read_iob(file_) docs = merge_sentences(sentences, n_sents) - output_filename = input_path.parts[-1].replace(".iob", ".json") + output_filename = (input_path.parts[-1] + .replace(".iob2", ".json") + .replace(".iob", ".json")) output_file = output_path / output_filename with output_file.open('w', encoding='utf-8') as f: f.write(json_dumps(docs)) @@ -28,7 +32,7 @@ def read_iob(raw_sents): for line in raw_sents: if not line.strip(): continue - tokens = [t.split('|') for t in line.split()] + tokens = [re.split('[^\w\-]', line.strip())] if len(tokens[0]) == 3: words, pos, iob = zip(*tokens) else: diff --git a/website/api/cli.jade b/website/api/cli.jade index 28c76c09c..fc6dd86a1 100644 --- a/website/api/cli.jade +++ b/website/api/cli.jade @@ -245,7 +245,7 @@ p The following file format converters are available: +row +cell #[code iob] - +cell IOB named entity recognition format. + +cell IOB or IOB2 named entity recognition format. +h(3, "train") Train