From 9c8c4287bf09836a2c9b7ff552337fac559bfe06 Mon Sep 17 00:00:00 2001
From: Gavriel Loria <gtloria@protonmail.com>
Date: Thu, 6 Dec 2018 09:50:25 -0500
Subject: [PATCH] Accept iob2 and allow generic whitespace (#2999)

* accept non-pipe whitespace as delimiter; allow iob2 filename

* added small documentation note for IOB2 allowance

* added contributor agreement
---
 spacy/cli/converters/iob2json.py | 8 ++++++--
 website/api/cli.jade             | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py
index 5a0e9e046..3f38a6e25 100644
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@@ -7,6 +7,8 @@ from ...compat import json_dumps, path2str
 from ...util import prints
 from ...gold import iob_to_biluo
 
+import re
+
 
 def iob2json(input_path, output_path, n_sents=10, *a, **k):
     """
@@ -15,7 +17,9 @@ def iob2json(input_path, output_path, n_sents=10, *a, **k):
     with input_path.open('r', encoding='utf8') as file_:
         sentences = read_iob(file_)
     docs = merge_sentences(sentences, n_sents)
-    output_filename = input_path.parts[-1].replace(".iob", ".json")
+    output_filename = (input_path.parts[-1]
+                       .replace(".iob2", ".json")
+                       .replace(".iob", ".json"))
     output_file = output_path / output_filename
     with output_file.open('w', encoding='utf-8') as f:
         f.write(json_dumps(docs))
@@ -28,7 +32,7 @@ def read_iob(raw_sents):
     for line in raw_sents:
         if not line.strip():
             continue
-        tokens = [t.split('|') for t in line.split()]
+        tokens = [re.split('[^\w\-]', line.strip())]
         if len(tokens[0]) == 3:
             words, pos, iob = zip(*tokens)
         else:
diff --git a/website/api/cli.jade b/website/api/cli.jade
index 28c76c09c..fc6dd86a1 100644
--- a/website/api/cli.jade
+++ b/website/api/cli.jade
@@ -245,7 +245,7 @@ p The following file format converters are available:
 
     +row
         +cell #[code iob]
-        +cell IOB named entity recognition format.
+        +cell IOB or IOB2 named entity recognition format.
 
 +h(3, "train") Train