Add incomplete iob converter

2025-11-14 23:06:01 +03:00 · 2017-05-19 13:27:51 -05:00 · 2017-05-19 13:27:51 -05:00 · 08766240c3
commit 08766240c3
parent c12ab47a56
1 changed files with 39 additions and 0 deletions
--- a/spacy/cli/converters/iob2json.py
+++ b/spacy/cli/converters/iob2json.py
@ -0,0 +1,39 @@
+# coding: utf8
+from __future__ import unicode_literals
+
+from ...compat import json_dumps, path2str
+from ...util import prints
+
+
+def iob2json(input_path, output_path, n_sents=10, *a, **k):
+    """
+    Convert IOB files into JSON format for use with train cli.
+    """
+    # TODO: This isn't complete yet -- need to map from IOB to
+    # BILUO
+    with input_path.open() as file_:
+        docs = read_iob(file_)
+
+    output_filename = input_path.parts[-1].replace(".iob", ".json")
+    output_file = output_path / output_filename
+    with output_file.open('w', encoding='utf-8') as f:
+        f.write(json_dumps(docs))
+    prints("Created %d documents" % len(docs),
+           title="Generated output file %s" % path2str(output_file))
+
+
+def read_iob(file_):
+    sentences = []
+    for line in file_:
+        if not line.strip():
+            continue
+        tokens = [t.rsplit('|', 2) for t in line.split()]
+        words, pos, iob = zip(*tokens)
+        sentences.append([
+            {'orth': w, 'tag': p, 'ner': ent}
+            for (w, p, ent) in zip(words, pos, iob)
+        ])
+    sentences = [{'tokens': sent} for sent in sentences]
+    paragraphs = [{'sentences': [sent]} for sent in sentences]
+    docs = [{'id': 0, 'paragraphs': [para]} for para in paragraphs]
+    return docs