diff --git a/spacy/cli/converters/__init__.py b/spacy/cli/converters/__init__.py index 9dcbf5b13..e44ad407d 100644 --- a/spacy/cli/converters/__init__.py +++ b/spacy/cli/converters/__init__.py @@ -1,4 +1,4 @@ from .conllu2json import conllu2json # noqa: F401 -from .iob2json import iob2json # noqa: F401 +from .iob2json import iob2docs # noqa: F401 from .conll_ner2json import conll_ner2json # noqa: F401 -from .jsonl2json import ner_jsonl2json # noqa: F401 +from .jsonl2docs import ner_jsonl2json # noqa: F401 diff --git a/spacy/cli/converters/conll_ner2json.py b/spacy/cli/converters/conll_ner2json.py index b607d5913..8d4139bde 100644 --- a/spacy/cli/converters/conll_ner2json.py +++ b/spacy/cli/converters/conll_ner2json.py @@ -3,15 +3,16 @@ from wasabi import Printer from ...gold import iob_to_biluo from ...lang.xx import MultiLanguage from ...tokens.doc import Doc +from ...vocab import Vocab from ...util import load_model -def conll_ner2json( +def conll_ner2doc( input_data, n_sents=10, seg_sents=False, model=None, no_print=False, **kwargs ): """ Convert files in the CoNLL-2003 NER format and similar - whitespace-separated columns into JSON format for use with train cli. + whitespace-separated columns into Doc objects. The first column is the tokens, the final column is the IOB tags. If an additional second column is present, the second column is the tags. @@ -81,17 +82,25 @@ def conll_ner2json( "No document delimiters found. Use `-n` to automatically group " "sentences into documents." ) + + if model: + nlp = load_model(model) + else: + nlp = MultiLanguage() output_docs = [] - for doc in input_data.strip().split(doc_delimiter): - doc = doc.strip() - if not doc: + for conll_doc in input_data.strip().split(doc_delimiter): + conll_doc = conll_doc.strip() + if not conll_doc: continue - output_doc = [] - for sent in doc.split("\n\n"): - sent = sent.strip() + words = [] + sent_starts = [] + pos_tags = [] + biluo_tags = [] + for conll_sent in conll_doc.split("\n\n"): + conll_sent = conll_sent.strip() if not sent: continue - lines = [line.strip() for line in sent.split("\n") if line.strip()] + lines = [line.strip() for line in conll_sent.split("\n") if line.strip()] cols = list(zip(*[line.split() for line in lines])) if len(cols) < 2: raise ValueError( @@ -99,25 +108,19 @@ def conll_ner2json( "Try checking whitespace and delimiters. See " "https://spacy.io/api/cli#convert" ) - words = cols[0] - iob_ents = cols[-1] - if len(cols) > 2: - tags = cols[1] - else: - tags = ["-"] * len(words) - biluo_ents = iob_to_biluo(iob_ents) - output_doc.append( - { - "tokens": [ - {"orth": w, "tag": tag, "ner": ent} - for (w, tag, ent) in zip(words, tags, biluo_ents) - ] - } - ) - output_docs.append( - {"id": len(output_docs), "paragraphs": [{"sentences": output_doc}]} - ) - output_doc = [] + length = len(cols[0]) + words.extend(cols[0]) + sent_stats.extend([True] + [False] * (length - 1)) + biluo_tags.extend(iob_to_biluo(cols[-1])) + pos_tags.extend(cols[1] if len(cols) > 2 else ["-"] * length) + + doc = Doc(nlp.vocab, words=words) + for i, token in enumerate(doc): + token.tag_ = pos_tags[i] + token.is_sent_start = sent_starts[i] + entities = tags_to_entities(biluo_tags) + doc.ents = [Span(doc, start=s, end=e+1, label=L) for L, s, e in entities] + output_docs.append(doc) return output_docs diff --git a/spacy/cli/converters/iob2json.py b/spacy/cli/converters/iob2json.py index b6ac234fc..2addc1af4 100644 --- a/spacy/cli/converters/iob2json.py +++ b/spacy/cli/converters/iob2json.py @@ -1,14 +1,15 @@ from wasabi import Printer -from ...gold import iob_to_biluo +from ...gold import iob_to_biluo, tags_to_entities from ...util import minibatch +from .util import merge_sentences from .conll_ner2json import n_sents_info -def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs): +def iob2docs(input_data, n_sents=10, no_print=False, *args, **kwargs): """ Convert IOB files with one sentence per line and tags separated with '|' - into JSON format for use with train cli. IOB and IOB2 are accepted. + into Doc objects so they can be saved. IOB and IOB2 are accepted. Sample formats: @@ -26,40 +27,25 @@ def iob2json(input_data, n_sents=10, no_print=False, *args, **kwargs): def read_iob(raw_sents): - sentences = [] + docs = [] for line in raw_sents: if not line.strip(): continue tokens = [t.split("|") for t in line.split()] if len(tokens[0]) == 3: - words, pos, iob = zip(*tokens) + words, tags, iob = zip(*tokens) elif len(tokens[0]) == 2: words, iob = zip(*tokens) - pos = ["-"] * len(words) + tags = ["-"] * len(words) else: raise ValueError( "The sentence-per-line IOB/IOB2 file is not formatted correctly. Try checking whitespace and delimiters. See https://spacy.io/api/cli#convert" ) + doc = Doc(vocab, words=words) + for i, tag in enumerate(pos): + doc[i].tag_ = tag biluo = iob_to_biluo(iob) - sentences.append( - [ - {"orth": w, "tag": p, "ner": ent} - for (w, p, ent) in zip(words, pos, biluo) - ] - ) - sentences = [{"tokens": sent} for sent in sentences] - paragraphs = [{"sentences": [sent]} for sent in sentences] - docs = [{"id": i, "paragraphs": [para]} for i, para in enumerate(paragraphs)] + entities = biluo_tags_to_entities(biluo) + doc.ents = [Span(doc, start=s, end=e, label=L) for (L, s, e) in entities] + docs.append(doc) return docs - - -def merge_sentences(docs, n_sents): - merged = [] - for group in minibatch(docs, size=n_sents): - group = list(group) - first = group.pop(0) - to_extend = first["paragraphs"][0]["sentences"] - for sent in group: - to_extend.extend(sent["paragraphs"][0]["sentences"]) - merged.append(first) - return merged diff --git a/spacy/cli/converters/jsonl2json.py b/spacy/cli/converters/jsonl2json.py index 525063b22..8639a11b9 100644 --- a/spacy/cli/converters/jsonl2json.py +++ b/spacy/cli/converters/jsonl2json.py @@ -4,15 +4,17 @@ from ...gold import docs_to_json from ...util import get_lang_class, minibatch -def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_): +def ner_jsonl2docs(input_data, lang=None, n_sents=10, use_morphology=False, **_): if lang is None: raise ValueError("No --lang specified, but tokenization required") - json_docs = [] + docs = [] input_examples = [srsly.json_loads(line) for line in input_data.strip().split("\n")] nlp = get_lang_class(lang)() sentencizer = nlp.create_pipe("sentencizer") for i, batch in enumerate(minibatch(input_examples, size=n_sents)): docs = [] + # TODO: Should we be merging these? We're disrespecting the n_sents + # currently. for record in batch: raw_text = record["text"] if "entities" in record: @@ -25,8 +27,7 @@ def ner_jsonl2json(input_data, lang=None, n_sents=10, use_morphology=False, **_) spans = [doc.char_span(s, e, label=L) for s, e, L in ents] doc.ents = _cleanup_spans(spans) docs.append(doc) - json_docs.append(docs_to_json(docs, id=i)) - return json_docs + return docs def _cleanup_spans(spans):