spaCy/examples/training/textcat_example_data/textcatjsonl_to_trainjson.py
Sofie Van Landeghem 0d94737857
Feature toggle_pipes (#5378)
* make disable_pipes deprecated in favour of the new toggle_pipes

* rewrite disable_pipes statements

* update documentation

* remove bin/wiki_entity_linking folder

* one more fix

* remove deprecated link to documentation

* few more doc fixes

* add note about name change to the docs

* restore original disable_pipes

* small fixes

* fix typo

* fix error number to W096

* rename to select_pipes

* also make changes to the documentation

Co-authored-by: Matthew Honnibal <honnibal+gh@gmail.com>
2020-05-18 22:27:10 +02:00

56 lines
1.5 KiB
Python

from pathlib import Path
import plac
import spacy
from spacy.gold import docs_to_json
import srsly
import sys
@plac.annotations(
model=("Model name. Defaults to 'en'.", "option", "m", str),
input_file=("Input file (jsonl)", "positional", None, Path),
output_dir=("Output directory", "positional", None, Path),
n_texts=("Number of texts to convert", "option", "t", int),
)
def convert(model="en", input_file=None, output_dir=None, n_texts=0):
# Load model with tokenizer + sentencizer only
nlp = spacy.load(model)
nlp.select_pipes(disable=nlp.pipe_names)
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer, first=True)
texts = []
cats = []
count = 0
if not input_file.exists():
print("Input file not found:", input_file)
sys.exit(1)
else:
with open(input_file) as fileh:
for line in fileh:
data = srsly.json_loads(line)
texts.append(data["text"])
cats.append(data["cats"])
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
else:
output_dir = Path(".")
docs = []
for i, doc in enumerate(nlp.pipe(texts)):
doc.cats = cats[i]
docs.append(doc)
if n_texts > 0 and count == n_texts:
break
count += 1
srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)])
if __name__ == "__main__":
plac.call(convert)