diff --git a/Textcat_conversion.py b/Textcat_conversion.py new file mode 100644 index 000000000..4ba61f7b1 --- /dev/null +++ b/Textcat_conversion.py @@ -0,0 +1,64 @@ +from pathlib import Path +import plac +import spacy +from spacy.tokens import DocBin +import srsly +import sys + + +@plac.annotations( + model=("Model name. Defaults to 'en'.", "option", "m", str), + input_file=("Input file (jsonl)", "positional", None, Path), + output_dir=("Output directory", "positional", None, Path), + n_texts=("Number of texts to convert", "option", "t", int), +) +def convert(model="en", input_file=None, output_dir=None, n_texts=0): + # Load model with tokenizer + sentencizer only + try: + nlp = spacy.load(model) + except OSError: + print(f"Model '{model}' not found. Please download the model using: python -m spacy download {model}") + sys.exit(1) + + nlp.select_pipes(disable=[pipe for pipe in nlp.pipe_names if pipe != "sentencizer"]) + if "sentencizer" not in nlp.pipe_names: + nlp.add_pipe("sentencizer", first=True) + + # AI-driven feature: Efficient batch processing for large datasets + texts = [] + cats = [] + count = 0 + + if not input_file.exists(): + print("Input file not found:", input_file) + sys.exit(1) + else: + with open(input_file) as fileh: + for line in fileh: + data = srsly.json_loads(line) + texts.append(data.get("text", "")) + cats.append(data.get("cats", {})) + + if output_dir is not None: + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + else: + output_dir = Path(".") + + # AI-driven feature: Optimize memory usage with DocBin + doc_bin = DocBin(store_user_data=True) + + for i, doc in enumerate(nlp.pipe(texts, batch_size=100)): + doc.cats = cats[i] + doc_bin.add(doc) + if n_texts > 0 and count == n_texts: + break + count += 1 + + output_path = output_dir / input_file.with_suffix(".spacy") + doc_bin.to_disk(output_path) + + print(f"Conversion complete. Output saved to {output_path}") + +if __name__ == "__main__": + plac.call(convert)