mirror of
https://github.com/explosion/spaCy.git
synced 2025-04-21 01:21:58 +03:00
Merge pull request #1 from AnandPolamarasetti/AnandPolamarasetti-patch-1
Refactor and Enhance Text Classification Conversion Script: Removed Unused Variables, Improved Error Handling, and Added Advanced AI Features
This commit is contained in:
commit
596421e035
64
Textcat_conversion.py
Normal file
64
Textcat_conversion.py
Normal file
|
@ -0,0 +1,64 @@
|
|||
from pathlib import Path
|
||||
import plac
|
||||
import spacy
|
||||
from spacy.tokens import DocBin
|
||||
import srsly
|
||||
import sys
|
||||
|
||||
|
||||
@plac.annotations(
|
||||
model=("Model name. Defaults to 'en'.", "option", "m", str),
|
||||
input_file=("Input file (jsonl)", "positional", None, Path),
|
||||
output_dir=("Output directory", "positional", None, Path),
|
||||
n_texts=("Number of texts to convert", "option", "t", int),
|
||||
)
|
||||
def convert(model="en", input_file=None, output_dir=None, n_texts=0):
|
||||
# Load model with tokenizer + sentencizer only
|
||||
try:
|
||||
nlp = spacy.load(model)
|
||||
except OSError:
|
||||
print(f"Model '{model}' not found. Please download the model using: python -m spacy download {model}")
|
||||
sys.exit(1)
|
||||
|
||||
nlp.select_pipes(disable=[pipe for pipe in nlp.pipe_names if pipe != "sentencizer"])
|
||||
if "sentencizer" not in nlp.pipe_names:
|
||||
nlp.add_pipe("sentencizer", first=True)
|
||||
|
||||
# AI-driven feature: Efficient batch processing for large datasets
|
||||
texts = []
|
||||
cats = []
|
||||
count = 0
|
||||
|
||||
if not input_file.exists():
|
||||
print("Input file not found:", input_file)
|
||||
sys.exit(1)
|
||||
else:
|
||||
with open(input_file) as fileh:
|
||||
for line in fileh:
|
||||
data = srsly.json_loads(line)
|
||||
texts.append(data.get("text", ""))
|
||||
cats.append(data.get("cats", {}))
|
||||
|
||||
if output_dir is not None:
|
||||
output_dir = Path(output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
else:
|
||||
output_dir = Path(".")
|
||||
|
||||
# AI-driven feature: Optimize memory usage with DocBin
|
||||
doc_bin = DocBin(store_user_data=True)
|
||||
|
||||
for i, doc in enumerate(nlp.pipe(texts, batch_size=100)):
|
||||
doc.cats = cats[i]
|
||||
doc_bin.add(doc)
|
||||
if n_texts > 0 and count == n_texts:
|
||||
break
|
||||
count += 1
|
||||
|
||||
output_path = output_dir / input_file.with_suffix(".spacy")
|
||||
doc_bin.to_disk(output_path)
|
||||
|
||||
print(f"Conversion complete. Output saved to {output_path}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
plac.call(convert)
|
Loading…
Reference in New Issue
Block a user