mirror of
https://github.com/explosion/spaCy.git
synced 2024-11-11 12:18:04 +03:00
56 lines
1.5 KiB
Python
56 lines
1.5 KiB
Python
from pathlib import Path
|
|
import plac
|
|
import spacy
|
|
from spacy.gold import docs_to_json
|
|
import srsly
|
|
import sys
|
|
|
|
|
|
@plac.annotations(
|
|
model=("Model name. Defaults to 'en'.", "option", "m", str),
|
|
input_file=("Input file (jsonl)", "positional", None, Path),
|
|
output_dir=("Output directory", "positional", None, Path),
|
|
n_texts=("Number of texts to convert", "option", "t", int),
|
|
)
|
|
def convert(model="en", input_file=None, output_dir=None, n_texts=0):
|
|
# Load model with tokenizer + sentencizer only
|
|
nlp = spacy.load(model)
|
|
nlp.select_pipes(disable=nlp.pipe_names)
|
|
sentencizer = nlp.create_pipe("sentencizer")
|
|
nlp.add_pipe(sentencizer, first=True)
|
|
|
|
texts = []
|
|
cats = []
|
|
count = 0
|
|
|
|
if not input_file.exists():
|
|
print("Input file not found:", input_file)
|
|
sys.exit(1)
|
|
else:
|
|
with open(input_file) as fileh:
|
|
for line in fileh:
|
|
data = srsly.json_loads(line)
|
|
texts.append(data["text"])
|
|
cats.append(data["cats"])
|
|
|
|
if output_dir is not None:
|
|
output_dir = Path(output_dir)
|
|
if not output_dir.exists():
|
|
output_dir.mkdir()
|
|
else:
|
|
output_dir = Path(".")
|
|
|
|
docs = []
|
|
for i, doc in enumerate(nlp.pipe(texts)):
|
|
doc.cats = cats[i]
|
|
docs.append(doc)
|
|
if n_texts > 0 and count == n_texts:
|
|
break
|
|
count += 1
|
|
|
|
srsly.write_json(output_dir / input_file.with_suffix(".json"), [docs_to_json(docs)])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
plac.call(convert)
|