backend/dock_checker/processor/tasks.py

import os

import shutil
from io import BytesIO
from time import sleep

import fitz
from celery import shared_task
from django.core.files import File
from pdf2image import convert_from_path
from django.core.cache import cache
from pypdf import PdfReader

from dock_checker.processor.models import File as FileModel, FileImage
from ml.main import (
    extract_test_features,
    inference_models,
    create_test_features,
    get_matches,
)


@shared_task
def process_pdf(pk: str):
    file = FileModel.objects.get(pk=pk)
    reader = PdfReader(file.file.path)
    cache.set(f"{pk}-total", len(reader.pages))
    cache.set(f"{pk}-features_loaded", False)
    cache.set(f"{pk}-processed", 1)
    extract_pdf_features.apply_async(kwargs={"pk": pk})
    return pk


@shared_task
def extract_pdf_features(pk: str):
    file = FileModel.objects.get(pk=pk)
    data, status = extract_test_features(file.file.path)
    if not status:
        print(data)
        cache.set(f"{pk}-error", True)
        cache.set(f"{pk}-error_description", data)
    else:
        # TODO: create new file for download
        data = create_test_features(data)
        _, target = inference_models("ml/checkpoints/models.pkl", data)
        text_locations = get_matches(file.file.path, target)
        file.ideal_title = target
        file.text_locations = text_locations

        pdfDoc = fitz.open(file.file.path)
        for loc in text_locations:
            page = pdfDoc[loc["page"] - 1]
            matching_val_area = page.search_for(loc["raw_text"])
            for rect in matching_val_area:
                page.add_highlight_annot(rect)
        output_buffer = BytesIO()
        pdfDoc.close()
        with open(file.file.path, mode="wb") as f:
            f.write(output_buffer.getbuffer())

        file.save()
    cache.set(f"{pk}-features_loaded", True)
    split_pdf_into_images.apply_async(kwargs={"pk": pk})
    load_pdf.apply_async(kwargs={"pk": pk})
    # create_processed_pdf.apply_async(kwargs={"pk": pk})
    return pk


@shared_task
def update_pdf_features(pk: str, target: str):
    file = FileModel.objects.get(pk=pk)
    cache.set(f"{pk}-features_loaded", False)
    data, status = extract_test_features(file.file.path)
    if not status:
        print(data)
        cache.set(f"{pk}-error", True)
        cache.set(f"{pk}-error_description", data)
    else:
        # TODO: create new file for download
        text_locations = get_matches(file.file.path, target)
        file.ideal_title = target
        file.text_locations = text_locations
        file.save()
    cache.set(f"{pk}-features_loaded", True)
    return pk


# @shared_task
# def create_processed_pdf(pk: str):
#     file = FileModel.objects.get(pk=pk)
#     f_path = "processed_" + file.file.path.split("/")[-1]
#     shutil.copy(file.file.path, f_path)
#
#     for loc in file.text_locations:
#         highlight_pdf(f_path, loc["raw_text"], page=loc["page"] - 1)
#
#     os.remove(f_path)


@shared_task
def split_pdf_into_images(pk: str):
    file = FileModel.objects.get(pk=pk)
    os.mkdir(str(pk))
    convert_from_path(file.file.path, output_folder=str(pk), paths_only=True, fmt="png")
    return pk


def get_file(pk: str, number: int):
    res = {}
    for e in os.listdir(str(pk)):
        p = int(e.split("-")[-1].split(".")[0])
        res[p] = e

    if number == len(os.listdir(str(pk))):
        sleep(1)
        return res[number]
    if number + 1 in res:
        return res[number]

    return False


@shared_task
def load_pdf(pk: str):
    file = FileModel.objects.get(pk=pk)
    if not os.path.isdir(str(pk)):
        load_pdf.apply_async(
            kwargs={"pk": pk},
            countdown=1,
        )
        return

    for i in range(cache.get(f"{pk}-processed"), cache.get(f"{pk}-total") + 1):
        cache.set(f"{pk}-processed", i)
        f_path = get_file(pk, i)
        if f_path:
            with open(str(pk) + "/" + f_path, "rb") as f:
                FileImage.objects.create(
                    image=File(f, name=f"{pk}-{i}.png"), file=file, order=i
                )
                print(i)
        else:
            load_pdf.apply_async(
                kwargs={"pk": pk},
                countdown=1,
            )
            return
    shutil.rmtree(str(pk))
    return pk