backend/dock_checker/processor/services.py

172 lines
5.6 KiB
Python

from typing import Tuple
from io import BytesIO
import re
import fitz
from django.core.cache import cache
from rest_framework.exceptions import NotFound
def get_task_status(pk: str) -> dict:
if cache.get(f"{pk}-processed") is None:
raise NotFound("given task does not exist")
created = cache.get_or_set(f"{pk}-processed", 0)
total = cache.get_or_set(f"{pk}-total", 0)
features_loaded = cache.get_or_set(f"{pk}-features_loaded", False)
error = cache.get_or_set(f"{pk}-error", False)
error_description = cache.get_or_set(f"{pk}-error_description", "")
return {
"processed": created,
"total": total,
"features_loaded": features_loaded,
"error": error,
"error_description": error_description,
}
def extract_info(input_file: str):
"""
Extracts file info
"""
# Open the PDF
pdfDoc = fitz.open(input_file)
output = {
"File": input_file,
"Encrypted": ("True" if pdfDoc.isEncrypted else "False"),
}
# If PDF is encrypted the file metadata cannot be extracted
if not pdfDoc.isEncrypted:
for key, value in pdfDoc.metadata.items():
output[key] = value
# To Display File Info
print("## File Information ##################################################")
print("\n".join("{}:{}".format(i, j) for i, j in output.items()))
print("######################################################################")
return True, output
def search_for_text(lines, search_str):
"""
Search for the search string within the document lines
"""
if search_str in lines:
return search_str
def redact_matching_data(page, matched_values):
"""
Redacts matching values
"""
matches_found = 0
# Loop throughout matching values
for val in matched_values:
matches_found += 1
matching_val_area = page.search_for(val)
# Redact matching values
[
page.addRedactAnnot(area, text=" ", fill=(0, 0, 0))
for area in matching_val_area
]
# Apply the redaction
page.apply_redactions()
return matches_found
def frame_matching_data(page, matched_values):
"""
frames matching values
"""
matches_found = 0
# Loop throughout matching values
for val in matched_values:
matches_found += 1
matching_val_area = page.search_for(val)
for area in matching_val_area:
if isinstance(area, fitz.fitz.Rect):
# Draw a rectangle around matched values
annot = page.addRectAnnot(area)
# , fill = fitz.utils.getColor('black')
annot.setColors(stroke=fitz.utils.getColor("red"))
# If you want to remove matched data
# page.addFreetextAnnot(area, ' ')
annot.update()
return matches_found
def highlight_matching_data(page, matched_values, type):
"""
Highlight matching values
"""
matches_found = 0
# Loop throughout matching values
for val in matched_values:
matches_found += 1
matching_val_area = page.search_for(val)
# print("matching_val_area",matching_val_area)
highlight = None
if type == "Highlight":
highlight = page.add_highlight_annot(matching_val_area)
elif type == "Squiggly":
highlight = page.add_squiggly_annot(matching_val_area)
elif type == "Underline":
highlight = page.add_underline_annot(matching_val_area)
elif type == "Strikeout":
highlight = page.add_strikeout_annot(matching_val_area)
else:
highlight = page.add_highlight_annot(matching_val_area)
# To change the highlight colar
# highlight.setColors({"stroke":(0,0,1),"fill":(0.75,0.8,0.95) })
# highlight.setColors(stroke = fitz.utils.getColor('white'), fill = fitz.utils.getColor('red'))
# highlight.setColors(colors= fitz.utils.getColor('red'))
highlight.update()
return matches_found
def process_data(
input_file: str,
search_str: str,
pages: Tuple = None,
action: str = "Highlight",
):
"""
Process the pages of the PDF File
"""
# Open the PDF
pdfDoc = fitz.open(input_file)
# Save the generated PDF to memory buffer
output_buffer = BytesIO()
total_matches = 0
# Iterate through pages
for pg in range(len(pdfDoc)):
# If required for specific pages
if pages:
if str(pg) not in pages:
continue
# Select the page
page = pdfDoc[pg]
# Get Matching Data
# Split page by lines
page_lines = page.get_text("text")
matched_values = search_for_text(page_lines, search_str)
if matched_values:
if action == "Redact":
matches_found = redact_matching_data(page, matched_values)
elif action == "Frame":
matches_found = frame_matching_data(page, matched_values)
elif action in ("Highlight", "Squiggly", "Underline", "Strikeout"):
matches_found = highlight_matching_data(page, matched_values, action)
else:
matches_found = highlight_matching_data(
page, matched_values, "Highlight"
)
total_matches += matches_found
print(
f"{total_matches} Match(es) Found of Search String {search_str} In Input File: {input_file}"
)
# Save to output
pdfDoc.save(output_buffer)
pdfDoc.close()
# Save the output buffer to the output file
with open(input_file, mode="wb") as f:
f.write(output_buffer.getbuffer())