added xlsx parse, ip lookup

This commit is contained in:
Alexander Karpov 2023-09-08 23:49:29 +03:00
parent 59234a5f2c
commit 58d2efa707
9 changed files with 134 additions and 12 deletions

View File

@ -11,9 +11,15 @@
default="LnQa85vE4W235BIYizBVnsOPJOfrBxjpdrgWtmDNaUMEIbDCxwySRuyp4hpmJMZ2",
)
# https://docs.djangoproject.com/en/dev/ref/settings/#allowed-hosts
ALLOWED_HOSTS = ["localhost", "0.0.0.0", "127.0.0.1", "192.168.107.4"]
ALLOWED_HOSTS = [
"localhost",
"0.0.0.0",
"127.0.0.1",
"192.168.103.224",
"192.168.107.4",
]
CORS_ORIGIN_ALLOW_ALL = True
CSRF_TRUSTED_ORIGINS = ["http://192.168.107.4"]
CSRF_TRUSTED_ORIGINS = ["http://192.168.103.224", "http://192.168.107.4"]
# WhiteNoise
# ------------------------------------------------------------------------------

45
poetry.lock generated
View File

@ -1070,6 +1070,18 @@ files = [
{file = "ebcdic-1.1.1-py2.py3-none-any.whl", hash = "sha256:33b4cb729bc2d0bf46cc1847b0e5946897cb8d3f53520c5b9aa5fa98d7e735f1"},
]
[[package]]
name = "et-xmlfile"
version = "1.1.0"
description = "An implementation of lxml.xmlfile for the standard library"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
{file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
{file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
]
[[package]]
name = "executing"
version = "1.2.0"
@ -1786,6 +1798,21 @@ files = [
{file = "olefile-0.46.zip", hash = "sha256:133b031eaf8fd2c9399b78b8bc5b8fcbe4c31e85295749bb17a87cba8f3c3964"},
]
[[package]]
name = "openpyxl"
version = "3.1.2"
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
{file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"},
{file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"},
]
[package.dependencies]
et-xmlfile = "*"
[[package]]
name = "packaging"
version = "23.1"
@ -2280,6 +2307,22 @@ files = [
[package.dependencies]
pylint = ">=1.7"
[[package]]
name = "pytesseract"
version = "0.3.10"
description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
{file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"},
{file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"},
]
[package.dependencies]
packaging = ">=21.3"
Pillow = ">=8.0.0"
[[package]]
name = "pytest"
version = "7.4.2"
@ -3310,4 +3353,4 @@ files = [
[metadata]
lock-version = "2.0"
python-versions = "^3.11"
content-hash = "5bcd52daf4504209b9936143d506ff50fae1f094bae17686a71ddd01efb9c49f"
content-hash = "fa7686c29a2d587dbafb35cec00bf4da4424047410560854cec79f904dc9de97"

View File

@ -1,4 +1,5 @@
from django.core.files.uploadedfile import InMemoryUploadedFile, TemporaryUploadedFile
from drf_spectacular.utils import extend_schema_field
from rest_framework import serializers
from press_release_nl.processor.models import Entry, Text
@ -65,14 +66,22 @@ def create(self, validated_data):
class ProcessedTextSerializer(serializers.ModelSerializer):
class Meta:
model = Text
fields = ["text", "score"]
fields = ["id", "summery", "text", "score"]
class EntrySerializer(serializers.ModelSerializer):
texts = ProcessedTextSerializer(many=True)
done = serializers.IntegerField(source="texts_done_count")
count = serializers.IntegerField(source="texts_count")
texts = serializers.SerializerMethodField(method_name="get_texts")
current = serializers.IntegerField(source="texts_done_count")
total = serializers.IntegerField(source="texts_count")
@extend_schema_field(ProcessedTextSerializer(many=True))
def get_texts(self, obj: Entry):
id = self.context["request"].query_params.get("id")
q = obj.texts.all()
if id:
q = q.filter(id=id)
return ProcessedTextSerializer(many=True).to_representation(q)
class Meta:
model = Entry
fields = ["texts", "done", "count", "created"]
fields = ["texts", "current", "total", "created"]

View File

@ -1,3 +1,4 @@
from drf_spectacular.utils import OpenApiParameter, extend_schema, extend_schema_view
from rest_framework import generics, parsers, permissions, status
from rest_framework.response import Response
@ -25,6 +26,9 @@ def create(self, request, *args, **kwargs):
)
@extend_schema_view(
get=extend_schema(parameters=[OpenApiParameter(name="id", type=int)])
)
class RetrieveEntryApiView(generics.RetrieveAPIView):
queryset = Entry.objects.all()
permission_classes = [permissions.AllowAny]

View File

@ -0,0 +1,23 @@
# Generated by Django 4.2.5 on 2023-09-08 18:42
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("processor", "0001_initial"),
]
operations = [
migrations.AddField(
model_name="text",
name="summery",
field=models.TextField(blank=True, max_length=2000, null=True),
),
migrations.AlterField(
model_name="text",
name="file",
field=models.FileField(blank=True, null=True, upload_to="uploads/"),
),
]

View File

@ -12,7 +12,7 @@ def __str__(self):
@property
def texts_done_count(self):
return len(self.texts.filter(score__isnull=False))
return len(self.texts.filter(score__isnull=False, summery__isnull=False))
@property
def texts_count(self):
@ -21,6 +21,7 @@ def texts_count(self):
class Text(models.Model):
entry = models.ForeignKey("Entry", related_name="texts", on_delete=models.CASCADE)
summery = models.TextField(max_length=2000, blank=True, null=True)
file = models.FileField(blank=True, null=True, upload_to="uploads/")
text = models.TextField(blank=True, null=True, max_length=25_000)
score = models.JSONField(null=True)

View File

@ -1,11 +1,25 @@
import openpyxl
from django.db.models.signals import post_save
from django.dispatch import receiver
from press_release_nl.processor.models import Text
from press_release_nl.processor.tasks import load_text
from press_release_nl.processor.tasks import load_text, load_text_sum
@receiver(post_save, sender=Text)
def run_text_process(sender, instance: Text, created, **kwargs):
if created:
load_text.apply_async(kwargs={"pk": instance.pk}, countdown=1)
if instance.file and instance.file.path.endswith("xlsx"):
wb_obj = openpyxl.load_workbook(instance.file.path)
sheet = wb_obj.worksheets[0]
for column in sheet.iter_cols():
column_name = column[0].value
if column_name == "pr_txt":
for text in column:
text = text.value
if text and text != "pr_txt":
Text.objects.create(entry=instance.entry, text=text)
instance.delete()
return
load_text.apply_async(kwargs={"pk": instance.pk}, countdown=2)
load_text_sum.apply_async(kwargs={"pk": instance.pk}, countdown=4)

View File

@ -1,3 +1,5 @@
from time import sleep
import requests
import textract
from celery import shared_task
@ -5,13 +7,16 @@
from press_release_nl.processor.models import Text
ML_HOST = "https://2b6a-176-59-106-6.ngrok-free.app/"
ML_SUM_HOST = "https://dev.akarpov.ru/"
@shared_task
def load_text(pk: int):
text = Text.objects.get(pk=pk)
if not text.text:
text.text = textract.process(text.file.path, encoding="unicode_escape").decode()
text.text = textract.process(
text.file.path, encoding="unicode_escape", language="rus"
).decode()
text.save()
re = requests.post(ML_HOST + "predict", json={"data": text.text})
if re.status_code != 200:
@ -19,3 +24,18 @@ def load_text(pk: int):
text.score = re.json()
text.save()
return pk
@shared_task
def load_text_sum(pk: int):
text = Text.objects.get(pk=pk)
if not text.text:
sleep(3)
text.refresh_from_db()
re = requests.post(ML_SUM_HOST, json={"body": text.text})
if re.status_code != 200:
raise ValueError(re.text)
data = re.json()
text.summery = str(data)
text.save()
return pk

View File

@ -48,6 +48,8 @@ django-coverage-plugin = "^3.0.0"
pytest-django = "^4.5.2"
sentry-sdk = "^1.12.0"
textract = "^1.6.5"
pytesseract = "^0.3.10"
openpyxl = "^3.1.2"
[build-system]