added xlsx parse, ip lookup

This commit is contained in:
Alexander Karpov 2023-09-08 23:49:29 +03:00
parent 59234a5f2c
commit 58d2efa707
9 changed files with 134 additions and 12 deletions

View File

@ -11,9 +11,15 @@
default="LnQa85vE4W235BIYizBVnsOPJOfrBxjpdrgWtmDNaUMEIbDCxwySRuyp4hpmJMZ2", default="LnQa85vE4W235BIYizBVnsOPJOfrBxjpdrgWtmDNaUMEIbDCxwySRuyp4hpmJMZ2",
) )
# https://docs.djangoproject.com/en/dev/ref/settings/#allowed-hosts # https://docs.djangoproject.com/en/dev/ref/settings/#allowed-hosts
ALLOWED_HOSTS = ["localhost", "0.0.0.0", "127.0.0.1", "192.168.107.4"] ALLOWED_HOSTS = [
"localhost",
"0.0.0.0",
"127.0.0.1",
"192.168.103.224",
"192.168.107.4",
]
CORS_ORIGIN_ALLOW_ALL = True CORS_ORIGIN_ALLOW_ALL = True
CSRF_TRUSTED_ORIGINS = ["http://192.168.107.4"] CSRF_TRUSTED_ORIGINS = ["http://192.168.103.224", "http://192.168.107.4"]
# WhiteNoise # WhiteNoise
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------

45
poetry.lock generated
View File

@ -1070,6 +1070,18 @@ files = [
{file = "ebcdic-1.1.1-py2.py3-none-any.whl", hash = "sha256:33b4cb729bc2d0bf46cc1847b0e5946897cb8d3f53520c5b9aa5fa98d7e735f1"}, {file = "ebcdic-1.1.1-py2.py3-none-any.whl", hash = "sha256:33b4cb729bc2d0bf46cc1847b0e5946897cb8d3f53520c5b9aa5fa98d7e735f1"},
] ]
[[package]]
name = "et-xmlfile"
version = "1.1.0"
description = "An implementation of lxml.xmlfile for the standard library"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
{file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
{file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
]
[[package]] [[package]]
name = "executing" name = "executing"
version = "1.2.0" version = "1.2.0"
@ -1786,6 +1798,21 @@ files = [
{file = "olefile-0.46.zip", hash = "sha256:133b031eaf8fd2c9399b78b8bc5b8fcbe4c31e85295749bb17a87cba8f3c3964"}, {file = "olefile-0.46.zip", hash = "sha256:133b031eaf8fd2c9399b78b8bc5b8fcbe4c31e85295749bb17a87cba8f3c3964"},
] ]
[[package]]
name = "openpyxl"
version = "3.1.2"
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
{file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"},
{file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"},
]
[package.dependencies]
et-xmlfile = "*"
[[package]] [[package]]
name = "packaging" name = "packaging"
version = "23.1" version = "23.1"
@ -2280,6 +2307,22 @@ files = [
[package.dependencies] [package.dependencies]
pylint = ">=1.7" pylint = ">=1.7"
[[package]]
name = "pytesseract"
version = "0.3.10"
description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
{file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"},
{file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"},
]
[package.dependencies]
packaging = ">=21.3"
Pillow = ">=8.0.0"
[[package]] [[package]]
name = "pytest" name = "pytest"
version = "7.4.2" version = "7.4.2"
@ -3310,4 +3353,4 @@ files = [
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = "^3.11" python-versions = "^3.11"
content-hash = "5bcd52daf4504209b9936143d506ff50fae1f094bae17686a71ddd01efb9c49f" content-hash = "fa7686c29a2d587dbafb35cec00bf4da4424047410560854cec79f904dc9de97"

View File

@ -1,4 +1,5 @@
from django.core.files.uploadedfile import InMemoryUploadedFile, TemporaryUploadedFile from django.core.files.uploadedfile import InMemoryUploadedFile, TemporaryUploadedFile
from drf_spectacular.utils import extend_schema_field
from rest_framework import serializers from rest_framework import serializers
from press_release_nl.processor.models import Entry, Text from press_release_nl.processor.models import Entry, Text
@ -65,14 +66,22 @@ def create(self, validated_data):
class ProcessedTextSerializer(serializers.ModelSerializer): class ProcessedTextSerializer(serializers.ModelSerializer):
class Meta: class Meta:
model = Text model = Text
fields = ["text", "score"] fields = ["id", "summery", "text", "score"]
class EntrySerializer(serializers.ModelSerializer): class EntrySerializer(serializers.ModelSerializer):
texts = ProcessedTextSerializer(many=True) texts = serializers.SerializerMethodField(method_name="get_texts")
done = serializers.IntegerField(source="texts_done_count") current = serializers.IntegerField(source="texts_done_count")
count = serializers.IntegerField(source="texts_count") total = serializers.IntegerField(source="texts_count")
@extend_schema_field(ProcessedTextSerializer(many=True))
def get_texts(self, obj: Entry):
id = self.context["request"].query_params.get("id")
q = obj.texts.all()
if id:
q = q.filter(id=id)
return ProcessedTextSerializer(many=True).to_representation(q)
class Meta: class Meta:
model = Entry model = Entry
fields = ["texts", "done", "count", "created"] fields = ["texts", "current", "total", "created"]

View File

@ -1,3 +1,4 @@
from drf_spectacular.utils import OpenApiParameter, extend_schema, extend_schema_view
from rest_framework import generics, parsers, permissions, status from rest_framework import generics, parsers, permissions, status
from rest_framework.response import Response from rest_framework.response import Response
@ -25,6 +26,9 @@ def create(self, request, *args, **kwargs):
) )
@extend_schema_view(
get=extend_schema(parameters=[OpenApiParameter(name="id", type=int)])
)
class RetrieveEntryApiView(generics.RetrieveAPIView): class RetrieveEntryApiView(generics.RetrieveAPIView):
queryset = Entry.objects.all() queryset = Entry.objects.all()
permission_classes = [permissions.AllowAny] permission_classes = [permissions.AllowAny]

View File

@ -0,0 +1,23 @@
# Generated by Django 4.2.5 on 2023-09-08 18:42
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("processor", "0001_initial"),
]
operations = [
migrations.AddField(
model_name="text",
name="summery",
field=models.TextField(blank=True, max_length=2000, null=True),
),
migrations.AlterField(
model_name="text",
name="file",
field=models.FileField(blank=True, null=True, upload_to="uploads/"),
),
]

View File

@ -12,7 +12,7 @@ def __str__(self):
@property @property
def texts_done_count(self): def texts_done_count(self):
return len(self.texts.filter(score__isnull=False)) return len(self.texts.filter(score__isnull=False, summery__isnull=False))
@property @property
def texts_count(self): def texts_count(self):
@ -21,6 +21,7 @@ def texts_count(self):
class Text(models.Model): class Text(models.Model):
entry = models.ForeignKey("Entry", related_name="texts", on_delete=models.CASCADE) entry = models.ForeignKey("Entry", related_name="texts", on_delete=models.CASCADE)
summery = models.TextField(max_length=2000, blank=True, null=True)
file = models.FileField(blank=True, null=True, upload_to="uploads/") file = models.FileField(blank=True, null=True, upload_to="uploads/")
text = models.TextField(blank=True, null=True, max_length=25_000) text = models.TextField(blank=True, null=True, max_length=25_000)
score = models.JSONField(null=True) score = models.JSONField(null=True)

View File

@ -1,11 +1,25 @@
import openpyxl
from django.db.models.signals import post_save from django.db.models.signals import post_save
from django.dispatch import receiver from django.dispatch import receiver
from press_release_nl.processor.models import Text from press_release_nl.processor.models import Text
from press_release_nl.processor.tasks import load_text from press_release_nl.processor.tasks import load_text, load_text_sum
@receiver(post_save, sender=Text) @receiver(post_save, sender=Text)
def run_text_process(sender, instance: Text, created, **kwargs): def run_text_process(sender, instance: Text, created, **kwargs):
if created: if created:
load_text.apply_async(kwargs={"pk": instance.pk}, countdown=1) if instance.file and instance.file.path.endswith("xlsx"):
wb_obj = openpyxl.load_workbook(instance.file.path)
sheet = wb_obj.worksheets[0]
for column in sheet.iter_cols():
column_name = column[0].value
if column_name == "pr_txt":
for text in column:
text = text.value
if text and text != "pr_txt":
Text.objects.create(entry=instance.entry, text=text)
instance.delete()
return
load_text.apply_async(kwargs={"pk": instance.pk}, countdown=2)
load_text_sum.apply_async(kwargs={"pk": instance.pk}, countdown=4)

View File

@ -1,3 +1,5 @@
from time import sleep
import requests import requests
import textract import textract
from celery import shared_task from celery import shared_task
@ -5,13 +7,16 @@
from press_release_nl.processor.models import Text from press_release_nl.processor.models import Text
ML_HOST = "https://2b6a-176-59-106-6.ngrok-free.app/" ML_HOST = "https://2b6a-176-59-106-6.ngrok-free.app/"
ML_SUM_HOST = "https://dev.akarpov.ru/"
@shared_task @shared_task
def load_text(pk: int): def load_text(pk: int):
text = Text.objects.get(pk=pk) text = Text.objects.get(pk=pk)
if not text.text: if not text.text:
text.text = textract.process(text.file.path, encoding="unicode_escape").decode() text.text = textract.process(
text.file.path, encoding="unicode_escape", language="rus"
).decode()
text.save() text.save()
re = requests.post(ML_HOST + "predict", json={"data": text.text}) re = requests.post(ML_HOST + "predict", json={"data": text.text})
if re.status_code != 200: if re.status_code != 200:
@ -19,3 +24,18 @@ def load_text(pk: int):
text.score = re.json() text.score = re.json()
text.save() text.save()
return pk return pk
@shared_task
def load_text_sum(pk: int):
text = Text.objects.get(pk=pk)
if not text.text:
sleep(3)
text.refresh_from_db()
re = requests.post(ML_SUM_HOST, json={"body": text.text})
if re.status_code != 200:
raise ValueError(re.text)
data = re.json()
text.summery = str(data)
text.save()
return pk

View File

@ -48,6 +48,8 @@ django-coverage-plugin = "^3.0.0"
pytest-django = "^4.5.2" pytest-django = "^4.5.2"
sentry-sdk = "^1.12.0" sentry-sdk = "^1.12.0"
textract = "^1.6.5" textract = "^1.6.5"
pytesseract = "^0.3.10"
openpyxl = "^3.1.2"
[build-system] [build-system]