mirror of
https://github.com/magnum-opus-nn-cp/backend.git
synced 2024-11-10 18:06:33 +03:00
added xlsx parse, ip lookup
This commit is contained in:
parent
59234a5f2c
commit
58d2efa707
|
@ -11,9 +11,15 @@
|
|||
default="LnQa85vE4W235BIYizBVnsOPJOfrBxjpdrgWtmDNaUMEIbDCxwySRuyp4hpmJMZ2",
|
||||
)
|
||||
# https://docs.djangoproject.com/en/dev/ref/settings/#allowed-hosts
|
||||
ALLOWED_HOSTS = ["localhost", "0.0.0.0", "127.0.0.1", "192.168.107.4"]
|
||||
ALLOWED_HOSTS = [
|
||||
"localhost",
|
||||
"0.0.0.0",
|
||||
"127.0.0.1",
|
||||
"192.168.103.224",
|
||||
"192.168.107.4",
|
||||
]
|
||||
CORS_ORIGIN_ALLOW_ALL = True
|
||||
CSRF_TRUSTED_ORIGINS = ["http://192.168.107.4"]
|
||||
CSRF_TRUSTED_ORIGINS = ["http://192.168.103.224", "http://192.168.107.4"]
|
||||
|
||||
# WhiteNoise
|
||||
# ------------------------------------------------------------------------------
|
||||
|
|
45
poetry.lock
generated
45
poetry.lock
generated
|
@ -1070,6 +1070,18 @@ files = [
|
|||
{file = "ebcdic-1.1.1-py2.py3-none-any.whl", hash = "sha256:33b4cb729bc2d0bf46cc1847b0e5946897cb8d3f53520c5b9aa5fa98d7e735f1"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "et-xmlfile"
|
||||
version = "1.1.0"
|
||||
description = "An implementation of lxml.xmlfile for the standard library"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "et_xmlfile-1.1.0-py3-none-any.whl", hash = "sha256:a2ba85d1d6a74ef63837eed693bcb89c3f752169b0e3e7ae5b16ca5e1b3deada"},
|
||||
{file = "et_xmlfile-1.1.0.tar.gz", hash = "sha256:8eb9e2bc2f8c97e37a2dc85a09ecdcdec9d8a396530a6d5a33b30b9a92da0c5c"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "executing"
|
||||
version = "1.2.0"
|
||||
|
@ -1786,6 +1798,21 @@ files = [
|
|||
{file = "olefile-0.46.zip", hash = "sha256:133b031eaf8fd2c9399b78b8bc5b8fcbe4c31e85295749bb17a87cba8f3c3964"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "openpyxl"
|
||||
version = "3.1.2"
|
||||
description = "A Python library to read/write Excel 2010 xlsx/xlsm files"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.6"
|
||||
files = [
|
||||
{file = "openpyxl-3.1.2-py2.py3-none-any.whl", hash = "sha256:f91456ead12ab3c6c2e9491cf33ba6d08357d802192379bb482f1033ade496f5"},
|
||||
{file = "openpyxl-3.1.2.tar.gz", hash = "sha256:a6f5977418eff3b2d5500d54d9db50c8277a368436f4e4f8ddb1be3422870184"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
et-xmlfile = "*"
|
||||
|
||||
[[package]]
|
||||
name = "packaging"
|
||||
version = "23.1"
|
||||
|
@ -2280,6 +2307,22 @@ files = [
|
|||
[package.dependencies]
|
||||
pylint = ">=1.7"
|
||||
|
||||
[[package]]
|
||||
name = "pytesseract"
|
||||
version = "0.3.10"
|
||||
description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = ">=3.7"
|
||||
files = [
|
||||
{file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"},
|
||||
{file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
packaging = ">=21.3"
|
||||
Pillow = ">=8.0.0"
|
||||
|
||||
[[package]]
|
||||
name = "pytest"
|
||||
version = "7.4.2"
|
||||
|
@ -3310,4 +3353,4 @@ files = [
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.11"
|
||||
content-hash = "5bcd52daf4504209b9936143d506ff50fae1f094bae17686a71ddd01efb9c49f"
|
||||
content-hash = "fa7686c29a2d587dbafb35cec00bf4da4424047410560854cec79f904dc9de97"
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from django.core.files.uploadedfile import InMemoryUploadedFile, TemporaryUploadedFile
|
||||
from drf_spectacular.utils import extend_schema_field
|
||||
from rest_framework import serializers
|
||||
|
||||
from press_release_nl.processor.models import Entry, Text
|
||||
|
@ -65,14 +66,22 @@ def create(self, validated_data):
|
|||
class ProcessedTextSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
model = Text
|
||||
fields = ["text", "score"]
|
||||
fields = ["id", "summery", "text", "score"]
|
||||
|
||||
|
||||
class EntrySerializer(serializers.ModelSerializer):
|
||||
texts = ProcessedTextSerializer(many=True)
|
||||
done = serializers.IntegerField(source="texts_done_count")
|
||||
count = serializers.IntegerField(source="texts_count")
|
||||
texts = serializers.SerializerMethodField(method_name="get_texts")
|
||||
current = serializers.IntegerField(source="texts_done_count")
|
||||
total = serializers.IntegerField(source="texts_count")
|
||||
|
||||
@extend_schema_field(ProcessedTextSerializer(many=True))
|
||||
def get_texts(self, obj: Entry):
|
||||
id = self.context["request"].query_params.get("id")
|
||||
q = obj.texts.all()
|
||||
if id:
|
||||
q = q.filter(id=id)
|
||||
return ProcessedTextSerializer(many=True).to_representation(q)
|
||||
|
||||
class Meta:
|
||||
model = Entry
|
||||
fields = ["texts", "done", "count", "created"]
|
||||
fields = ["texts", "current", "total", "created"]
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
from drf_spectacular.utils import OpenApiParameter, extend_schema, extend_schema_view
|
||||
from rest_framework import generics, parsers, permissions, status
|
||||
from rest_framework.response import Response
|
||||
|
||||
|
@ -25,6 +26,9 @@ def create(self, request, *args, **kwargs):
|
|||
)
|
||||
|
||||
|
||||
@extend_schema_view(
|
||||
get=extend_schema(parameters=[OpenApiParameter(name="id", type=int)])
|
||||
)
|
||||
class RetrieveEntryApiView(generics.RetrieveAPIView):
|
||||
queryset = Entry.objects.all()
|
||||
permission_classes = [permissions.AllowAny]
|
||||
|
|
|
@ -0,0 +1,23 @@
|
|||
# Generated by Django 4.2.5 on 2023-09-08 18:42
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
("processor", "0001_initial"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name="text",
|
||||
name="summery",
|
||||
field=models.TextField(blank=True, max_length=2000, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name="text",
|
||||
name="file",
|
||||
field=models.FileField(blank=True, null=True, upload_to="uploads/"),
|
||||
),
|
||||
]
|
|
@ -12,7 +12,7 @@ def __str__(self):
|
|||
|
||||
@property
|
||||
def texts_done_count(self):
|
||||
return len(self.texts.filter(score__isnull=False))
|
||||
return len(self.texts.filter(score__isnull=False, summery__isnull=False))
|
||||
|
||||
@property
|
||||
def texts_count(self):
|
||||
|
@ -21,6 +21,7 @@ def texts_count(self):
|
|||
|
||||
class Text(models.Model):
|
||||
entry = models.ForeignKey("Entry", related_name="texts", on_delete=models.CASCADE)
|
||||
summery = models.TextField(max_length=2000, blank=True, null=True)
|
||||
file = models.FileField(blank=True, null=True, upload_to="uploads/")
|
||||
text = models.TextField(blank=True, null=True, max_length=25_000)
|
||||
score = models.JSONField(null=True)
|
||||
|
|
|
@ -1,11 +1,25 @@
|
|||
import openpyxl
|
||||
from django.db.models.signals import post_save
|
||||
from django.dispatch import receiver
|
||||
|
||||
from press_release_nl.processor.models import Text
|
||||
from press_release_nl.processor.tasks import load_text
|
||||
from press_release_nl.processor.tasks import load_text, load_text_sum
|
||||
|
||||
|
||||
@receiver(post_save, sender=Text)
|
||||
def run_text_process(sender, instance: Text, created, **kwargs):
|
||||
if created:
|
||||
load_text.apply_async(kwargs={"pk": instance.pk}, countdown=1)
|
||||
if instance.file and instance.file.path.endswith("xlsx"):
|
||||
wb_obj = openpyxl.load_workbook(instance.file.path)
|
||||
sheet = wb_obj.worksheets[0]
|
||||
for column in sheet.iter_cols():
|
||||
column_name = column[0].value
|
||||
if column_name == "pr_txt":
|
||||
for text in column:
|
||||
text = text.value
|
||||
if text and text != "pr_txt":
|
||||
Text.objects.create(entry=instance.entry, text=text)
|
||||
instance.delete()
|
||||
return
|
||||
load_text.apply_async(kwargs={"pk": instance.pk}, countdown=2)
|
||||
load_text_sum.apply_async(kwargs={"pk": instance.pk}, countdown=4)
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
from time import sleep
|
||||
|
||||
import requests
|
||||
import textract
|
||||
from celery import shared_task
|
||||
|
@ -5,13 +7,16 @@
|
|||
from press_release_nl.processor.models import Text
|
||||
|
||||
ML_HOST = "https://2b6a-176-59-106-6.ngrok-free.app/"
|
||||
ML_SUM_HOST = "https://dev.akarpov.ru/"
|
||||
|
||||
|
||||
@shared_task
|
||||
def load_text(pk: int):
|
||||
text = Text.objects.get(pk=pk)
|
||||
if not text.text:
|
||||
text.text = textract.process(text.file.path, encoding="unicode_escape").decode()
|
||||
text.text = textract.process(
|
||||
text.file.path, encoding="unicode_escape", language="rus"
|
||||
).decode()
|
||||
text.save()
|
||||
re = requests.post(ML_HOST + "predict", json={"data": text.text})
|
||||
if re.status_code != 200:
|
||||
|
@ -19,3 +24,18 @@ def load_text(pk: int):
|
|||
text.score = re.json()
|
||||
text.save()
|
||||
return pk
|
||||
|
||||
|
||||
@shared_task
|
||||
def load_text_sum(pk: int):
|
||||
text = Text.objects.get(pk=pk)
|
||||
if not text.text:
|
||||
sleep(3)
|
||||
text.refresh_from_db()
|
||||
re = requests.post(ML_SUM_HOST, json={"body": text.text})
|
||||
if re.status_code != 200:
|
||||
raise ValueError(re.text)
|
||||
data = re.json()
|
||||
text.summery = str(data)
|
||||
text.save()
|
||||
return pk
|
||||
|
|
|
@ -48,6 +48,8 @@ django-coverage-plugin = "^3.0.0"
|
|||
pytest-django = "^4.5.2"
|
||||
sentry-sdk = "^1.12.0"
|
||||
textract = "^1.6.5"
|
||||
pytesseract = "^0.3.10"
|
||||
openpyxl = "^3.1.2"
|
||||
|
||||
|
||||
[build-system]
|
||||
|
|
Loading…
Reference in New Issue
Block a user