mirror of
				https://github.com/Ai-hack-MAGNUM-OPUS/backend.git
				synced 2025-10-30 23:37:34 +03:00 
			
		
		
		
	Optimised file processing, added doc, odt -> docx converter, minor changes
This commit is contained in:
		
							parent
							
								
									2bf771e3a2
								
							
						
					
					
						commit
						6cffc965a8
					
				|  | @ -1,4 +1,7 @@ | ||||||
| import os | import os | ||||||
|  | import re | ||||||
|  | import convertapi | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| from checker.services.generators import generate_charset | from checker.services.generators import generate_charset | ||||||
| 
 | 
 | ||||||
|  | @ -28,5 +31,31 @@ def process_word_paragraphs(text): | ||||||
|     return _base_process(text) |     return _base_process(text) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def doc_to_docx(file_path): | ||||||
|  |     convertapi.api_secret = '0fp22XFRPwKmNJql' | ||||||
|  |     result = convertapi.convert('docx', {'File': file_path}, from_format='doc') | ||||||
|  |     result.file.save(file_path.split(".")[0] + ".docx") | ||||||
|  |     return file_path.split(".")[0] + ".docx" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def doc_to_odt(file_path): | ||||||
|  |     convertapi.api_secret = '0fp22XFRPwKmNJql' | ||||||
|  |     result = convertapi.convert('docx', {'File': file_path}, from_format='odt') | ||||||
|  |     result.file.save(file_path.split(".")[0] + ".docx") | ||||||
|  |     return file_path.split(".")[0] + ".docx" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def media_upload_path(instance, filename): | def media_upload_path(instance, filename): | ||||||
|     return os.path.join(f"uploads/{generate_charset(7)}/", filename) |     return os.path.join(f"uploads/{generate_charset(7)}/", filename) | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def split_text(text): | ||||||
|  |     texts, groups = [], [] | ||||||
|  |     regt = re.findall(r"{(.*?)}(.*?){(.*?)}", text.replace('\n', '')) | ||||||
|  |     for t in regt: | ||||||
|  |         if t[0] == t[-1]: | ||||||
|  |             texts.append(t[1]) | ||||||
|  |             groups.append(int(t[0])) | ||||||
|  |         else: | ||||||
|  |             print(t) | ||||||
|  |     return texts, groups | ||||||
|  |  | ||||||
|  | @ -1,15 +1,31 @@ | ||||||
|  | import magic | ||||||
|  | 
 | ||||||
| from django.db.models.signals import post_save | from django.db.models.signals import post_save | ||||||
|  | from django.core.files import File | ||||||
| from django.dispatch import receiver | from django.dispatch import receiver | ||||||
|  | from celery import chain | ||||||
| 
 | 
 | ||||||
| from checker.models import Docx, WordDocx | from checker.models import Docx, WordDocx | ||||||
|  | from checker.services.file import doc_to_docx, doc_to_odt | ||||||
| from checker.tasks import process_file, process_word, highlight_file | from checker.tasks import process_file, process_word, highlight_file | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @receiver(post_save, sender=Docx) | @receiver(post_save, sender=Docx) | ||||||
| def create_docs(sender, instance, created, **kwargs): | def create_docs(sender, instance, created, **kwargs): | ||||||
|     if created: |     if created: | ||||||
|         process_file.apply_async(kwargs={"pk": instance.pk}) |         type = magic.from_file(instance.file.path, mime=True) | ||||||
|         highlight_file.apply_async(kwargs={"pk": instance.pk}) |         if type == "application/msword": | ||||||
|  |             pth = doc_to_docx(instance.file.path) | ||||||
|  |             with open(pth, 'rb') as f: | ||||||
|  |                 instance.file = File(f, name=pth.split("/")[-1]) | ||||||
|  |                 instance.save(update_fields=["file"]) | ||||||
|  |         elif type == "application/vnd.oasis.opendocument.text": | ||||||
|  |             pth = doc_to_odt(instance.file.path) | ||||||
|  |             with open(pth, 'rb') as f: | ||||||
|  |                 instance.file = File(f, name=pth.split("/")[-1]) | ||||||
|  |                 instance.save(update_fields=["file"]) | ||||||
|  | 
 | ||||||
|  |         chain(process_file.s(instance.pk), highlight_file.s()).apply_async() | ||||||
|         return |         return | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -3,49 +3,63 @@ import requests | ||||||
| from celery import shared_task | from celery import shared_task | ||||||
| from docx import Document | from docx import Document | ||||||
| from docx.enum.text import WD_COLOR_INDEX | from docx.enum.text import WD_COLOR_INDEX | ||||||
|  | from requests.exceptions import InvalidJSONError | ||||||
| 
 | 
 | ||||||
| from checker.models import Paragraph, Docx, WordDocx, WordParagraph | from checker.models import Paragraph, Docx, WordDocx, WordParagraph | ||||||
| from checker.services.file import process_paragraphs, process_word_paragraphs | from checker.services.file import process_paragraphs, process_word_paragraphs, split_text | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @shared_task() | @shared_task() | ||||||
| def process_file(pk: int): | def process_file(pk: int, *args, **kwargs): | ||||||
|     file = Docx.objects.get(pk=pk) |     file = Docx.objects.get(pk=pk) | ||||||
|     uuid = file.uuid |     uuid = file.uuid | ||||||
|     document = docx2txt.process(file.file.path) |     document = docx2txt.process(file.file.path) | ||||||
|     paragraphs = process_paragraphs(document) |     # paragraphs = process_paragraphs(document) | ||||||
|  |     paragraphs, groups = split_text(document) | ||||||
| 
 | 
 | ||||||
|     file.paragraphs_loaded = len(paragraphs) |     file.paragraphs_loaded = len(paragraphs) | ||||||
|     file.save(update_fields=["paragraphs_loaded"]) |     file.save(update_fields=["paragraphs_loaded"]) | ||||||
| 
 | 
 | ||||||
|     cut = 100 |     cut = 10 | ||||||
|     counter = 0 |     for i in range(len(paragraphs) // cut): | ||||||
|     len_c = len(paragraphs) + 1 |         vals = [x for x in range(i * cut, (i+ 1) * cut)] | ||||||
|     paragraphs = list(paragraphs.values()) |         dct = {x: paragraphs[x] for x in vals} | ||||||
|     for i in range(0, len(paragraphs) // cut + 1): |         x = requests.post("http://109.248.175.223:5000/api", json=dct, timeout=1) | ||||||
|         vals = paragraphs[i * cut : (i + 1) * cut + 1] |  | ||||||
|         dct = {x: vals[x] for x in range(len(vals))} |  | ||||||
| 
 |  | ||||||
|         x = requests.post("http://109.248.175.223:5000/api", json=dct) |  | ||||||
|         if x.status_code == 200: |  | ||||||
|         for el_id, dat in x.json().items(): |         for el_id, dat in x.json().items(): | ||||||
|             type_id, score = dat |             type_id, score = dat | ||||||
|             Paragraph.objects.create( |             Paragraph.objects.create( | ||||||
|                     type_id=type_id, docx=file, text=dct[int(el_id)], score=score |                 type_id=type_id, docx=file, text=str(groups[int(el_id)]) + dct[int(el_id)] + str(groups[int(el_id)]), score=score | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|             counter += len(vals) |     #for i in range(0, len(paragraphs) // cut + 1): | ||||||
|             print(f"processing {uuid}, {counter}/{len_c}") |     #    vals = paragraphs[i * cut : (i + 1) * cut + 1] | ||||||
|             file.paragraphs_processed = counter |     #    dct = {x: vals[x] for x in range(len(vals))} | ||||||
|             file.save(update_fields=["paragraphs_processed"]) |     # | ||||||
|         else: |     #    x = requests.post("http://109.248.175.223:5000/api", json=dct) | ||||||
|             print(f"AI server error, {x.status_code}") |     #    if x.status_code == 200: | ||||||
|  |     #        try: | ||||||
|  |     #            for el_id, dat in x.json().items(): | ||||||
|  |     #                type_id, score = dat | ||||||
|  |     #                Paragraph.objects.create( | ||||||
|  |     #                    type_id=type_id, docx=file, text=str(groups[g_c])+dct[int(el_id)]+str(groups[g_c]), score=score | ||||||
|  |     #                ) | ||||||
|  |     #                g_c += 1 | ||||||
|  |     # | ||||||
|  |     #            counter += len(vals) | ||||||
|  |     #            print(f"processing {uuid}, {counter}/{len_c}") | ||||||
|  |     #            file.paragraphs_processed = counter | ||||||
|  |     #            file.save(update_fields=["paragraphs_processed"]) | ||||||
|  |     #        except InvalidJSONError: | ||||||
|  |     #            print("json pars error") | ||||||
|  |     #    else: | ||||||
|  |     #        print(f"AI server error, {x.status_code}") | ||||||
| 
 | 
 | ||||||
|     return f"ok, {pk}" | 
 | ||||||
|  |     return pk | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @shared_task() | @shared_task() | ||||||
| def process_word(pk: int): | def process_word(pk: int, *args, **kwargs): | ||||||
|     file = WordDocx.objects.get(pk=pk) |     file = WordDocx.objects.get(pk=pk) | ||||||
|     uuid = file.uuid |     uuid = file.uuid | ||||||
|     paragraphs = process_word_paragraphs(file.text.tobytes().decode()) |     paragraphs = process_word_paragraphs(file.text.tobytes().decode()) | ||||||
|  | @ -53,7 +67,7 @@ def process_word(pk: int): | ||||||
|     file.paragraphs_loaded = len(paragraphs) |     file.paragraphs_loaded = len(paragraphs) | ||||||
|     file.save(update_fields=["paragraphs_loaded"]) |     file.save(update_fields=["paragraphs_loaded"]) | ||||||
| 
 | 
 | ||||||
|     cut = 150 |     cut = 10 | ||||||
|     len_c = len(paragraphs) + 1 |     len_c = len(paragraphs) + 1 | ||||||
|     paragraphs = list(paragraphs.values()) |     paragraphs = list(paragraphs.values()) | ||||||
|     counter = 0 |     counter = 0 | ||||||
|  | @ -63,6 +77,7 @@ def process_word(pk: int): | ||||||
| 
 | 
 | ||||||
|         x = requests.post("http://109.248.175.223:5000/api", json=dct) |         x = requests.post("http://109.248.175.223:5000/api", json=dct) | ||||||
|         if x.status_code == 200: |         if x.status_code == 200: | ||||||
|  |             try: | ||||||
|                 for el_id, dat in x.json().items(): |                 for el_id, dat in x.json().items(): | ||||||
|                     type_id, score = dat |                     type_id, score = dat | ||||||
|                     WordParagraph.objects.create( |                     WordParagraph.objects.create( | ||||||
|  | @ -73,21 +88,23 @@ def process_word(pk: int): | ||||||
|                 print(f"processing {uuid}, {counter}/{len_c}") |                 print(f"processing {uuid}, {counter}/{len_c}") | ||||||
|                 file.paragraphs_processed = counter |                 file.paragraphs_processed = counter | ||||||
|                 file.save(update_fields=["paragraphs_processed"]) |                 file.save(update_fields=["paragraphs_processed"]) | ||||||
|  |             except InvalidJSONError: | ||||||
|  |                 print("json pars error") | ||||||
|         else: |         else: | ||||||
|             print(f"AI server error, {x.status_code}") |             print(f"AI server error, {x.status_code}") | ||||||
| 
 | 
 | ||||||
|     return f"ok, {pk}" |     return pk | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| @shared_task | @shared_task | ||||||
| def highlight_file(pk: int): | def highlight_file(pk: int, *args, **kwargs): | ||||||
|     c = 0 |     c = 0 | ||||||
|     lim = 0 |     lim = 0 | ||||||
|     file = Docx.objects.get(pk=pk) |     file = Docx.objects.get(pk=pk) | ||||||
|     document = Document(file.file.path) |     document = Document(file.file.path) | ||||||
| 
 | 
 | ||||||
|     paragraphs = document.paragraphs |     paragraphs = document.paragraphs | ||||||
|     cut = 100 |     cut = 10 | ||||||
| 
 | 
 | ||||||
|     for paragraph in paragraphs: |     for paragraph in paragraphs: | ||||||
|         if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.": |         if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.": | ||||||
|  | @ -101,6 +118,7 @@ def highlight_file(pk: int): | ||||||
|             if dat: |             if dat: | ||||||
|                 n_dct[el] = dat |                 n_dct[el] = dat | ||||||
|         x = requests.post("http://109.248.175.223:5000/api", json=n_dct) |         x = requests.post("http://109.248.175.223:5000/api", json=n_dct) | ||||||
|  |         try: | ||||||
|             jsn = x.json() |             jsn = x.json() | ||||||
|             if x.status_code == 200: |             if x.status_code == 200: | ||||||
|                 for j in range(len(paragraphs_sliced)): |                 for j in range(len(paragraphs_sliced)): | ||||||
|  | @ -116,5 +134,7 @@ def highlight_file(pk: int): | ||||||
|                             c += 1 |                             c += 1 | ||||||
|             else: |             else: | ||||||
|                 print("AI server error") |                 print("AI server error") | ||||||
|  |         except InvalidJSONError: | ||||||
|  |             print("json pars error") | ||||||
|     document.save(file.file.path) |     document.save(file.file.path) | ||||||
|     return f"highlighted {c}, {pk}" |     return pk | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user