mirror of
				https://github.com/Ai-hack-MAGNUM-OPUS/backend.git
				synced 2025-10-30 23:37:34 +03:00 
			
		
		
		
	Optimised file processing, added doc, odt -> docx converter, minor changes
This commit is contained in:
		
							parent
							
								
									2bf771e3a2
								
							
						
					
					
						commit
						6cffc965a8
					
				|  | @ -1,4 +1,7 @@ | |||
| import os | ||||
| import re | ||||
| import convertapi | ||||
| 
 | ||||
| 
 | ||||
| from checker.services.generators import generate_charset | ||||
| 
 | ||||
|  | @ -28,5 +31,31 @@ def process_word_paragraphs(text): | |||
|     return _base_process(text) | ||||
| 
 | ||||
| 
 | ||||
| def doc_to_docx(file_path): | ||||
|     convertapi.api_secret = '0fp22XFRPwKmNJql' | ||||
|     result = convertapi.convert('docx', {'File': file_path}, from_format='doc') | ||||
|     result.file.save(file_path.split(".")[0] + ".docx") | ||||
|     return file_path.split(".")[0] + ".docx" | ||||
| 
 | ||||
| 
 | ||||
| def doc_to_odt(file_path): | ||||
|     convertapi.api_secret = '0fp22XFRPwKmNJql' | ||||
|     result = convertapi.convert('docx', {'File': file_path}, from_format='odt') | ||||
|     result.file.save(file_path.split(".")[0] + ".docx") | ||||
|     return file_path.split(".")[0] + ".docx" | ||||
| 
 | ||||
| 
 | ||||
| def media_upload_path(instance, filename): | ||||
|     return os.path.join(f"uploads/{generate_charset(7)}/", filename) | ||||
| 
 | ||||
| 
 | ||||
| def split_text(text): | ||||
|     texts, groups = [], [] | ||||
|     regt = re.findall(r"{(.*?)}(.*?){(.*?)}", text.replace('\n', '')) | ||||
|     for t in regt: | ||||
|         if t[0] == t[-1]: | ||||
|             texts.append(t[1]) | ||||
|             groups.append(int(t[0])) | ||||
|         else: | ||||
|             print(t) | ||||
|     return texts, groups | ||||
|  |  | |||
|  | @ -1,15 +1,31 @@ | |||
| import magic | ||||
| 
 | ||||
| from django.db.models.signals import post_save | ||||
| from django.core.files import File | ||||
| from django.dispatch import receiver | ||||
| from celery import chain | ||||
| 
 | ||||
| from checker.models import Docx, WordDocx | ||||
| from checker.services.file import doc_to_docx, doc_to_odt | ||||
| from checker.tasks import process_file, process_word, highlight_file | ||||
| 
 | ||||
| 
 | ||||
| @receiver(post_save, sender=Docx) | ||||
| def create_docs(sender, instance, created, **kwargs): | ||||
|     if created: | ||||
|         process_file.apply_async(kwargs={"pk": instance.pk}) | ||||
|         highlight_file.apply_async(kwargs={"pk": instance.pk}) | ||||
|         type = magic.from_file(instance.file.path, mime=True) | ||||
|         if type == "application/msword": | ||||
|             pth = doc_to_docx(instance.file.path) | ||||
|             with open(pth, 'rb') as f: | ||||
|                 instance.file = File(f, name=pth.split("/")[-1]) | ||||
|                 instance.save(update_fields=["file"]) | ||||
|         elif type == "application/vnd.oasis.opendocument.text": | ||||
|             pth = doc_to_odt(instance.file.path) | ||||
|             with open(pth, 'rb') as f: | ||||
|                 instance.file = File(f, name=pth.split("/")[-1]) | ||||
|                 instance.save(update_fields=["file"]) | ||||
| 
 | ||||
|         chain(process_file.s(instance.pk), highlight_file.s()).apply_async() | ||||
|         return | ||||
| 
 | ||||
| 
 | ||||
|  |  | |||
|  | @ -3,49 +3,63 @@ import requests | |||
| from celery import shared_task | ||||
| from docx import Document | ||||
| from docx.enum.text import WD_COLOR_INDEX | ||||
| from requests.exceptions import InvalidJSONError | ||||
| 
 | ||||
| from checker.models import Paragraph, Docx, WordDocx, WordParagraph | ||||
| from checker.services.file import process_paragraphs, process_word_paragraphs | ||||
| from checker.services.file import process_paragraphs, process_word_paragraphs, split_text | ||||
| 
 | ||||
| 
 | ||||
| @shared_task() | ||||
| def process_file(pk: int): | ||||
| def process_file(pk: int, *args, **kwargs): | ||||
|     file = Docx.objects.get(pk=pk) | ||||
|     uuid = file.uuid | ||||
|     document = docx2txt.process(file.file.path) | ||||
|     paragraphs = process_paragraphs(document) | ||||
|     # paragraphs = process_paragraphs(document) | ||||
|     paragraphs, groups = split_text(document) | ||||
| 
 | ||||
|     file.paragraphs_loaded = len(paragraphs) | ||||
|     file.save(update_fields=["paragraphs_loaded"]) | ||||
| 
 | ||||
|     cut = 100 | ||||
|     counter = 0 | ||||
|     len_c = len(paragraphs) + 1 | ||||
|     paragraphs = list(paragraphs.values()) | ||||
|     for i in range(0, len(paragraphs) // cut + 1): | ||||
|         vals = paragraphs[i * cut : (i + 1) * cut + 1] | ||||
|         dct = {x: vals[x] for x in range(len(vals))} | ||||
| 
 | ||||
|         x = requests.post("http://109.248.175.223:5000/api", json=dct) | ||||
|         if x.status_code == 200: | ||||
|     cut = 10 | ||||
|     for i in range(len(paragraphs) // cut): | ||||
|         vals = [x for x in range(i * cut, (i+ 1) * cut)] | ||||
|         dct = {x: paragraphs[x] for x in vals} | ||||
|         x = requests.post("http://109.248.175.223:5000/api", json=dct, timeout=1) | ||||
|         for el_id, dat in x.json().items(): | ||||
|             type_id, score = dat | ||||
|             Paragraph.objects.create( | ||||
|                     type_id=type_id, docx=file, text=dct[int(el_id)], score=score | ||||
|                 type_id=type_id, docx=file, text=str(groups[int(el_id)]) + dct[int(el_id)] + str(groups[int(el_id)]), score=score | ||||
|             ) | ||||
| 
 | ||||
|             counter += len(vals) | ||||
|             print(f"processing {uuid}, {counter}/{len_c}") | ||||
|             file.paragraphs_processed = counter | ||||
|             file.save(update_fields=["paragraphs_processed"]) | ||||
|         else: | ||||
|             print(f"AI server error, {x.status_code}") | ||||
|     #for i in range(0, len(paragraphs) // cut + 1): | ||||
|     #    vals = paragraphs[i * cut : (i + 1) * cut + 1] | ||||
|     #    dct = {x: vals[x] for x in range(len(vals))} | ||||
|     # | ||||
|     #    x = requests.post("http://109.248.175.223:5000/api", json=dct) | ||||
|     #    if x.status_code == 200: | ||||
|     #        try: | ||||
|     #            for el_id, dat in x.json().items(): | ||||
|     #                type_id, score = dat | ||||
|     #                Paragraph.objects.create( | ||||
|     #                    type_id=type_id, docx=file, text=str(groups[g_c])+dct[int(el_id)]+str(groups[g_c]), score=score | ||||
|     #                ) | ||||
|     #                g_c += 1 | ||||
|     # | ||||
|     #            counter += len(vals) | ||||
|     #            print(f"processing {uuid}, {counter}/{len_c}") | ||||
|     #            file.paragraphs_processed = counter | ||||
|     #            file.save(update_fields=["paragraphs_processed"]) | ||||
|     #        except InvalidJSONError: | ||||
|     #            print("json pars error") | ||||
|     #    else: | ||||
|     #        print(f"AI server error, {x.status_code}") | ||||
| 
 | ||||
|     return f"ok, {pk}" | ||||
| 
 | ||||
|     return pk | ||||
| 
 | ||||
| 
 | ||||
| @shared_task() | ||||
| def process_word(pk: int): | ||||
| def process_word(pk: int, *args, **kwargs): | ||||
|     file = WordDocx.objects.get(pk=pk) | ||||
|     uuid = file.uuid | ||||
|     paragraphs = process_word_paragraphs(file.text.tobytes().decode()) | ||||
|  | @ -53,7 +67,7 @@ def process_word(pk: int): | |||
|     file.paragraphs_loaded = len(paragraphs) | ||||
|     file.save(update_fields=["paragraphs_loaded"]) | ||||
| 
 | ||||
|     cut = 150 | ||||
|     cut = 10 | ||||
|     len_c = len(paragraphs) + 1 | ||||
|     paragraphs = list(paragraphs.values()) | ||||
|     counter = 0 | ||||
|  | @ -63,6 +77,7 @@ def process_word(pk: int): | |||
| 
 | ||||
|         x = requests.post("http://109.248.175.223:5000/api", json=dct) | ||||
|         if x.status_code == 200: | ||||
|             try: | ||||
|                 for el_id, dat in x.json().items(): | ||||
|                     type_id, score = dat | ||||
|                     WordParagraph.objects.create( | ||||
|  | @ -73,21 +88,23 @@ def process_word(pk: int): | |||
|                 print(f"processing {uuid}, {counter}/{len_c}") | ||||
|                 file.paragraphs_processed = counter | ||||
|                 file.save(update_fields=["paragraphs_processed"]) | ||||
|             except InvalidJSONError: | ||||
|                 print("json pars error") | ||||
|         else: | ||||
|             print(f"AI server error, {x.status_code}") | ||||
| 
 | ||||
|     return f"ok, {pk}" | ||||
|     return pk | ||||
| 
 | ||||
| 
 | ||||
| @shared_task | ||||
| def highlight_file(pk: int): | ||||
| def highlight_file(pk: int, *args, **kwargs): | ||||
|     c = 0 | ||||
|     lim = 0 | ||||
|     file = Docx.objects.get(pk=pk) | ||||
|     document = Document(file.file.path) | ||||
| 
 | ||||
|     paragraphs = document.paragraphs | ||||
|     cut = 100 | ||||
|     cut = 10 | ||||
| 
 | ||||
|     for paragraph in paragraphs: | ||||
|         if paragraph.text and len(paragraph.text) > 2 and paragraph.text[:2] == "1.": | ||||
|  | @ -101,6 +118,7 @@ def highlight_file(pk: int): | |||
|             if dat: | ||||
|                 n_dct[el] = dat | ||||
|         x = requests.post("http://109.248.175.223:5000/api", json=n_dct) | ||||
|         try: | ||||
|             jsn = x.json() | ||||
|             if x.status_code == 200: | ||||
|                 for j in range(len(paragraphs_sliced)): | ||||
|  | @ -116,5 +134,7 @@ def highlight_file(pk: int): | |||
|                             c += 1 | ||||
|             else: | ||||
|                 print("AI server error") | ||||
|         except InvalidJSONError: | ||||
|             print("json pars error") | ||||
|     document.save(file.file.path) | ||||
|     return f"highlighted {c}, {pk}" | ||||
|     return pk | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	Block a user