backend/parsers/extract_mus.py

import os
import json

import requests

from urllib.parse import urlparse
from bs4 import BeautifulSoup

result = []

url = "http://vrm.museum.ru/mus/list.asp?by=alpha"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
t = soup.find_all("tr")
for j in range(20, len(t)):
    try:
        el = t[j]
        l = str(el.find_all(href=True)[0]).split('"')[1]
        link = "http://vrm.museum.ru" + l
        response = requests.get(link)
        name = BeautifulSoup(
            [x for x in response.text.splitlines() if f"http://www.museum.ru{l}" in x][
                0
            ],
            "html.parser",
        ).text
        soup2 = BeautifulSoup(response.text, "html.parser")
        data2 = []
        for table in soup2.find_all("table"):
            rows = table.find_all("tr")
            data = []
            for row in rows:
                cols = row.find_all("td")
                cols = [ele.text.strip() for ele in cols]
                data.append([ele for ele in cols if ele])
            data2 += data

        data3 = {}
        for row in data2:
            if len(row) > 0:
                rec = []
                for el in row:
                    rec += el.split(":")
                if len(rec) > 1:
                    c_name = " ".join(rec[0].split())
                    desc = " ".join(" ".join(rec[1:]).split())
                    data3[c_name] = desc

        images = []
        img_tags = soup2.find_all("img")
        urls = [img["src"] for img in img_tags]
        add = {
            "name": name,
            "urls": [x for x in urls if "asp" in x],
            "link": link,
        } | data3
        result.append(add)
        print(name)
    except Exception as e:
        print(e)
    print(j, "/", len(t))

    with open("ext.json", "w", encoding="utf-16") as f:
        json.dump({"links": result}, f, ensure_ascii=False, indent=4)

with open("ext.json", "w", encoding="utf-16") as f:
    json.dump({"links": result}, f, ensure_ascii=False, indent=4)