backend/parsers/extract_mus.py

import os
import json

import requests

from urllib.parse import urlparse
from bs4 import BeautifulSoup

result = []

url = "http://vrm.museum.ru/mus/list.asp?by=alpha"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
t = soup.find_all("tr")
for j in range(20, len(t)):
    try:
        el = t[j]
        l = str(el.find_all(href=True)[0]).split('"')[1]
        link = "http://vrm.museum.ru" + l
        response = requests.get(link)
        name = BeautifulSoup(
            [x for x in response.text.splitlines() if f"http://www.museum.ru{l}" in x][
                0
            ],
            "html.parser",
        ).text
        soup2 = BeautifulSoup(response.text, "html.parser")
        data2 = []
        for table in soup2.find_all("table"):
            rows = table.find_all("tr")
            data = []
            for row in rows:
                cols = row.find_all("td")
                cols = [ele.text.strip() for ele in cols]
                data.append([ele for ele in cols if ele])
            data2 += data

        data3 = {}
        for row in data2:
            if len(row) > 0:
                rec = []
                for el in row:
                    rec += el.split(":")
                if len(rec) > 1:
                    c_name = " ".join(rec[0].split())
                    desc = " ".join(" ".join(rec[1:]).split())
                    data3[c_name] = desc

        images = []
        img_tags = soup2.find_all("img")
        urls = [img["src"] for img in img_tags]
        add = {
            "name": name,
            "urls": [x for x in urls if "asp" in x],
            "link": link,
        } | data3
        result.append(add)
        print(name)
    except Exception as e:
        print(e)
    print(j, "/", len(t))

    with open("ext.json", "w", encoding="utf-16") as f:
        json.dump({"links": result}, f, ensure_ascii=False, indent=4)

with open("ext.json", "w", encoding="utf-16") as f:
    json.dump({"links": result}, f, ensure_ascii=False, indent=4)
added museums, clickhouse for user preference, fixed position 2023-05-22 22:29:46 +03:00			`import os`
			`import json`

			`import requests`

			`from urllib.parse import urlparse`
			`from bs4 import BeautifulSoup`

			`result = []`

			`url = "http://vrm.museum.ru/mus/list.asp?by=alpha"`
			`response = requests.get(url)`
			`soup = BeautifulSoup(response.text, "html.parser")`
			`t = soup.find_all("tr")`
			`for j in range(20, len(t)):`
			`try:`
			`el = t[j]`
			`l = str(el.find_all(href=True)[0]).split('"')[1]`
			`link = "http://vrm.museum.ru" + l`
			`response = requests.get(link)`
			`name = BeautifulSoup(`
			`[x for x in response.text.splitlines() if f"http://www.museum.ru{l}" in x][`
			`0`
			`],`
			`"html.parser",`
			`).text`
			`soup2 = BeautifulSoup(response.text, "html.parser")`
			`data2 = []`
			`for table in soup2.find_all("table"):`
			`rows = table.find_all("tr")`
			`data = []`
			`for row in rows:`
			`cols = row.find_all("td")`
			`cols = [ele.text.strip() for ele in cols]`
			`data.append([ele for ele in cols if ele])`
			`data2 += data`

			`data3 = {}`
			`for row in data2:`
			`if len(row) > 0:`
			`rec = []`
			`for el in row:`
			`rec += el.split(":")`
			`if len(rec) > 1:`
			`c_name = " ".join(rec[0].split())`
			`desc = " ".join(" ".join(rec[1:]).split())`
			`data3[c_name] = desc`

			`images = []`
			`img_tags = soup2.find_all("img")`
			`urls = [img["src"] for img in img_tags]`
			`add = {`
			`"name": name,`
			`"urls": [x for x in urls if "asp" in x],`
			`"link": link,`
			`} \| data3`
			`result.append(add)`
			`print(name)`
			`except Exception as e:`
			`print(e)`
			`print(j, "/", len(t))`

			`with open("ext.json", "w", encoding="utf-16") as f:`
			`json.dump({"links": result}, f, ensure_ascii=False, indent=4)`

			`with open("ext.json", "w", encoding="utf-16") as f:`
			`json.dump({"links": result}, f, ensure_ascii=False, indent=4)`