backend/parsers/extract_mus.py

68 lines
2.0 KiB
Python
Raw Normal View History

2023-05-27 11:13:41 +03:00
import os
import json
import requests
from urllib.parse import urlparse
from bs4 import BeautifulSoup
result = []
url = "http://vrm.museum.ru/mus/list.asp?by=alpha"
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
t = soup.find_all("tr")
for j in range(20, len(t)):
try:
el = t[j]
l = str(el.find_all(href=True)[0]).split('"')[1]
link = "http://vrm.museum.ru" + l
response = requests.get(link)
name = BeautifulSoup(
[x for x in response.text.splitlines() if f"http://www.museum.ru{l}" in x][
0
],
"html.parser",
).text
soup2 = BeautifulSoup(response.text, "html.parser")
data2 = []
for table in soup2.find_all("table"):
rows = table.find_all("tr")
data = []
for row in rows:
cols = row.find_all("td")
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols if ele])
data2 += data
data3 = {}
for row in data2:
if len(row) > 0:
rec = []
for el in row:
rec += el.split(":")
if len(rec) > 1:
c_name = " ".join(rec[0].split())
desc = " ".join(" ".join(rec[1:]).split())
data3[c_name] = desc
images = []
img_tags = soup2.find_all("img")
urls = [img["src"] for img in img_tags]
add = {
"name": name,
"urls": [x for x in urls if "asp" in x],
"link": link,
} | data3
result.append(add)
print(name)
except Exception as e:
print(e)
print(j, "/", len(t))
with open("ext.json", "w", encoding="utf-16") as f:
json.dump({"links": result}, f, ensure_ascii=False, indent=4)
with open("ext.json", "w", encoding="utf-16") as f:
json.dump({"links": result}, f, ensure_ascii=False, indent=4)