mirror of
https://github.com/Alexander-D-Karpov/scripts.git
synced 2025-06-01 11:43:08 +03:00
added soundcloud download, tg web scrap, updated castbox
This commit is contained in:
parent
06cd5b7bea
commit
29074ada79
|
@ -24,61 +24,78 @@ def download_file(file_url):
|
||||||
return local_filename
|
return local_filename
|
||||||
|
|
||||||
|
|
||||||
r = requests.get(url)
|
def get_data(url):
|
||||||
if r.status_code != 200:
|
r = requests.get(url)
|
||||||
raise LookupError("Site not found")
|
if r.status_code != 200:
|
||||||
inner_data = r.text.splitlines()
|
raise LookupError("Site not found")
|
||||||
data = []
|
inner_data = r.text.splitlines()
|
||||||
for line in inner_data:
|
data = []
|
||||||
if "window.__INITIAL_STATE__" in line:
|
for line in inner_data:
|
||||||
data.append(line)
|
if "window.__INITIAL_STATE__" in line:
|
||||||
|
data.append(line)
|
||||||
|
|
||||||
if len(data) != 1:
|
if len(data) != 1:
|
||||||
raise ValueError("Payload not found")
|
raise ValueError("Payload not found")
|
||||||
|
|
||||||
d = json.loads(unquote(data[0].split('"')[1::2][0])) # type: dict
|
d = json.loads(unquote(data[0].split('"')[1::2][0])) # type: dict
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
d = get_data(url)
|
||||||
|
while (
|
||||||
|
not d
|
||||||
|
or "ch" not in d
|
||||||
|
or "chInfo" not in d["ch"]
|
||||||
|
or "title" not in d["ch"]["chInfo"]
|
||||||
|
):
|
||||||
|
d = get_data(url)
|
||||||
|
print("Data not loaded, retrying...")
|
||||||
title = d["ch"]["chInfo"]["title"]
|
title = d["ch"]["chInfo"]["title"]
|
||||||
main_image = d["ch"]["chInfo"]["cover_web"]
|
main_image = d["ch"]["chInfo"]["cover_web"]
|
||||||
author = d["ch"]["chInfo"]["author"]
|
author = d["ch"]["chInfo"]["author"]
|
||||||
|
episode_count = d["ch"]["chInfo"]["episode_count"]
|
||||||
print("Downloading podcast " + title)
|
print("Downloading podcast " + title)
|
||||||
episodes = d["ch"]["eps"]
|
episodes = d["ch"]["eps"]
|
||||||
if not os.path.isdir(title):
|
if not os.path.isdir(title):
|
||||||
os.mkdir(title)
|
os.mkdir(title)
|
||||||
for i, episode in enumerate(episodes):
|
for i, episode in enumerate(episodes):
|
||||||
print(f"Downloading: {episode['title']}", end="\r")
|
n_path = title + "/" + f"{title}.mp3" # имя из ep
|
||||||
if "url" in episode and episode["url"]:
|
if not os.path.exists(n_path): # на 1 выключается
|
||||||
ep_url = episode["url"]
|
print(f"Downloading: {episode['title']}", end="\r")
|
||||||
else:
|
if "url" in episode and episode["url"]:
|
||||||
ep_url = episode["urls"][0]
|
ep_url = episode["url"]
|
||||||
orig_path = download_file(ep_url)
|
else:
|
||||||
n_path = title + "/" + f"{title}.mp3"
|
ep_url = episode["urls"][0]
|
||||||
AudioSegment.from_file(orig_path).export(n_path)
|
orig_path = download_file(ep_url)
|
||||||
os.remove(orig_path)
|
AudioSegment.from_file(orig_path).export(n_path)
|
||||||
if "cover_url" not in episode or not episode["cover_url"]:
|
os.remove(orig_path)
|
||||||
img_path = download_file(main_image)
|
if "cover_url" not in episode or not episode["cover_url"]:
|
||||||
else:
|
img_path = download_file(main_image)
|
||||||
img_path = download_file(episode["cover_url"])
|
else:
|
||||||
if "author" in episode and episode["author"]:
|
img_path = download_file(episode["cover_url"])
|
||||||
ep_author = episode["author"]
|
if "author" in episode and episode["author"]:
|
||||||
else:
|
ep_author = episode["author"]
|
||||||
ep_author = author
|
else:
|
||||||
|
ep_author = author
|
||||||
|
|
||||||
tag = MP3(n_path, ID3=ID3)
|
print(f"Processing: {episode['title']}", end="\r")
|
||||||
tag.tags.add(
|
tag = MP3(n_path, ID3=ID3)
|
||||||
APIC(
|
tag.tags.add(
|
||||||
encoding=3,
|
APIC(
|
||||||
mime="image/png",
|
encoding=3,
|
||||||
type=3,
|
mime="image/png",
|
||||||
desc="Cover",
|
type=3,
|
||||||
data=open(img_path, "rb").read(),
|
desc="Cover",
|
||||||
|
data=open(img_path, "rb").read(),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
)
|
tag.save()
|
||||||
tag.save()
|
tag = EasyID3(n_path)
|
||||||
tag = EasyID3(n_path)
|
|
||||||
|
|
||||||
tag["title"] = episode["title"]
|
tag["title"] = episode["title"]
|
||||||
tag["album"] = title
|
tag["album"] = title
|
||||||
tag["artist"] = ep_author
|
tag["tracknumber"] = f"{episode_count - i}/{episode_count}"
|
||||||
|
tag["artist"] = ep_author
|
||||||
|
|
||||||
tag.save()
|
tag.save()
|
||||||
os.remove(img_path)
|
os.remove(img_path)
|
||||||
|
|
336
soundcloud_downloader.py
Normal file
336
soundcloud_downloader.py
Normal file
|
@ -0,0 +1,336 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
SoundCloud Downloader with ID3 Tags
|
||||||
|
-----------------------------------
|
||||||
|
This script downloads all tracks from a SoundCloud artist,
|
||||||
|
including proper ID3 tags and album artwork.
|
||||||
|
|
||||||
|
Requirements:
|
||||||
|
pip install scdl mutagen requests tqdm
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import subprocess
|
||||||
|
import json
|
||||||
|
import requests
|
||||||
|
from pathlib import Path
|
||||||
|
from mutagen.id3 import ID3, APIC, TIT2, TPE1, TALB, TDRC, TCON, TCOM, COMM
|
||||||
|
from tqdm import tqdm
|
||||||
|
import re
|
||||||
|
import argparse
|
||||||
|
|
||||||
|
|
||||||
|
# ANSI colors for terminal output
|
||||||
|
class Colors:
|
||||||
|
GREEN = '\033[92m'
|
||||||
|
YELLOW = '\033[93m'
|
||||||
|
RED = '\033[91m'
|
||||||
|
BLUE = '\033[94m'
|
||||||
|
ENDC = '\033[0m'
|
||||||
|
|
||||||
|
|
||||||
|
def setup_argparser():
|
||||||
|
parser = argparse.ArgumentParser(description='Download all tracks from a SoundCloud artist with proper ID3 tags')
|
||||||
|
parser.add_argument('url', help='SoundCloud URL (artist profile or likes page)')
|
||||||
|
parser.add_argument('-o', '--output', default='downloads', help='Output directory')
|
||||||
|
parser.add_argument('-c', '--client-id', help='SoundCloud client ID (optional)')
|
||||||
|
parser.add_argument('--likes', action='store_true', help='Download liked tracks (auto-detected from URL)')
|
||||||
|
parser.add_argument('--author', help='Explicitly set the author name for all tracks')
|
||||||
|
parser.add_argument('--album', help='Explicitly set the album name for all tracks')
|
||||||
|
parser.add_argument('--force-tags', action='store_true', help='Force update of ID3 tags even if they exist')
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def get_client_id():
|
||||||
|
"""Extract client_id by scraping SoundCloud's website"""
|
||||||
|
print(f"{Colors.BLUE}[*] Obtaining SoundCloud client ID...{Colors.ENDC}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get('https://soundcloud.com/')
|
||||||
|
scripts = re.findall(r'<script crossorigin src="(.*?\.js)"', response.text)
|
||||||
|
|
||||||
|
# Try to find client_id in the scripts
|
||||||
|
for script_url in scripts:
|
||||||
|
if not script_url.startswith('http'):
|
||||||
|
script_url = 'https://soundcloud.com' + script_url
|
||||||
|
|
||||||
|
script_content = requests.get(script_url).text
|
||||||
|
client_id_match = re.search(r'"client_id":"([a-zA-Z0-9]+)"', script_content)
|
||||||
|
if client_id_match:
|
||||||
|
return client_id_match.group(1)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{Colors.RED}[!] Error getting client ID: {e}{Colors.ENDC}")
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def download_tracks(artist_url, output_dir, client_id=None, likes=False):
|
||||||
|
"""Download all tracks from the given artist URL or likes page"""
|
||||||
|
if not client_id:
|
||||||
|
client_id = get_client_id()
|
||||||
|
|
||||||
|
if not client_id:
|
||||||
|
print(f"{Colors.RED}[!] Failed to get client ID. Please provide it manually with --client-id{Colors.ENDC}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# Create output directory if it doesn't exist
|
||||||
|
os.makedirs(output_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Extract artist name from URL
|
||||||
|
url_parts = artist_url.strip('/').split('/')
|
||||||
|
artist_name = url_parts[-2] if likes or '/likes' in artist_url else url_parts[-1]
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"{Colors.GREEN}[+] {'Downloading liked tracks' if likes else 'Downloading tracks'} for {artist_name} to {output_dir}{Colors.ENDC}")
|
||||||
|
|
||||||
|
# Use scdl to download tracks
|
||||||
|
cmd = [
|
||||||
|
'scdl',
|
||||||
|
'-l', artist_url,
|
||||||
|
'--path', output_dir,
|
||||||
|
'--client-id', client_id,
|
||||||
|
'--flac', # Try to get best quality where available
|
||||||
|
'-c' # Continue if download already exists
|
||||||
|
]
|
||||||
|
|
||||||
|
# Add appropriate flag based on download type
|
||||||
|
if likes or '/likes' in artist_url:
|
||||||
|
cmd.append('-f') # Download favorites/likes
|
||||||
|
elif '/sets/' in artist_url:
|
||||||
|
cmd.append('-p') # Download playlist
|
||||||
|
else:
|
||||||
|
cmd.append('-a') # Download all tracks from user
|
||||||
|
|
||||||
|
try:
|
||||||
|
subprocess.run(cmd, check=True)
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"{Colors.RED}[!] Error running scdl: {e}{Colors.ENDC}")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
return output_dir, artist_name
|
||||||
|
|
||||||
|
|
||||||
|
def get_artist_info(artist_url, client_id):
|
||||||
|
"""Get artist information from SoundCloud API"""
|
||||||
|
resolve_url = f"https://api-v2.soundcloud.com/resolve?url={artist_url}&client_id={client_id}"
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(resolve_url)
|
||||||
|
data = response.json()
|
||||||
|
return data
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{Colors.RED}[!] Error getting artist info: {e}{Colors.ENDC}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def get_tracks_info(download_dir, client_id):
|
||||||
|
"""Get information about tracks from SoundCloud API"""
|
||||||
|
print(f"{Colors.BLUE}[*] Gathering track information from SoundCloud...{Colors.ENDC}")
|
||||||
|
|
||||||
|
# Find all MP3 files
|
||||||
|
mp3_files = list(Path(download_dir).glob('*.mp3'))
|
||||||
|
track_info_map = {}
|
||||||
|
|
||||||
|
for mp3_file in mp3_files:
|
||||||
|
# Try to extract track ID or permalink from filename
|
||||||
|
# Many SoundCloud downloaders append the track ID to the filename
|
||||||
|
track_id_match = re.search(r'[-_](\d{6,})(\.mp3)?$', mp3_file.stem)
|
||||||
|
|
||||||
|
if track_id_match:
|
||||||
|
# If we have a track ID, use it to get info from the API
|
||||||
|
track_id = track_id_match.group(1)
|
||||||
|
try:
|
||||||
|
track_url = f"https://api-v2.soundcloud.com/tracks/{track_id}?client_id={client_id}"
|
||||||
|
response = requests.get(track_url)
|
||||||
|
if response.status_code == 200:
|
||||||
|
track_data = response.json()
|
||||||
|
track_info_map[mp3_file.name] = track_data
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{Colors.YELLOW}[!] Warning: Could not get info for track ID {track_id}: {e}{Colors.ENDC}")
|
||||||
|
|
||||||
|
return track_info_map
|
||||||
|
|
||||||
|
|
||||||
|
def extract_set_info(filename):
|
||||||
|
"""Extract information from set/playlist filenames"""
|
||||||
|
# For files from sets: "Set Name_Artist - Track Title.mp3"
|
||||||
|
set_match = re.search(r'^(.+?)_(.+?)\.mp3$', filename)
|
||||||
|
if set_match:
|
||||||
|
set_name = set_match.group(1).strip()
|
||||||
|
title_part = set_match.group(2).strip()
|
||||||
|
|
||||||
|
# Try to extract artist from title if it's in the "Artist - Title" format
|
||||||
|
artist_title_match = re.search(r'^(.+?) - (.+)$', title_part)
|
||||||
|
if artist_title_match:
|
||||||
|
artist = artist_title_match.group(1).strip()
|
||||||
|
title = artist_title_match.group(2).strip()
|
||||||
|
else:
|
||||||
|
# If no artist separator found, the whole part is the title
|
||||||
|
artist = None
|
||||||
|
title = title_part
|
||||||
|
|
||||||
|
return {
|
||||||
|
'set_name': set_name,
|
||||||
|
'artist': artist,
|
||||||
|
'title': title
|
||||||
|
}
|
||||||
|
|
||||||
|
# Another pattern: Some playlist files don't have the separator
|
||||||
|
# Example: "Playlist Name - Track Title.mp3" without artist info
|
||||||
|
alt_match = re.search(r'^(.+?) - (.+?)\.mp3$', filename)
|
||||||
|
if alt_match:
|
||||||
|
set_name = alt_match.group(1).strip()
|
||||||
|
title = alt_match.group(2).strip()
|
||||||
|
|
||||||
|
return {
|
||||||
|
'set_name': set_name,
|
||||||
|
'artist': None, # No artist info in this format
|
||||||
|
'title': title
|
||||||
|
}
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def extract_album_from_comments(tags):
|
||||||
|
"""Try to extract album information from ID3 comment tags"""
|
||||||
|
if "COMM" in tags:
|
||||||
|
comment = str(tags["COMM"])
|
||||||
|
# Look for potential album indicators in comments
|
||||||
|
album_match = re.search(r'CTCD-\d+\s+["\'](.+?)["\']', comment)
|
||||||
|
if album_match:
|
||||||
|
return album_match.group(1)
|
||||||
|
|
||||||
|
# Another pattern: Album name followed by E.P. or EP
|
||||||
|
ep_match = re.search(r'([^"\']+?)\s+E\.?P\.?', comment)
|
||||||
|
if ep_match:
|
||||||
|
return f"{ep_match.group(1)} E.P."
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def fix_id3_tags(download_dir, artist_name, client_id, forced_author=None, forced_album=None, force_tags=False):
|
||||||
|
"""Fix ID3 tags and add album artwork to downloaded files"""
|
||||||
|
print(f"{Colors.BLUE}[*] Adding ID3 tags and artwork...{Colors.ENDC}")
|
||||||
|
|
||||||
|
# Get artist info
|
||||||
|
artist_info = get_artist_info(f"https://soundcloud.com/{artist_name}", client_id)
|
||||||
|
artist_avatar_url = artist_info.get('avatar_url') if artist_info else None
|
||||||
|
|
||||||
|
# Try to get additional track info from SoundCloud API
|
||||||
|
track_info_map = get_tracks_info(download_dir, client_id)
|
||||||
|
|
||||||
|
# Download artist avatar for use as album art if needed
|
||||||
|
avatar_data = None
|
||||||
|
if artist_avatar_url:
|
||||||
|
try:
|
||||||
|
# Get highest resolution image by replacing size in URL
|
||||||
|
hi_res_avatar_url = artist_avatar_url.replace('-large', '-t500x500')
|
||||||
|
avatar_response = requests.get(hi_res_avatar_url)
|
||||||
|
avatar_data = avatar_response.content
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{Colors.YELLOW}[!] Warning: Could not download artist avatar: {e}{Colors.ENDC}")
|
||||||
|
|
||||||
|
# Process all MP3 files
|
||||||
|
downloaded_files = list(Path(download_dir).glob('*.mp3'))
|
||||||
|
processed_count = 0
|
||||||
|
skipped_count = 0
|
||||||
|
|
||||||
|
for mp3_file in tqdm(downloaded_files, desc="Processing files"):
|
||||||
|
try:
|
||||||
|
# Read or create ID3 tags
|
||||||
|
try:
|
||||||
|
tags = ID3(mp3_file)
|
||||||
|
# Skip if tags exist and force_tags is not set
|
||||||
|
if not force_tags and "TIT2" in tags and "TPE1" in tags and "TALB" in tags:
|
||||||
|
skipped_count += 1
|
||||||
|
continue
|
||||||
|
except:
|
||||||
|
# Create new ID3 tag if not present
|
||||||
|
tags = ID3()
|
||||||
|
|
||||||
|
# Extract information from filename
|
||||||
|
set_info = extract_set_info(mp3_file.name)
|
||||||
|
|
||||||
|
# Initialize variables
|
||||||
|
title = None
|
||||||
|
artist = forced_author
|
||||||
|
album = forced_album
|
||||||
|
|
||||||
|
# Get title from set_info or filename
|
||||||
|
if set_info:
|
||||||
|
title = set_info['title']
|
||||||
|
# Only use artist from set_info if forced_author not provided
|
||||||
|
if not artist and set_info['artist']:
|
||||||
|
artist = set_info['artist']
|
||||||
|
# Only use set_name as album if forced_album not provided
|
||||||
|
if not album:
|
||||||
|
album = set_info['set_name']
|
||||||
|
else:
|
||||||
|
# Try to extract from regular filename
|
||||||
|
filename_match = re.search(r'(.+?) - (.+?)\.mp3$', mp3_file.name)
|
||||||
|
if filename_match:
|
||||||
|
if not artist:
|
||||||
|
artist = filename_match.group(1).strip()
|
||||||
|
title = filename_match.group(2).strip()
|
||||||
|
else:
|
||||||
|
# Just use the filename as title
|
||||||
|
title = mp3_file.stem
|
||||||
|
|
||||||
|
# Try to extract album info from existing tags if available
|
||||||
|
if not album and "COMM" in tags:
|
||||||
|
album_from_comment = extract_album_from_comments(tags)
|
||||||
|
if album_from_comment:
|
||||||
|
album = album_from_comment
|
||||||
|
|
||||||
|
# If no album was determined, use a default
|
||||||
|
if not album:
|
||||||
|
album = "Unknown Album"
|
||||||
|
|
||||||
|
# If no artist was determined, use the forced_author or a default
|
||||||
|
if not artist:
|
||||||
|
artist = forced_author or "Unknown Artist"
|
||||||
|
|
||||||
|
# Set ID3 tags
|
||||||
|
tags["TIT2"] = TIT2(encoding=3, text=title)
|
||||||
|
tags["TPE1"] = TPE1(encoding=3, text=artist)
|
||||||
|
tags["TALB"] = TALB(encoding=3, text=album)
|
||||||
|
|
||||||
|
# Add artwork if we have it and it's missing or we're forcing updates
|
||||||
|
if avatar_data and (force_tags or not any(tag.startswith('APIC') for tag in tags.keys())):
|
||||||
|
tags["APIC"] = APIC(
|
||||||
|
encoding=3,
|
||||||
|
mime="image/jpeg",
|
||||||
|
type=3, # Cover (front)
|
||||||
|
desc="Cover",
|
||||||
|
data=avatar_data
|
||||||
|
)
|
||||||
|
|
||||||
|
# Save tags to file
|
||||||
|
tags.save(mp3_file)
|
||||||
|
processed_count += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"{Colors.YELLOW}[!] Warning: Could not process file {mp3_file}: {e}{Colors.ENDC}")
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"{Colors.GREEN}[+] Successfully processed {processed_count} files, skipped {skipped_count} files with existing tags{Colors.ENDC}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = setup_argparser()
|
||||||
|
|
||||||
|
print(f"{Colors.GREEN}[+] SoundCloud Downloader with ID3 Tags{Colors.ENDC}")
|
||||||
|
|
||||||
|
# Auto-detect likes URL if not explicitly set
|
||||||
|
likes = args.likes or '/likes' in args.url
|
||||||
|
|
||||||
|
download_dir, artist_name = download_tracks(args.url, args.output, args.client_id, likes)
|
||||||
|
client_id = args.client_id or get_client_id()
|
||||||
|
fix_id3_tags(download_dir, artist_name, client_id, args.author, args.album, args.force_tags)
|
||||||
|
|
||||||
|
print(f"{Colors.GREEN}[+] All done! Downloaded tracks are in: {download_dir}{Colors.ENDC}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
219
tg_scrap.py
Normal file
219
tg_scrap.py
Normal file
|
@ -0,0 +1,219 @@
|
||||||
|
import re
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup, NavigableString
|
||||||
|
import time
|
||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
def get_telegram_channel_info(channel_username):
|
||||||
|
channel_username = channel_username.strip('@')
|
||||||
|
url = f'https://t.me/s/{channel_username}'
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
'Referer': 'https://t.me/',
|
||||||
|
'Sec-Ch-Ua': '"Google Chrome";v="122", "Chromium";v="122", "Not(A:Brand";v="24"',
|
||||||
|
'Sec-Ch-Ua-Mobile': '?0',
|
||||||
|
'Sec-Ch-Ua-Platform': '"Windows"',
|
||||||
|
'Sec-Fetch-Dest': 'document',
|
||||||
|
'Sec-Fetch-Mode': 'navigate',
|
||||||
|
'Sec-Fetch-Site': 'same-site',
|
||||||
|
'Sec-Fetch-User': '?1',
|
||||||
|
'Upgrade-Insecure-Requests': '1'
|
||||||
|
}
|
||||||
|
|
||||||
|
time.sleep(random.uniform(1, 2))
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(url, headers=headers)
|
||||||
|
response.raise_for_status()
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
print(f"Error fetching channel: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, 'html.parser')
|
||||||
|
channel_info = {
|
||||||
|
'username': channel_username,
|
||||||
|
'url': f'https://t.me/{channel_username}',
|
||||||
|
'subscriber_count': 'Unknown',
|
||||||
|
'channel_name': 'Unknown',
|
||||||
|
'description': 'Unknown',
|
||||||
|
'photo_url': None,
|
||||||
|
'recent_posts': []
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract channel name
|
||||||
|
channel_name_elem = soup.find('div', class_='tgme_channel_info_header_title')
|
||||||
|
if channel_name_elem and channel_name_elem.find('span'):
|
||||||
|
channel_info['channel_name'] = channel_name_elem.find('span').text.strip()
|
||||||
|
|
||||||
|
# Extract subscriber count
|
||||||
|
subscriber_counter = soup.find('div', class_='tgme_channel_info_counter',
|
||||||
|
string=lambda text: 'subscribers' in text if text else False)
|
||||||
|
if subscriber_counter and subscriber_counter.find('span', class_='counter_value'):
|
||||||
|
channel_info['subscriber_count'] = subscriber_counter.find('span', class_='counter_value').text.strip()
|
||||||
|
|
||||||
|
# Alternative location for subscriber count
|
||||||
|
if channel_info['subscriber_count'] == 'Unknown':
|
||||||
|
header_counter = soup.find('div', class_='tgme_header_counter')
|
||||||
|
if header_counter:
|
||||||
|
channel_info['subscriber_count'] = header_counter.text.strip().replace('subscribers', '').strip()
|
||||||
|
|
||||||
|
# Extract description
|
||||||
|
description_elem = soup.find('div', class_='tgme_channel_info_description')
|
||||||
|
if description_elem:
|
||||||
|
channel_info['description'] = description_elem.text.strip()
|
||||||
|
|
||||||
|
# Extract channel photo
|
||||||
|
photo_elem = soup.find('i', class_='tgme_page_photo_image')
|
||||||
|
if photo_elem and photo_elem.find('img'):
|
||||||
|
channel_info['photo_url'] = photo_elem.find('img').get('src')
|
||||||
|
|
||||||
|
# Extract recent posts
|
||||||
|
message_containers = soup.find_all('div', class_='tgme_widget_message')
|
||||||
|
|
||||||
|
for container in message_containers[::-1]:
|
||||||
|
post_data = {}
|
||||||
|
|
||||||
|
post_data['id'] = container.get('data-post')
|
||||||
|
|
||||||
|
text_elem = container.find('div', class_='tgme_widget_message_text')
|
||||||
|
if text_elem:
|
||||||
|
post_data['html_text'] = str(text_elem)
|
||||||
|
|
||||||
|
formatted_text = ""
|
||||||
|
|
||||||
|
def process_node(node):
|
||||||
|
nonlocal formatted_text
|
||||||
|
|
||||||
|
if node.name == 'br':
|
||||||
|
formatted_text += '\n'
|
||||||
|
elif node.name == 'b':
|
||||||
|
formatted_text += f"<b>{node.get_text()}</b>"
|
||||||
|
elif node.name == 'i':
|
||||||
|
formatted_text += f"<i>{node.get_text()}</i>"
|
||||||
|
elif node.name == 'a':
|
||||||
|
href = node.get('href', '')
|
||||||
|
formatted_text += f"<a href='{href}'>{node.get_text()}</a>"
|
||||||
|
elif node.name == 'pre' or node.name == 'code':
|
||||||
|
formatted_text += f"<code>{node.get_text()}</code>"
|
||||||
|
elif node.name == 'tg-emoji':
|
||||||
|
emoji_code = node.find('i', class_='emoji').get_text() if node.find('i', class_='emoji') else ''
|
||||||
|
formatted_text += emoji_code
|
||||||
|
elif isinstance(node, NavigableString):
|
||||||
|
formatted_text += str(node)
|
||||||
|
else:
|
||||||
|
for child in node.children:
|
||||||
|
process_node(child)
|
||||||
|
|
||||||
|
for child in text_elem.children:
|
||||||
|
process_node(child)
|
||||||
|
|
||||||
|
post_data['formatted_text'] = formatted_text
|
||||||
|
post_data['plain_text'] = text_elem.get_text(separator=' ', strip=True)
|
||||||
|
else:
|
||||||
|
post_data['html_text'] = ""
|
||||||
|
post_data['formatted_text'] = ""
|
||||||
|
post_data['plain_text'] = ""
|
||||||
|
|
||||||
|
# Get message date
|
||||||
|
date_elem = container.find('a', class_='tgme_widget_message_date')
|
||||||
|
if date_elem and date_elem.find('time'):
|
||||||
|
post_data['date'] = date_elem.find('time')['datetime']
|
||||||
|
|
||||||
|
# Get view count
|
||||||
|
views_elem = container.find('span', class_='tgme_widget_message_views')
|
||||||
|
if views_elem:
|
||||||
|
post_data['views'] = views_elem.text.strip()
|
||||||
|
|
||||||
|
# Check for media attachments
|
||||||
|
photo_elem = container.find('a', class_='tgme_widget_message_photo_wrap')
|
||||||
|
video_elem = container.find('a', class_='tgme_widget_message_video_player')
|
||||||
|
|
||||||
|
if photo_elem:
|
||||||
|
post_data['has_photo'] = True
|
||||||
|
if 'style' in photo_elem.attrs:
|
||||||
|
style = photo_elem['style']
|
||||||
|
if 'background-image' in style:
|
||||||
|
url_start = style.find('url(') + 4
|
||||||
|
url_end = style.find(')', url_start)
|
||||||
|
photo_url = style[url_start:url_end].strip("'")
|
||||||
|
post_data['photo_url'] = photo_url.replace('\'', '').replace('"', '')
|
||||||
|
else:
|
||||||
|
post_data['has_photo'] = False
|
||||||
|
|
||||||
|
if video_elem:
|
||||||
|
post_data['has_video'] = True
|
||||||
|
video_thumb = video_elem.find('i', class_='tgme_widget_message_video_thumb')
|
||||||
|
if video_thumb and 'style' in video_thumb.attrs:
|
||||||
|
style = video_thumb['style']
|
||||||
|
if 'background-image' in style:
|
||||||
|
url_start = style.find('url(') + 4
|
||||||
|
url_end = style.find(')', url_start)
|
||||||
|
thumbnail_url = style[url_start:url_end].strip("'")
|
||||||
|
post_data['video_thumbnail'] = thumbnail_url.replace('\'', '').replace('"', '')
|
||||||
|
|
||||||
|
# Get video duration
|
||||||
|
duration_elem = video_elem.find('time', class_='message_video_duration')
|
||||||
|
if duration_elem:
|
||||||
|
post_data['video_duration'] = duration_elem.text.strip()
|
||||||
|
else:
|
||||||
|
post_data['has_video'] = False
|
||||||
|
|
||||||
|
# Add post to list
|
||||||
|
channel_info['recent_posts'].append(post_data)
|
||||||
|
|
||||||
|
return channel_info
|
||||||
|
|
||||||
|
|
||||||
|
def print_channel_info(channel_info):
|
||||||
|
if not channel_info:
|
||||||
|
print("Could not retrieve channel information.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"TELEGRAM CHANNEL: @{channel_info['username']}")
|
||||||
|
print(f"Name: {channel_info['channel_name']}")
|
||||||
|
print(f"Subscribers: {channel_info['subscriber_count']}")
|
||||||
|
print(f"URL: {channel_info['url']}")
|
||||||
|
|
||||||
|
print("\nDescription:")
|
||||||
|
print(channel_info['description'])
|
||||||
|
|
||||||
|
print(f"\nRecent Posts ({len(channel_info['recent_posts'])} found):")
|
||||||
|
for i, post in enumerate(channel_info['recent_posts'], 1):
|
||||||
|
print(f"\nPost {i}:")
|
||||||
|
print(f" Date: {post.get('date', 'Unknown')}")
|
||||||
|
print(f" Views: {post.get('views', 'Unknown')}")
|
||||||
|
|
||||||
|
media_types = []
|
||||||
|
if post.get('has_photo'):
|
||||||
|
media_types.append("Photo")
|
||||||
|
if post.get('has_video'):
|
||||||
|
media_types.append("Video")
|
||||||
|
|
||||||
|
media_str = ", ".join(media_types) if media_types else "None"
|
||||||
|
print(f" Media: {media_str}")
|
||||||
|
|
||||||
|
formatted_text = post.get('formatted_text', '')
|
||||||
|
if formatted_text:
|
||||||
|
formatted_text = re.sub(r'<br\s*/?>', '\n', formatted_text)
|
||||||
|
formatted_text = re.sub(r'\n+', '\n', formatted_text)
|
||||||
|
|
||||||
|
if len(formatted_text) > 150:
|
||||||
|
preview = formatted_text[:150] + "..."
|
||||||
|
else:
|
||||||
|
preview = formatted_text
|
||||||
|
|
||||||
|
preview = re.sub(r'\s+', ' ', preview)
|
||||||
|
print(f" Content: {preview}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
channel_name = input("Enter Telegram channel username (with or without @): ")
|
||||||
|
|
||||||
|
print(f"\nFetching information for {channel_name}...")
|
||||||
|
channel_info = get_telegram_channel_info(channel_name)
|
||||||
|
|
||||||
|
print_channel_info(channel_info)
|
Loading…
Reference in New Issue
Block a user