added soundcloud download, tg web scrap, updated castbox

This commit is contained in:
Alexander Karpov 2025-04-24 01:28:06 +03:00
parent 06cd5b7bea
commit 29074ada79
3 changed files with 616 additions and 44 deletions

View File

@ -24,61 +24,78 @@ def download_file(file_url):
return local_filename
r = requests.get(url)
if r.status_code != 200:
raise LookupError("Site not found")
inner_data = r.text.splitlines()
data = []
for line in inner_data:
if "window.__INITIAL_STATE__" in line:
data.append(line)
def get_data(url):
r = requests.get(url)
if r.status_code != 200:
raise LookupError("Site not found")
inner_data = r.text.splitlines()
data = []
for line in inner_data:
if "window.__INITIAL_STATE__" in line:
data.append(line)
if len(data) != 1:
raise ValueError("Payload not found")
if len(data) != 1:
raise ValueError("Payload not found")
d = json.loads(unquote(data[0].split('"')[1::2][0])) # type: dict
d = json.loads(unquote(data[0].split('"')[1::2][0])) # type: dict
return d
d = get_data(url)
while (
not d
or "ch" not in d
or "chInfo" not in d["ch"]
or "title" not in d["ch"]["chInfo"]
):
d = get_data(url)
print("Data not loaded, retrying...")
title = d["ch"]["chInfo"]["title"]
main_image = d["ch"]["chInfo"]["cover_web"]
author = d["ch"]["chInfo"]["author"]
episode_count = d["ch"]["chInfo"]["episode_count"]
print("Downloading podcast " + title)
episodes = d["ch"]["eps"]
if not os.path.isdir(title):
os.mkdir(title)
for i, episode in enumerate(episodes):
print(f"Downloading: {episode['title']}", end="\r")
if "url" in episode and episode["url"]:
ep_url = episode["url"]
else:
ep_url = episode["urls"][0]
orig_path = download_file(ep_url)
n_path = title + "/" + f"{title}.mp3"
AudioSegment.from_file(orig_path).export(n_path)
os.remove(orig_path)
if "cover_url" not in episode or not episode["cover_url"]:
img_path = download_file(main_image)
else:
img_path = download_file(episode["cover_url"])
if "author" in episode and episode["author"]:
ep_author = episode["author"]
else:
ep_author = author
n_path = title + "/" + f"{title}.mp3" # имя из ep
if not os.path.exists(n_path): # на 1 выключается
print(f"Downloading: {episode['title']}", end="\r")
if "url" in episode and episode["url"]:
ep_url = episode["url"]
else:
ep_url = episode["urls"][0]
orig_path = download_file(ep_url)
AudioSegment.from_file(orig_path).export(n_path)
os.remove(orig_path)
if "cover_url" not in episode or not episode["cover_url"]:
img_path = download_file(main_image)
else:
img_path = download_file(episode["cover_url"])
if "author" in episode and episode["author"]:
ep_author = episode["author"]
else:
ep_author = author
tag = MP3(n_path, ID3=ID3)
tag.tags.add(
APIC(
encoding=3,
mime="image/png",
type=3,
desc="Cover",
data=open(img_path, "rb").read(),
print(f"Processing: {episode['title']}", end="\r")
tag = MP3(n_path, ID3=ID3)
tag.tags.add(
APIC(
encoding=3,
mime="image/png",
type=3,
desc="Cover",
data=open(img_path, "rb").read(),
)
)
)
tag.save()
tag = EasyID3(n_path)
tag.save()
tag = EasyID3(n_path)
tag["title"] = episode["title"]
tag["album"] = title
tag["artist"] = ep_author
tag["title"] = episode["title"]
tag["album"] = title
tag["tracknumber"] = f"{episode_count - i}/{episode_count}"
tag["artist"] = ep_author
tag.save()
os.remove(img_path)
tag.save()
os.remove(img_path)

336
soundcloud_downloader.py Normal file
View File

@ -0,0 +1,336 @@
#!/usr/bin/env python3
"""
SoundCloud Downloader with ID3 Tags
-----------------------------------
This script downloads all tracks from a SoundCloud artist,
including proper ID3 tags and album artwork.
Requirements:
pip install scdl mutagen requests tqdm
"""
import os
import sys
import subprocess
import json
import requests
from pathlib import Path
from mutagen.id3 import ID3, APIC, TIT2, TPE1, TALB, TDRC, TCON, TCOM, COMM
from tqdm import tqdm
import re
import argparse
# ANSI colors for terminal output
class Colors:
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BLUE = '\033[94m'
ENDC = '\033[0m'
def setup_argparser():
parser = argparse.ArgumentParser(description='Download all tracks from a SoundCloud artist with proper ID3 tags')
parser.add_argument('url', help='SoundCloud URL (artist profile or likes page)')
parser.add_argument('-o', '--output', default='downloads', help='Output directory')
parser.add_argument('-c', '--client-id', help='SoundCloud client ID (optional)')
parser.add_argument('--likes', action='store_true', help='Download liked tracks (auto-detected from URL)')
parser.add_argument('--author', help='Explicitly set the author name for all tracks')
parser.add_argument('--album', help='Explicitly set the album name for all tracks')
parser.add_argument('--force-tags', action='store_true', help='Force update of ID3 tags even if they exist')
return parser.parse_args()
def get_client_id():
"""Extract client_id by scraping SoundCloud's website"""
print(f"{Colors.BLUE}[*] Obtaining SoundCloud client ID...{Colors.ENDC}")
try:
response = requests.get('https://soundcloud.com/')
scripts = re.findall(r'<script crossorigin src="(.*?\.js)"', response.text)
# Try to find client_id in the scripts
for script_url in scripts:
if not script_url.startswith('http'):
script_url = 'https://soundcloud.com' + script_url
script_content = requests.get(script_url).text
client_id_match = re.search(r'"client_id":"([a-zA-Z0-9]+)"', script_content)
if client_id_match:
return client_id_match.group(1)
except Exception as e:
print(f"{Colors.RED}[!] Error getting client ID: {e}{Colors.ENDC}")
return None
def download_tracks(artist_url, output_dir, client_id=None, likes=False):
"""Download all tracks from the given artist URL or likes page"""
if not client_id:
client_id = get_client_id()
if not client_id:
print(f"{Colors.RED}[!] Failed to get client ID. Please provide it manually with --client-id{Colors.ENDC}")
sys.exit(1)
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Extract artist name from URL
url_parts = artist_url.strip('/').split('/')
artist_name = url_parts[-2] if likes or '/likes' in artist_url else url_parts[-1]
print(
f"{Colors.GREEN}[+] {'Downloading liked tracks' if likes else 'Downloading tracks'} for {artist_name} to {output_dir}{Colors.ENDC}")
# Use scdl to download tracks
cmd = [
'scdl',
'-l', artist_url,
'--path', output_dir,
'--client-id', client_id,
'--flac', # Try to get best quality where available
'-c' # Continue if download already exists
]
# Add appropriate flag based on download type
if likes or '/likes' in artist_url:
cmd.append('-f') # Download favorites/likes
elif '/sets/' in artist_url:
cmd.append('-p') # Download playlist
else:
cmd.append('-a') # Download all tracks from user
try:
subprocess.run(cmd, check=True)
except subprocess.CalledProcessError as e:
print(f"{Colors.RED}[!] Error running scdl: {e}{Colors.ENDC}")
sys.exit(1)
return output_dir, artist_name
def get_artist_info(artist_url, client_id):
"""Get artist information from SoundCloud API"""
resolve_url = f"https://api-v2.soundcloud.com/resolve?url={artist_url}&client_id={client_id}"
try:
response = requests.get(resolve_url)
data = response.json()
return data
except Exception as e:
print(f"{Colors.RED}[!] Error getting artist info: {e}{Colors.ENDC}")
return None
def get_tracks_info(download_dir, client_id):
"""Get information about tracks from SoundCloud API"""
print(f"{Colors.BLUE}[*] Gathering track information from SoundCloud...{Colors.ENDC}")
# Find all MP3 files
mp3_files = list(Path(download_dir).glob('*.mp3'))
track_info_map = {}
for mp3_file in mp3_files:
# Try to extract track ID or permalink from filename
# Many SoundCloud downloaders append the track ID to the filename
track_id_match = re.search(r'[-_](\d{6,})(\.mp3)?$', mp3_file.stem)
if track_id_match:
# If we have a track ID, use it to get info from the API
track_id = track_id_match.group(1)
try:
track_url = f"https://api-v2.soundcloud.com/tracks/{track_id}?client_id={client_id}"
response = requests.get(track_url)
if response.status_code == 200:
track_data = response.json()
track_info_map[mp3_file.name] = track_data
except Exception as e:
print(f"{Colors.YELLOW}[!] Warning: Could not get info for track ID {track_id}: {e}{Colors.ENDC}")
return track_info_map
def extract_set_info(filename):
"""Extract information from set/playlist filenames"""
# For files from sets: "Set Name_Artist - Track Title.mp3"
set_match = re.search(r'^(.+?)_(.+?)\.mp3$', filename)
if set_match:
set_name = set_match.group(1).strip()
title_part = set_match.group(2).strip()
# Try to extract artist from title if it's in the "Artist - Title" format
artist_title_match = re.search(r'^(.+?) - (.+)$', title_part)
if artist_title_match:
artist = artist_title_match.group(1).strip()
title = artist_title_match.group(2).strip()
else:
# If no artist separator found, the whole part is the title
artist = None
title = title_part
return {
'set_name': set_name,
'artist': artist,
'title': title
}
# Another pattern: Some playlist files don't have the separator
# Example: "Playlist Name - Track Title.mp3" without artist info
alt_match = re.search(r'^(.+?) - (.+?)\.mp3$', filename)
if alt_match:
set_name = alt_match.group(1).strip()
title = alt_match.group(2).strip()
return {
'set_name': set_name,
'artist': None, # No artist info in this format
'title': title
}
return None
def extract_album_from_comments(tags):
"""Try to extract album information from ID3 comment tags"""
if "COMM" in tags:
comment = str(tags["COMM"])
# Look for potential album indicators in comments
album_match = re.search(r'CTCD-\d+\s+["\'](.+?)["\']', comment)
if album_match:
return album_match.group(1)
# Another pattern: Album name followed by E.P. or EP
ep_match = re.search(r'([^"\']+?)\s+E\.?P\.?', comment)
if ep_match:
return f"{ep_match.group(1)} E.P."
return None
def fix_id3_tags(download_dir, artist_name, client_id, forced_author=None, forced_album=None, force_tags=False):
"""Fix ID3 tags and add album artwork to downloaded files"""
print(f"{Colors.BLUE}[*] Adding ID3 tags and artwork...{Colors.ENDC}")
# Get artist info
artist_info = get_artist_info(f"https://soundcloud.com/{artist_name}", client_id)
artist_avatar_url = artist_info.get('avatar_url') if artist_info else None
# Try to get additional track info from SoundCloud API
track_info_map = get_tracks_info(download_dir, client_id)
# Download artist avatar for use as album art if needed
avatar_data = None
if artist_avatar_url:
try:
# Get highest resolution image by replacing size in URL
hi_res_avatar_url = artist_avatar_url.replace('-large', '-t500x500')
avatar_response = requests.get(hi_res_avatar_url)
avatar_data = avatar_response.content
except Exception as e:
print(f"{Colors.YELLOW}[!] Warning: Could not download artist avatar: {e}{Colors.ENDC}")
# Process all MP3 files
downloaded_files = list(Path(download_dir).glob('*.mp3'))
processed_count = 0
skipped_count = 0
for mp3_file in tqdm(downloaded_files, desc="Processing files"):
try:
# Read or create ID3 tags
try:
tags = ID3(mp3_file)
# Skip if tags exist and force_tags is not set
if not force_tags and "TIT2" in tags and "TPE1" in tags and "TALB" in tags:
skipped_count += 1
continue
except:
# Create new ID3 tag if not present
tags = ID3()
# Extract information from filename
set_info = extract_set_info(mp3_file.name)
# Initialize variables
title = None
artist = forced_author
album = forced_album
# Get title from set_info or filename
if set_info:
title = set_info['title']
# Only use artist from set_info if forced_author not provided
if not artist and set_info['artist']:
artist = set_info['artist']
# Only use set_name as album if forced_album not provided
if not album:
album = set_info['set_name']
else:
# Try to extract from regular filename
filename_match = re.search(r'(.+?) - (.+?)\.mp3$', mp3_file.name)
if filename_match:
if not artist:
artist = filename_match.group(1).strip()
title = filename_match.group(2).strip()
else:
# Just use the filename as title
title = mp3_file.stem
# Try to extract album info from existing tags if available
if not album and "COMM" in tags:
album_from_comment = extract_album_from_comments(tags)
if album_from_comment:
album = album_from_comment
# If no album was determined, use a default
if not album:
album = "Unknown Album"
# If no artist was determined, use the forced_author or a default
if not artist:
artist = forced_author or "Unknown Artist"
# Set ID3 tags
tags["TIT2"] = TIT2(encoding=3, text=title)
tags["TPE1"] = TPE1(encoding=3, text=artist)
tags["TALB"] = TALB(encoding=3, text=album)
# Add artwork if we have it and it's missing or we're forcing updates
if avatar_data and (force_tags or not any(tag.startswith('APIC') for tag in tags.keys())):
tags["APIC"] = APIC(
encoding=3,
mime="image/jpeg",
type=3, # Cover (front)
desc="Cover",
data=avatar_data
)
# Save tags to file
tags.save(mp3_file)
processed_count += 1
except Exception as e:
print(f"{Colors.YELLOW}[!] Warning: Could not process file {mp3_file}: {e}{Colors.ENDC}")
print(
f"{Colors.GREEN}[+] Successfully processed {processed_count} files, skipped {skipped_count} files with existing tags{Colors.ENDC}")
def main():
args = setup_argparser()
print(f"{Colors.GREEN}[+] SoundCloud Downloader with ID3 Tags{Colors.ENDC}")
# Auto-detect likes URL if not explicitly set
likes = args.likes or '/likes' in args.url
download_dir, artist_name = download_tracks(args.url, args.output, args.client_id, likes)
client_id = args.client_id or get_client_id()
fix_id3_tags(download_dir, artist_name, client_id, args.author, args.album, args.force_tags)
print(f"{Colors.GREEN}[+] All done! Downloaded tracks are in: {download_dir}{Colors.ENDC}")
if __name__ == "__main__":
main()

219
tg_scrap.py Normal file
View File

@ -0,0 +1,219 @@
import re
import requests
from bs4 import BeautifulSoup, NavigableString
import time
import random
def get_telegram_channel_info(channel_username):
channel_username = channel_username.strip('@')
url = f'https://t.me/s/{channel_username}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'en-US,en;q=0.9',
'Referer': 'https://t.me/',
'Sec-Ch-Ua': '"Google Chrome";v="122", "Chromium";v="122", "Not(A:Brand";v="24"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"Windows"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1'
}
time.sleep(random.uniform(1, 2))
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Error fetching channel: {e}")
return None
soup = BeautifulSoup(response.text, 'html.parser')
channel_info = {
'username': channel_username,
'url': f'https://t.me/{channel_username}',
'subscriber_count': 'Unknown',
'channel_name': 'Unknown',
'description': 'Unknown',
'photo_url': None,
'recent_posts': []
}
# Extract channel name
channel_name_elem = soup.find('div', class_='tgme_channel_info_header_title')
if channel_name_elem and channel_name_elem.find('span'):
channel_info['channel_name'] = channel_name_elem.find('span').text.strip()
# Extract subscriber count
subscriber_counter = soup.find('div', class_='tgme_channel_info_counter',
string=lambda text: 'subscribers' in text if text else False)
if subscriber_counter and subscriber_counter.find('span', class_='counter_value'):
channel_info['subscriber_count'] = subscriber_counter.find('span', class_='counter_value').text.strip()
# Alternative location for subscriber count
if channel_info['subscriber_count'] == 'Unknown':
header_counter = soup.find('div', class_='tgme_header_counter')
if header_counter:
channel_info['subscriber_count'] = header_counter.text.strip().replace('subscribers', '').strip()
# Extract description
description_elem = soup.find('div', class_='tgme_channel_info_description')
if description_elem:
channel_info['description'] = description_elem.text.strip()
# Extract channel photo
photo_elem = soup.find('i', class_='tgme_page_photo_image')
if photo_elem and photo_elem.find('img'):
channel_info['photo_url'] = photo_elem.find('img').get('src')
# Extract recent posts
message_containers = soup.find_all('div', class_='tgme_widget_message')
for container in message_containers[::-1]:
post_data = {}
post_data['id'] = container.get('data-post')
text_elem = container.find('div', class_='tgme_widget_message_text')
if text_elem:
post_data['html_text'] = str(text_elem)
formatted_text = ""
def process_node(node):
nonlocal formatted_text
if node.name == 'br':
formatted_text += '\n'
elif node.name == 'b':
formatted_text += f"<b>{node.get_text()}</b>"
elif node.name == 'i':
formatted_text += f"<i>{node.get_text()}</i>"
elif node.name == 'a':
href = node.get('href', '')
formatted_text += f"<a href='{href}'>{node.get_text()}</a>"
elif node.name == 'pre' or node.name == 'code':
formatted_text += f"<code>{node.get_text()}</code>"
elif node.name == 'tg-emoji':
emoji_code = node.find('i', class_='emoji').get_text() if node.find('i', class_='emoji') else ''
formatted_text += emoji_code
elif isinstance(node, NavigableString):
formatted_text += str(node)
else:
for child in node.children:
process_node(child)
for child in text_elem.children:
process_node(child)
post_data['formatted_text'] = formatted_text
post_data['plain_text'] = text_elem.get_text(separator=' ', strip=True)
else:
post_data['html_text'] = ""
post_data['formatted_text'] = ""
post_data['plain_text'] = ""
# Get message date
date_elem = container.find('a', class_='tgme_widget_message_date')
if date_elem and date_elem.find('time'):
post_data['date'] = date_elem.find('time')['datetime']
# Get view count
views_elem = container.find('span', class_='tgme_widget_message_views')
if views_elem:
post_data['views'] = views_elem.text.strip()
# Check for media attachments
photo_elem = container.find('a', class_='tgme_widget_message_photo_wrap')
video_elem = container.find('a', class_='tgme_widget_message_video_player')
if photo_elem:
post_data['has_photo'] = True
if 'style' in photo_elem.attrs:
style = photo_elem['style']
if 'background-image' in style:
url_start = style.find('url(') + 4
url_end = style.find(')', url_start)
photo_url = style[url_start:url_end].strip("'")
post_data['photo_url'] = photo_url.replace('\'', '').replace('"', '')
else:
post_data['has_photo'] = False
if video_elem:
post_data['has_video'] = True
video_thumb = video_elem.find('i', class_='tgme_widget_message_video_thumb')
if video_thumb and 'style' in video_thumb.attrs:
style = video_thumb['style']
if 'background-image' in style:
url_start = style.find('url(') + 4
url_end = style.find(')', url_start)
thumbnail_url = style[url_start:url_end].strip("'")
post_data['video_thumbnail'] = thumbnail_url.replace('\'', '').replace('"', '')
# Get video duration
duration_elem = video_elem.find('time', class_='message_video_duration')
if duration_elem:
post_data['video_duration'] = duration_elem.text.strip()
else:
post_data['has_video'] = False
# Add post to list
channel_info['recent_posts'].append(post_data)
return channel_info
def print_channel_info(channel_info):
if not channel_info:
print("Could not retrieve channel information.")
return
print(f"TELEGRAM CHANNEL: @{channel_info['username']}")
print(f"Name: {channel_info['channel_name']}")
print(f"Subscribers: {channel_info['subscriber_count']}")
print(f"URL: {channel_info['url']}")
print("\nDescription:")
print(channel_info['description'])
print(f"\nRecent Posts ({len(channel_info['recent_posts'])} found):")
for i, post in enumerate(channel_info['recent_posts'], 1):
print(f"\nPost {i}:")
print(f" Date: {post.get('date', 'Unknown')}")
print(f" Views: {post.get('views', 'Unknown')}")
media_types = []
if post.get('has_photo'):
media_types.append("Photo")
if post.get('has_video'):
media_types.append("Video")
media_str = ", ".join(media_types) if media_types else "None"
print(f" Media: {media_str}")
formatted_text = post.get('formatted_text', '')
if formatted_text:
formatted_text = re.sub(r'<br\s*/?>', '\n', formatted_text)
formatted_text = re.sub(r'\n+', '\n', formatted_text)
if len(formatted_text) > 150:
preview = formatted_text[:150] + "..."
else:
preview = formatted_text
preview = re.sub(r'\s+', ' ', preview)
print(f" Content: {preview}")
if __name__ == "__main__":
channel_name = input("Enter Telegram channel username (with or without @): ")
print(f"\nFetching information for {channel_name}...")
channel_info = get_telegram_channel_info(channel_name)
print_channel_info(channel_info)