added soundcloud download, tg web scrap, updated castbox

2025-07-22 14:09:45 +03:00 · 2025-04-24 01:28:06 +03:00 · 2025-04-24 01:28:06 +03:00 · 29074ada79
commit 29074ada79
parent 06cd5b7bea
3 changed files with 616 additions and 44 deletions
--- a/podcasts/castbox/castbox.py
+++ b/podcasts/castbox/castbox.py
@ -24,61 +24,78 @@ def download_file(file_url):
    return local_filename
-r = requests.get(url)
+def get_data(url):
-if r.status_code != 200:
+    r = requests.get(url)
-    raise LookupError("Site not found")
+    if r.status_code != 200:
-inner_data = r.text.splitlines()
+        raise LookupError("Site not found")
-data = []
+    inner_data = r.text.splitlines()
-for line in inner_data:
+    data = []
-    if "window.__INITIAL_STATE__" in line:
+    for line in inner_data:
-        data.append(line)
+        if "window.__INITIAL_STATE__" in line:
            data.append(line)
-if len(data) != 1:
+    if len(data) != 1:
-    raise ValueError("Payload not found")
+        raise ValueError("Payload not found")
-d = json.loads(unquote(data[0].split('"')[1::2][0]))  # type: dict
+    d = json.loads(unquote(data[0].split('"')[1::2][0]))  # type: dict
    return d
 d = get_data(url)
 while (
    not d
    or "ch" not in d
    or "chInfo" not in d["ch"]
    or "title" not in d["ch"]["chInfo"]
 ):
    d = get_data(url)
    print("Data not loaded, retrying...")
 title = d["ch"]["chInfo"]["title"]
 main_image = d["ch"]["chInfo"]["cover_web"]
 author = d["ch"]["chInfo"]["author"]
 episode_count = d["ch"]["chInfo"]["episode_count"]
 print("Downloading podcast " + title)
 episodes = d["ch"]["eps"]
 if not os.path.isdir(title):
    os.mkdir(title)
 for i, episode in enumerate(episodes):
-    print(f"Downloading: {episode['title']}", end="\r")
+    n_path = title + "/" + f"{title}.mp3" # имя из ep
-    if "url" in episode and episode["url"]:
+    if not os.path.exists(n_path): # на 1 выключается
-        ep_url = episode["url"]
+        print(f"Downloading: {episode['title']}", end="\r")
-    else:
+        if "url" in episode and episode["url"]:
-        ep_url = episode["urls"][0]
+            ep_url = episode["url"]
-    orig_path = download_file(ep_url)
+        else:
-    n_path = title + "/" + f"{title}.mp3"
+            ep_url = episode["urls"][0]
-    AudioSegment.from_file(orig_path).export(n_path)
+        orig_path = download_file(ep_url)
-    os.remove(orig_path)
+        AudioSegment.from_file(orig_path).export(n_path)
-    if "cover_url" not in episode or not episode["cover_url"]:
+        os.remove(orig_path)
-        img_path = download_file(main_image)
+        if "cover_url" not in episode or not episode["cover_url"]:
-    else:
+            img_path = download_file(main_image)
-        img_path = download_file(episode["cover_url"])
+        else:
-    if "author" in episode and episode["author"]:
+            img_path = download_file(episode["cover_url"])
-        ep_author = episode["author"]
+        if "author" in episode and episode["author"]:
-    else:
+            ep_author = episode["author"]
-        ep_author = author
+        else:
            ep_author = author
-    tag = MP3(n_path, ID3=ID3)
+        print(f"Processing: {episode['title']}", end="\r")
-    tag.tags.add(
+        tag = MP3(n_path, ID3=ID3)
-        APIC(
+        tag.tags.add(
-            encoding=3,
+            APIC(
-            mime="image/png",
+                encoding=3,
-            type=3,
+                mime="image/png",
-            desc="Cover",
+                type=3,
-            data=open(img_path, "rb").read(),
+                desc="Cover",
                data=open(img_path, "rb").read(),
            )
        )
-    )
+        tag.save()
-    tag.save()
+        tag = EasyID3(n_path)
    tag = EasyID3(n_path)
-    tag["title"] = episode["title"]
+        tag["title"] = episode["title"]
-    tag["album"] = title
+        tag["album"] = title
-    tag["artist"] = ep_author
+        tag["tracknumber"] = f"{episode_count - i}/{episode_count}"
        tag["artist"] = ep_author
-    tag.save()
+        tag.save()
-    os.remove(img_path)
+        os.remove(img_path)
--- a/soundcloud_downloader.py
+++ b/soundcloud_downloader.py
@ -0,0 +1,336 @@
 #!/usr/bin/env python3
 """
 SoundCloud Downloader with ID3 Tags
 -----------------------------------
 This script downloads all tracks from a SoundCloud artist,
 including proper ID3 tags and album artwork.
 Requirements:
 pip install scdl mutagen requests tqdm
 """
 import os
 import sys
 import subprocess
 import json
 import requests
 from pathlib import Path
 from mutagen.id3 import ID3, APIC, TIT2, TPE1, TALB, TDRC, TCON, TCOM, COMM
 from tqdm import tqdm
 import re
 import argparse
 # ANSI colors for terminal output
 class Colors:
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    BLUE = '\033[94m'
    ENDC = '\033[0m'
 def setup_argparser():
    parser = argparse.ArgumentParser(description='Download all tracks from a SoundCloud artist with proper ID3 tags')
    parser.add_argument('url', help='SoundCloud URL (artist profile or likes page)')
    parser.add_argument('-o', '--output', default='downloads', help='Output directory')
    parser.add_argument('-c', '--client-id', help='SoundCloud client ID (optional)')
    parser.add_argument('--likes', action='store_true', help='Download liked tracks (auto-detected from URL)')
    parser.add_argument('--author', help='Explicitly set the author name for all tracks')
    parser.add_argument('--album', help='Explicitly set the album name for all tracks')
    parser.add_argument('--force-tags', action='store_true', help='Force update of ID3 tags even if they exist')
    return parser.parse_args()
 def get_client_id():
    """Extract client_id by scraping SoundCloud's website"""
    print(f"{Colors.BLUE}[*] Obtaining SoundCloud client ID...{Colors.ENDC}")
    try:
        response = requests.get('https://soundcloud.com/')
        scripts = re.findall(r'<script crossorigin src="(.*?\.js)"', response.text)
        # Try to find client_id in the scripts
        for script_url in scripts:
            if not script_url.startswith('http'):
                script_url = 'https://soundcloud.com' + script_url
            script_content = requests.get(script_url).text
            client_id_match = re.search(r'"client_id":"([a-zA-Z0-9]+)"', script_content)
            if client_id_match:
                return client_id_match.group(1)
    except Exception as e:
        print(f"{Colors.RED}[!] Error getting client ID: {e}{Colors.ENDC}")
    return None
 def download_tracks(artist_url, output_dir, client_id=None, likes=False):
    """Download all tracks from the given artist URL or likes page"""
    if not client_id:
        client_id = get_client_id()
    if not client_id:
        print(f"{Colors.RED}[!] Failed to get client ID. Please provide it manually with --client-id{Colors.ENDC}")
        sys.exit(1)
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    # Extract artist name from URL
    url_parts = artist_url.strip('/').split('/')
    artist_name = url_parts[-2] if likes or '/likes' in artist_url else url_parts[-1]
    print(
        f"{Colors.GREEN}[+] {'Downloading liked tracks' if likes else 'Downloading tracks'} for {artist_name} to {output_dir}{Colors.ENDC}")
    # Use scdl to download tracks
    cmd = [
        'scdl',
        '-l', artist_url,
        '--path', output_dir,
        '--client-id', client_id,
        '--flac',  # Try to get best quality where available
        '-c'  # Continue if download already exists
    ]
    # Add appropriate flag based on download type
    if likes or '/likes' in artist_url:
        cmd.append('-f')  # Download favorites/likes
    elif '/sets/' in artist_url:
        cmd.append('-p')  # Download playlist
    else:
        cmd.append('-a')  # Download all tracks from user
    try:
        subprocess.run(cmd, check=True)
    except subprocess.CalledProcessError as e:
        print(f"{Colors.RED}[!] Error running scdl: {e}{Colors.ENDC}")
        sys.exit(1)
    return output_dir, artist_name
 def get_artist_info(artist_url, client_id):
    """Get artist information from SoundCloud API"""
    resolve_url = f"https://api-v2.soundcloud.com/resolve?url={artist_url}&client_id={client_id}"
    try:
        response = requests.get(resolve_url)
        data = response.json()
        return data
    except Exception as e:
        print(f"{Colors.RED}[!] Error getting artist info: {e}{Colors.ENDC}")
        return None
 def get_tracks_info(download_dir, client_id):
    """Get information about tracks from SoundCloud API"""
    print(f"{Colors.BLUE}[*] Gathering track information from SoundCloud...{Colors.ENDC}")
    # Find all MP3 files
    mp3_files = list(Path(download_dir).glob('*.mp3'))
    track_info_map = {}
    for mp3_file in mp3_files:
        # Try to extract track ID or permalink from filename
        # Many SoundCloud downloaders append the track ID to the filename
        track_id_match = re.search(r'[-_](\d{6,})(\.mp3)?$', mp3_file.stem)
        if track_id_match:
            # If we have a track ID, use it to get info from the API
            track_id = track_id_match.group(1)
            try:
                track_url = f"https://api-v2.soundcloud.com/tracks/{track_id}?client_id={client_id}"
                response = requests.get(track_url)
                if response.status_code == 200:
                    track_data = response.json()
                    track_info_map[mp3_file.name] = track_data
            except Exception as e:
                print(f"{Colors.YELLOW}[!] Warning: Could not get info for track ID {track_id}: {e}{Colors.ENDC}")
    return track_info_map
 def extract_set_info(filename):
    """Extract information from set/playlist filenames"""
    # For files from sets: "Set Name_Artist - Track Title.mp3"
    set_match = re.search(r'^(.+?)_(.+?)\.mp3$', filename)
    if set_match:
        set_name = set_match.group(1).strip()
        title_part = set_match.group(2).strip()
        # Try to extract artist from title if it's in the "Artist - Title" format
        artist_title_match = re.search(r'^(.+?) - (.+)$', title_part)
        if artist_title_match:
            artist = artist_title_match.group(1).strip()
            title = artist_title_match.group(2).strip()
        else:
            # If no artist separator found, the whole part is the title
            artist = None
            title = title_part
        return {
            'set_name': set_name,
            'artist': artist,
            'title': title
        }
    # Another pattern: Some playlist files don't have the separator
    # Example: "Playlist Name - Track Title.mp3" without artist info
    alt_match = re.search(r'^(.+?) - (.+?)\.mp3$', filename)
    if alt_match:
        set_name = alt_match.group(1).strip()
        title = alt_match.group(2).strip()
        return {
            'set_name': set_name,
            'artist': None,  # No artist info in this format
            'title': title
        }
    return None
 def extract_album_from_comments(tags):
    """Try to extract album information from ID3 comment tags"""
    if "COMM" in tags:
        comment = str(tags["COMM"])
        # Look for potential album indicators in comments
        album_match = re.search(r'CTCD-\d+\s+["\'](.+?)["\']', comment)
        if album_match:
            return album_match.group(1)
        # Another pattern: Album name followed by E.P. or EP
        ep_match = re.search(r'([^"\']+?)\s+E\.?P\.?', comment)
        if ep_match:
            return f"{ep_match.group(1)} E.P."
    return None
 def fix_id3_tags(download_dir, artist_name, client_id, forced_author=None, forced_album=None, force_tags=False):
    """Fix ID3 tags and add album artwork to downloaded files"""
    print(f"{Colors.BLUE}[*] Adding ID3 tags and artwork...{Colors.ENDC}")
    # Get artist info
    artist_info = get_artist_info(f"https://soundcloud.com/{artist_name}", client_id)
    artist_avatar_url = artist_info.get('avatar_url') if artist_info else None
    # Try to get additional track info from SoundCloud API
    track_info_map = get_tracks_info(download_dir, client_id)
    # Download artist avatar for use as album art if needed
    avatar_data = None
    if artist_avatar_url:
        try:
            # Get highest resolution image by replacing size in URL
            hi_res_avatar_url = artist_avatar_url.replace('-large', '-t500x500')
            avatar_response = requests.get(hi_res_avatar_url)
            avatar_data = avatar_response.content
        except Exception as e:
            print(f"{Colors.YELLOW}[!] Warning: Could not download artist avatar: {e}{Colors.ENDC}")
    # Process all MP3 files
    downloaded_files = list(Path(download_dir).glob('*.mp3'))
    processed_count = 0
    skipped_count = 0
    for mp3_file in tqdm(downloaded_files, desc="Processing files"):
        try:
            # Read or create ID3 tags
            try:
                tags = ID3(mp3_file)
                # Skip if tags exist and force_tags is not set
                if not force_tags and "TIT2" in tags and "TPE1" in tags and "TALB" in tags:
                    skipped_count += 1
                    continue
            except:
                # Create new ID3 tag if not present
                tags = ID3()
            # Extract information from filename
            set_info = extract_set_info(mp3_file.name)
            # Initialize variables
            title = None
            artist = forced_author
            album = forced_album
            # Get title from set_info or filename
            if set_info:
                title = set_info['title']
                # Only use artist from set_info if forced_author not provided
                if not artist and set_info['artist']:
                    artist = set_info['artist']
                # Only use set_name as album if forced_album not provided
                if not album:
                    album = set_info['set_name']
            else:
                # Try to extract from regular filename
                filename_match = re.search(r'(.+?) - (.+?)\.mp3$', mp3_file.name)
                if filename_match:
                    if not artist:
                        artist = filename_match.group(1).strip()
                    title = filename_match.group(2).strip()
                else:
                    # Just use the filename as title
                    title = mp3_file.stem
            # Try to extract album info from existing tags if available
            if not album and "COMM" in tags:
                album_from_comment = extract_album_from_comments(tags)
                if album_from_comment:
                    album = album_from_comment
            # If no album was determined, use a default
            if not album:
                album = "Unknown Album"
            # If no artist was determined, use the forced_author or a default
            if not artist:
                artist = forced_author or "Unknown Artist"
            # Set ID3 tags
            tags["TIT2"] = TIT2(encoding=3, text=title)
            tags["TPE1"] = TPE1(encoding=3, text=artist)
            tags["TALB"] = TALB(encoding=3, text=album)
            # Add artwork if we have it and it's missing or we're forcing updates
            if avatar_data and (force_tags or not any(tag.startswith('APIC') for tag in tags.keys())):
                tags["APIC"] = APIC(
                    encoding=3,
                    mime="image/jpeg",
                    type=3,  # Cover (front)
                    desc="Cover",
                    data=avatar_data
                )
            # Save tags to file
            tags.save(mp3_file)
            processed_count += 1
        except Exception as e:
            print(f"{Colors.YELLOW}[!] Warning: Could not process file {mp3_file}: {e}{Colors.ENDC}")
    print(
        f"{Colors.GREEN}[+] Successfully processed {processed_count} files, skipped {skipped_count} files with existing tags{Colors.ENDC}")
 def main():
    args = setup_argparser()
    print(f"{Colors.GREEN}[+] SoundCloud Downloader with ID3 Tags{Colors.ENDC}")
    # Auto-detect likes URL if not explicitly set
    likes = args.likes or '/likes' in args.url
    download_dir, artist_name = download_tracks(args.url, args.output, args.client_id, likes)
    client_id = args.client_id or get_client_id()
    fix_id3_tags(download_dir, artist_name, client_id, args.author, args.album, args.force_tags)
    print(f"{Colors.GREEN}[+] All done! Downloaded tracks are in: {download_dir}{Colors.ENDC}")
 if __name__ == "__main__":
    main()
--- a/tg_scrap.py
+++ b/tg_scrap.py
@ -0,0 +1,219 @@
 import re
 import requests
 from bs4 import BeautifulSoup, NavigableString
 import time
 import random
 def get_telegram_channel_info(channel_username):
    channel_username = channel_username.strip('@')
    url = f'https://t.me/s/{channel_username}'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        'Accept-Language': 'en-US,en;q=0.9',
        'Referer': 'https://t.me/',
        'Sec-Ch-Ua': '"Google Chrome";v="122", "Chromium";v="122", "Not(A:Brand";v="24"',
        'Sec-Ch-Ua-Mobile': '?0',
        'Sec-Ch-Ua-Platform': '"Windows"',
        'Sec-Fetch-Dest': 'document',
        'Sec-Fetch-Mode': 'navigate',
        'Sec-Fetch-Site': 'same-site',
        'Sec-Fetch-User': '?1',
        'Upgrade-Insecure-Requests': '1'
    }
    time.sleep(random.uniform(1, 2))
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching channel: {e}")
        return None
    soup = BeautifulSoup(response.text, 'html.parser')
    channel_info = {
        'username': channel_username,
        'url': f'https://t.me/{channel_username}',
        'subscriber_count': 'Unknown',
        'channel_name': 'Unknown',
        'description': 'Unknown',
        'photo_url': None,
        'recent_posts': []
    }
    # Extract channel name
    channel_name_elem = soup.find('div', class_='tgme_channel_info_header_title')
    if channel_name_elem and channel_name_elem.find('span'):
        channel_info['channel_name'] = channel_name_elem.find('span').text.strip()
    # Extract subscriber count
    subscriber_counter = soup.find('div', class_='tgme_channel_info_counter',
                                   string=lambda text: 'subscribers' in text if text else False)
    if subscriber_counter and subscriber_counter.find('span', class_='counter_value'):
        channel_info['subscriber_count'] = subscriber_counter.find('span', class_='counter_value').text.strip()
    # Alternative location for subscriber count
    if channel_info['subscriber_count'] == 'Unknown':
        header_counter = soup.find('div', class_='tgme_header_counter')
        if header_counter:
            channel_info['subscriber_count'] = header_counter.text.strip().replace('subscribers', '').strip()
    # Extract description
    description_elem = soup.find('div', class_='tgme_channel_info_description')
    if description_elem:
        channel_info['description'] = description_elem.text.strip()
    # Extract channel photo
    photo_elem = soup.find('i', class_='tgme_page_photo_image')
    if photo_elem and photo_elem.find('img'):
        channel_info['photo_url'] = photo_elem.find('img').get('src')
    # Extract recent posts
    message_containers = soup.find_all('div', class_='tgme_widget_message')
    for container in message_containers[::-1]:
        post_data = {}
        post_data['id'] = container.get('data-post')
        text_elem = container.find('div', class_='tgme_widget_message_text')
        if text_elem:
            post_data['html_text'] = str(text_elem)
            formatted_text = ""
            def process_node(node):
                nonlocal formatted_text
                if node.name == 'br':
                    formatted_text += '\n'
                elif node.name == 'b':
                    formatted_text += f"<b>{node.get_text()}</b>"
                elif node.name == 'i':
                    formatted_text += f"<i>{node.get_text()}</i>"
                elif node.name == 'a':
                    href = node.get('href', '')
                    formatted_text += f"<a href='{href}'>{node.get_text()}</a>"
                elif node.name == 'pre' or node.name == 'code':
                    formatted_text += f"<code>{node.get_text()}</code>"
                elif node.name == 'tg-emoji':
                    emoji_code = node.find('i', class_='emoji').get_text() if node.find('i', class_='emoji') else ''
                    formatted_text += emoji_code
                elif isinstance(node, NavigableString):
                    formatted_text += str(node)
                else:
                    for child in node.children:
                        process_node(child)
            for child in text_elem.children:
                process_node(child)
            post_data['formatted_text'] = formatted_text
            post_data['plain_text'] = text_elem.get_text(separator=' ', strip=True)
        else:
            post_data['html_text'] = ""
            post_data['formatted_text'] = ""
            post_data['plain_text'] = ""
        # Get message date
        date_elem = container.find('a', class_='tgme_widget_message_date')
        if date_elem and date_elem.find('time'):
            post_data['date'] = date_elem.find('time')['datetime']
        # Get view count
        views_elem = container.find('span', class_='tgme_widget_message_views')
        if views_elem:
            post_data['views'] = views_elem.text.strip()
        # Check for media attachments
        photo_elem = container.find('a', class_='tgme_widget_message_photo_wrap')
        video_elem = container.find('a', class_='tgme_widget_message_video_player')
        if photo_elem:
            post_data['has_photo'] = True
            if 'style' in photo_elem.attrs:
                style = photo_elem['style']
                if 'background-image' in style:
                    url_start = style.find('url(') + 4
                    url_end = style.find(')', url_start)
                    photo_url = style[url_start:url_end].strip("'")
                    post_data['photo_url'] = photo_url.replace('\'', '').replace('"', '')
        else:
            post_data['has_photo'] = False
        if video_elem:
            post_data['has_video'] = True
            video_thumb = video_elem.find('i', class_='tgme_widget_message_video_thumb')
            if video_thumb and 'style' in video_thumb.attrs:
                style = video_thumb['style']
                if 'background-image' in style:
                    url_start = style.find('url(') + 4
                    url_end = style.find(')', url_start)
                    thumbnail_url = style[url_start:url_end].strip("'")
                    post_data['video_thumbnail'] = thumbnail_url.replace('\'', '').replace('"', '')
            # Get video duration
            duration_elem = video_elem.find('time', class_='message_video_duration')
            if duration_elem:
                post_data['video_duration'] = duration_elem.text.strip()
        else:
            post_data['has_video'] = False
        # Add post to list
        channel_info['recent_posts'].append(post_data)
    return channel_info
 def print_channel_info(channel_info):
    if not channel_info:
        print("Could not retrieve channel information.")
        return
    print(f"TELEGRAM CHANNEL: @{channel_info['username']}")
    print(f"Name:        {channel_info['channel_name']}")
    print(f"Subscribers: {channel_info['subscriber_count']}")
    print(f"URL:         {channel_info['url']}")
    print("\nDescription:")
    print(channel_info['description'])
    print(f"\nRecent Posts ({len(channel_info['recent_posts'])} found):")
    for i, post in enumerate(channel_info['recent_posts'], 1):
        print(f"\nPost {i}:")
        print(f"  Date:    {post.get('date', 'Unknown')}")
        print(f"  Views:   {post.get('views', 'Unknown')}")
        media_types = []
        if post.get('has_photo'):
            media_types.append("Photo")
        if post.get('has_video'):
            media_types.append("Video")
        media_str = ", ".join(media_types) if media_types else "None"
        print(f"  Media:   {media_str}")
        formatted_text = post.get('formatted_text', '')
        if formatted_text:
            formatted_text = re.sub(r'<br\s*/?>', '\n', formatted_text)
            formatted_text = re.sub(r'\n+', '\n', formatted_text)
            if len(formatted_text) > 150:
                preview = formatted_text[:150] + "..."
            else:
                preview = formatted_text
            preview = re.sub(r'\s+', ' ', preview)
            print(f"  Content: {preview}")
 if __name__ == "__main__":
    channel_name = input("Enter Telegram channel username (with or without @): ")
    print(f"\nFetching information for {channel_name}...")
    channel_info = get_telegram_channel_info(channel_name)
    print_channel_info(channel_info)