mirror of
https://github.com/Alexander-D-Karpov/scripts.git
synced 2025-06-01 11:43:08 +03:00
added soundcloud download, tg web scrap, updated castbox
This commit is contained in:
parent
06cd5b7bea
commit
29074ada79
|
@ -24,61 +24,78 @@ def download_file(file_url):
|
|||
return local_filename
|
||||
|
||||
|
||||
r = requests.get(url)
|
||||
if r.status_code != 200:
|
||||
raise LookupError("Site not found")
|
||||
inner_data = r.text.splitlines()
|
||||
data = []
|
||||
for line in inner_data:
|
||||
if "window.__INITIAL_STATE__" in line:
|
||||
data.append(line)
|
||||
def get_data(url):
|
||||
r = requests.get(url)
|
||||
if r.status_code != 200:
|
||||
raise LookupError("Site not found")
|
||||
inner_data = r.text.splitlines()
|
||||
data = []
|
||||
for line in inner_data:
|
||||
if "window.__INITIAL_STATE__" in line:
|
||||
data.append(line)
|
||||
|
||||
if len(data) != 1:
|
||||
raise ValueError("Payload not found")
|
||||
if len(data) != 1:
|
||||
raise ValueError("Payload not found")
|
||||
|
||||
d = json.loads(unquote(data[0].split('"')[1::2][0])) # type: dict
|
||||
d = json.loads(unquote(data[0].split('"')[1::2][0])) # type: dict
|
||||
return d
|
||||
|
||||
|
||||
d = get_data(url)
|
||||
while (
|
||||
not d
|
||||
or "ch" not in d
|
||||
or "chInfo" not in d["ch"]
|
||||
or "title" not in d["ch"]["chInfo"]
|
||||
):
|
||||
d = get_data(url)
|
||||
print("Data not loaded, retrying...")
|
||||
title = d["ch"]["chInfo"]["title"]
|
||||
main_image = d["ch"]["chInfo"]["cover_web"]
|
||||
author = d["ch"]["chInfo"]["author"]
|
||||
episode_count = d["ch"]["chInfo"]["episode_count"]
|
||||
print("Downloading podcast " + title)
|
||||
episodes = d["ch"]["eps"]
|
||||
if not os.path.isdir(title):
|
||||
os.mkdir(title)
|
||||
for i, episode in enumerate(episodes):
|
||||
print(f"Downloading: {episode['title']}", end="\r")
|
||||
if "url" in episode and episode["url"]:
|
||||
ep_url = episode["url"]
|
||||
else:
|
||||
ep_url = episode["urls"][0]
|
||||
orig_path = download_file(ep_url)
|
||||
n_path = title + "/" + f"{title}.mp3"
|
||||
AudioSegment.from_file(orig_path).export(n_path)
|
||||
os.remove(orig_path)
|
||||
if "cover_url" not in episode or not episode["cover_url"]:
|
||||
img_path = download_file(main_image)
|
||||
else:
|
||||
img_path = download_file(episode["cover_url"])
|
||||
if "author" in episode and episode["author"]:
|
||||
ep_author = episode["author"]
|
||||
else:
|
||||
ep_author = author
|
||||
n_path = title + "/" + f"{title}.mp3" # имя из ep
|
||||
if not os.path.exists(n_path): # на 1 выключается
|
||||
print(f"Downloading: {episode['title']}", end="\r")
|
||||
if "url" in episode and episode["url"]:
|
||||
ep_url = episode["url"]
|
||||
else:
|
||||
ep_url = episode["urls"][0]
|
||||
orig_path = download_file(ep_url)
|
||||
AudioSegment.from_file(orig_path).export(n_path)
|
||||
os.remove(orig_path)
|
||||
if "cover_url" not in episode or not episode["cover_url"]:
|
||||
img_path = download_file(main_image)
|
||||
else:
|
||||
img_path = download_file(episode["cover_url"])
|
||||
if "author" in episode and episode["author"]:
|
||||
ep_author = episode["author"]
|
||||
else:
|
||||
ep_author = author
|
||||
|
||||
tag = MP3(n_path, ID3=ID3)
|
||||
tag.tags.add(
|
||||
APIC(
|
||||
encoding=3,
|
||||
mime="image/png",
|
||||
type=3,
|
||||
desc="Cover",
|
||||
data=open(img_path, "rb").read(),
|
||||
print(f"Processing: {episode['title']}", end="\r")
|
||||
tag = MP3(n_path, ID3=ID3)
|
||||
tag.tags.add(
|
||||
APIC(
|
||||
encoding=3,
|
||||
mime="image/png",
|
||||
type=3,
|
||||
desc="Cover",
|
||||
data=open(img_path, "rb").read(),
|
||||
)
|
||||
)
|
||||
)
|
||||
tag.save()
|
||||
tag = EasyID3(n_path)
|
||||
tag.save()
|
||||
tag = EasyID3(n_path)
|
||||
|
||||
tag["title"] = episode["title"]
|
||||
tag["album"] = title
|
||||
tag["artist"] = ep_author
|
||||
tag["title"] = episode["title"]
|
||||
tag["album"] = title
|
||||
tag["tracknumber"] = f"{episode_count - i}/{episode_count}"
|
||||
tag["artist"] = ep_author
|
||||
|
||||
tag.save()
|
||||
os.remove(img_path)
|
||||
tag.save()
|
||||
os.remove(img_path)
|
||||
|
|
336
soundcloud_downloader.py
Normal file
336
soundcloud_downloader.py
Normal file
|
@ -0,0 +1,336 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
SoundCloud Downloader with ID3 Tags
|
||||
-----------------------------------
|
||||
This script downloads all tracks from a SoundCloud artist,
|
||||
including proper ID3 tags and album artwork.
|
||||
|
||||
Requirements:
|
||||
pip install scdl mutagen requests tqdm
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import json
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from mutagen.id3 import ID3, APIC, TIT2, TPE1, TALB, TDRC, TCON, TCOM, COMM
|
||||
from tqdm import tqdm
|
||||
import re
|
||||
import argparse
|
||||
|
||||
|
||||
# ANSI colors for terminal output
|
||||
class Colors:
|
||||
GREEN = '\033[92m'
|
||||
YELLOW = '\033[93m'
|
||||
RED = '\033[91m'
|
||||
BLUE = '\033[94m'
|
||||
ENDC = '\033[0m'
|
||||
|
||||
|
||||
def setup_argparser():
|
||||
parser = argparse.ArgumentParser(description='Download all tracks from a SoundCloud artist with proper ID3 tags')
|
||||
parser.add_argument('url', help='SoundCloud URL (artist profile or likes page)')
|
||||
parser.add_argument('-o', '--output', default='downloads', help='Output directory')
|
||||
parser.add_argument('-c', '--client-id', help='SoundCloud client ID (optional)')
|
||||
parser.add_argument('--likes', action='store_true', help='Download liked tracks (auto-detected from URL)')
|
||||
parser.add_argument('--author', help='Explicitly set the author name for all tracks')
|
||||
parser.add_argument('--album', help='Explicitly set the album name for all tracks')
|
||||
parser.add_argument('--force-tags', action='store_true', help='Force update of ID3 tags even if they exist')
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def get_client_id():
|
||||
"""Extract client_id by scraping SoundCloud's website"""
|
||||
print(f"{Colors.BLUE}[*] Obtaining SoundCloud client ID...{Colors.ENDC}")
|
||||
|
||||
try:
|
||||
response = requests.get('https://soundcloud.com/')
|
||||
scripts = re.findall(r'<script crossorigin src="(.*?\.js)"', response.text)
|
||||
|
||||
# Try to find client_id in the scripts
|
||||
for script_url in scripts:
|
||||
if not script_url.startswith('http'):
|
||||
script_url = 'https://soundcloud.com' + script_url
|
||||
|
||||
script_content = requests.get(script_url).text
|
||||
client_id_match = re.search(r'"client_id":"([a-zA-Z0-9]+)"', script_content)
|
||||
if client_id_match:
|
||||
return client_id_match.group(1)
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}[!] Error getting client ID: {e}{Colors.ENDC}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def download_tracks(artist_url, output_dir, client_id=None, likes=False):
|
||||
"""Download all tracks from the given artist URL or likes page"""
|
||||
if not client_id:
|
||||
client_id = get_client_id()
|
||||
|
||||
if not client_id:
|
||||
print(f"{Colors.RED}[!] Failed to get client ID. Please provide it manually with --client-id{Colors.ENDC}")
|
||||
sys.exit(1)
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Extract artist name from URL
|
||||
url_parts = artist_url.strip('/').split('/')
|
||||
artist_name = url_parts[-2] if likes or '/likes' in artist_url else url_parts[-1]
|
||||
|
||||
print(
|
||||
f"{Colors.GREEN}[+] {'Downloading liked tracks' if likes else 'Downloading tracks'} for {artist_name} to {output_dir}{Colors.ENDC}")
|
||||
|
||||
# Use scdl to download tracks
|
||||
cmd = [
|
||||
'scdl',
|
||||
'-l', artist_url,
|
||||
'--path', output_dir,
|
||||
'--client-id', client_id,
|
||||
'--flac', # Try to get best quality where available
|
||||
'-c' # Continue if download already exists
|
||||
]
|
||||
|
||||
# Add appropriate flag based on download type
|
||||
if likes or '/likes' in artist_url:
|
||||
cmd.append('-f') # Download favorites/likes
|
||||
elif '/sets/' in artist_url:
|
||||
cmd.append('-p') # Download playlist
|
||||
else:
|
||||
cmd.append('-a') # Download all tracks from user
|
||||
|
||||
try:
|
||||
subprocess.run(cmd, check=True)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"{Colors.RED}[!] Error running scdl: {e}{Colors.ENDC}")
|
||||
sys.exit(1)
|
||||
|
||||
return output_dir, artist_name
|
||||
|
||||
|
||||
def get_artist_info(artist_url, client_id):
|
||||
"""Get artist information from SoundCloud API"""
|
||||
resolve_url = f"https://api-v2.soundcloud.com/resolve?url={artist_url}&client_id={client_id}"
|
||||
|
||||
try:
|
||||
response = requests.get(resolve_url)
|
||||
data = response.json()
|
||||
return data
|
||||
except Exception as e:
|
||||
print(f"{Colors.RED}[!] Error getting artist info: {e}{Colors.ENDC}")
|
||||
return None
|
||||
|
||||
|
||||
def get_tracks_info(download_dir, client_id):
|
||||
"""Get information about tracks from SoundCloud API"""
|
||||
print(f"{Colors.BLUE}[*] Gathering track information from SoundCloud...{Colors.ENDC}")
|
||||
|
||||
# Find all MP3 files
|
||||
mp3_files = list(Path(download_dir).glob('*.mp3'))
|
||||
track_info_map = {}
|
||||
|
||||
for mp3_file in mp3_files:
|
||||
# Try to extract track ID or permalink from filename
|
||||
# Many SoundCloud downloaders append the track ID to the filename
|
||||
track_id_match = re.search(r'[-_](\d{6,})(\.mp3)?$', mp3_file.stem)
|
||||
|
||||
if track_id_match:
|
||||
# If we have a track ID, use it to get info from the API
|
||||
track_id = track_id_match.group(1)
|
||||
try:
|
||||
track_url = f"https://api-v2.soundcloud.com/tracks/{track_id}?client_id={client_id}"
|
||||
response = requests.get(track_url)
|
||||
if response.status_code == 200:
|
||||
track_data = response.json()
|
||||
track_info_map[mp3_file.name] = track_data
|
||||
except Exception as e:
|
||||
print(f"{Colors.YELLOW}[!] Warning: Could not get info for track ID {track_id}: {e}{Colors.ENDC}")
|
||||
|
||||
return track_info_map
|
||||
|
||||
|
||||
def extract_set_info(filename):
|
||||
"""Extract information from set/playlist filenames"""
|
||||
# For files from sets: "Set Name_Artist - Track Title.mp3"
|
||||
set_match = re.search(r'^(.+?)_(.+?)\.mp3$', filename)
|
||||
if set_match:
|
||||
set_name = set_match.group(1).strip()
|
||||
title_part = set_match.group(2).strip()
|
||||
|
||||
# Try to extract artist from title if it's in the "Artist - Title" format
|
||||
artist_title_match = re.search(r'^(.+?) - (.+)$', title_part)
|
||||
if artist_title_match:
|
||||
artist = artist_title_match.group(1).strip()
|
||||
title = artist_title_match.group(2).strip()
|
||||
else:
|
||||
# If no artist separator found, the whole part is the title
|
||||
artist = None
|
||||
title = title_part
|
||||
|
||||
return {
|
||||
'set_name': set_name,
|
||||
'artist': artist,
|
||||
'title': title
|
||||
}
|
||||
|
||||
# Another pattern: Some playlist files don't have the separator
|
||||
# Example: "Playlist Name - Track Title.mp3" without artist info
|
||||
alt_match = re.search(r'^(.+?) - (.+?)\.mp3$', filename)
|
||||
if alt_match:
|
||||
set_name = alt_match.group(1).strip()
|
||||
title = alt_match.group(2).strip()
|
||||
|
||||
return {
|
||||
'set_name': set_name,
|
||||
'artist': None, # No artist info in this format
|
||||
'title': title
|
||||
}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def extract_album_from_comments(tags):
|
||||
"""Try to extract album information from ID3 comment tags"""
|
||||
if "COMM" in tags:
|
||||
comment = str(tags["COMM"])
|
||||
# Look for potential album indicators in comments
|
||||
album_match = re.search(r'CTCD-\d+\s+["\'](.+?)["\']', comment)
|
||||
if album_match:
|
||||
return album_match.group(1)
|
||||
|
||||
# Another pattern: Album name followed by E.P. or EP
|
||||
ep_match = re.search(r'([^"\']+?)\s+E\.?P\.?', comment)
|
||||
if ep_match:
|
||||
return f"{ep_match.group(1)} E.P."
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def fix_id3_tags(download_dir, artist_name, client_id, forced_author=None, forced_album=None, force_tags=False):
|
||||
"""Fix ID3 tags and add album artwork to downloaded files"""
|
||||
print(f"{Colors.BLUE}[*] Adding ID3 tags and artwork...{Colors.ENDC}")
|
||||
|
||||
# Get artist info
|
||||
artist_info = get_artist_info(f"https://soundcloud.com/{artist_name}", client_id)
|
||||
artist_avatar_url = artist_info.get('avatar_url') if artist_info else None
|
||||
|
||||
# Try to get additional track info from SoundCloud API
|
||||
track_info_map = get_tracks_info(download_dir, client_id)
|
||||
|
||||
# Download artist avatar for use as album art if needed
|
||||
avatar_data = None
|
||||
if artist_avatar_url:
|
||||
try:
|
||||
# Get highest resolution image by replacing size in URL
|
||||
hi_res_avatar_url = artist_avatar_url.replace('-large', '-t500x500')
|
||||
avatar_response = requests.get(hi_res_avatar_url)
|
||||
avatar_data = avatar_response.content
|
||||
except Exception as e:
|
||||
print(f"{Colors.YELLOW}[!] Warning: Could not download artist avatar: {e}{Colors.ENDC}")
|
||||
|
||||
# Process all MP3 files
|
||||
downloaded_files = list(Path(download_dir).glob('*.mp3'))
|
||||
processed_count = 0
|
||||
skipped_count = 0
|
||||
|
||||
for mp3_file in tqdm(downloaded_files, desc="Processing files"):
|
||||
try:
|
||||
# Read or create ID3 tags
|
||||
try:
|
||||
tags = ID3(mp3_file)
|
||||
# Skip if tags exist and force_tags is not set
|
||||
if not force_tags and "TIT2" in tags and "TPE1" in tags and "TALB" in tags:
|
||||
skipped_count += 1
|
||||
continue
|
||||
except:
|
||||
# Create new ID3 tag if not present
|
||||
tags = ID3()
|
||||
|
||||
# Extract information from filename
|
||||
set_info = extract_set_info(mp3_file.name)
|
||||
|
||||
# Initialize variables
|
||||
title = None
|
||||
artist = forced_author
|
||||
album = forced_album
|
||||
|
||||
# Get title from set_info or filename
|
||||
if set_info:
|
||||
title = set_info['title']
|
||||
# Only use artist from set_info if forced_author not provided
|
||||
if not artist and set_info['artist']:
|
||||
artist = set_info['artist']
|
||||
# Only use set_name as album if forced_album not provided
|
||||
if not album:
|
||||
album = set_info['set_name']
|
||||
else:
|
||||
# Try to extract from regular filename
|
||||
filename_match = re.search(r'(.+?) - (.+?)\.mp3$', mp3_file.name)
|
||||
if filename_match:
|
||||
if not artist:
|
||||
artist = filename_match.group(1).strip()
|
||||
title = filename_match.group(2).strip()
|
||||
else:
|
||||
# Just use the filename as title
|
||||
title = mp3_file.stem
|
||||
|
||||
# Try to extract album info from existing tags if available
|
||||
if not album and "COMM" in tags:
|
||||
album_from_comment = extract_album_from_comments(tags)
|
||||
if album_from_comment:
|
||||
album = album_from_comment
|
||||
|
||||
# If no album was determined, use a default
|
||||
if not album:
|
||||
album = "Unknown Album"
|
||||
|
||||
# If no artist was determined, use the forced_author or a default
|
||||
if not artist:
|
||||
artist = forced_author or "Unknown Artist"
|
||||
|
||||
# Set ID3 tags
|
||||
tags["TIT2"] = TIT2(encoding=3, text=title)
|
||||
tags["TPE1"] = TPE1(encoding=3, text=artist)
|
||||
tags["TALB"] = TALB(encoding=3, text=album)
|
||||
|
||||
# Add artwork if we have it and it's missing or we're forcing updates
|
||||
if avatar_data and (force_tags or not any(tag.startswith('APIC') for tag in tags.keys())):
|
||||
tags["APIC"] = APIC(
|
||||
encoding=3,
|
||||
mime="image/jpeg",
|
||||
type=3, # Cover (front)
|
||||
desc="Cover",
|
||||
data=avatar_data
|
||||
)
|
||||
|
||||
# Save tags to file
|
||||
tags.save(mp3_file)
|
||||
processed_count += 1
|
||||
|
||||
except Exception as e:
|
||||
print(f"{Colors.YELLOW}[!] Warning: Could not process file {mp3_file}: {e}{Colors.ENDC}")
|
||||
|
||||
print(
|
||||
f"{Colors.GREEN}[+] Successfully processed {processed_count} files, skipped {skipped_count} files with existing tags{Colors.ENDC}")
|
||||
|
||||
|
||||
def main():
|
||||
args = setup_argparser()
|
||||
|
||||
print(f"{Colors.GREEN}[+] SoundCloud Downloader with ID3 Tags{Colors.ENDC}")
|
||||
|
||||
# Auto-detect likes URL if not explicitly set
|
||||
likes = args.likes or '/likes' in args.url
|
||||
|
||||
download_dir, artist_name = download_tracks(args.url, args.output, args.client_id, likes)
|
||||
client_id = args.client_id or get_client_id()
|
||||
fix_id3_tags(download_dir, artist_name, client_id, args.author, args.album, args.force_tags)
|
||||
|
||||
print(f"{Colors.GREEN}[+] All done! Downloaded tracks are in: {download_dir}{Colors.ENDC}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
219
tg_scrap.py
Normal file
219
tg_scrap.py
Normal file
|
@ -0,0 +1,219 @@
|
|||
import re
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup, NavigableString
|
||||
import time
|
||||
import random
|
||||
|
||||
|
||||
def get_telegram_channel_info(channel_username):
|
||||
channel_username = channel_username.strip('@')
|
||||
url = f'https://t.me/s/{channel_username}'
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
'Referer': 'https://t.me/',
|
||||
'Sec-Ch-Ua': '"Google Chrome";v="122", "Chromium";v="122", "Not(A:Brand";v="24"',
|
||||
'Sec-Ch-Ua-Mobile': '?0',
|
||||
'Sec-Ch-Ua-Platform': '"Windows"',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-Site': 'same-site',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Upgrade-Insecure-Requests': '1'
|
||||
}
|
||||
|
||||
time.sleep(random.uniform(1, 2))
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers)
|
||||
response.raise_for_status()
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Error fetching channel: {e}")
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(response.text, 'html.parser')
|
||||
channel_info = {
|
||||
'username': channel_username,
|
||||
'url': f'https://t.me/{channel_username}',
|
||||
'subscriber_count': 'Unknown',
|
||||
'channel_name': 'Unknown',
|
||||
'description': 'Unknown',
|
||||
'photo_url': None,
|
||||
'recent_posts': []
|
||||
}
|
||||
|
||||
# Extract channel name
|
||||
channel_name_elem = soup.find('div', class_='tgme_channel_info_header_title')
|
||||
if channel_name_elem and channel_name_elem.find('span'):
|
||||
channel_info['channel_name'] = channel_name_elem.find('span').text.strip()
|
||||
|
||||
# Extract subscriber count
|
||||
subscriber_counter = soup.find('div', class_='tgme_channel_info_counter',
|
||||
string=lambda text: 'subscribers' in text if text else False)
|
||||
if subscriber_counter and subscriber_counter.find('span', class_='counter_value'):
|
||||
channel_info['subscriber_count'] = subscriber_counter.find('span', class_='counter_value').text.strip()
|
||||
|
||||
# Alternative location for subscriber count
|
||||
if channel_info['subscriber_count'] == 'Unknown':
|
||||
header_counter = soup.find('div', class_='tgme_header_counter')
|
||||
if header_counter:
|
||||
channel_info['subscriber_count'] = header_counter.text.strip().replace('subscribers', '').strip()
|
||||
|
||||
# Extract description
|
||||
description_elem = soup.find('div', class_='tgme_channel_info_description')
|
||||
if description_elem:
|
||||
channel_info['description'] = description_elem.text.strip()
|
||||
|
||||
# Extract channel photo
|
||||
photo_elem = soup.find('i', class_='tgme_page_photo_image')
|
||||
if photo_elem and photo_elem.find('img'):
|
||||
channel_info['photo_url'] = photo_elem.find('img').get('src')
|
||||
|
||||
# Extract recent posts
|
||||
message_containers = soup.find_all('div', class_='tgme_widget_message')
|
||||
|
||||
for container in message_containers[::-1]:
|
||||
post_data = {}
|
||||
|
||||
post_data['id'] = container.get('data-post')
|
||||
|
||||
text_elem = container.find('div', class_='tgme_widget_message_text')
|
||||
if text_elem:
|
||||
post_data['html_text'] = str(text_elem)
|
||||
|
||||
formatted_text = ""
|
||||
|
||||
def process_node(node):
|
||||
nonlocal formatted_text
|
||||
|
||||
if node.name == 'br':
|
||||
formatted_text += '\n'
|
||||
elif node.name == 'b':
|
||||
formatted_text += f"<b>{node.get_text()}</b>"
|
||||
elif node.name == 'i':
|
||||
formatted_text += f"<i>{node.get_text()}</i>"
|
||||
elif node.name == 'a':
|
||||
href = node.get('href', '')
|
||||
formatted_text += f"<a href='{href}'>{node.get_text()}</a>"
|
||||
elif node.name == 'pre' or node.name == 'code':
|
||||
formatted_text += f"<code>{node.get_text()}</code>"
|
||||
elif node.name == 'tg-emoji':
|
||||
emoji_code = node.find('i', class_='emoji').get_text() if node.find('i', class_='emoji') else ''
|
||||
formatted_text += emoji_code
|
||||
elif isinstance(node, NavigableString):
|
||||
formatted_text += str(node)
|
||||
else:
|
||||
for child in node.children:
|
||||
process_node(child)
|
||||
|
||||
for child in text_elem.children:
|
||||
process_node(child)
|
||||
|
||||
post_data['formatted_text'] = formatted_text
|
||||
post_data['plain_text'] = text_elem.get_text(separator=' ', strip=True)
|
||||
else:
|
||||
post_data['html_text'] = ""
|
||||
post_data['formatted_text'] = ""
|
||||
post_data['plain_text'] = ""
|
||||
|
||||
# Get message date
|
||||
date_elem = container.find('a', class_='tgme_widget_message_date')
|
||||
if date_elem and date_elem.find('time'):
|
||||
post_data['date'] = date_elem.find('time')['datetime']
|
||||
|
||||
# Get view count
|
||||
views_elem = container.find('span', class_='tgme_widget_message_views')
|
||||
if views_elem:
|
||||
post_data['views'] = views_elem.text.strip()
|
||||
|
||||
# Check for media attachments
|
||||
photo_elem = container.find('a', class_='tgme_widget_message_photo_wrap')
|
||||
video_elem = container.find('a', class_='tgme_widget_message_video_player')
|
||||
|
||||
if photo_elem:
|
||||
post_data['has_photo'] = True
|
||||
if 'style' in photo_elem.attrs:
|
||||
style = photo_elem['style']
|
||||
if 'background-image' in style:
|
||||
url_start = style.find('url(') + 4
|
||||
url_end = style.find(')', url_start)
|
||||
photo_url = style[url_start:url_end].strip("'")
|
||||
post_data['photo_url'] = photo_url.replace('\'', '').replace('"', '')
|
||||
else:
|
||||
post_data['has_photo'] = False
|
||||
|
||||
if video_elem:
|
||||
post_data['has_video'] = True
|
||||
video_thumb = video_elem.find('i', class_='tgme_widget_message_video_thumb')
|
||||
if video_thumb and 'style' in video_thumb.attrs:
|
||||
style = video_thumb['style']
|
||||
if 'background-image' in style:
|
||||
url_start = style.find('url(') + 4
|
||||
url_end = style.find(')', url_start)
|
||||
thumbnail_url = style[url_start:url_end].strip("'")
|
||||
post_data['video_thumbnail'] = thumbnail_url.replace('\'', '').replace('"', '')
|
||||
|
||||
# Get video duration
|
||||
duration_elem = video_elem.find('time', class_='message_video_duration')
|
||||
if duration_elem:
|
||||
post_data['video_duration'] = duration_elem.text.strip()
|
||||
else:
|
||||
post_data['has_video'] = False
|
||||
|
||||
# Add post to list
|
||||
channel_info['recent_posts'].append(post_data)
|
||||
|
||||
return channel_info
|
||||
|
||||
|
||||
def print_channel_info(channel_info):
|
||||
if not channel_info:
|
||||
print("Could not retrieve channel information.")
|
||||
return
|
||||
|
||||
print(f"TELEGRAM CHANNEL: @{channel_info['username']}")
|
||||
print(f"Name: {channel_info['channel_name']}")
|
||||
print(f"Subscribers: {channel_info['subscriber_count']}")
|
||||
print(f"URL: {channel_info['url']}")
|
||||
|
||||
print("\nDescription:")
|
||||
print(channel_info['description'])
|
||||
|
||||
print(f"\nRecent Posts ({len(channel_info['recent_posts'])} found):")
|
||||
for i, post in enumerate(channel_info['recent_posts'], 1):
|
||||
print(f"\nPost {i}:")
|
||||
print(f" Date: {post.get('date', 'Unknown')}")
|
||||
print(f" Views: {post.get('views', 'Unknown')}")
|
||||
|
||||
media_types = []
|
||||
if post.get('has_photo'):
|
||||
media_types.append("Photo")
|
||||
if post.get('has_video'):
|
||||
media_types.append("Video")
|
||||
|
||||
media_str = ", ".join(media_types) if media_types else "None"
|
||||
print(f" Media: {media_str}")
|
||||
|
||||
formatted_text = post.get('formatted_text', '')
|
||||
if formatted_text:
|
||||
formatted_text = re.sub(r'<br\s*/?>', '\n', formatted_text)
|
||||
formatted_text = re.sub(r'\n+', '\n', formatted_text)
|
||||
|
||||
if len(formatted_text) > 150:
|
||||
preview = formatted_text[:150] + "..."
|
||||
else:
|
||||
preview = formatted_text
|
||||
|
||||
preview = re.sub(r'\s+', ' ', preview)
|
||||
print(f" Content: {preview}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
channel_name = input("Enter Telegram channel username (with or without @): ")
|
||||
|
||||
print(f"\nFetching information for {channel_name}...")
|
||||
channel_info = get_telegram_channel_info(channel_name)
|
||||
|
||||
print_channel_info(channel_info)
|
Loading…
Reference in New Issue
Block a user