-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhandle_extractor.py
54 lines (47 loc) · 2.18 KB
/
handle_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import requests
from bs4 import BeautifulSoup
import re
import json
import logging
def get_youtube_handle_from_url(url):
"""Extract the handle from a YouTube URL."""
# Check if the URL is already in the handle format
handle_match = re.search(r'youtube\.com/(@[\w-]+)', url)
if handle_match:
return handle_match.group(1)[1:] # Return handle without '@'
# If the URL is not in the handle format, fetch the page and scrape the handle
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
# Find the script tag containing the JSON data
for script in soup.find_all("script"):
if script.string and 'var ytInitialData' in script.string:
try:
json_text = script.string.split('var ytInitialData = ')[1].rstrip(' ;')
json_data = json.loads(json_text)
tabs = json_data['contents']['twoColumnBrowseResultsRenderer']['tabs']
for tab in tabs:
if 'tabRenderer' in tab:
endpoint = tab['tabRenderer']['endpoint']
browse_endpoint = endpoint.get('browseEndpoint', {})
canonical_base_url = browse_endpoint.get('canonicalBaseUrl', "")
if canonical_base_url.startswith("/@"):
return canonical_base_url[2:] # Extract handle without leading '/@'
except (json.JSONDecodeError, KeyError, IndexError):
continue
return None
def extract_handles_from_urls(urls):
handles = []
for url in urls:
handle = get_youtube_handle_from_url(url.strip())
if handle:
handles.append(handle)
else:
print(f"Handle not found for URL: {url}")
return ','.join(handles)
# Example list of YouTube channel URLs
channel_urls = "https://www.youtube.com/@handle,https://www.youtube.com/user/username,https://www.youtube.com/c/username,https://www.youtube.com/channel/UC_CHANNEL_ID"
# Split the URLs into a list
channel_urls_list = channel_urls.split(',')
# Get the comma-separated list of handles
handles_list = extract_handles_from_urls(channel_urls_list)
print(handles_list)