-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscript.py
More file actions
130 lines (101 loc) · 3.78 KB
/
script.py
File metadata and controls
130 lines (101 loc) · 3.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import requests
from bs4 import BeautifulSoup
import os
import re
from urllib.parse import urljoin, urlparse
from tqdm import tqdm
def is_valid_url(url):
"""
Checks if a URL is valid.
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc])
except ValueError:
return False
def find_mkv_links(url, domain_filter=None):
"""
Finds all potential MKV links on a page.
Args:
url (str): The URL of the page to scrape.
domain_filter (str): If not none, will only include links that contain this in the url
Returns:
list: A list of MKV file URLs.
"""
try:
response = requests.get(url)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {url}. Error: {e}")
return []
soup = BeautifulSoup(response.content, 'html.parser')
mkv_links = []
for link in soup.find_all('a', href=True):
href = link['href']
#make sure we are including the domain filter
if domain_filter is not None:
if domain_filter not in href:
continue
# Use urljoin to handle relative paths
absolute_url = urljoin(url, href)
if absolute_url.endswith('.mkv') or '.mkv?' in absolute_url: # check for things like ?dl=1
mkv_links.append(absolute_url)
return mkv_links
def download_file(url, save_path):
"""
Downloads a file from a URL.
Args:
url (str): The URL of the file to download.
save_path (str): The directory to save the file.
"""
try:
response = requests.get(url, stream=True)
response.raise_for_status()
file_name = os.path.basename(urlparse(url).path) # Extract file name from URL
if not file_name:
print(f"Could not determine filename for {url}")
return
file_path = os.path.join(save_path, file_name)
# Get total file size for the progress bar
total_size_in_bytes = int(response.headers.get('content-length', 0))
block_size = 8192 #8KB
print(f"Starting download: {url} -> {file_path}") #start of download print
with tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True, desc=file_name) as progress_bar:
with open(file_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=block_size):
progress_bar.update(len(chunk))
if chunk:
f.write(chunk)
print(f"Downloaded: {url} -> {file_path}")
except requests.exceptions.RequestException as e:
print(f"Error downloading {url}. Error: {e}")
except Exception as e:
print(f"An unexpected error occurred downloading {url}: {e}")
def main():
"""
Main function to drive the MKV downloader.
"""
while True:
target_url = input("Enter the URL to scrape for MKV files: ").strip()
if is_valid_url(target_url):
break
else:
print("Invalid URL. Please enter a valid URL.")
#optional domain filter
domain_filter = input("Enter a domain filter (optional): ").strip()
if domain_filter == "":
domain_filter = None
save_path = "./downloads" # Change as needed
os.makedirs(save_path, exist_ok=True)
mkv_links = find_mkv_links(target_url, domain_filter)
if not mkv_links:
print("No MKV links found on the page.")
return
print(f"Found {len(mkv_links)} potential MKV links:")
for link in mkv_links:
print(link)
for link in mkv_links:
download_file(link, save_path)
print("Finished processing links.")
if __name__ == "__main__":
main()