-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutility_functions.py
131 lines (111 loc) · 5 KB
/
utility_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import requests
from bs4 import BeautifulSoup as bs
#per proxies
from fake_useragent import UserAgent
from urllib.request import Request, urlopen
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from multiprocessing import Queue
import os , signal , sys , time, random
def create_selenium_driver():
chrome_options = webdriver.ChromeOptions() # Set up the headless browser for webdriver
chrome_options.add_argument('--headless') # Run Chrome in headless mode
chrome_options.add_argument('--disable-gpu') # Disable GPU acceleration (needed in headless mode)
chrome_options.add_argument('--no-sandbox')
driver = webdriver.Chrome(options=chrome_options) # Create a WebDriver instance with the specified options
driver.minimize_window()
return driver
# function to get bookname from passed url
def get_bookname_from_url(url): #? works form novelhi/lightnovelhub tbd for other websites
#handle case where website fails or userURL is not correct
try:
bookname = url.split('/')[4].replace('-',' ')
print(f"\n\nBookname : {bookname}")
return bookname
except Exception as e:
print(f"Error: {e}")
print("Novel URL is not in correct format. Please try again.")
exit()
# image download functions
def download_image_novelhi(url, save_path):
response = requests.get(url, stream=True)
if response.status_code == 200:
with open(save_path, 'wb') as file:
for chunk in response.iter_content(chunk_size=128):
file.write(chunk)
print(f"\n\nImage downloaded and saved at: {save_path}")
else:
print(f"\n\nFailed to download image. Status code: {response.status_code}")
def download_image_lightnovelhub(url, save_path):
driver = create_selenium_driver()
driver.get(url)
driver.implicitly_wait(10)
# Find the image element using the By.TAG_NAME method, get 3rd item and save it
image = driver.find_elements(By.TAG_NAME, 'img')[2]
image.screenshot(save_path)
print(f"\n\nImage downloaded and saved at: {save_path}")
# Close the browser
driver.quit()
##* Get last chapter from website
#novelhi
def get_nh_lastchapter(url):
driver = create_selenium_driver()
driver.get(url)
driver.implicitly_wait(10)
c = driver.find_element(By.CSS_SELECTOR, "#indexList > li:nth-child(1) > span:nth-child(1) > a:nth-child(1)").text
return c.split("Chapter ")[1]
#lightnovelhub
def get_lnh_lastchapter(url):
driver = create_selenium_driver()
driver.get(url)
driver.implicitly_wait(10)
return driver.find_element(By.CSS_SELECTOR, ".header-stats > span:nth-child(1) > strong:nth-child(1)").text
#closing parent process if window gets closed
def sigterm_handler(si, frame):
print("UI process ended, terminating parent process")
sys.exit()
### Proxy functions
# Function to extract proxies from https://www.sslproxies.org/ to be used when getting a website request
def generate_proxies(proxy_list):
ua = UserAgent()
proxy_list.clear() # Svuoto la lista dei proxy (nel caso la funzione venga chiamata più volte non si vogliono avere duplicati)
proxies_req = Request('https://www.sslproxies.org/')
proxies_req.add_header('User-Agent', ua.random)
proxies_doc = urlopen(proxies_req).read().decode('utf8')
soup = bs(proxies_doc, 'html.parser')
proxies_table = soup.find('table', class_='table table-striped table-bordered')
# Salvo i proxy nella lista proxies
for row in proxies_table.tbody.find_all('tr'):
td = row.find_all('td')
proxy_list.append({
'ip': td[0].string,
'port': td[1].string})
# function to request a page using proxies
def get_request_page(url, proxies):
ua = UserAgent()
original_proxy_count = len(proxies)
while True:
if len(proxies) == 0:
print(f"Max blocked proxies reached, getting new ones ({original_proxy_count})")
raise StopIteration
proxy = random.choice(proxies)
# print("uso il proxy : ",proxy)
user_agent = ua.random
try:
headers = {'User-Agent': user_agent}
response = requests.get(url, headers=headers, proxies=proxy)
# print(response)
soup = bs(response.text, 'html.parser')
if response.text.startswith("Too"):
proxies.remove(proxy)
print(f"Website blocked the request, changing proxy... (numero di proxy rimanenti: {len(proxies)})")
time.sleep(5) # Attendi 5 secondi prima di provare un nuovo proxy
continue
#se ho una risposta OK, restituisco la risposta e l'oggetto response
return response
except:
proxies.remove(proxy)
print(f"Errore durante la richiesta. Cambio proxy... (numero di proxy rimanenti alla sospensione dell'esecuzione: {len(proxies)})")
time.sleep(5) # Attendi 5 secondi prima di provare un nuovo proxy
continue