-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathD_scraper4.pyw
138 lines (109 loc) · 3.95 KB
/
D_scraper4.pyw
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import sqlite3
from bs4 import BeautifulSoup as Soup
def to_do():
# vars...
conn = sqlite3.connect(r'test.db')
csv_file_location = r"data_file.csv"
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/80.0.3987.132 Safari/537.36'
driver_exe = 'chromedriver'
options = Options()
options.add_argument("--headless")
options.add_argument(f'user-agent={user_agent}')
options.add_argument("--disable-web-security")
options.add_argument("--allow-running-insecure-content")
options.add_argument("--allow-cross-origin-auth-prompt")
cur = conn.cursor()
cur.execute("SELECT id FROM urls")
for my_url in cur.fetchall():
pass
unwanted_chars = "()''"
url_to_str = str(my_url)
for wanted_char in unwanted_chars:
result = url_to_str.replace(wanted_char, "")
# changing tupled string by removing "(", ")", ","
step_1 = result.replace("(", "")
step_2 = step_1.replace(")", "")
step_3 = step_2.replace(",", "")
print(step_3)
driver = webdriver.Chrome(
executable_path=r"chromedriver", options=options)
driver.get(step_3)
cur.execute("SELECT class FROM classes")
all_values = tuple()
for class_Names in cur.fetchall():
all_values += class_Names
print(all_values)
# changing tupled string by removing "(", ")", ","
one, two, three, four = all_values
one = one.replace(' ', '.')
print(one)
two = two.replace(' ', '.')
print(two)
three = three.replace(' ', '.')
print(three)
four = four.replace(' ', '.')
print(four)
cur.execute("SELECT tag FROM classes;")
all_values1 = tuple()
for tags in cur.fetchall():
all_values1 += tags
print(all_values1)
one1_, two2_, three3_, four4_ = all_values1
x = one1_ + one
print(x)
y = two2_ + two
print(y)
z = three3_ + three
print(z)
z1 = four4_ + four
print(z1)
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, x))
)
page = Soup(driver.page_source, features='html.parser')
elements = page.select(x)
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, y))
)
page = Soup(driver.page_source, features='html.parser')
elements2 = page.select(y)
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, z))
)
page = Soup(driver.page_source, features='html.parser')
elements3 = page.select(z)
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.CSS_SELECTOR, z1))
)
page = Soup(driver.page_source, features='html.parser')
elements4 = page.select(z1)
df = pd.DataFrame({
"Title1": pd.Series([ele.text.strip() for ele in elements]),
"Title2": pd.Series([ele2.text.strip() for ele2 in elements2]),
"Title3": pd.Series([ele3.text.strip() for ele3 in elements3]),
"Title4": pd.Series([ele4.text.strip() for ele4 in elements4]),
})
df = df.dropna()
df.to_csv(csv_file_location,
index=False, mode='a', encoding='utf-8')
try:
f = open(csv_file_location)
print("Done !!!\n"*3)
# Do something with the file
except IOError:
print("File not accessible")
finally:
f.close()
driver.quit()
#print("Unable to scrape...")
cur.execute("""DROP TABLE Classes;""")
cur.execute("""DROP TABLE urls;""")
print("start")