-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
121 lines (87 loc) · 3.51 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
# Selenium configuration
options = webdriver.ChromeOptions()
# options.add_argument('--ignore-certificate-errors')
# options.add_argument('--incognito')
# Don't open chrome each time I run the code...
options.add_argument('--headless')
service = Service("C:/Program Files/chromedriver.exe")
driver = webdriver.Chrome(service=service, options=options)
# base url for traversing categories...
base_url_trendyol = 'https://www.trendyol.com/en/'
def TrendyolParser():
# for fetching male and female links two different branches of links
male_categories = [
'Sweaters',
'Cardigans',
'Shirts',
'Coats',
'Jeans',
'Sweatsuits',
'Pajama sets',
'Shoes & Accessories',
'Swimshorts',
'Sweatshirts',
'T-Shirts',
'Blazers',
'Jackets',
'Pants',
'Sweatpants',
'Boxers',
'Sportswear',
]
# links for male and female clothing
url_male = 'https://www.trendyol.com/en/campaign/list/men'
url_female = 'https://www.trendyol.com/en/campaign/list/women'
driver.get(url_male)
time.sleep(1)
found = driver.find_elements(by=By.CLASS_NAME, value="subcategory-items")
page_source = driver.page_source
# Requirement -> pip install lxml, if you run into problems here
soup = BeautifulSoup(page_source, 'html.parser')
subcategory_items = soup.find_all('div', class_='category-menu-wrapper')
clothing_subsection = subcategory_items[1]
clothing_hrefs = clothing_subsection.find_all('a')
clothing_links = [i['href'] for i in clothing_hrefs if not str(i['href']).startswith("sr?") and not i['href'] == '']
time.sleep(1)
# Check it with the selenium here again. Ye cool.
driver.get(base_url_trendyol + clothing_links[0])
# Have some rest..
time.sleep(5)
def boyner_parser(link,file_count):
driver.get(link)
time.sleep(1)
page_source = driver.page_source
soup = BeautifulSoup(page_source, 'html.parser')
total_products_num = soup.find_all("span", class_='grey')[-1].extract()
total_products_num = int(total_products_num.get_text().replace("(","").replace(")",""))
print(total_products_num)
print(total_products_num//45)
# 45 products in 1 page.
for i in range(1, total_products_num//45 + 2):
driver.get(link + str(i))
print(link + str(i))
time.sleep(1)
page_source = driver.page_source
# Requirement -> pip install lxml, if you run into problems here
soup = BeautifulSoup(page_source, 'html.parser')
product_list_depot = soup.find_all("div", class_='imgDepot')
print(f"------------------------PAGE {i}------------------------")
file_name = str(file_count) + '.txt'
with open(file_name, 'a') as f:
for item in product_list_depot:
for img_section in item.findAll('img'):
f.write(img_section['data-imgsrc'])
f.write('\n')
boyner_url_list = [
'https://www.boyner.com.tr/erkek-t-shirt-modelleri-c-200107/',
'https://www.boyner.com.tr/erkek-gomlek-modelleri-c-200104/',
'https://www.boyner.com.tr/erkek-pantolon-modelleri-c-200105/',
'https://www.boyner.com.tr/erkek-jean-c-200109/',
'https://www.boyner.com.tr/erkek-ceket-yelek-modelleri-c-200103/',
'https://www.boyner.com.tr/erkek-sort-c-3302071/',
]