Skip to content

Commit

Permalink
feat set get request header cookie
Browse files Browse the repository at this point in the history
  • Loading branch information
BarisYazici committed Mar 20, 2023
1 parent 0cda468 commit df2d101
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 3 deletions.
7 changes: 7 additions & 0 deletions config.yaml.dist
Original file line number Diff line number Diff line change
Expand Up @@ -191,3 +191,10 @@ apprise:
# For websites like idealista.it, there are anti-crawler measures that can be
# circumvented using proxies.
# use_proxy_list: True

# If you are having bot detection issues with immobilienscout24,
# you can set the cookie that you get from your logged in account
# Go to the immobilienscout24.de website, log in, and then in the developer tools
# (F12) go to the "Network" tab, then "Cookies" and copy the value of the
# "reese84" cookie.
immoscout_cookie: ""
22 changes: 19 additions & 3 deletions flathunter/crawl_immobilienscout.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def __init__(self, config):
self.driver = None
self.checkbox = None
self.afterlogin_string = None

self.set_cookie_enabled = "immoscout_cookie" in self.config
if config.captcha_enabled():
self.checkbox = config.get_captcha_checkbox()
self.afterlogin_string = config.get_captcha_afterlogin_string()
Expand Down Expand Up @@ -147,18 +147,34 @@ def extract_entry_from_javascript(self, entry):
'rooms': str(entry.get("numberOfRooms", ''))
}

def set_cookie(self):
"""Sets request header cookie parameter to identify as a logged in user"""
self.HEADERS['Cookie'] = f'reese84:${self.config["immoscout_cookie"]}'

# pylint: disable=too-many-arguments
def get_soup_from_url(self,
url,
driver=None,
checkbox=None,
afterlogin_string=None,
set_cookie=False):
if set_cookie:
self.set_cookie()
return super().get_soup_from_url(url, driver, checkbox, afterlogin_string)

def get_page(self, search_url, driver=None, page_no=None):
"""Applies a page number to a formatted search URL and fetches the exposes at that page"""
return self.get_soup_from_url(
search_url.format(page_no),
driver=driver,
checkbox=self.checkbox,
afterlogin_string=self.afterlogin_string
afterlogin_string=self.afterlogin_string,
set_cookie= self.set_cookie_enabled
)

def get_expose_details(self, expose):
"""Loads additional details for an expose by processing the expose detail URL"""
soup = self.get_soup_from_url(expose['url'])
soup = self.get_soup_from_url(expose['url'], self.set_cookie_enabled)
date = soup.find('dd', {"class": "is24qa-bezugsfrei-ab"})
expose['from'] = datetime.datetime.now().strftime("%2d.%2m.%Y")
if date is not None:
Expand Down

0 comments on commit df2d101

Please sign in to comment.