From df2d1016c44f044377f702f5423afc9a68d5f07d Mon Sep 17 00:00:00 2001 From: BarisYazici Date: Tue, 14 Mar 2023 00:40:23 +0100 Subject: [PATCH] feat set get request header cookie --- config.yaml.dist | 7 +++++++ flathunter/crawl_immobilienscout.py | 22 +++++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/config.yaml.dist b/config.yaml.dist index 0cf29ccb..32a89360 100644 --- a/config.yaml.dist +++ b/config.yaml.dist @@ -191,3 +191,10 @@ apprise: # For websites like idealista.it, there are anti-crawler measures that can be # circumvented using proxies. # use_proxy_list: True + +# If you are having bot detection issues with immobilienscout24, +# you can set the cookie that you get from your logged in account +# Go to the immobilienscout24.de website, log in, and then in the developer tools +# (F12) go to the "Network" tab, then "Cookies" and copy the value of the +# "reese84" cookie. +immoscout_cookie: "" \ No newline at end of file diff --git a/flathunter/crawl_immobilienscout.py b/flathunter/crawl_immobilienscout.py index 1c98ef2f..711525a9 100644 --- a/flathunter/crawl_immobilienscout.py +++ b/flathunter/crawl_immobilienscout.py @@ -31,7 +31,7 @@ def __init__(self, config): self.driver = None self.checkbox = None self.afterlogin_string = None - + self.set_cookie_enabled = "immoscout_cookie" in self.config if config.captcha_enabled(): self.checkbox = config.get_captcha_checkbox() self.afterlogin_string = config.get_captcha_afterlogin_string() @@ -147,18 +147,34 @@ def extract_entry_from_javascript(self, entry): 'rooms': str(entry.get("numberOfRooms", '')) } + def set_cookie(self): + """Sets request header cookie parameter to identify as a logged in user""" + self.HEADERS['Cookie'] = f'reese84:${self.config["immoscout_cookie"]}' + + # pylint: disable=too-many-arguments + def get_soup_from_url(self, + url, + driver=None, + checkbox=None, + afterlogin_string=None, + set_cookie=False): + if set_cookie: + self.set_cookie() + return super().get_soup_from_url(url, driver, checkbox, afterlogin_string) + def get_page(self, search_url, driver=None, page_no=None): """Applies a page number to a formatted search URL and fetches the exposes at that page""" return self.get_soup_from_url( search_url.format(page_no), driver=driver, checkbox=self.checkbox, - afterlogin_string=self.afterlogin_string + afterlogin_string=self.afterlogin_string, + set_cookie= self.set_cookie_enabled ) def get_expose_details(self, expose): """Loads additional details for an expose by processing the expose detail URL""" - soup = self.get_soup_from_url(expose['url']) + soup = self.get_soup_from_url(expose['url'], self.set_cookie_enabled) date = soup.find('dd', {"class": "is24qa-bezugsfrei-ab"}) expose['from'] = datetime.datetime.now().strftime("%2d.%2m.%Y") if date is not None: