feat set get request header cookie

flathunters · Mar 20, 2023 · df2d101 · df2d101
1 parent 0cda468
commit df2d101
Show file tree

Hide file tree

Showing 2 changed files with 26 additions and 3 deletions.
diff --git a/config.yaml.dist b/config.yaml.dist
@@ -191,3 +191,10 @@ apprise:
 # For websites like idealista.it, there are anti-crawler measures that can be
 # circumvented using proxies.
 # use_proxy_list: True
+
+# If you are having bot detection issues with immobilienscout24,
+# you can set the cookie that you get from your logged in account
+# Go to the immobilienscout24.de website, log in, and then in the developer tools
+# (F12) go to the "Network" tab, then "Cookies" and copy the value of the
+# "reese84" cookie.
+immoscout_cookie: ""
diff --git a/flathunter/crawl_immobilienscout.py b/flathunter/crawl_immobilienscout.py
@@ -31,7 +31,7 @@ def __init__(self, config):
         self.driver = None
         self.checkbox = None
         self.afterlogin_string = None
-
+        self.set_cookie_enabled = "immoscout_cookie" in self.config
         if config.captcha_enabled():
             self.checkbox = config.get_captcha_checkbox()
             self.afterlogin_string = config.get_captcha_afterlogin_string()
@@ -147,18 +147,34 @@ def extract_entry_from_javascript(self, entry):
             'rooms': str(entry.get("numberOfRooms", ''))
         }
 
+    def set_cookie(self):
+        """Sets request header cookie parameter to identify as a logged in user"""
+        self.HEADERS['Cookie'] = f'reese84:${self.config["immoscout_cookie"]}'
+
+    # pylint: disable=too-many-arguments
+    def get_soup_from_url(self,
+                          url,
+                          driver=None,
+                          checkbox=None,
+                          afterlogin_string=None,
+                          set_cookie=False):
+        if set_cookie:
+            self.set_cookie()
+        return super().get_soup_from_url(url, driver, checkbox, afterlogin_string)
+
     def get_page(self, search_url, driver=None, page_no=None):
         """Applies a page number to a formatted search URL and fetches the exposes at that page"""
         return self.get_soup_from_url(
             search_url.format(page_no),
             driver=driver,
             checkbox=self.checkbox,
-            afterlogin_string=self.afterlogin_string
+            afterlogin_string=self.afterlogin_string,
+            set_cookie= self.set_cookie_enabled
         )
 
     def get_expose_details(self, expose):
         """Loads additional details for an expose by processing the expose detail URL"""
-        soup = self.get_soup_from_url(expose['url'])
+        soup = self.get_soup_from_url(expose['url'], self.set_cookie_enabled)
         date = soup.find('dd', {"class": "is24qa-bezugsfrei-ab"})
         expose['from'] = datetime.datetime.now().strftime("%2d.%2m.%Y")
         if date is not None: