Fixes and Features update

Fixes: If your OS is not Windows, the script will not use any Windows functions Script will now grab all media unless they are locked behind a paywall or duplicated. If there is any corrupt media, it will be exported to archive.json. All media will download to their proper folders. (No more images in videos and vice-versa) Features: metadata/archive.json will now contain valid and invalid posts. (This will replace links.json) Config Update: ignored_keywords- Any posts containing these words will be ignored. text_length- When using the {text} in file_name_format, you can set a maximum length. boards- Input any boards you'd like to automatically scrape.
UltimaHoarder · Oct 20, 2019 · 69d83b4 · 69d83b4
1 parent 8a00117
commit 69d83b4
Show file tree

Hide file tree

Showing 7 changed files with 417 additions and 254 deletions.
diff --git a/README.md b/README.md
@@ -59,6 +59,14 @@ file_name_format:
     Example: {date}/{text}-{file_name}.{ext}
     Warning: It's important to keep a unique identifier next to .{ext}. By default it's {file_name}, but it can be {date}-{text}.ext
 
+text_length:
+
+    Default = ""
+    Ideal = "50"
+    Max = "259"
+
+    When you use {text} in file_name_format, a limit of how many characters can be set by inputting a number.
+
 auto_site_choice:
 
     Default = ""
@@ -101,6 +109,20 @@ multithreading:
     If set to false, you will download files 1 by 1. (If you don't have fast internet, may god help you.)
     I'd reccomend leaving it set to true.
 
+boards:
+
+    Default = []
+    Example = ["s", "gif"]
+
+    Input boards names that you want to automatically scrape.
+
+ignored_keywords:
+
+    Default = []
+    Example = ["ignore", "me"]
+
+    Any words you input, the script will ignore any content that contains these words.
+
 
 
 # OPTIONAL ARGUMENTS

diff --git a/Start Datascraper.py b/Start Datascraper.py
@@ -34,6 +34,7 @@
         x = int(input())
         site_name = site_names[x]
     json_auth = json_sites[site_name]["auth"]
+    json_site_settings = json_sites[site_name]["settings"]
     session = ""
     x = ""
     app_token = ""
@@ -43,22 +44,27 @@
         auth_hash = json_auth['auth_hash']
         x = onlyfans
         session = x.create_session(user_agent, auth_id, auth_hash, app_token)
+        array = []
     elif site_name == "justforfans":
         auth_id = json_auth['phpsessid']
         auth_hash = json_auth['user_hash2']
         x = justforfans
         session = x.create_session(user_agent, auth_id, auth_hash)
+        array = []
     elif site_name == "4chan":
         x = four_chan
         session = x.create_session()
+        array = json_site_settings["boards"]
 
     if not session[0]:
         continue
     print('Input a '+site_name+' '+session[1])
-    input_link = input().strip()
-    username = helpers.parse_links(site_name, input_link)
-    start_time = timeit.default_timer()
     session = session[0]
-    result = x.start_datascraper(session, username, site_name, app_token)
-    stop_time = str(int(timeit.default_timer() - start_time) / 60)
-    print('Task Completed in ' + stop_time + ' Minutes')
+    if not array:
+        array = [input().strip()]
+    for input_link in array:
+        username = helpers.parse_links(site_name, input_link)
+        start_time = timeit.default_timer()
+        result = x.start_datascraper(session, username, site_name, app_token)
+        stop_time = str(int(timeit.default_timer() - start_time) / 60)
+        print('Task Completed in ' + stop_time + ' Minutes')
diff --git a/config.json b/config.json
@@ -15,8 +15,10 @@
       "settings": {
         "directory": "",
         "file_name_format": "{file_name}.{ext}",
+        "text_length": "",
         "overwrite_files": true,
-        "date_format": "%d-%m-%Y"
+        "date_format": "%d-%m-%Y",
+        "ignored_keywords": []
       }
     },
     "justforfans": {
@@ -27,18 +29,22 @@
       "settings": {
         "directory": "",
         "file_name_format": "{file_name}.{ext}",
+        "text_length": "",
         "overwrite_files": true,
-        "date_format": "%d-%m-%Y"
+        "date_format": "%d-%m-%Y",
+        "ignored_keywords": []
       }
     },
     "4chan": {
       "auth": {},
       "settings": {
         "directory": "",
         "file_name_format": "{file_name}.{ext}",
+        "text_length": "",
         "overwrite_files": false,
         "date_format": "%d-%m-%Y",
-        "ignore_thread_titles": [""]
+        "boards": [],
+        "ignored_keywords": []
       }
     }
 

diff --git a/modules/four_chan.py b/modules/four_chan.py
@@ -27,6 +27,12 @@
 format_path = json_settings['file_name_format']
 overwrite_files = json_settings["overwrite_files"]
 date_format = json_settings["date_format"]
+ignored_keywords = json_settings["ignored_keywords"]
+maximum_length = 240
+text_length = int(json_settings["text_length"]
+                  ) if json_settings["text_length"] else maximum_length
+if text_length > maximum_length:
+    text_length = maximum_length
 
 max_threads = multiprocessing.cpu_count()
 
@@ -41,6 +47,7 @@ def start_datascraper(session, board_name, site_name, link_type=None):
         print(user_id[1])
         print("First time? Did you forget to edit your config.json file?")
         return [False]
+    print("Board: " + board_name)
     array = scrape_choice(board_name)
     link_array = {}
     if multithreading:
@@ -50,9 +57,7 @@ def start_datascraper(session, board_name, site_name, link_type=None):
     threads = board_scraper(session, array[0], "")
     archive_threads = board_scraper(session, array[1], "archive")
     threads = threads + archive_threads
-    print("Scraping Threads")
-    threads = pool.starmap(thread_scraper,
-                           product(threads, [board_name], [session]))
+    print("Original Count: "+str(len(threads)))
     directory = j_directory
     directory += "/"+site_name + "/" + board_name + "/"
     if "/sites/" == j_directory:
@@ -62,10 +67,16 @@ def start_datascraper(session, board_name, site_name, link_type=None):
     else:
         directory = directory
 
+    print("Scraping Threads")
+    threads = pool.starmap(thread_scraper,
+                           product(threads, [board_name], [session], [directory]))
+    threads = [x for x in threads if x is not None]
+    print("Filtered Count: "+str(len(threads)))
     print("Downloading Media")
-    pool.starmap(download_media,
-                 product(threads, [session], [directory], [board_name]))
-
+    results = pool.starmap(download_media,
+                           product(threads, [session], [directory], [board_name]))
+    count_results = str(len([x for x in threads if x is None]))
+    print("Valid Count: "+count_results)
     # When profile is done scraping, this function will return True
     return [True, link_array]
 
@@ -103,49 +114,128 @@ def board_scraper(session, link, category):
     return threads
 
 
-def thread_scraper(thread_id, board_name, session):
-    link = "http://a.4cdn.org/" + board_name + "/thread/" + str(
-        thread_id) + ".json"
+def thread_scraper(thread_id, board_name, session, directory):
+    thread_id = str(thread_id)
+    link = "http://a.4cdn.org/" + board_name + "/thread/" + thread_id + ".json"
     r = session.get(link)
-    y = json.loads(r.text)
-    return y
-
+    if r.status_code == 404:
+        return
+    try:
+        thread = json.loads(r.text)
+        thread_master = thread["posts"][0]
+    except Exception as e:
+        print(e, link)
+        return
+    if "archived" in thread_master:
+        location = "Archive"
+    else:
+        location = "Catalog"
 
-def download_media(thread, session, directory, board_name):
-    thread_master = thread["posts"][0]
-    thread_id = str(thread_master["no"])
+    if "sub" in thread_master:
+        title = thread_master["sub"].lower()
+        if any(ignored_keyword in title for ignored_keyword in ignored_keywords):
+            print("Removed From "+location+": ", title)
+            return
+
+    if "com" in thread_master:
+        title = thread_master["com"].lower()
+        if any(ignored_keyword in title for ignored_keyword in ignored_keywords):
+            print("Removed From "+location+": ", title)
+            return
     text = ""
     if "sub" in thread_master:
-        text = thread_master["sub"]
+        text = thread_master["sub"][:text_length]
     else:
-        if "com" in thread_master:
-            text = thread_master["com"]
+        text = thread_master["com"][:text_length]
     text = BeautifulSoup(text, 'html.parser').get_text().replace(
         "\n", " ").strip()
     text = re.sub(r'[\\/*?:"<>|]', '', text)
+    thread["download_path"] = ""
     for post in thread["posts"]:
+        if "name" not in post:
+            post["name"] = "Anonymous"
         if "filename" in post:
             filename = str(post["tim"])
             ext = post["ext"].replace(".", "")
-            link = "http://i.4cdn.org/" + board_name + "/" + filename+"."+ext
+            filename = post["filename"]
             new_directory = directory+"/"+text+" - "+thread_id+"/"
             if not text:
                 new_directory = new_directory.replace(" - ", "")
+
             date_object = datetime.fromtimestamp(post["time"])
-            new_directory = reformat(new_directory, filename, text, ext, date_object, post["name"], format_path,
-                                     date_format)
-            if not overwrite_files:
-                if os.path.isfile(new_directory):
-                    continue
-            r = session.get(link, stream=True)
-            if r.status_code != 404:
-                if not os.path.exists(os.path.dirname(new_directory)):
-                    os.makedirs(os.path.dirname(new_directory))
-                with open(new_directory, 'wb') as f:
-                    for chunk in r.iter_content(chunk_size=1024):
-                        if chunk:  # filter out keep-alive new chunks
-                            f.write(chunk)
-                print(link, new_directory)
+            og_filename = filename
+            download_path = os.path.dirname(reformat(
+                new_directory, filename, text, ext, date_object, post["name"], format_path, date_format, text_length, maximum_length))
+            size = len(download_path)
+            size2 = len(thread["download_path"])
+            if thread["download_path"]:
+                if len(download_path) < len(thread["download_path"]):
+                    thread["download_path"] = download_path
+            else:
+                thread["download_path"] = download_path
+    return thread
+
+
+def download_media(thread, session, directory, board_name):
+    try:
+        directory = thread["download_path"]+"/"
+        valid = False
+        for post in thread["posts"]:
+            if "filename" in post:
+                post["filename"] = re.sub(
+                    r'[\\/*?:"<>|]', '', post["filename"])
+                ext = post["ext"].replace(".", "")
+                filename = str(post["tim"])+"."+ext
+                link = "http://i.4cdn.org/" + board_name + "/" + filename
+                filename = post["filename"]+"."+ext
+                download_path = directory+filename
+                count_string = len(download_path)
+                if count_string > 259:
+                    num_sum = count_string - 259
+                    post["filename"] = post["filename"][:50]
+                    download_path = directory+post["filename"]+"."+ext
+
+                if not overwrite_files:
+                    count = 1
+                    found = False
+                    og_filename = post["filename"]
+                    while True:
+                        if os.path.isfile(download_path):
+                            remote_size = post["fsize"]
+                            local_size = os.path.getsize(download_path)
+                            if remote_size == local_size:
+                                found = True
+                                break
+                            else:
+                                download_path = directory+og_filename + \
+                                    " ("+str(count)+")."+ext
+                                count += 1
+                                continue
+                        else:
+                            found = False
+                            break
+                    if found:
+                        continue
+                r = session.get(link, stream=True)
+                if r.status_code != 404:
+                    if not os.path.exists(os.path.dirname(download_path)):
+                        os.makedirs(os.path.dirname(download_path))
+                    with open(download_path, 'wb') as f:
+                        for chunk in r.iter_content(chunk_size=1024):
+                            if chunk:  # filter out keep-alive new chunks
+                                f.write(chunk)
+                    print(download_path)
+                    valid = True
+        if valid:
+            os.makedirs(directory, exist_ok=True)
+            with open(directory+'archive.json', 'w') as outfile:
+                json.dump(thread, outfile)
+            return thread
+        else:
+            return
+    except Exception as e:
+        print("ERROR", e, directory)
+        return
 
 
 def create_session():

diff --git a/modules/helpers.py b/modules/helpers.py
@@ -1,5 +1,6 @@
-from bs4 import BeautifulSoup
 import re
+import os
+from bs4 import BeautifulSoup
 
 
 def parse_links(site_name, input_link):
@@ -19,21 +20,46 @@ def parse_links(site_name, input_link):
             return input_link
 
 
-def reformat(directory, file_name, text, ext, date, username, format_path, date_format):
+def reformat(directory, file_name, text, ext, date, username, format_path, date_format, text_length, maximum_length):
     path = format_path.replace("{username}", username)
-    text = BeautifulSoup(text, 'html.parser').get_text().replace("\n", " ").strip()
+    text = BeautifulSoup(text, 'html.parser').get_text().replace(
+        "\n", " ").strip()
     filtered_text = re.sub(r'[\\/*?:"<>|]', '', text)
     path = path.replace("{text}", filtered_text)
     date = date.strftime(date_format)
     path = path.replace("{date}", date)
     path = path.replace("{file_name}", file_name)
     path = path.replace("{ext}", ext)
-    directory += path
-    count_string = len(directory)
-    if count_string > 259:
-        num_sum = count_string - 259
-        directory = directory.replace(filtered_text, filtered_text[:-num_sum])
-    return directory
-
+    directory2 = directory + path
+    count_string = len(directory2)
+    if count_string > maximum_length:
+        num_sum = count_string - maximum_length
+        directory2 = directory2.replace(
+            filtered_text, filtered_text[:text_length])
+    count_string = len(directory2)
+    if count_string > maximum_length:
+        num_sum = count_string - maximum_length
+        directory2 = directory2.replace(
+            filtered_text, filtered_text[:-num_sum])
+        count_string = len(directory2)
+        if count_string > maximum_length:
+            directory2 = directory
+    count_string = len(directory2)
+    if count_string > maximum_length:
+        num_sum = count_string - maximum_length
+        directory2 = directory2.replace(
+            filtered_text, filtered_text[:50])
+        count_string = len(directory2)
+        if count_string > maximum_length:
+            directory2 = directory
+    return directory2
 
 
+def format_media_set(media_set):
+    x = {}
+    x["valid"] = []
+    x["invalid"] = []
+    for y in media_set:
+        x["valid"].extend(y[0])
+        x["invalid"].extend(y[1])
+    return x