Skip to content

Commit

Permalink
Fixes and Features update
Browse files Browse the repository at this point in the history
Fixes:
If your OS is not Windows, the script will not use any Windows functions

Script will now grab all media unless they are locked behind a paywall or duplicated. If there is any corrupt media, it will be exported to archive.json.

All media will download to their proper folders. (No more images in videos and vice-versa)

Features:

metadata/archive.json will now contain valid and invalid posts. (This will replace links.json)

Config Update:
ignored_keywords-
Any posts containing these words will be ignored.
text_length-
When using the {text} in file_name_format, you can set a maximum length.

boards-
Input any boards you'd like to automatically scrape.
  • Loading branch information
SecretShell committed Oct 20, 2019
1 parent 8a00117 commit 69d83b4
Show file tree
Hide file tree
Showing 7 changed files with 417 additions and 254 deletions.
22 changes: 22 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,14 @@ file_name_format:
Example: {date}/{text}-{file_name}.{ext}
Warning: It's important to keep a unique identifier next to .{ext}. By default it's {file_name}, but it can be {date}-{text}.ext

text_length:

Default = ""
Ideal = "50"
Max = "259"

When you use {text} in file_name_format, a limit of how many characters can be set by inputting a number.

auto_site_choice:

Default = ""
Expand Down Expand Up @@ -101,6 +109,20 @@ multithreading:
If set to false, you will download files 1 by 1. (If you don't have fast internet, may god help you.)
I'd reccomend leaving it set to true.

boards:

Default = []
Example = ["s", "gif"]

Input boards names that you want to automatically scrape.

ignored_keywords:

Default = []
Example = ["ignore", "me"]

Any words you input, the script will ignore any content that contains these words.



# OPTIONAL ARGUMENTS
Expand Down
18 changes: 12 additions & 6 deletions Start Datascraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
x = int(input())
site_name = site_names[x]
json_auth = json_sites[site_name]["auth"]
json_site_settings = json_sites[site_name]["settings"]
session = ""
x = ""
app_token = ""
Expand All @@ -43,22 +44,27 @@
auth_hash = json_auth['auth_hash']
x = onlyfans
session = x.create_session(user_agent, auth_id, auth_hash, app_token)
array = []
elif site_name == "justforfans":
auth_id = json_auth['phpsessid']
auth_hash = json_auth['user_hash2']
x = justforfans
session = x.create_session(user_agent, auth_id, auth_hash)
array = []
elif site_name == "4chan":
x = four_chan
session = x.create_session()
array = json_site_settings["boards"]

if not session[0]:
continue
print('Input a '+site_name+' '+session[1])
input_link = input().strip()
username = helpers.parse_links(site_name, input_link)
start_time = timeit.default_timer()
session = session[0]
result = x.start_datascraper(session, username, site_name, app_token)
stop_time = str(int(timeit.default_timer() - start_time) / 60)
print('Task Completed in ' + stop_time + ' Minutes')
if not array:
array = [input().strip()]
for input_link in array:
username = helpers.parse_links(site_name, input_link)
start_time = timeit.default_timer()
result = x.start_datascraper(session, username, site_name, app_token)
stop_time = str(int(timeit.default_timer() - start_time) / 60)
print('Task Completed in ' + stop_time + ' Minutes')
12 changes: 9 additions & 3 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@
"settings": {
"directory": "",
"file_name_format": "{file_name}.{ext}",
"text_length": "",
"overwrite_files": true,
"date_format": "%d-%m-%Y"
"date_format": "%d-%m-%Y",
"ignored_keywords": []
}
},
"justforfans": {
Expand All @@ -27,18 +29,22 @@
"settings": {
"directory": "",
"file_name_format": "{file_name}.{ext}",
"text_length": "",
"overwrite_files": true,
"date_format": "%d-%m-%Y"
"date_format": "%d-%m-%Y",
"ignored_keywords": []
}
},
"4chan": {
"auth": {},
"settings": {
"directory": "",
"file_name_format": "{file_name}.{ext}",
"text_length": "",
"overwrite_files": false,
"date_format": "%d-%m-%Y",
"ignore_thread_titles": [""]
"boards": [],
"ignored_keywords": []
}
}

Expand Down
156 changes: 123 additions & 33 deletions modules/four_chan.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@
format_path = json_settings['file_name_format']
overwrite_files = json_settings["overwrite_files"]
date_format = json_settings["date_format"]
ignored_keywords = json_settings["ignored_keywords"]
maximum_length = 240
text_length = int(json_settings["text_length"]
) if json_settings["text_length"] else maximum_length
if text_length > maximum_length:
text_length = maximum_length

max_threads = multiprocessing.cpu_count()

Expand All @@ -41,6 +47,7 @@ def start_datascraper(session, board_name, site_name, link_type=None):
print(user_id[1])
print("First time? Did you forget to edit your config.json file?")
return [False]
print("Board: " + board_name)
array = scrape_choice(board_name)
link_array = {}
if multithreading:
Expand All @@ -50,9 +57,7 @@ def start_datascraper(session, board_name, site_name, link_type=None):
threads = board_scraper(session, array[0], "")
archive_threads = board_scraper(session, array[1], "archive")
threads = threads + archive_threads
print("Scraping Threads")
threads = pool.starmap(thread_scraper,
product(threads, [board_name], [session]))
print("Original Count: "+str(len(threads)))
directory = j_directory
directory += "/"+site_name + "/" + board_name + "/"
if "/sites/" == j_directory:
Expand All @@ -62,10 +67,16 @@ def start_datascraper(session, board_name, site_name, link_type=None):
else:
directory = directory

print("Scraping Threads")
threads = pool.starmap(thread_scraper,
product(threads, [board_name], [session], [directory]))
threads = [x for x in threads if x is not None]
print("Filtered Count: "+str(len(threads)))
print("Downloading Media")
pool.starmap(download_media,
product(threads, [session], [directory], [board_name]))

results = pool.starmap(download_media,
product(threads, [session], [directory], [board_name]))
count_results = str(len([x for x in threads if x is None]))
print("Valid Count: "+count_results)
# When profile is done scraping, this function will return True
return [True, link_array]

Expand Down Expand Up @@ -103,49 +114,128 @@ def board_scraper(session, link, category):
return threads


def thread_scraper(thread_id, board_name, session):
link = "http://a.4cdn.org/" + board_name + "/thread/" + str(
thread_id) + ".json"
def thread_scraper(thread_id, board_name, session, directory):
thread_id = str(thread_id)
link = "http://a.4cdn.org/" + board_name + "/thread/" + thread_id + ".json"
r = session.get(link)
y = json.loads(r.text)
return y

if r.status_code == 404:
return
try:
thread = json.loads(r.text)
thread_master = thread["posts"][0]
except Exception as e:
print(e, link)
return
if "archived" in thread_master:
location = "Archive"
else:
location = "Catalog"

def download_media(thread, session, directory, board_name):
thread_master = thread["posts"][0]
thread_id = str(thread_master["no"])
if "sub" in thread_master:
title = thread_master["sub"].lower()
if any(ignored_keyword in title for ignored_keyword in ignored_keywords):
print("Removed From "+location+": ", title)
return

if "com" in thread_master:
title = thread_master["com"].lower()
if any(ignored_keyword in title for ignored_keyword in ignored_keywords):
print("Removed From "+location+": ", title)
return
text = ""
if "sub" in thread_master:
text = thread_master["sub"]
text = thread_master["sub"][:text_length]
else:
if "com" in thread_master:
text = thread_master["com"]
text = thread_master["com"][:text_length]
text = BeautifulSoup(text, 'html.parser').get_text().replace(
"\n", " ").strip()
text = re.sub(r'[\\/*?:"<>|]', '', text)
thread["download_path"] = ""
for post in thread["posts"]:
if "name" not in post:
post["name"] = "Anonymous"
if "filename" in post:
filename = str(post["tim"])
ext = post["ext"].replace(".", "")
link = "http://i.4cdn.org/" + board_name + "/" + filename+"."+ext
filename = post["filename"]
new_directory = directory+"/"+text+" - "+thread_id+"/"
if not text:
new_directory = new_directory.replace(" - ", "")

date_object = datetime.fromtimestamp(post["time"])
new_directory = reformat(new_directory, filename, text, ext, date_object, post["name"], format_path,
date_format)
if not overwrite_files:
if os.path.isfile(new_directory):
continue
r = session.get(link, stream=True)
if r.status_code != 404:
if not os.path.exists(os.path.dirname(new_directory)):
os.makedirs(os.path.dirname(new_directory))
with open(new_directory, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
print(link, new_directory)
og_filename = filename
download_path = os.path.dirname(reformat(
new_directory, filename, text, ext, date_object, post["name"], format_path, date_format, text_length, maximum_length))
size = len(download_path)
size2 = len(thread["download_path"])
if thread["download_path"]:
if len(download_path) < len(thread["download_path"]):
thread["download_path"] = download_path
else:
thread["download_path"] = download_path
return thread


def download_media(thread, session, directory, board_name):
try:
directory = thread["download_path"]+"/"
valid = False
for post in thread["posts"]:
if "filename" in post:
post["filename"] = re.sub(
r'[\\/*?:"<>|]', '', post["filename"])
ext = post["ext"].replace(".", "")
filename = str(post["tim"])+"."+ext
link = "http://i.4cdn.org/" + board_name + "/" + filename
filename = post["filename"]+"."+ext
download_path = directory+filename
count_string = len(download_path)
if count_string > 259:
num_sum = count_string - 259
post["filename"] = post["filename"][:50]
download_path = directory+post["filename"]+"."+ext

if not overwrite_files:
count = 1
found = False
og_filename = post["filename"]
while True:
if os.path.isfile(download_path):
remote_size = post["fsize"]
local_size = os.path.getsize(download_path)
if remote_size == local_size:
found = True
break
else:
download_path = directory+og_filename + \
" ("+str(count)+")."+ext
count += 1
continue
else:
found = False
break
if found:
continue
r = session.get(link, stream=True)
if r.status_code != 404:
if not os.path.exists(os.path.dirname(download_path)):
os.makedirs(os.path.dirname(download_path))
with open(download_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
print(download_path)
valid = True
if valid:
os.makedirs(directory, exist_ok=True)
with open(directory+'archive.json', 'w') as outfile:
json.dump(thread, outfile)
return thread
else:
return
except Exception as e:
print("ERROR", e, directory)
return


def create_session():
Expand Down
46 changes: 36 additions & 10 deletions modules/helpers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from bs4 import BeautifulSoup
import re
import os
from bs4 import BeautifulSoup


def parse_links(site_name, input_link):
Expand All @@ -19,21 +20,46 @@ def parse_links(site_name, input_link):
return input_link


def reformat(directory, file_name, text, ext, date, username, format_path, date_format):
def reformat(directory, file_name, text, ext, date, username, format_path, date_format, text_length, maximum_length):
path = format_path.replace("{username}", username)
text = BeautifulSoup(text, 'html.parser').get_text().replace("\n", " ").strip()
text = BeautifulSoup(text, 'html.parser').get_text().replace(
"\n", " ").strip()
filtered_text = re.sub(r'[\\/*?:"<>|]', '', text)
path = path.replace("{text}", filtered_text)
date = date.strftime(date_format)
path = path.replace("{date}", date)
path = path.replace("{file_name}", file_name)
path = path.replace("{ext}", ext)
directory += path
count_string = len(directory)
if count_string > 259:
num_sum = count_string - 259
directory = directory.replace(filtered_text, filtered_text[:-num_sum])
return directory

directory2 = directory + path
count_string = len(directory2)
if count_string > maximum_length:
num_sum = count_string - maximum_length
directory2 = directory2.replace(
filtered_text, filtered_text[:text_length])
count_string = len(directory2)
if count_string > maximum_length:
num_sum = count_string - maximum_length
directory2 = directory2.replace(
filtered_text, filtered_text[:-num_sum])
count_string = len(directory2)
if count_string > maximum_length:
directory2 = directory
count_string = len(directory2)
if count_string > maximum_length:
num_sum = count_string - maximum_length
directory2 = directory2.replace(
filtered_text, filtered_text[:50])
count_string = len(directory2)
if count_string > maximum_length:
directory2 = directory
return directory2


def format_media_set(media_set):
x = {}
x["valid"] = []
x["invalid"] = []
for y in media_set:
x["valid"].extend(y[0])
x["invalid"].extend(y[1])
return x
Loading

0 comments on commit 69d83b4

Please sign in to comment.