diff --git a/src/pe_mailer/email_reports.py b/src/pe_mailer/email_reports.py index 95141be7..1b12bcf4 100644 --- a/src/pe_mailer/email_reports.py +++ b/src/pe_mailer/email_reports.py @@ -28,13 +28,16 @@ import os import re import sys +import time from typing import Any, Dict # Third-Party Libraries import boto3 from botocore.exceptions import ClientError import docopt +import pymongo.errors from schema import And, Schema, SchemaError, Use +import yaml # cisagov Libraries import pe_reports @@ -44,7 +47,6 @@ from .pe_message import PEMessage from .stats_message import StatsMessage -# Setup logging LOGGER = logging.getLogger(__name__) MAILER_AWS_PROFILE = "cool-dns-sessendemail-cyber.dhs.gov" MAILER_ARN = os.environ.get("MAILER_ARN") @@ -192,11 +194,11 @@ def send_message(ses_client, message, counter=None): def send_pe_reports(ses_client, pe_report_dir, to): - """ - Send out Posture and Exposure reports. + """Send out Posture and Exposure reports. Parameters ---------- + ses_client : boto3.client The boto3 SES client via which the message is to be sent. @@ -227,22 +229,25 @@ def send_pe_reports(ses_client, pe_report_dir, to): try: # The directory must contain one usable report cyhy_agencies = len(pe_orgs) - LOGGER.info(f"{cyhy_agencies} agencies found in P&E.") + LOGGER.info(f"Running report mailer for {cyhy_agencies} organizations") 1 / cyhy_agencies except ZeroDivisionError: - LOGGER.critical("No report data is found in %s", pe_report_dir) + LOGGER.critical("No report data was found in %s", pe_report_dir) sys.exit(1) staging_conn = connect() # org_contacts = get_orgs_contacts(staging_conn) # old tsql ver. - org_contacts = get_orgs_contacts() # api ver. - + org_contacts = get_orgs_contacts() # api ver. + agencies_emailed_pe_reports = 0 + reports_not_mailed = 0 # Iterate over cyhy_requests, if necessary if pe_report_dir: for org in pe_orgs: id = org[2] if id == "GSEC": + LOGGER.warning(f"The PDF report for {org[2]} was intentionally set to not be mailed") + reports_not_mailed += 1 continue if to is not None: to_emails = to @@ -274,9 +279,10 @@ def send_pe_reports(ses_client, pe_report_dir, to): # At most one Cybex report and CSV should match if len(pe_report_filenames) > 2: - LOGGER.warning("More than two PDF reports found") + LOGGER.warning(f"More than two encrypted PDF reports found for {org[2]}") elif not pe_report_filenames: - LOGGER.error("No PDF report found") + LOGGER.warning(f"No encrypted PDF report found for {org[2]}, no report will be mailed") + reports_not_mailed += 1 continue if pe_report_filenames: @@ -306,10 +312,11 @@ def send_pe_reports(ses_client, pe_report_dir, to): pe_report_filename, pe_asm_filename, report_date, id, to_emails ) - print(to_emails) - print(pe_report_filename) - print(pe_asm_filename) - print(report_date) + print("Recipient: ", to_emails) + print("Report Date: ", report_date) + print("Report File:", pe_report_filename) + print("ASM Summary File", pe_asm_filename, "\n") + try: agencies_emailed_pe_reports = send_message( @@ -325,7 +332,8 @@ def send_pe_reports(ses_client, pe_report_dir, to): # Print out and log some statistics pe_stats_string = f"Out of {cyhy_agencies} agencies with Posture and Exposure reports, {agencies_emailed_pe_reports} ({100.0 * agencies_emailed_pe_reports / cyhy_agencies:.2f}%) were emailed." - LOGGER.info(pe_stats_string) + mail_summary_log_string = f"{agencies_emailed_pe_reports}/{cyhy_agencies} reports were mailed, {reports_not_mailed}/{cyhy_agencies} reports were not mailed" + LOGGER.info(mail_summary_log_string) return pe_stats_string @@ -339,19 +347,20 @@ def send_reports(pe_report_dir, summary_to, test_emails): return 1 # Assume role to use mailer - sts_client = boto3.client("sts") - assumed_role_object = sts_client.assume_role( - RoleArn=MAILER_ARN, RoleSessionName="AssumeRoleSession1" + sts_client = boto3.client('sts') + assumed_role_object=sts_client.assume_role( + RoleArn=MAILER_ARN, + RoleSessionName="AssumeRoleSession1" ) - credentials = assumed_role_object["Credentials"] + credentials=assumed_role_object['Credentials'] - ses_client = boto3.client( - "ses", + ses_client = boto3.client("ses", region_name="us-east-1", - aws_access_key_id=credentials["AccessKeyId"], - aws_secret_access_key=credentials["SecretAccessKey"], - aws_session_token=credentials["SessionToken"], + aws_access_key_id=credentials['AccessKeyId'], + aws_secret_access_key=credentials['SecretAccessKey'], + aws_session_token=credentials['SessionToken'] ) + # Email the summary statistics, if necessary if test_emails is not None: @@ -380,6 +389,8 @@ def send_reports(pe_report_dir, summary_to, test_emails): def main(): """Send emails.""" + LOGGER.info("--- PE Report Mailing Starting ---") + start_time = time.time() # Parse command line arguments args: Dict[str, str] = docopt.docopt(__doc__, version=__version__) @@ -416,7 +427,7 @@ def main(): level=log_level.upper(), ) - LOGGER.info("Sending Posture & Exposure Reports, Version : %s", __version__) + LOGGER.info("Posture & Exposure Report Mailer, Version : %s", __version__) send_reports( # TODO: Improve use of schema to validate arguments. @@ -426,5 +437,10 @@ def main(): validated_args["--test-emails"], ) + end_time = time.time() + LOGGER.info(f"Execution time for PE report mailing: {str(datetime.timedelta(seconds=(end_time - start_time)))} (H:M:S)") + LOGGER.info("--- PE Report Mailing Complete ---") + # Stop logging and clean up logging.shutdown() + diff --git a/src/pe_reports/asm_generator.py b/src/pe_reports/asm_generator.py index 7cbbd034..43b7c330 100644 --- a/src/pe_reports/asm_generator.py +++ b/src/pe_reports/asm_generator.py @@ -7,24 +7,25 @@ import os # Third-Party Libraries -from PyPDF2 import PdfFileReader, PdfFileWriter import fitz +from PyPDF2 import PdfFileReader, PdfFileWriter +import numpy as np import pandas as pd - -# from reportlab.lib.enums import TA_CENTER from reportlab.lib.pagesizes import letter from reportlab.lib.styles import ParagraphStyle -from reportlab.lib.units import inch from reportlab.pdfbase import pdfmetrics from reportlab.pdfbase.ttfonts import TTFont from reportlab.pdfgen import canvas from reportlab.platypus import Frame, Paragraph +from reportlab.lib.enums import TA_CENTER +from reportlab.lib.units import inch + # cisagov Libraries from pe_reports.data.db_query import ( query_cidrs_by_org, - query_extra_ips, query_foreign_IPs, + query_extra_ips, query_ports_protocols, query_roots, query_software, @@ -35,17 +36,14 @@ LOGGER = logging.getLogger(__name__) BASE_DIR = os.path.abspath(os.path.dirname(__file__)) -IN_FILEPATH = BASE_DIR + "/assets_asm/attack_surface_empty.pdf" ON_PAGE_INDEX = 0 UNDERNEATH = ( False # if True, new content will be placed underneath page (painted first) ) +pdfmetrics.registerFont(TTFont("Frank_Goth", BASE_DIR + "/fonts/FranklinGothic.ttf")) pdfmetrics.registerFont( - TTFont("Frank_Goth", BASE_DIR + "/assets_asm/FranklinGothic.ttf") -) -pdfmetrics.registerFont( - TTFont("Frank_Goth_Book", BASE_DIR + "/assets_asm/Franklin_Gothic_Book_Regular.ttf") + TTFont("Frank_Goth_Book", BASE_DIR + "/fonts/Franklin_Gothic_Book_Regular.ttf") ) @@ -108,7 +106,7 @@ def add_stat_frame(current_value, last_value, x, y, width, height, style, can): def add_attachment(org_uid, final_output, pdf_file, asm_json, asm_xlsx): """Create and add JSON attachment.""" - LOGGER.info("Creating attachment") + LOGGER.info("Creating ASM attachments") # Create ASM Excel file asmWriter = pd.ExcelWriter(asm_xlsx, engine="xlsxwriter") @@ -119,16 +117,14 @@ def add_attachment(org_uid, final_output, pdf_file, asm_json, asm_xlsx): cidr_dict = cidr_df["network"].to_list() # Extra IPs - LOGGER.info("Getting extra IPs") ip_lst = query_extra_ips(org_uid) ips_df = pd.DataFrame(ip_lst, columns=["ip"]) ips_df.to_excel(asmWriter, sheet_name="Extra IPs", index=False) ips_dict = ips_df["ip"].to_list() - LOGGER.info("Finished extra IPs") # Ports/protocols ports_protocols_df = query_ports_protocols(org_uid) - ports_protocols_df.to_excel(asmWriter, sheet_name="Ports_Protocols", index=False) + ports_protocols_df.to_excel(asmWriter, sheet_name="Ports Protocols", index=False) ports_protocols_dict = ports_protocols_df.to_dict(orient="records") # Root domains @@ -139,9 +135,12 @@ def add_attachment(org_uid, final_output, pdf_file, asm_json, asm_xlsx): # Sub-domains sd_df = query_subs(org_uid) - sd_df = sd_df[["sub_domain"]] - sd_df.to_excel(asmWriter, sheet_name="Sub-domains", index=False) - sd_dict = sd_df["sub_domain"].to_list() + # sd_df = sd_df[["sub_domain"]] + #sd_df = sd_df[["sub_domain", "origin_root_domain", "pe_discovered_asset"]] + sd_df = sd_df[["sub_domain", "origin_root_domain"]] + sd_df.to_excel(asmWriter, sheet_name="Subdomains", index=False) + # sd_dict = sd_df["sub_domain"].to_list() + sd_dict = sd_df.to_dict(orient="records") # Software soft_df = query_software(org_uid) @@ -205,9 +204,7 @@ def add_attachment(org_uid, final_output, pdf_file, asm_json, asm_xlsx): return asm_xlsx -def create_summary( - org_uid, final_output, data_dict, file_name, json_filename, excel_filename -): +def create_summary(org_uid, final_output, data_dict, file_name, json_filename, excel_filename): """Create ASM summary PDF.""" packet = io.BytesIO() @@ -310,8 +307,8 @@ def create_summary( can, ) json_title_frame = Frame( - 6 * inch, 100, 1.5 * inch, 0.5 * inch, id=None, showBoundary=0 - ) + 6 * inch, 100, 1.5 * inch, 0.5 * inch, id=None, showBoundary=0 + ) json_title = Paragraph( "JSON      EXCEL", style=json_excel, @@ -324,7 +321,7 @@ def create_summary( new_pdf = PdfFileReader(packet) # Read existing PDF template - existing_pdf = PdfFileReader(open(BASE_DIR + "/assets_asm/empty_asm.pdf", "rb")) + existing_pdf = PdfFileReader(open(BASE_DIR + "/assets_asm/empty_asm_2024-04-15.pdf", "rb")) output = PdfFileWriter() # Add the "watermark" (which is the new pdf) on the existing page @@ -341,5 +338,5 @@ def create_summary( asm_xlsx = add_attachment( org_uid, final_output, file_name, json_filename, excel_filename ) - - return asm_xlsx + + return asm_xlsx \ No newline at end of file diff --git a/src/pe_reports/assets_asm/empty_asm_2024-04-15.pdf b/src/pe_reports/assets_asm/empty_asm_2024-04-15.pdf new file mode 100644 index 00000000..f5fe4ba1 Binary files /dev/null and b/src/pe_reports/assets_asm/empty_asm_2024-04-15.pdf differ diff --git a/src/pe_reports/data/db_query.py b/src/pe_reports/data/db_query.py index 0eca2127..7ebb0474 100644 --- a/src/pe_reports/data/db_query.py +++ b/src/pe_reports/data/db_query.py @@ -29,8 +29,9 @@ CONN_PARAMS_DIC_STAGING = staging_config() # These need to filled with API key/url path in database.ini -pe_api_key = CONN_PARAMS_DIC_STAGING.get("pe_api_key") -pe_api_url = CONN_PARAMS_DIC_STAGING.get("pe_api_url") +API_DIC = staging_config(section="pe_api") +pe_api_url = API_DIC.get("pe_api_url") +pe_api_key = API_DIC.get("pe_api_key") def task_api_call(task_url, check_url, data={}, retry_time=3): @@ -55,24 +56,36 @@ def task_api_call(task_url, check_url, data={}, retry_time=3): create_task_url, headers=headers, data=data ).json() task_id = create_task_result.get("task_id") - LOGGER.info("Created task for", task_url, "query, task_id: ", task_id) + LOGGER.info("Created task for " + task_url + " query, task_id: " + task_id) check_task_url += task_id while task_status != "Completed" and task_status != "Failed": # Ping task status endpoint and get status - check_task_resp = requests.get(check_task_url, headers=headers).json() + # check_task_resp = requests.get(check_task_url, headers=headers).json() + check_task_resp = requests.get(check_task_url, headers=headers) + #print(check_task_resp) + check_task_resp = check_task_resp.json() task_status = check_task_resp.get("status") - LOGGER.info("\tPinged", check_url, "status endpoint, status:", task_status) + LOGGER.info( + "\tPinged " + check_url + " status endpoint, status: " + task_status + ) time.sleep(retry_time) except requests.exceptions.HTTPError as errh: LOGGER.error(errh) + print(errh) except requests.exceptions.ConnectionError as errc: LOGGER.error(errc) + print(errc) except requests.exceptions.Timeout as errt: LOGGER.error(errt) + print(errt) except requests.exceptions.RequestException as err: LOGGER.error(err) + print(err) except json.decoder.JSONDecodeError as err: LOGGER.error(err) + print(err) + except Exception as err: + print(err) # Once task finishes, return result if task_status == "Completed": return check_task_resp.get("result") @@ -2080,7 +2093,6 @@ def query_subs(org_uid): Return: All the subdomains belonging to the specified org as a dataframe """ - start_time = time.time() total_num_pages = 1 page_num = 1 total_data = [] @@ -2100,21 +2112,16 @@ def query_subs(org_uid): page_num += 1 # Once all data has been retrieved, return overall dataframe total_data = pd.DataFrame.from_dict(total_data) - LOGGER.info( - "Total time to retrieve all subdomains for this org: " - + str(time.time() - start_time) - ) # Process data and return total_data.rename( columns={ - "root_domain_uid_id": "root_domain_uid", - "data_source_uid_id": "data_source_uid", - "dns_record_uid_id": "dns_record_uid", + "root_domain_uid__root_domain": "origin_root_domain", + "identified": "pe_discovered_asset", }, inplace=True, ) - total_data["first_seen"] = pd.to_datetime(total_data["first_seen"]).dt.date - total_data["last_seen"] = pd.to_datetime(total_data["last_seen"]).dt.date + # total_data["first_seen"] = pd.to_datetime(total_data["first_seen"]).dt.date + # total_data["last_seen"] = pd.to_datetime(total_data["last_seen"]).dt.date # Return truly empty dataframe if no results if total_data[total_data.columns].isnull().apply(lambda x: all(x), axis=1)[0]: total_data.drop(total_data.index, inplace=True) diff --git a/src/pe_reports/fonts/FranklinGothic.ttf b/src/pe_reports/fonts/FranklinGothic.ttf new file mode 100644 index 00000000..778e5b14 Binary files /dev/null and b/src/pe_reports/fonts/FranklinGothic.ttf differ diff --git a/src/pe_reports/fonts/Franklin_Gothic_Book_Regular.ttf b/src/pe_reports/fonts/Franklin_Gothic_Book_Regular.ttf new file mode 100644 index 00000000..21c4ecfc Binary files /dev/null and b/src/pe_reports/fonts/Franklin_Gothic_Book_Regular.ttf differ diff --git a/src/pe_reports/helpers/encrypt_accessor.py b/src/pe_reports/helpers/encrypt_accessor.py index 93759a5a..e8afda0e 100644 --- a/src/pe_reports/helpers/encrypt_accessor.py +++ b/src/pe_reports/helpers/encrypt_accessor.py @@ -10,20 +10,18 @@ -c --ssh-rsa-file=FILENAME A YAML file containing the Cyber Hygiene database credentials. """ -# Standard Python Libraries +import boto3 +from datetime import timedelta +from docopt import docopt +import fitz import logging import os +import time import traceback -# Third-Party Libraries -from docopt import docopt -import fitz - -# cisagov Libraries +from pe_reports.data.db_query import connect, get_orgs, get_orgs_pass from pe_reports.data.config import db_password_key -from pe_reports.data.db_query import connect, get_orgs_pass -# Setup logging LOGGER = logging.getLogger(__name__) ACCESSOR_AWS_PROFILE = "cool-dns-sesmanagesuppressionlist-cyber.dhs.gov" BUCKET_NAME = "cisa-crossfeed-staging-reports" @@ -53,23 +51,27 @@ def encrypt(file, password, encrypted_file): def download_encrypt_reports(report_date, output_dir): """Fetch reports from S3 bucket.""" - # download_count = 0 - # total = len(pe_orgs) - # print(total) - + LOGGER.info("--- PE Report Encryption Starting ---") + start_time = time.time() + download_count = 0 # Encrypt the reports conn = connect() pe_org_pass = get_orgs_pass(conn, PASSWORD) conn.close() encrypted_count = 0 + no_pass_count = 0 + LOGGER.info(f"Encrypting {len(pe_org_pass)} PE reports for the {report_date} report run") for org_pass in pe_org_pass: print(org_pass) password = org_pass[1] - if password is None: - LOGGER.error("NO PASSWORD") + if password == None: + LOGGER.warning(f"No password on file for {org_pass[0]}, no encrypted report will be generated") + no_pass_count += 1 continue # Check if file exists before encrypting - current_file = f"{output_dir}/{org_pass[0]}/Posture_and_Exposure_Report-{org_pass[0]}-{report_date}.pdf" + current_file = ( + f"{output_dir}/{org_pass[0]}/Posture_and_Exposure_Report-{org_pass[0]}-{report_date}.pdf" + ) current_asm_file = f"{output_dir}/{org_pass[0]}/Posture-and-Exposure-ASM-Summary_{org_pass[0]}_{report_date}.pdf" if not os.path.isfile(current_file): LOGGER.error("%s report does not exist.", org_pass[0]) @@ -100,7 +102,10 @@ def download_encrypt_reports(report_date, output_dir): LOGGER.error("%s report failed to encrypt.", org_pass[0]) continue - LOGGER.info("%d/%d were encrypted.", encrypted_count, 134) + end_time = time.time() + LOGGER.info(f"{encrypted_count}/{len(pe_org_pass)} reports were encrypted, {no_pass_count}/{len(pe_org_pass)} reports do not have passwords") + LOGGER.info(f"Execution time for PE report encryption: {str(timedelta(seconds=(end_time - start_time)))} (H:M:S)") + LOGGER.info("--- PE Report Encryption Complete ---") def main(): diff --git a/src/pe_reports/pages.py b/src/pe_reports/pages.py index 48b24b24..e63fdd52 100644 --- a/src/pe_reports/pages.py +++ b/src/pe_reports/pages.py @@ -376,11 +376,11 @@ def init( } # Get ASM values - LOGGER.info("Getting asset counts") + # LOGGER.info("Getting asset counts") asset_dict = get_org_assets_count(org_uid) - LOGGER.info("finished getting asset counts") + # LOGGER.info("finished getting asset counts") asset_dict_past = get_org_assets_count_past(org_uid, start_date - timedelta(days=1)) - LOGGER.info("Past report date: %s", start_date - timedelta(days=1)) + # LOGGER.info("Past report date: %s", start_date - timedelta(days=1)) if asset_dict_past.empty: LOGGER.error("No ASM summary data for the last report period.") diff --git a/src/pe_reports/report_generator.py b/src/pe_reports/report_generator.py index cc24bcc0..ad13b0ae 100644 --- a/src/pe_reports/report_generator.py +++ b/src/pe_reports/report_generator.py @@ -19,11 +19,13 @@ import logging import os import sys +import time from typing import Any, Dict # Third-Party Libraries import boto3 from botocore.exceptions import ClientError +from datetime import timedelta import docopt import fitz import pandas as pd @@ -35,14 +37,14 @@ from ._version import __version__ from .asm_generator import create_summary from .data.db_query import connect, get_demo_orgs, get_orgs, refresh_asset_counts_vw + +# from .helpers.generate_score import get_pe_scores from .pages import init from .reportlab_core_generator import core_report_gen from .reportlab_generator import report_gen # from .scorecard_generator import create_scorecard -# from .helpers.generate_pe_score import get_pe_scores -# Setup logging LOGGER = logging.getLogger(__name__) ACCESSOR_AWS_PROFILE = os.getenv("ACCESSOR_PROFILE") @@ -61,13 +63,16 @@ def upload_file_to_s3(file_name, datestring, bucket, excel_org): try: response = s3_client.upload_file(file_name, bucket, object_name) if response is None: - LOGGER.info("Success uploading to S3.") + LOGGER.info(f"Success uploading {file_name.split('/')[-1]} to S3.") else: LOGGER.info(response) except ClientError as e: LOGGER.error(e) +LOGGER = logging.getLogger(__name__) + + def embed( output_directory, org_code, @@ -168,43 +173,34 @@ def generate_reports(datestring, output_directory, soc_med_included=False, demo= generated_reports = 0 # Resfresh ASM counts view - LOGGER.info("Refreshing ASM count view and IPs from cidrs") + LOGGER.info("Refreshing ASM asset count view and IPs from cidrs view") refresh_asset_counts_vw() # set_from_cidr() - LOGGER.info("Finished refreshing ASM count view and IPs from Cidrs") + LOGGER.info("Finished refreshing ASM asset count view and IPs from cidrs view") # Iterate over organizations - if pe_orgs: - LOGGER.info("PE orgs count: %d", len(pe_orgs)) - # Generate PE scores for all stakeholders. - LOGGER.info("Calculating P&E Scores") + # Generate PE scores for all stakeholders WIP + # LOGGER.info("Calculating P&E Scores") # pe_scores_df = get_pe_scores(datestring, 12) - # go = 0 # pe_orgs.reverse() - for org in pe_orgs: + + # Uncomment this to generate reports only for these orgs + # pe_orgs = [x for x in pe_orgs if x[2] in [""]] + + # Uncomment this to generate all reports except for these orgs + # pe_orgs = [x for x in pe_orgs if x[2] not in [""]] + + LOGGER.info(f"Generating PE reports for {len(pe_orgs)} requested organizations") + + for org_idx, org in enumerate(pe_orgs): # Assign organization values org_uid = org[0] org_name = org[1] org_code = org[2] premium = org[8] - # Uncomment this to only run specified orgs - # if org_code not in ["EXIM"]: - # continue - - # Uncomment this to skip specified orgs - # if org_code in ["ABLTYONE","ACHP","ACUS","ADF","AFRH","BGSF","CFA","CFPB","CFTC","CIGIE","CLACWA","CNCS","CPSC","CSHIB","CSOSA","DE","DENALI_AlasConnect","DFC","DHS","DHS_FLETC","DNFSB","DOC","DOC_BIS","DOC_CENSUS","DOC_NIST","DOC_NOAA","DOC_NTIA","DOE","DOI","DOI_BIA","DOI_BSEE-BOEM-ONRR","DOI_FWS","DOI_IBC","DOI_NPS","DOI_OIG","DOI_OS","DOI_OS-OAS","DOJ","DOL","DOL_BLS","DOS","DOT","EAC","ED","EEOC","EOP","EOP_OMB","EPA","EXIM","FAA","FCA","FCC","FERC","FHFA","FLRA","FMSHRC","FRB","FTC","GSA","GSEC","GTA","HHS_FDA","HHS_NIH","MCC","MMC","MSPB","NARA","NASA","NCD","NCPC","NCUA","NEA","NEH","NLRB","NMB","NRC","NSF","NTSB","NWTRB","OGE","ONHIR","OPM","OSC","OSHRC","PBGC","PC","PCLOB","PRC","PT","RRB","SBA","SEC","SSA","SSAB","SSS","STB","TREAS","TVA","UDALL","USAB","USAGM","USAID","USCCR","USDA","USIBWC","USICH","USITC","USTDA","VA"]: - # continue - - # if org_code == "HHS_FDA": - # go = 1 - # continue - # if go != 1: - # continue - # Rapidgator%20 DOI_BIA - - LOGGER.info("Running on %s", org_code) + LOGGER.info(f"-- Generating report for {org_code} ({org_idx+1} of {len(pe_orgs)}) --") # Create folders in output directory for dir_name in ("ppt", org_code): @@ -249,7 +245,7 @@ def generate_reports(datestring, output_directory, soc_med_included=False, demo= ) # Create ASM Summary - LOGGER.info("Creating ASM Summary") + LOGGER.info("Creating ASM summary") summary_filename = f"{output_directory}/Posture-and-Exposure-ASM-Summary_{org_code}_{scorecard_dict['end_date'].strftime('%Y-%m-%d')}.pdf" final_summary_output = f"{output_directory}/{org_code}/Posture-and-Exposure-ASM-Summary_{org_code}_{scorecard_dict['end_date'].strftime('%Y-%m-%d')}.pdf" summary_json_filename = f"{output_directory}/{org_code}/ASM_Summary.json" @@ -262,13 +258,13 @@ def generate_reports(datestring, output_directory, soc_med_included=False, demo= summary_json_filename, summary_excel_filename, ) - LOGGER.info("Done") + LOGGER.info("Finished creating ASM summary") # Create scorecard - LOGGER.info("Creating scorecard") + # LOGGER.info("Creating scorecard") # scorecard_filename = f"{output_directory}/{org_code}/Posture-and-Exposure-Scorecard_{org_code}_{scorecard_dict['end_date'].strftime('%Y-%m-%d')}.pdf" # create_scorecard(scorecard_dict, scorecard_filename) - LOGGER.info("Done") + # LOGGER.info("Finished creating scorecard") # Convert to HTML to PDF output_filename = f"{output_directory}/Posture_and_Exposure_Report-{org_code}-{datestring}.pdf" @@ -330,7 +326,8 @@ def generate_reports(datestring, output_directory, soc_med_included=False, demo= "Connection to pe database failed and/or there are 0 organizations stored." ) - LOGGER.info("%s reports generated", generated_reports) + LOGGER.info(f"In total, {generated_reports}/{len(pe_orgs)} reports were generated") + LOGGER.info(f"Generated reports have been output to the directory: {output_directory}") def main(): @@ -370,7 +367,9 @@ def main(): level=log_level.upper(), ) - LOGGER.info("Loading Posture & Exposure Report, Version : %s", __version__) + LOGGER.info("--- PE Report Generation Starting ---") + LOGGER.info("Posture & Exposure Report, Version : %s", __version__) + report_gen_start_time = time.time() # Create output directory if not os.path.exists(validated_args["OUTPUT_DIRECTORY"]): @@ -384,7 +383,10 @@ def main(): validated_args["--demo"], ) - LOGGER.info("%s reports generated", generated_reports) + report_gen_end_time = time.time() + report_gen_exe_time = str(timedelta(seconds=(report_gen_end_time - report_gen_start_time))) + LOGGER.info(f"Execution time for PE report generation: {report_gen_exe_time} (H:M:S)") + LOGGER.info("--- PE Report Generation Complete ---") # Stop logging and clean up logging.shutdown() diff --git a/src/pe_reports/reportlab_generator.py b/src/pe_reports/reportlab_generator.py index 9e4ccf48..7980f3d3 100644 --- a/src/pe_reports/reportlab_generator.py +++ b/src/pe_reports/reportlab_generator.py @@ -169,8 +169,7 @@ def format_table( # Remove emojis from content because the report generator can't display them print(str(cell)) cell = Paragraph( - demoji.replace(str(cell), "").replace("&", "[and]"), - column_style_list[current_cell], + demoji.replace(str(cell), "").replace("&", "[and]"), column_style_list[current_cell] ) current_row.append(cell) diff --git a/src/pe_source/cybersixgill.py b/src/pe_source/cybersixgill.py index 82f3dc93..acc1972b 100644 --- a/src/pe_source/cybersixgill.py +++ b/src/pe_source/cybersixgill.py @@ -3,7 +3,9 @@ # Standard Python Libraries from datetime import date, datetime, timedelta import logging +import pandas as pd import sys +import time import traceback from .data.pe_db.db_query_source import ( @@ -42,7 +44,6 @@ START_DATE_TIME = (NOW - DAYS_BACK).strftime("%Y-%m-%d %H:%M:%S") END_DATE_TIME = NOW.strftime("%Y-%m-%d %H:%M:%S") -# Setup logging LOGGER = logging.getLogger(__name__) @@ -101,7 +102,8 @@ def run_cybersixgill(self): for pe_org in pe_orgs_final: list = list + pe_org["cyhy_db_name"] + "," print(list) - for pe_org in pe_orgs_final: + + for org_idx, pe_org in enumerate(pe_orgs_final): org_id = pe_org["cyhy_db_name"] pe_org_uid = pe_org["organizations_uid"] # Only run on specified orgs @@ -111,13 +113,14 @@ def run_cybersixgill(self): try: sixgill_org_id = sixgill_orgs[org_id][0] except KeyError as err: - LOGGER.error("PE org is not listed in Cybersixgill.") - print(err, file=sys.stderr) - failed.append("%s not in sixgill" % org_id) + LOGGER.warning(f"{org_id} is not registered in Cybersixgill") + # print(err, file=sys.stderr) + # failed.append("%s not in sixgill" % org_id) continue # Run alerts if "alerts" in method_list: + LOGGER.info(f"Fetching alert data for {org_id} ({org_idx+1} of {len(pe_orgs_final)})") if ( self.get_alerts( org_id, @@ -131,6 +134,7 @@ def run_cybersixgill(self): failed.append("%s alerts" % org_id) # Run mentions if "mentions" in method_list: + LOGGER.info(f"Fetching mention data for {org_id} ({org_idx+1} of {len(pe_orgs_final)})") if ( self.get_mentions( org_id, @@ -144,6 +148,7 @@ def run_cybersixgill(self): failed.append("%s mentions" % org_id) # Run credentials if "credentials" in method_list: + LOGGER.info(f"Fetching credential data for {org_id} ({org_idx+1} of {len(pe_orgs_final)})") if ( self.get_credentials( org_id, sixgill_org_id, pe_org_uid, source_uid @@ -158,7 +163,6 @@ def get_alerts( self, org_id, sixgill_org_id, pe_org_uid, source_uid, soc_med_included ): """Get alerts.""" - LOGGER.info("Fetching alert data for %s.", org_id) soc_med_platforms = [ "twitter", "Twitter", @@ -179,7 +183,7 @@ def get_alerts( ] # Fetch alert data with sixgill_org_id try: - print(sixgill_org_id) + # print(sixgill_org_id) alerts_df = alerts(sixgill_org_id) if not soc_med_included: alerts_df = alerts_df[~alerts_df["site"].isin(soc_med_platforms)] @@ -190,7 +194,7 @@ def get_alerts( # Rename columns alerts_df = alerts_df.rename(columns={"id": "sixgill_id"}) except Exception as e: - LOGGER.error("Failed fetching alert data for %s", org_id) + LOGGER.error("Failed fetching total alert count for %s", org_id) LOGGER.error(e) print(traceback.format_exc()) return 1 @@ -201,18 +205,18 @@ def get_alerts( # Fetch organization assets org_assets_dict = all_assets_list(sixgill_org_id) for alert_index, alert_row in alerts_df.iterrows(): - print(org_id) + # print(org_id) try: alert_id = alert_row["sixgill_id"] - content_snip, asset_mentioned, asset_type = get_alerts_content( - sixgill_org_id, alert_id, org_assets_dict - ) + # content_snip, asset_mentioned, asset_type = get_alerts_content( + # sixgill_org_id, alert_id, org_assets_dict + # ) alerts_df.at[alert_index, "content_snip"] = content_snip alerts_df.at[alert_index, "asset_mentioned"] = asset_mentioned alerts_df.at[alert_index, "asset_type"] = asset_type - except Exception: + except Exception as e: # LOGGER.error( # "Failed fetching a specific alert content for %s", org_id # ) @@ -241,7 +245,6 @@ def get_mentions( self, org_id, sixgill_org_id, pe_org_uid, source_uid, soc_med_included ): """Get mentions.""" - LOGGER.info("Fetching mention data for %s.", org_id) # Fetch org aliases from Cybersixgill try: aliases = alias_organization(sixgill_org_id) @@ -251,6 +254,7 @@ def get_mentions( LOGGER.error(e) return 1 + mentions_df = None # Fetch mention data try: if "dhs" in aliases: @@ -267,7 +271,7 @@ def get_mentions( if "st" in aliases: aliases.remove("st") if "nih" in aliases: - aliases.remove("noh") + aliases.remove("nih") if "blm" in aliases: aliases.remove("blm") if "ed" in aliases: @@ -315,20 +319,24 @@ def get_mentions( if "stb" in aliases: aliases.remove("stb") aliases.append("surface transportation") - try: - mentions_df = mentions(DATE_SPAN, aliases, soc_med_included) - except UnboundLocalError: - return 1 - mentions_df = mentions_df.rename(columns={"id": "sixgill_mention_id"}) - mentions_df["organizations_uid"] = pe_org_uid - # Add data source uid - mentions_df["data_source_uid"] = source_uid + # Retrieve mention data + mentions_df = mentions(DATE_SPAN, aliases, soc_med_included) except Exception as e: LOGGER.error("Failed fetching mentions for %s", org_id) print(traceback.format_exc()) LOGGER.error(e) return 1 + # Catch no mentions found situation: + if mentions_df.empty: + LOGGER.info(f"No mention data found for {org_id}, moving on") + return 0 + + # Format data + mentions_df = mentions_df.rename(columns={"id": "sixgill_mention_id"}) + mentions_df["organizations_uid"] = pe_org_uid + mentions_df["data_source_uid"] = source_uid + # Insert mention data into the PE database try: insert_sixgill_mentions(mentions_df) @@ -341,7 +349,6 @@ def get_mentions( def get_credentials(self, org_id, sixgill_org_id, pe_org_uid, source_uid): """Get credentials.""" - LOGGER.info("Fetching credential data for %s.", org_id) # Fetch org root domains from Cybersixgill try: roots = root_domains(sixgill_org_id) @@ -351,24 +358,52 @@ def get_credentials(self, org_id, sixgill_org_id, pe_org_uid, source_uid): LOGGER.error(e) return 1 + # Catch no root assets situation + if len(roots) == 0: + LOGGER.warning(f"{org_id} does not have any root domain assets in Cybersixgill") + return 0 + # Fetch credential data - try: - creds_df = creds(roots, START_DATE_TIME, END_DATE_TIME) - LOGGER.info("Found %s credentials.", len(creds_df.index)) - creds_df["organizations_uid"] = pe_org_uid - # Add data source uid - creds_df["data_source_uid"] = source_uid - except Exception as e: - LOGGER.error("Failed fetching credentials for %s", org_id) - LOGGER.error(e) - return 1 + if len(roots) > 100: + # Catch situation where an org has >100 roots in sixgill + LOGGER.info(f"{org_id} has more than 100 root assets in cybersixgill, breaking into chunks of 100...") + root_chunks = [roots[i:i + 100] for i in range(0, len(roots), 100)] + creds_df = pd.DataFrame() + for idx, chunk in enumerate(root_chunks): + try: + LOGGER.info(f"On chunk {idx+1} of {len(root_chunks)} for {org_id} credentials") + chunk_creds_df = creds(chunk, START_DATE_TIME, END_DATE_TIME) + LOGGER.info("Found %s credentials for this chunk", len(chunk_creds_df.index)) + chunk_creds_df["organizations_uid"] = pe_org_uid + chunk_creds_df["data_source_uid"] = source_uid + creds_df = creds_df.append(chunk_creds_df, ignore_index=True) + except Exception as e: + LOGGER.error("Failed fetching credential chunk for %s", org_id) + LOGGER.error(e) + return 1 + LOGGER.info("Found %s credentials in total", len(creds_df.index)) + else: + # Otherwise, just fetch credential data + try: + creds_df = creds(roots, START_DATE_TIME, END_DATE_TIME) + LOGGER.info("Found %s credentials.", len(creds_df.index)) + creds_df["organizations_uid"] = pe_org_uid + creds_df["data_source_uid"] = source_uid + except Exception as e: + LOGGER.error("Failed fetching credentials for %s", org_id) + LOGGER.error(e) + return 1 + + # Catch no credentials found situation + if creds_df.empty: + LOGGER.info("No credential findings for this org, moving on") + return 0 # Change empty and ambiguous breach names try: creds_df.loc[ creds_df["breach_name"] == "", "breach_name" ] = "Cybersixgill_" + creds_df["breach_id"].astype(str) - creds_df.loc[ creds_df["breach_name"] == "Automatic leaked credentials detection", "breach_name", @@ -405,7 +440,7 @@ def get_credentials(self, org_id, sixgill_org_id, pe_org_uid, source_uid): ) creds_breach_df.drop(columns=["exposed_cred_count"], inplace=True) except Exception as e: - LOGGER.error("Probably no credential breaches for %s", org_id) + LOGGER.error("Error formatting credential breach data for %s", org_id) LOGGER.error(e) return 1 @@ -453,8 +488,7 @@ def get_credentials(self, org_id, sixgill_org_id, pe_org_uid, source_uid): def get_topCVEs(self, source_uid): """Get top CVEs.""" - LOGGER.info("Fetching top CVE data.") - + LOGGER.info(f"Fetching the top ten CVEs for the past report period") # Fetch top CVE data try: top_cve_df = top_cves(10) diff --git a/src/pe_source/data/pe_db/db_query_source.py b/src/pe_source/data/pe_db/db_query_source.py index d26c7091..14192d6a 100644 --- a/src/pe_source/data/pe_db/db_query_source.py +++ b/src/pe_source/data/pe_db/db_query_source.py @@ -2,6 +2,7 @@ # Standard Python Libraries from datetime import datetime +from decimal import Decimal import json import logging import socket @@ -21,14 +22,15 @@ from pe_reports.data.db_query import task_api_call # Setup logging to central file -LOGGER = app.config["LOGGER"] +LOGGER = logging.getLogger(__name__) CONN_PARAMS_DIC = config() CONN_PARAMS_DIC_STAGING = staging_config() # These need to filled with API key/url path in database.ini -pe_api_key = CONN_PARAMS_DIC_STAGING.get("pe_api_key") -pe_api_url = CONN_PARAMS_DIC_STAGING.get("pe_api_url") +API_DIC = staging_config(section="pe_api") +pe_api_key = API_DIC.get("pe_api_key") +pe_api_url = API_DIC.get("pe_api_url") def show_psycopg2_exception(err): @@ -154,6 +156,16 @@ def insert_sixgill_alerts(new_alerts): LOGGER.info( "Working on chunk " + str(chunk_ct) + " of " + str(len(chunked_list)) ) + # Check total content field data size for this chunk + total_content_size = sum([len(entry["content"]) for entry in chunk]) + # If total content field size is too big, trim content field for this chunk + if total_content_size > 400000: + LOGGER.warning("Excessive alert content data for this chunk, trimming...") + for entry in chunk: + over_limit = len(entry["content"]) > 2000 + entry["content"] = entry["content"][:2000] + if over_limit: + entry["content"] += "\n[content has been trimmed for space savings]" # Endpoint info task_url = "alerts_insert" status_url = "alerts_insert/task/" @@ -993,7 +1005,7 @@ def get_data_source_uid(source): source: The name of the specified data source Return: - UID for the specified data source + Data for the specified data source """ # Endpoint info endpoint_url = pe_api_url + "data_source_by_name" @@ -1071,8 +1083,8 @@ def getSubdomain(domain): try: result = requests.post(endpoint_url, headers=headers, data=data).json() # Process data and return - final_result = result[0] - return final_result + tup_result = [tuple(row.values()) for row in result] + return tup_result[0][0] except requests.exceptions.HTTPError as errh: LOGGER.error(errh) except requests.exceptions.ConnectionError as errc: @@ -1472,14 +1484,37 @@ def insert_shodan_vulns(dataframe, table, thread, org_name, failed): def get_ips(org_uid): """Get IP data.""" conn = connect() - sql = """SELECT wa.asset as ip_address - FROM web_assets wa - WHERE wa.organizations_uid = %(org_uid)s - and wa.report_on = True - """ - df = pd.read_sql(sql, conn, params={"org_uid": org_uid}) - ips = list(df["ip_address"].values) + sql1 = """SELECT i.ip_hash, i.ip, ct.network FROM ips i + JOIN cidrs ct on ct.cidr_uid = i.origin_cidr + JOIN organizations o on o.organizations_uid = ct.organizations_uid + where o.organizations_uid = %(org_uid)s + and i.origin_cidr is not null + and i.shodan_results is True + and i.current;""" + df1 = pd.read_sql(sql1, conn, params={"org_uid": org_uid}) + ips1 = list(df1["ip"].values) + + sql2 = """select i.ip_hash, i.ip + from ips i + join ips_subs is2 ON i.ip_hash = is2.ip_hash + join sub_domains sd on sd.sub_domain_uid = is2.sub_domain_uid + join root_domains rd on rd.root_domain_uid = sd.root_domain_uid + JOIN organizations o on o.organizations_uid = rd.organizations_uid + where o.organizations_uid = %(org_uid)s + and i.shodan_results is True + and sd.current + and i.current;""" + df2 = pd.read_sql(sql2, conn, params={"org_uid": org_uid}) + ips2 = list(df2["ip"].values) + + in_first = set(ips1) + in_second = set(ips2) + + in_second_but_not_in_first = in_second - in_first + + ips = ips1 + list(in_second_but_not_in_first) conn.close() + return ips @@ -2274,7 +2309,7 @@ def insert_shodan_data(dataframe, table, thread, org_name, failed): tpls, ) conn.commit() - logging.info( + LOGGER.info( "{} Data inserted using execute_values() successfully - {}".format( thread, org_name ) diff --git a/src/pe_source/data/shodan_db/shodan_search.py b/src/pe_source/data/shodan_db/shodan_search.py index 27aaeab3..286f9446 100644 --- a/src/pe_source/data/shodan_db/shodan_search.py +++ b/src/pe_source/data/shodan_db/shodan_search.py @@ -11,23 +11,24 @@ import shodan # cisagov Libraries -from pe_source.data.pe_db.db_query_source import ( +from pe_source.data.pe_db.db_query_source import ( # get_ips_dhs,; get_ips_hhs,; get_ips_nasa, get_data_source_uid, get_ips, insert_shodan_data, ) -# Setup logging LOGGER = logging.getLogger(__name__) def run_shodan_thread(api, org_chunk, thread_name): """Run a Shodan thread.""" failed = [] - for org in org_chunk: + warnings = [] + for org_idx, org in enumerate(org_chunk): org_name = org["cyhy_db_name"] org_uid = org["organizations_uid"] - LOGGER.info("{} Running IPs for {}".format(thread_name, org_name)) + # LOGGER.info("{} Running IPs for {}".format(thread_name, org_name)) + LOGGER.info(f"{thread_name}: Running Shodan on {org_name}") start, end = get_dates() try: ips = get_ips(org_uid) @@ -38,14 +39,17 @@ def run_shodan_thread(api, org_chunk, thread_name): continue if len(ips) == 0: - LOGGER.error("{} No IPs for {}.".format(thread_name, org_name)) - failed.append("{} has 0 IPs".format(org_name)) + LOGGER.warning("{} No IPs for {}.".format(thread_name, org_name)) + warnings.append("{} has 0 IPs".format(org_name)) continue failed = search_shodan( thread_name, ips, api, start, end, org_uid, org_name, failed ) + if len(warnings) > 0: + LOGGER.warning(f"{thread_name} Warnings: {warnings}") + if len(failed) > 0: LOGGER.critical("{} Failures: {}".format(thread_name, failed)) @@ -161,6 +165,9 @@ def search_shodan(thread_name, ips, api, start, end, org_uid, org_name, failed): "timestamp": d["timestamp"], "type": ftype, "is_verified": False, + "cpe": d.get("cpe", None), + "banner": d.get("data", None), + "version": d.get("version", None) } ) elif d["_shodan"]["module"] in risky_ports: @@ -202,6 +209,9 @@ def search_shodan(thread_name, ips, api, start, end, org_uid, org_name, failed): "timestamp": d["timestamp"], "type": ftype, "is_verified": False, + "cpe": d.get("cpe", None), + "banner": d.get("data", None), + "version": d.get("version", None) } ) @@ -221,7 +231,7 @@ def search_shodan(thread_name, ips, api, start, end, org_uid, org_name, failed): "tags": r["tags"], "timestamp": d["timestamp"], "country_code": location["country_code"], - "location": str(location), + "location": str(location) } ) diff --git a/src/pe_source/data/sixgill/api.py b/src/pe_source/data/sixgill/api.py index 5a728c1b..c1156e5f 100755 --- a/src/pe_source/data/sixgill/api.py +++ b/src/pe_source/data/sixgill/api.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python """Cybersixgill API calls.""" # Standard Python Libraries import json @@ -13,12 +12,12 @@ # cisagov Libraries from pe_source.data.pe_db.config import cybersix_token -# Setup logging LOGGER = logging.getLogger(__name__) def get_sixgill_organizations(): """Get the list of organizations.""" + # Call Cybersixgill's /organization endpoint url = "https://api.cybersixgill.com/multi-tenant/organization" auth = cybersix_token() headers = { @@ -26,15 +25,26 @@ def get_sixgill_organizations(): "Cache-Control": "no-cache", "Authorization": "Bearer " + auth, } - orgs = requests.get(url, headers=headers).json() + orgs = requests.get(url, headers=headers) + # Retry clause in case Cybersixgill's API falters + retry_count, max_retries, time_delay = 0, 10, 5 + while orgs.status_code != 200 and retry_count < max_retries: + endpoint_name = url.split('/')[-1] + LOGGER.warning(f"Retrying Cybersixgill /{endpoint_name} endpoint, attmept {retry_count+1} of {max_retries}") + time.sleep(time_delay) + orgs = requests.get(url, headers=headers) + retry_count += 1 + orgs = orgs.json() df_orgs = pd.DataFrame(orgs) df_orgs = df_orgs[["name", "organization_id"]] sixgill_dict = df_orgs.set_index("name").agg(list, axis=1).to_dict() + # Return results return sixgill_dict def org_assets(org_id): """Get organization assets.""" + # Call Cybersixgill's /assets endpoint url = f"https://api.cybersixgill.com/multi-tenant/organization/{org_id}/assets" auth = cybersix_token() headers = { @@ -43,22 +53,23 @@ def org_assets(org_id): "Authorization": "Bearer " + auth, } payload = {"organization_id": org_id} - count = 1 - while count < 7: - try: - resp = requests.get(url, headers=headers, params=payload).json() - break - except Exception: - time.sleep(5) - LOGGER.info("Error. Trying query post again...") - count += 1 - continue - resp = requests.get(url, headers=headers, params=payload).json() + resp = requests.get(url, headers=headers, params=payload) + # Retry clause in case Cybersixgill's API falters + retry_count, max_retries, time_delay = 0, 10, 5 + while resp.status_code != 200 and retry_count < max_retries: + endpoint_name = url.split('/')[-1] + LOGGER.warning(f"Retrying Cybersixgill /{endpoint_name} endpoint, attmept {retry_count+1} of {max_retries}") + time.sleep(time_delay) + resp = requests.get(url, headers=headers, params=payload) + retry_count += 1 + resp = resp.json() + # Return result return resp def intel_post(auth, query, frm, scroll, result_size): """Get intel items - advanced variation.""" + # Call Cybersixgill's /intel_items endpoint url = "https://api.cybersixgill.com/intel/intel_items" headers = { "Content-Type": "application/json", @@ -77,12 +88,23 @@ def intel_post(auth, query, frm, scroll, result_size): "recent_items": False, "safe_content_size": True, } - resp = requests.post(url, headers=headers, json=payload).json() + resp = requests.post(url, headers=headers, json=payload) + # Retry clause in case Cybersixgill's API falters + retry_count, max_retries, time_delay = 0, 10, 5 + while resp.status_code != 200 and retry_count < max_retries: + endpoint_name = url.split('/')[-1] + LOGGER.warning(f"Retrying Cybersixgill /{endpoint_name} endpoint, attmept {retry_count+1} of {max_retries}") + time.sleep(time_delay) + resp = requests.post(url, headers=headers, json=payload) + retry_count += 1 + resp = resp.json() + # Return result return resp def alerts_list(auth, organization_id, fetch_size, offset): """Get actionable alerts by ID using organization_id with optional filters.""" + # Call Cybersixgill's /actionable-alert endpoint url = "https://api.cybersixgill.com/alerts/actionable-alert" headers = { "Content-Type": "application/json", @@ -95,11 +117,26 @@ def alerts_list(auth, organization_id, fetch_size, offset): "offset": offset, } resp = requests.get(url, headers=headers, params=payload) + # Catch possible 400 error code + if resp.status_code == 400: + LOGGER.error("Received error code 400 from Cybersixgill's /actionable-alert endpoint") + LOGGER.error(resp.content) + # Retry clause in case Cybersixgill's API falters + retry_count, max_retries, time_delay = 0, 10, 5 + while resp.status_code != 200 and retry_count < max_retries: + endpoint_name = url.split('/')[-1] + LOGGER.warning(f"Retrying Cybersixgill /{endpoint_name} endpoint (code {resp.status_code}) for chunk at offset {offset} , attempt {retry_count+1} of {max_retries}") + time.sleep(time_delay) + resp = requests.get(url, headers=headers, params=payload) + retry_count += 1 + resp = resp.json() + # Return result return resp def alerts_count(auth, organization_id): """Get the total read and unread actionable alerts by organization.""" + # Call Cybersixgill's /count endpoint url = "https://api.cybersixgill.com/alerts/actionable_alert/count" headers = { "Content-Type": "application/json", @@ -107,12 +144,23 @@ def alerts_count(auth, organization_id): "Authorization": "Bearer " + auth, } payload = {"organization_id": organization_id} - resp = requests.get(url, headers=headers, params=payload).json() + resp = requests.get(url, headers=headers, params=payload) + # Retry clause in case Cybersixgill's API falters + retry_count, max_retries, time_delay = 0, 10, 5 + while resp.status_code != 200 and retry_count < max_retries: + endpoint_name = url.split('/')[-1] + LOGGER.warning(f"Retrying Cybersixgill /{endpoint_name} endpoint, attmept {retry_count+1} of {max_retries}") + time.sleep(time_delay) + resp = requests.get(url, headers=headers, params=payload) + retry_count += 1 + resp = resp.json() + # Return result return resp def alerts_content(auth, organization_id, alert_id): """Get total alert content.""" + # Call Cybersixgill's /actionable_alert_content endpoint url = f"https://api.cybersixgill.com/alerts/actionable_alert_content/{alert_id}" headers = { "Content-Type": "application/json", @@ -120,7 +168,16 @@ def alerts_content(auth, organization_id, alert_id): "Authorization": "Bearer " + auth, } payload = {"organization_id": organization_id, "limit": 10000} - content = requests.get(url, headers=headers, params=payload).json() + content = requests.get(url, headers=headers, params=payload) + # Retry clause in case Cybersixgill's API falters + retry_count, max_retries, time_delay = 0, 10, 5 + while content.status_code != 200 and retry_count < max_retries: + endpoint_name = url.split('/')[-1] + LOGGER.warning(f"Retrying Cybersixgill /actionable_alert_content endpoint, attmept {retry_count+1} of {max_retries}") + time.sleep(time_delay) + content = requests.get(url, headers=headers, params=payload) + retry_count += 1 + content = content.json() try: content = content["content"]["items"][0] if "_source" in content: @@ -132,13 +189,14 @@ def alerts_content(auth, organization_id, alert_id): except Exception as e: LOGGER.error("Failed getting content snip: %s", e) content = "" + # Return result return content def dve_top_cves(): """Retrieve the top 10 CVEs for this report period.""" + # Call Cybersixgill's /enrich endpoint url = "https://api.cybersixgill.com/dve_enrich/enrich" - # Used to be: https://api.cybersixgill.com/dve_enrich/top_cves auth = cybersix_token() headers = { "Content-Type": "application/json", @@ -148,36 +206,33 @@ def dve_top_cves(): data = json.dumps( { "filters": { - "sixgill_rating_range": {"from": 8, "to": 10}, + "sixgill_rating_range": {"from": 6, "to": 10}, }, "results_size": 10, "enriched": True, "from_index": 0, } ) - resp = requests.post(url, headers=headers, data=data).json() - - # old version needed sorting: - # sorted_values = sorted( - # resp["objects"], - # key=lambda x: x["score"]["sixgill"]["current"] - # if x["score"]["sixgill"]["current"] is not None - # else float("-inf"), - # reverse=True, - # ) - # top_10_cves = sorted_values[:10] - + resp = requests.post(url, headers=headers, data=data) + # Retry clause in case Cybersixgill's API falters + retry_count, max_retries, time_delay = 0, 10, 5 + while resp.status_code != 200 and retry_count < max_retries: + endpoint_name = url.split('/')[-1] + LOGGER.warning(f"Retrying Cybersixgill /{endpoint_name} endpoint, attmept {retry_count+1} of {max_retries}") + time.sleep(time_delay) + resp = requests.post(url, headers=headers, data=data) + retry_count += 1 + resp = resp.json() + # Sort and clean top CVE data result_list = resp.get("objects") clean_top_10_cves = [] for result in result_list: cve_id = result.get("name") - dynamic_rating = result.get("x_sixgill_info").get("score").get("current") + dynamic_rating = result.get("x_sixgill_info").get("rating").get("current") if result.get("x_sixgill_info").get("nvd").get("v3") is None: nvd_v3_score = None else: - nvd_v3_score = ( - result.get("x_sixgill_info").get("nvd").get("v3").get("current") - ) + nvd_v3_score = result.get("x_sixgill_info").get("nvd").get("v3").get("current") nvd_base_score = "{'v2': None, 'v3': " + str(nvd_v3_score) + "}" clean_cve = { "cve_id": cve_id, @@ -185,12 +240,12 @@ def dve_top_cves(): "nvd_base_score": nvd_base_score, } clean_top_10_cves.append(clean_cve) - + # Return result return clean_top_10_cves - def credential_auth(params): """Get data about a specific CVE.""" + # Call Cybersixgill's /leaks endpoint url = "https://api.cybersixgill.com/credentials/leaks" auth = cybersix_token() headers = { @@ -198,7 +253,17 @@ def credential_auth(params): "Cache-Control": "no-cache", "Authorization": "Bearer " + auth, } - resp = requests.get(url, headers=headers, params=params).json() + resp = requests.get(url, headers=headers, params=params) + # Retry clause in case Cybersixgill's API falters + retry_count, max_retries, time_delay = 0, 10, 5 + while resp.status_code != 200 and retry_count < max_retries: + endpoint_name = url.split('/')[-1] + LOGGER.warning(f"Retrying Cybersixgill /{endpoint_name} endpoint, attmept {retry_count+1} of {max_retries}") + time.sleep(time_delay) + resp = requests.get(url, headers=headers, params=params) + retry_count += 1 + resp = resp.json() + # Return result return resp @@ -234,24 +299,29 @@ def setOrganizationUsers(org_id): id_role1 = "610017c216948d7efa077a52" csg_role_id = "role_id" csg_user_id = "user_id" + for user in getUserInfo(): userrole = user[csg_role_id] user_id = user[csg_user_id] + if ( (userrole == role1) and (user_id != id_role1) or userrole == role2 and user_id != id_role1 ): + url = ( f"https://api.cybersixgill.com/multi-tenant/organization/" f"{org_id}/user/{user_id}?role_id={userrole}" ) + headers = { "Content-Type": "application/json", "Cache-Control": "no-cache", "Authorization": f"Bearer {cybersix_token()}", } + response = requests.post(url, headers=headers).json() LOGGER.info("The response is %s", response) @@ -270,11 +340,13 @@ def setOrganizationDetails(org_id, orgAliases, orgDomain, orgIP, orgExecs): } ) url = f"https://api.cybersixgill.com/multi-tenant/" f"organization/{org_id}/assets" + headers = { "Content-Type": "application/json", "Cache-Control": "no-cache", "Authorization": f"Bearer {cybersix_token()}", } + response = requests.put(url, headers=headers, data=newOrganizationDetails).json() LOGGER.info("The response is %s", response) @@ -282,12 +354,15 @@ def setOrganizationDetails(org_id, orgAliases, orgDomain, orgIP, orgExecs): def getUserInfo(): """Get all organization details from Cybersixgill via API.""" url = "https://api.cybersixgill.com/multi-tenant/organization" + headers = { "Content-Type": "application/json", "Cache-Control": "no-cache", "Authorization": f"Bearer {cybersix_token()}", } + response = requests.get(url, headers=headers).json() + userInfo = response[1]["assigned_users"] return userInfo @@ -304,6 +379,7 @@ def get_bulk_cve_resp(cve_list): Raw API response for CVE list """ + # Call Cybersixgill's /enrich endpoint c6g_url = "https://api.cybersixgill.com/dve_enrich/enrich" auth = cybersix_token() headers = { @@ -316,11 +392,15 @@ def get_bulk_cve_resp(cve_list): "results_size": len(cve_list), "from_index": 0, } - # Make API call for specified CVE list - try: - # Attempt API call - resp = requests.post(c6g_url, headers=headers, json=body).json() - # Return response - return resp - except Exception as e: - LOGGER.error("Error making bulk CVE API call: %s", e) + resp = requests.post(c6g_url, headers=headers, json=body) + # Retry clause in case Cybersixgill's API falters + retry_count, max_retries, time_delay = 0, 10, 5 + while resp.status_code != 200 and retry_count < max_retries: + endpoint_name = url.split('/')[-1] + LOGGER.warning(f"Retrying Cybersixgill /{endpoint_name} endpoint, attmept {retry_count+1} of {max_retries}") + time.sleep(time_delay) + resp = requests.get(url, headers=headers, params=params) + retry_count += 1 + resp = resp.json() + # Return results + return resp diff --git a/src/pe_source/data/sixgill/source.py b/src/pe_source/data/sixgill/source.py index d0391232..6146608f 100644 --- a/src/pe_source/data/sixgill/source.py +++ b/src/pe_source/data/sixgill/source.py @@ -22,7 +22,6 @@ org_assets, ) -# Setup logging LOGGER = logging.getLogger(__name__) @@ -59,6 +58,7 @@ def root_domains(org_id): def mentions(date, aliases, soc_media_included=False): """Pull dark web mentions data for an organization.""" token = cybersix_token() + # Build the query using the org's aliases mentions = "" for mention in aliases: @@ -78,19 +78,22 @@ def mentions(date, aliases, soc_media_included=False): linkedin, Linkedin, discord, forum_discord, raddle, telegram, jabber, ICQ, icq, mastodon)""" ) + # Get the total number of mentions - count = 1 - while count < 7: - try: - LOGGER.info("Total mentions try #%s", count) - resp = intel_post(token, query, frm=0, scroll=False, result_size=1) - break - except Exception: - LOGGER.info("Error. Trying to get mentions count again...") - count += 1 - continue - total_mentions = resp["total_intel_items"] + try: + LOGGER.info(f"Retrieving total numnber of mentions") + resp = intel_post(token, query, frm=0, scroll=False, result_size=1) + total_mentions = resp["total_intel_items"] + except Exception as e: + LOGGER.error("Total mentions count retrieval failed") + LOGGER.error(e) + LOGGER.info("Total Mentions: %s", total_mentions) + + # Catch situation where org has 0 mentions + if total_mentions == 0: + return pd.DataFrame() + # Fetch mentions in segments # Recommended segment is 50. The maximum is 400. i = 0 @@ -141,13 +144,13 @@ def mentions(date, aliases, soc_media_included=False): smaller_segment_count = 1 else: segment_size = 10 - LOGGER.error( + LOGGER.warning( "Failed 3 times. Switching to a segment size of %s", segment_size, ) try_count = 1 continue - LOGGER.error("Try %s/3 failed.", try_count) + LOGGER.warning("Mentions segment retieval failed, try %s/3", try_count) try_count += 1 return df_all_mentions @@ -159,25 +162,39 @@ def alerts(org_id): LOGGER.info(count) count_total = count["total"] LOGGER.info("Total Alerts: %s", count_total) + # Recommended "fetch_size" is 25. The maximum is 400. - fetch_size = 25 + fetch_size = 50 all_alerts = [] - + df_all_alerts = pd.DataFrame() + token_refresh_counter = 1 for offset in range(0, count_total, fetch_size): + # Keep API auth token refreshed (they expire after 30min) + if token_refresh_counter % 100 == 0: + # Set to refresh every 100 chunks + # this needs to be adjusted depending on chunk size + LOGGER.warning("API auth token refreshed due to long alert retrieval time...") + token = cybersix_token() + token_refresh_counter += 1 + # Retrieve alert data for this chunk try: - resp = alerts_list(token, org_id, fetch_size, offset).json() + print(f"Working on alert chunk at offset {offset} out of {count_total}") + resp = alerts_list(token, org_id, fetch_size, offset) df_alerts = pd.DataFrame.from_dict(resp) + df_alerts.drop(columns=["sub_alerts"], inplace=True) # large unused data field all_alerts.append(df_alerts) df_all_alerts = pd.concat(all_alerts).reset_index(drop=True) except Exception as e: - print(e) - print("HAD TO CONTINUE THROUGH ALERT CHUNK") + LOGGER.error(f"Issue fetching alert data chunk at offset: {offset}") + LOGGER.error(e) continue + # Fetch the full content of each alert # for i, r in df_all_alerts.iterrows(): # print(r["id"]) # content = alerts_content(org_id, r["id"]) - # df_all_alerts.at[i, "content"] = content + # df_all_alerts.at[i, "content"] + return df_all_alerts diff --git a/src/pe_source/dnsmonitor.py b/src/pe_source/dnsmonitor.py index bc064cd2..49109586 100644 --- a/src/pe_source/dnsmonitor.py +++ b/src/pe_source/dnsmonitor.py @@ -19,15 +19,14 @@ getSubdomain, ) -# Setup logging -LOGGER = logging.getLogger(__name__) - NOW = datetime.datetime.now() DAYS_BACK = datetime.timedelta(days=20) DAY = datetime.timedelta(days=1) START_DATE = NOW - DAYS_BACK END_DATE = NOW + DAY +LOGGER = logging.getLogger(__name__) + class DNSMonitor: """Fetch DNSMonitor data.""" @@ -70,32 +69,33 @@ def run_dnsMonitor(self): domain_df = get_monitored_domains(token) failed = [] + warnings = [] # Iterate through each org - for org in pe_orgs_final: + for org_idx, org in enumerate(pe_orgs_final): org_name = org["name"] org_uid = org["organizations_uid"] org_code = org["cyhy_db_name"] - LOGGER.info("\nRunning DNSMonitor on %s", org_code) + LOGGER.info(f"Running DNSMonitor on {org_code} ({org_idx+1} of {len(pe_orgs_final)})") # Get respective domain IDs domain_ids = domain_df[domain_df["org"] == org_name] - LOGGER.info("Found %s root domains being monitored.", len(domain_ids)) + LOGGER.info(f"Found {len(domain_ids)} root domains being monitored for {org_code}") domain_ids = str(domain_ids["domainId"].tolist()) # Get Alerts for a specific org based on the list of domain IDs if domain_ids == "[]": - LOGGER.error("Can't match org to any domains...") - failed.append(f"{org_code} - No domains") + LOGGER.warning(f"No domains being monitored by DNSMonitor for {org_code}") + warnings.append(f"{org_code} - No domains being monitored") continue else: alerts_df = get_domain_alerts(token, domain_ids, START_DATE, END_DATE) LOGGER.info("Fetched %s alerts.", len(alerts_df.index)) - # If no alerts, continue - if alerts_df.empty: - LOGGER.error("No alerts for %s", org_code) - failed.append(f"{org_code} - No alerts") - continue + # If no alerts, continue + if alerts_df.empty: + LOGGER.warning(f"No DNSMonitor alerts found for {org_code}") + warnings.append(f"{org_code} - No alerts found") + continue for alert_index, alert_row in alerts_df.iterrows(): # Get subdomain_uid @@ -107,7 +107,7 @@ def run_dnsMonitor(self): root_domain, ) try: - addSubdomain(root_domain, org_uid, True) # api ver. + addSubdomain(root_domain, org_uid, True) # api ver. # addSubdomain(conn, root_domain, org_uid, True) # tsql ver. LOGGER.info( "Success adding %s to subdomain table.", root_domain @@ -164,13 +164,13 @@ def run_dnsMonitor(self): subset=["domain_permutation"], keep="last" ) try: - execute_dnsmonitor_data(dom_perm_df) # api ver. + execute_dnsmonitor_data(dom_perm_df) # api ver. # execute_dnsmonitor_data(dom_perm_df, "domain_permutations") # tsql ver. - # LOGGER.info("Success inserting into domain_permutations - %s", org_code) # tsql ver. + # LOGGER.info("Success inserting into domain_permutations - %s", org_code) except Exception as e: LOGGER.error("Failed inserting into domain_permutations - %s", org_code) LOGGER.error(e) - failed.append(f"{org_code} - Failed inserting into dom_perms") + failed.append(f"{org_code} - Failed inserting into domain_permutations") # Format dataframe and insert into domain_alerts table alerts_df = alerts_df.rename(columns={"date_observed": "date"}) @@ -187,14 +187,28 @@ def run_dnsMonitor(self): ] ] try: - execute_dnsmonitor_alert_data(domain_alerts) # api ver. + execute_dnsmonitor_alert_data(domain_alerts) # api ver. # execute_dnsmonitor_alert_data(domain_alerts, "domain_alerts") # tsql ver. - # LOGGER.info("Success inserting into domain_alerts - %s", org_code) # tsql ver. + # LOGGER.info("Success inserting into domain_alerts - %s", org_code) except Exception as e: LOGGER.error("Failed inserting into domain_alerts - %s", org_code) LOGGER.error(e) - failed.append(f"{org_code} - Failed inserting into dom_alerts") + failed.append(f"{org_code} - Failed inserting into domain_alerts") + + # Output any warnings + if len(warnings) > 0: + LOGGER.warning("Warnings: %s", warnings) # Output any failures if len(failed) > 0: LOGGER.error("Failures: %s", failed) + + # Output summary stats + num_no_domain_monitor = sum('No domains being monitored' in s for s in warnings) + num_no_alerts = sum('No alerts found' in s for s in warnings) + num_success = len(pe_orgs_final) - num_no_domain_monitor - num_no_alerts - len(failed) + num_fail = len(failed) + LOGGER.info(f"{num_no_domain_monitor}/{len(pe_orgs_final)} orgs do not have domains being monitored by DNSMonitor") + LOGGER.info(f"{num_no_alerts}/{len(pe_orgs_final)} orgs have domains being monitored, but didn't have any new alerts") + LOGGER.info(f"{num_success}/{len(pe_orgs_final)} orgs had new DNSMonitor findings and successfully added them to the database") + LOGGER.info(f"{num_fail}/{len(pe_orgs_final)} orgs had a significant failure during the DNSMonitor scan") diff --git a/src/pe_source/dnstwistscript.py b/src/pe_source/dnstwistscript.py index 717cb6a7..b7ae80ca 100644 --- a/src/pe_source/dnstwistscript.py +++ b/src/pe_source/dnstwistscript.py @@ -5,6 +5,7 @@ import json import logging import pathlib +import time import traceback # Third-Party Libraries @@ -22,9 +23,8 @@ org_root_domains, ) -# Setup logging -LOGGER = logging.getLogger(__name__) date = datetime.datetime.now().strftime("%Y-%m-%d") +LOGGER = logging.getLogger(__name__) def checkBlocklist(dom, sub_domain_uid, source_uid, pe_org_uid, perm_list): @@ -150,7 +150,7 @@ def execute_dnstwist(root_domain, test=0): if test == 1: return dnstwist_result finalorglist = dnstwist_result + [] - if root_domain.split(".")[-1] == "gov": + if root_domain.split(".")[-1] == "gov": for dom in dnstwist_result: if ( ("tld-swap" not in dom["fuzzer"]) @@ -161,7 +161,7 @@ def execute_dnstwist(root_domain, test=0): and ("insertion" not in dom["fuzzer"]) and ("transposition" not in dom["fuzzer"]) ): - LOGGER.info("Running again on %s", dom["domain"]) + LOGGER.info("\tRunning again on %s", dom["domain"]) secondlist = dnstwist.run( registered=True, tld=pathtoDict, @@ -201,31 +201,45 @@ def run_dnstwist(orgs_list): continue failures = [] - for org in pe_orgs_final: + # total_dnstwist_time = datetime.timedelta(seconds=0) # exe time testing + # total_blocklist_time = datetime.timedelta(seconds=0) # exe time testing + # total_insertion_time = datetime.timedelta(seconds=0) # exe time testing + for org_idx, org in enumerate(pe_orgs_final): pe_org_uid = org["organizations_uid"] org_name = org["name"] pe_org_id = org["cyhy_db_name"] # Only run on orgs in the org list if pe_org_id in orgs_list or orgs_list == "all" or orgs_list == "DEMO": - LOGGER.info("Running DNSTwist on %s", pe_org_id) + LOGGER.info(f"Running DNSTwist on {pe_org_id} ({org_idx+1} of {len(pe_orgs_final)})") """Collect DNSTwist data from Crossfeed""" try: # Get root domains - root_dict = org_root_domains(PE_conn, pe_org_uid) + # root_dict = org_root_domains(PE_conn, pe_org_uid) # TSQL ver. + root_dict = org_root_domains(pe_org_uid) # API ver. + + # Convert to deduped list of root domains + list_of_roots = [d['root_domain'] for d in root_dict] + list_of_roots = [s.strip() for s in list_of_roots] + list_of_roots = list(set(list_of_roots)) + LOGGER.info(f"{len(list_of_roots)} roots found for {pe_org_id}") + domain_list = [] perm_list = [] - for root in root_dict: - root_domain = root["root_domain"] + for root in list_of_roots: + root_domain = root if root_domain == "Null_Root": continue - LOGGER.info("\tRunning on root domain: %s", root["root_domain"]) - + LOGGER.info("Running DNSTwist on root domain: %s", root) + # t1_1 = time.time() # exe time testing with open( "dnstwist_output.txt", "w" ) as f, contextlib.redirect_stdout(f): finalorglist = execute_dnstwist(root_domain) + # t1_2 = time.time() # exe time testing + LOGGER.info(f"\tFinished running DNSTwist on root domain: {root}") + # total_dnstwist_time += datetime.timedelta(seconds=(t1_2 - t1_1)) # exe time testing # Get subdomain uid sub_domain = root_domain @@ -235,17 +249,26 @@ def run_dnstwist(orgs_list): # TODO: Create custom exceptions. # Issue 265: https://github.com/cisagov/pe-reports/issues/265 # Add and then get it - addSubdomain(sub_domain, pe_org_uid, True) # api ver. + addSubdomain(sub_domain, pe_org_uid, True) # api ver. # addSubdomain(PE_conn, sub_domain, pe_org_uid, True) # tsql ver. sub_domain_uid = getSubdomain(sub_domain) # Check Blocklist - for dom in finalorglist: + LOGGER.info(f"\tRunning blocklist check on the DNSTwist results from root domain: {root}") + # t2_1 = time.time() # exe time testing + for dom_idx, dom in enumerate(finalorglist): + domain_name = dom.get("domain") + print(f"Running blocklist check on {domain_name} ({dom_idx+1}/{len(finalorglist)})") domain_dict, perm_list = checkBlocklist( dom, sub_domain_uid, source_uid, pe_org_uid, perm_list ) if domain_dict is not None: domain_list.append(domain_dict) + # t2_2 = time.time() # exe time testing + LOGGER.info(f"\tFinished running blocklist check on the DNSTwist results from root domain: {root}") + # total_blocklist_time += datetime.timedelta(seconds=(t2_2 - t2_1)) # exe time testing + # print(f"\tCurrent dnstwist time total: {total_dnstwist_time}") # exe time testing + # print(f"\tCurrent blocklist time total: {total_blocklist_time}") # exe time testing except Exception: # TODO: Create custom exceptions. # Issue 265: https://github.com/cisagov/pe-reports/issues/265 @@ -254,6 +277,7 @@ def run_dnstwist(orgs_list): LOGGER.info(traceback.format_exc()) """Insert cleaned data into PE database.""" + # t3_1 = time.time() # exe time testing try: cursor = PE_conn.cursor() try: @@ -291,11 +315,20 @@ def run_dnstwist(orgs_list): LOGGER.info("Failure inserting data into database.") failures.append(org_name) LOGGER.info(traceback.format_exc()) + # t3_2 = time.time() # exe time testing + # total_insertion_time = datetime.timedelta(seconds=(t3_2 - t3_1)) # exe time testing + + # print(f"DNSTwist summary for {pe_org_id}") # exe time testing + # print(f"DNSTwist time: {str(total_dnstwist_time)} (H:M:S)") # exe time testing + # print(f"Blocklist time: {str(total_blocklist_time)} (H:M:S)") # exe time testing + # print(f"DB insert time: {str(total_insertion_time)} (H:M:S)") # exe time testing + + LOGGER.info(f"{len(pe_orgs_final) - len(failures)}/{len(pe_orgs_final)} orgs successfully underwent the DNSTwist scan") + LOGGER.info(f"{len(failures)}/{len(pe_orgs_final)} orgs had a significant failure during the DNSTwist scan") PE_conn.close() if failures != []: - LOGGER.error("These orgs failed:") - LOGGER.error(failures) + LOGGER.error("These orgs failed: ", failures) if __name__ == "__main__": diff --git a/src/pe_source/intelx_identity.py b/src/pe_source/intelx_identity.py index 537837c8..5462f29b 100644 --- a/src/pe_source/intelx_identity.py +++ b/src/pe_source/intelx_identity.py @@ -28,11 +28,11 @@ END_DATE = TODAY.strftime("%Y-%m-%d %H:%M:%S") # Get data source uid SOURCE_UID = get_data_source_uid("IntelX") + section = "intelx" params = get_params(section) api_key = params[0][1] -# Setup logging LOGGER = logging.getLogger(__name__) @@ -69,19 +69,26 @@ def run_intelx(self): else: continue - for pe_org in pe_orgs_final: + success = 0 + failed = 0 + for org_idx, pe_org in enumerate(pe_orgs_final): cyhy_org_id = pe_org["cyhy_db_name"] pe_org_uid = pe_org["organizations_uid"] # Verify the org is in the list of orgs to scan if cyhy_org_id in orgs_list or orgs_list == "all" or orgs_list == "DEMO": + LOGGER.info(f"Running IntelX on {cyhy_org_id} ({org_idx+1} of {len(pe_orgs_final)})") if self.get_credentials(cyhy_org_id, pe_org_uid) == 1: LOGGER.error("Failed to get credentials for %s", cyhy_org_id) + failed += 1 + else: + success +=1 + + LOGGER.info(f"IntelX scan ran successfully for {success}/{len(pe_orgs_final)} organizations") + def get_credentials(self, cyhy_org_id, pe_org_uid): """Get credentials for a provided org.""" - LOGGER.info("Running IntelX on %s", cyhy_org_id) - try: conn = connect() roots_df = get_root_domains(conn, pe_org_uid) @@ -307,4 +314,4 @@ def process_leaks_results(self, leaks_json, org_uid): ] ] - return creds_df, breaches_df + return creds_df, breaches_df \ No newline at end of file diff --git a/src/pe_source/nist_update.py b/src/pe_source/nist_update.py index 5317622f..b50b370a 100644 --- a/src/pe_source/nist_update.py +++ b/src/pe_source/nist_update.py @@ -7,7 +7,12 @@ import time # Third-Party Libraries -from data.pe_db.db_query_source import api_cve_insert, get_cve_and_products +from data.pe_db.db_query_source import ( + query_all_cves, + api_cve_insert, + get_cve_and_products +) + from nested_lookup import nested_lookup import pytz import requests diff --git a/src/pe_source/pe_scripts.py b/src/pe_source/pe_scripts.py index 39499b3f..eb5a98c8 100644 --- a/src/pe_source/pe_scripts.py +++ b/src/pe_source/pe_scripts.py @@ -28,9 +28,11 @@ # Standard Python Libraries import logging import sys +import time from typing import Any, Dict # Third-Party Libraries +from datetime import timedelta import docopt from schema import And, Schema, SchemaError, Use @@ -45,7 +47,6 @@ from .pshtt_wrapper import launch_pe_pshtt from .shodan_wrapper import Get_shodan -# Setup logging LOGGER = logging.getLogger(__name__) @@ -55,27 +56,60 @@ def run_pe_script(source, orgs_list, cybersix_methods, soc_med_included): if orgs_list != "all" and orgs_list != "DEMO": orgs_list = orgs_list.split(",") # If not "all", separate Cybersixgill methods string into a list + sixgill_scan_name = cybersix_methods.title() if cybersix_methods == "all": cybersix_methods = ["alerts", "mentions", "credentials", "topCVEs"] else: cybersix_methods = cybersix_methods.split(",") - LOGGER.info("Running %s on these orgs: %s", source, orgs_list) + # LOGGER.info("Running %s on these orgs: %s", source, orgs_list) if source == "cybersixgill": + if sixgill_scan_name == "Topcves": + sixgill_scan_name = "Top CVEs" + LOGGER.info(f"--- Cybersixgill {sixgill_scan_name} Scan Starting ---") + LOGGER.info(f"Running Cybersixgill {sixgill_scan_name} on these orgs: {orgs_list}") + sixgill_start_time = time.time() cybersix = Cybersixgill(orgs_list, cybersix_methods, soc_med_included) cybersix.run_cybersixgill() + sixgill_end_time = time.time() + LOGGER.info(f"Execution time for Cybersixgill {sixgill_scan_name} scan: {str(timedelta(seconds=(sixgill_end_time - sixgill_start_time)))} (H:M:S)") + LOGGER.info(f"--- Cybersixgill {sixgill_scan_name} Scan Complete ---") elif source == "shodan": + LOGGER.info("--- Shodan Scan Starting ---") + LOGGER.info(f"Running Shodan on these orgs: {orgs_list}") + shodan_start_time = time.time() shodan = Get_shodan(orgs_list) shodan.run_shodan() + shodan_end_time = time.time() + LOGGER.info(f"Execution time for Shodan scan: {str(timedelta(seconds=(shodan_end_time - shodan_start_time)))} (H:M:S)") + LOGGER.info("--- Shodan Scan Complete ---") elif source == "dnsmonitor": + LOGGER.info("--- DNSMonitor Scan Starting ---") + LOGGER.info(f"Running DNSMonitor on these orgs: {orgs_list}") + dnsmonitor_start_time = time.time() dnsMonitor = DNSMonitor(orgs_list) dnsMonitor.run_dnsMonitor() + dnsmonitor_end_time = time.time() + LOGGER.info(f"Execution time for DNSMonitor scan: {str(timedelta(seconds=(dnsmonitor_end_time - dnsmonitor_start_time)))} (H:M:S)") + LOGGER.info("--- DNSMonitor Scan Complete ---") elif source == "dnstwist": + LOGGER.info("--- DNSTwist Scan Starting ---") + LOGGER.info(f"Running DNSTwist on these orgs: {orgs_list}") + dnstwist_start_time = time.time() run_dnstwist(orgs_list) + dnstwist_end_time = time.time() + LOGGER.info(f"Execution time for DNSTwist scan: {str(timedelta(seconds=(dnstwist_end_time - dnstwist_start_time)))} (H:M:S)") + LOGGER.info("--- DNSTwist Scan Complete ---") elif source == "intelx": + LOGGER.info("--- IntelX Scan Starting ---") + LOGGER.info(f"Running IntelX on these orgs: {orgs_list}") + intelx_start_time = time.time() intelx = IntelX(orgs_list) intelx.run_intelx() + intelx_end_time = time.time() + LOGGER.info(f"Execution time for IntelX scan: {str(timedelta(seconds=(intelx_end_time - intelx_start_time)))} (H:M:S)") + LOGGER.info("--- IntelX Scan Complete ---") elif source == "pshtt": launch_pe_pshtt() else: diff --git a/src/pe_source/pshtt_wrapper.py b/src/pe_source/pshtt_wrapper.py index 54f0456b..441f0a71 100644 --- a/src/pe_source/pshtt_wrapper.py +++ b/src/pe_source/pshtt_wrapper.py @@ -6,7 +6,7 @@ import threading # Third-Party Libraries -import data.pshtt.utils as utils +from .data.pshtt import utils import numpy as np from .data.pe_db.db_query_source import api_pshtt_domains_to_run, api_pshtt_insert diff --git a/src/pe_source/shodan_wrapper.py b/src/pe_source/shodan_wrapper.py index 498c3582..d2e441e6 100644 --- a/src/pe_source/shodan_wrapper.py +++ b/src/pe_source/shodan_wrapper.py @@ -1,15 +1,20 @@ """Collect Shodan data.""" # Standard Python Libraries +import logging import threading +import time # Third-Party Libraries +from datetime import timedelta import numpy from .data.pe_db.config import shodan_api_init from .data.pe_db.db_query_source import get_orgs from .data.shodan_db.shodan_search import run_shodan_thread +# Logging +LOGGER = logging.getLogger(__name__) class Get_shodan: """Fetch Shodan data.""" @@ -68,4 +73,4 @@ def run_shodan(self): # Wait until all threads finish to continue for thread in thread_list: - thread.join() + thread.join() \ No newline at end of file