Various logging improvements, ASM summary tweaks, and scan fixes

cisagov · Jun 10, 2024 · 9e8166a · 9e8166a
1 parent b402cb0
commit 9e8166a
Show file tree

Hide file tree

Showing 22 changed files with 589 additions and 289 deletions.
diff --git a/src/pe_mailer/email_reports.py b/src/pe_mailer/email_reports.py
@@ -28,13 +28,16 @@
 import os
 import re
 import sys
+import time
 from typing import Any, Dict
 
 # Third-Party Libraries
 import boto3
 from botocore.exceptions import ClientError
 import docopt
+import pymongo.errors
 from schema import And, Schema, SchemaError, Use
+import yaml
 
 # cisagov Libraries
 import pe_reports
@@ -44,7 +47,6 @@
 from .pe_message import PEMessage
 from .stats_message import StatsMessage
 
-# Setup logging
 LOGGER = logging.getLogger(__name__)
 MAILER_AWS_PROFILE = "cool-dns-sessendemail-cyber.dhs.gov"
 MAILER_ARN = os.environ.get("MAILER_ARN")
@@ -192,11 +194,11 @@ def send_message(ses_client, message, counter=None):
 
 
 def send_pe_reports(ses_client, pe_report_dir, to):
-    """
-    Send out Posture and Exposure reports.
+    """Send out Posture and Exposure reports.
 
     Parameters
     ----------
+
     ses_client : boto3.client
         The boto3 SES client via which the message is to be sent.
 
@@ -227,22 +229,25 @@ def send_pe_reports(ses_client, pe_report_dir, to):
     try:
         # The directory must contain one usable report
         cyhy_agencies = len(pe_orgs)
-        LOGGER.info(f"{cyhy_agencies} agencies found in P&E.")
+        LOGGER.info(f"Running report mailer for {cyhy_agencies} organizations")
         1 / cyhy_agencies
     except ZeroDivisionError:
-        LOGGER.critical("No report data is found in %s", pe_report_dir)
+        LOGGER.critical("No report data was found in %s", pe_report_dir)
         sys.exit(1)
 
     staging_conn = connect()
     # org_contacts = get_orgs_contacts(staging_conn) # old tsql ver.
-    org_contacts = get_orgs_contacts()  # api ver.
-
+    org_contacts = get_orgs_contacts() # api ver.
+    
     agencies_emailed_pe_reports = 0
+    reports_not_mailed = 0
     # Iterate over cyhy_requests, if necessary
     if pe_report_dir:
         for org in pe_orgs:
             id = org[2]
             if id == "GSEC":
+                LOGGER.warning(f"The PDF report for {org[2]} was intentionally set to not be mailed")
+                reports_not_mailed += 1
                 continue
             if to is not None:
                 to_emails = to
@@ -274,9 +279,10 @@ def send_pe_reports(ses_client, pe_report_dir, to):
 
             # At most one Cybex report and CSV should match
             if len(pe_report_filenames) > 2:
-                LOGGER.warning("More than two PDF reports found")
+                LOGGER.warning(f"More than two encrypted PDF reports found for {org[2]}")
             elif not pe_report_filenames:
-                LOGGER.error("No PDF report found")
+                LOGGER.warning(f"No encrypted PDF report found for {org[2]}, no report will be mailed")
+                reports_not_mailed += 1
                 continue
 
             if pe_report_filenames:
@@ -306,10 +312,11 @@ def send_pe_reports(ses_client, pe_report_dir, to):
                     pe_report_filename, pe_asm_filename, report_date, id, to_emails
                 )
 
-                print(to_emails)
-                print(pe_report_filename)
-                print(pe_asm_filename)
-                print(report_date)
+                print("Recipient: ", to_emails)
+                print("Report Date: ", report_date)
+                print("Report File:", pe_report_filename)
+                print("ASM Summary File", pe_asm_filename, "\n")
+
 
                 try:
                     agencies_emailed_pe_reports = send_message(
@@ -325,7 +332,8 @@ def send_pe_reports(ses_client, pe_report_dir, to):
 
     # Print out and log some statistics
     pe_stats_string = f"Out of {cyhy_agencies} agencies with Posture and Exposure reports, {agencies_emailed_pe_reports} ({100.0 * agencies_emailed_pe_reports / cyhy_agencies:.2f}%) were emailed."
-    LOGGER.info(pe_stats_string)
+    mail_summary_log_string = f"{agencies_emailed_pe_reports}/{cyhy_agencies} reports were mailed, {reports_not_mailed}/{cyhy_agencies} reports were not mailed"
+    LOGGER.info(mail_summary_log_string)
 
     return pe_stats_string
 
@@ -339,19 +347,20 @@ def send_reports(pe_report_dir, summary_to, test_emails):
         return 1
 
     # Assume role to use mailer
-    sts_client = boto3.client("sts")
-    assumed_role_object = sts_client.assume_role(
-        RoleArn=MAILER_ARN, RoleSessionName="AssumeRoleSession1"
+    sts_client = boto3.client('sts')
+    assumed_role_object=sts_client.assume_role(
+        RoleArn=MAILER_ARN,
+        RoleSessionName="AssumeRoleSession1"
     )
-    credentials = assumed_role_object["Credentials"]
+    credentials=assumed_role_object['Credentials']
 
-    ses_client = boto3.client(
-        "ses",
+    ses_client = boto3.client("ses", 
         region_name="us-east-1",
-        aws_access_key_id=credentials["AccessKeyId"],
-        aws_secret_access_key=credentials["SecretAccessKey"],
-        aws_session_token=credentials["SessionToken"],
+        aws_access_key_id=credentials['AccessKeyId'],
+        aws_secret_access_key=credentials['SecretAccessKey'],
+        aws_session_token=credentials['SessionToken']
     )
+
 
     # Email the summary statistics, if necessary
     if test_emails is not None:
@@ -380,6 +389,8 @@ def send_reports(pe_report_dir, summary_to, test_emails):
 
 def main():
     """Send emails."""
+    LOGGER.info("--- PE Report Mailing Starting ---")
+    start_time = time.time()
     # Parse command line arguments
     args: Dict[str, str] = docopt.docopt(__doc__, version=__version__)
 
@@ -416,7 +427,7 @@ def main():
         level=log_level.upper(),
     )
 
-    LOGGER.info("Sending Posture & Exposure Reports, Version : %s", __version__)
+    LOGGER.info("Posture & Exposure Report Mailer, Version : %s", __version__)
 
     send_reports(
         # TODO: Improve use of schema to validate arguments.
@@ -426,5 +437,10 @@ def main():
         validated_args["--test-emails"],
     )
 
+    end_time = time.time()
+    LOGGER.info(f"Execution time for PE report mailing: {str(datetime.timedelta(seconds=(end_time - start_time)))} (H:M:S)")
+    LOGGER.info("--- PE Report Mailing Complete ---")
+
     # Stop logging and clean up
     logging.shutdown()
+
diff --git a/src/pe_reports/asm_generator.py b/src/pe_reports/asm_generator.py
@@ -7,24 +7,25 @@
 import os
 
 # Third-Party Libraries
-from PyPDF2 import PdfFileReader, PdfFileWriter
 import fitz
+from PyPDF2 import PdfFileReader, PdfFileWriter
+import numpy as np
 import pandas as pd
-
-# from reportlab.lib.enums import TA_CENTER
 from reportlab.lib.pagesizes import letter
 from reportlab.lib.styles import ParagraphStyle
-from reportlab.lib.units import inch
 from reportlab.pdfbase import pdfmetrics
 from reportlab.pdfbase.ttfonts import TTFont
 from reportlab.pdfgen import canvas
 from reportlab.platypus import Frame, Paragraph
+from reportlab.lib.enums import TA_CENTER
+from reportlab.lib.units import inch
+
 
 # cisagov Libraries
 from pe_reports.data.db_query import (
     query_cidrs_by_org,
-    query_extra_ips,
     query_foreign_IPs,
+    query_extra_ips,
     query_ports_protocols,
     query_roots,
     query_software,
@@ -35,17 +36,14 @@
 LOGGER = logging.getLogger(__name__)
 
 BASE_DIR = os.path.abspath(os.path.dirname(__file__))
-IN_FILEPATH = BASE_DIR + "/assets_asm/attack_surface_empty.pdf"
 ON_PAGE_INDEX = 0
 UNDERNEATH = (
     False  # if True, new content will be placed underneath page (painted first)
 )
 
+pdfmetrics.registerFont(TTFont("Frank_Goth", BASE_DIR + "/fonts/FranklinGothic.ttf"))
 pdfmetrics.registerFont(
-    TTFont("Frank_Goth", BASE_DIR + "/assets_asm/FranklinGothic.ttf")
-)
-pdfmetrics.registerFont(
-    TTFont("Frank_Goth_Book", BASE_DIR + "/assets_asm/Franklin_Gothic_Book_Regular.ttf")
+    TTFont("Frank_Goth_Book", BASE_DIR + "/fonts/Franklin_Gothic_Book_Regular.ttf")
 )
 
 
@@ -108,7 +106,7 @@ def add_stat_frame(current_value, last_value, x, y, width, height, style, can):
 
 def add_attachment(org_uid, final_output, pdf_file, asm_json, asm_xlsx):
     """Create and add JSON attachment."""
-    LOGGER.info("Creating attachment")
+    LOGGER.info("Creating ASM attachments")
     # Create ASM Excel file
     asmWriter = pd.ExcelWriter(asm_xlsx, engine="xlsxwriter")
 
@@ -119,16 +117,14 @@ def add_attachment(org_uid, final_output, pdf_file, asm_json, asm_xlsx):
     cidr_dict = cidr_df["network"].to_list()
 
     # Extra IPs
-    LOGGER.info("Getting extra IPs")
     ip_lst = query_extra_ips(org_uid)
     ips_df = pd.DataFrame(ip_lst, columns=["ip"])
     ips_df.to_excel(asmWriter, sheet_name="Extra IPs", index=False)
     ips_dict = ips_df["ip"].to_list()
-    LOGGER.info("Finished extra IPs")
 
     # Ports/protocols
     ports_protocols_df = query_ports_protocols(org_uid)
-    ports_protocols_df.to_excel(asmWriter, sheet_name="Ports_Protocols", index=False)
+    ports_protocols_df.to_excel(asmWriter, sheet_name="Ports Protocols", index=False)
     ports_protocols_dict = ports_protocols_df.to_dict(orient="records")
 
     # Root domains
@@ -139,9 +135,12 @@ def add_attachment(org_uid, final_output, pdf_file, asm_json, asm_xlsx):
 
     # Sub-domains
     sd_df = query_subs(org_uid)
-    sd_df = sd_df[["sub_domain"]]
-    sd_df.to_excel(asmWriter, sheet_name="Sub-domains", index=False)
-    sd_dict = sd_df["sub_domain"].to_list()
+    # sd_df = sd_df[["sub_domain"]]
+    #sd_df = sd_df[["sub_domain", "origin_root_domain", "pe_discovered_asset"]]
+    sd_df = sd_df[["sub_domain", "origin_root_domain"]]
+    sd_df.to_excel(asmWriter, sheet_name="Subdomains", index=False)
+    # sd_dict = sd_df["sub_domain"].to_list()
+    sd_dict = sd_df.to_dict(orient="records")
 
     # Software
     soft_df = query_software(org_uid)
@@ -205,9 +204,7 @@ def add_attachment(org_uid, final_output, pdf_file, asm_json, asm_xlsx):
     return asm_xlsx
 
 
-def create_summary(
-    org_uid, final_output, data_dict, file_name, json_filename, excel_filename
-):
+def create_summary(org_uid, final_output, data_dict, file_name, json_filename, excel_filename):
     """Create ASM summary PDF."""
     packet = io.BytesIO()
 
@@ -310,8 +307,8 @@ def create_summary(
         can,
     )
     json_title_frame = Frame(
-        6 * inch, 100, 1.5 * inch, 0.5 * inch, id=None, showBoundary=0
-    )
+            6 * inch, 100, 1.5 * inch, 0.5 * inch, id=None, showBoundary=0
+        )
     json_title = Paragraph(
         "JSON&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;EXCEL",
         style=json_excel,
@@ -324,7 +321,7 @@ def create_summary(
     new_pdf = PdfFileReader(packet)
 
     # Read existing PDF template
-    existing_pdf = PdfFileReader(open(BASE_DIR + "/assets_asm/empty_asm.pdf", "rb"))
+    existing_pdf = PdfFileReader(open(BASE_DIR + "/assets_asm/empty_asm_2024-04-15.pdf", "rb"))
     output = PdfFileWriter()
 
     # Add the "watermark" (which is the new pdf) on the existing page
@@ -341,5 +338,5 @@ def create_summary(
     asm_xlsx = add_attachment(
         org_uid, final_output, file_name, json_filename, excel_filename
     )
-
-    return asm_xlsx
+    
+    return asm_xlsx
diff --git a/src/pe_reports/assets_asm/empty_asm_2024-04-15.pdf b/src/pe_reports/assets_asm/empty_asm_2024-04-15.pdf
diff --git a/src/pe_reports/data/db_query.py b/src/pe_reports/data/db_query.py
@@ -29,8 +29,9 @@
 CONN_PARAMS_DIC_STAGING = staging_config()
 
 # These need to filled with API key/url path in database.ini
-pe_api_key = CONN_PARAMS_DIC_STAGING.get("pe_api_key")
-pe_api_url = CONN_PARAMS_DIC_STAGING.get("pe_api_url")
+API_DIC = staging_config(section="pe_api")
+pe_api_url = API_DIC.get("pe_api_url")
+pe_api_key = API_DIC.get("pe_api_key")
 
 
 def task_api_call(task_url, check_url, data={}, retry_time=3):
@@ -55,24 +56,36 @@ def task_api_call(task_url, check_url, data={}, retry_time=3):
             create_task_url, headers=headers, data=data
         ).json()
         task_id = create_task_result.get("task_id")
-        LOGGER.info("Created task for", task_url, "query, task_id: ", task_id)
+        LOGGER.info("Created task for " + task_url + " query, task_id: " + task_id)
         check_task_url += task_id
         while task_status != "Completed" and task_status != "Failed":
             # Ping task status endpoint and get status
-            check_task_resp = requests.get(check_task_url, headers=headers).json()
+            # check_task_resp = requests.get(check_task_url, headers=headers).json()
+            check_task_resp = requests.get(check_task_url, headers=headers)
+            #print(check_task_resp)
+            check_task_resp = check_task_resp.json()
             task_status = check_task_resp.get("status")
-            LOGGER.info("\tPinged", check_url, "status endpoint, status:", task_status)
+            LOGGER.info(
+                "\tPinged " + check_url + " status endpoint, status: " + task_status
+            )
             time.sleep(retry_time)
     except requests.exceptions.HTTPError as errh:
         LOGGER.error(errh)
+        print(errh)
     except requests.exceptions.ConnectionError as errc:
         LOGGER.error(errc)
+        print(errc)
     except requests.exceptions.Timeout as errt:
         LOGGER.error(errt)
+        print(errt)
     except requests.exceptions.RequestException as err:
         LOGGER.error(err)
+        print(err)
     except json.decoder.JSONDecodeError as err:
         LOGGER.error(err)
+        print(err)
+    except Exception as err:
+        print(err)
     # Once task finishes, return result
     if task_status == "Completed":
         return check_task_resp.get("result")
@@ -2080,7 +2093,6 @@ def query_subs(org_uid):
     Return:
         All the subdomains belonging to the specified org as a dataframe
     """
-    start_time = time.time()
     total_num_pages = 1
     page_num = 1
     total_data = []
@@ -2100,21 +2112,16 @@ def query_subs(org_uid):
         page_num += 1
     # Once all data has been retrieved, return overall dataframe
     total_data = pd.DataFrame.from_dict(total_data)
-    LOGGER.info(
-        "Total time to retrieve all subdomains for this org: "
-        + str(time.time() - start_time)
-    )
     # Process data and return
     total_data.rename(
         columns={
-            "root_domain_uid_id": "root_domain_uid",
-            "data_source_uid_id": "data_source_uid",
-            "dns_record_uid_id": "dns_record_uid",
+            "root_domain_uid__root_domain": "origin_root_domain",
+            "identified": "pe_discovered_asset",
         },
         inplace=True,
     )
-    total_data["first_seen"] = pd.to_datetime(total_data["first_seen"]).dt.date
-    total_data["last_seen"] = pd.to_datetime(total_data["last_seen"]).dt.date
+    # total_data["first_seen"] = pd.to_datetime(total_data["first_seen"]).dt.date
+    # total_data["last_seen"] = pd.to_datetime(total_data["last_seen"]).dt.date
     # Return truly empty dataframe if no results
     if total_data[total_data.columns].isnull().apply(lambda x: all(x), axis=1)[0]:
         total_data.drop(total_data.index, inplace=True)

diff --git a/src/pe_reports/fonts/FranklinGothic.ttf b/src/pe_reports/fonts/FranklinGothic.ttf
diff --git a/src/pe_reports/fonts/Franklin_Gothic_Book_Regular.ttf b/src/pe_reports/fonts/Franklin_Gothic_Book_Regular.ttf