osintanalyzers/malwareanalyzer.py

from .osintbase import *
from analyzerbase import *


class MalwareAnalyzer(OSINTAnalyzerBase):
    """Class for analyzing malware samples with OSINT sources"""

    SOURCES = ["malwarebazaar", 
                "urlhaus",
                "threatfox",
                "malpedia",
                "exploitdb",
               ]
    
    IGNORE_TAGS = [
        "c2", 'elf', 'win', '32', '64', "win32", "win64", '32-bit', '64-bit', '32bit', '64bit',
        "linux", "linux64", "windows", "mac", "macos", "macosx", "macosx64", "macosx32", "macos64", "macos32",
        "android", "arm", "exe", "dll", "elf32", "elf64", "elf32le", "elf32be", "elf64le", "elf64be", "apk",
        "censys", "jar", "RAT", "rat", "trojan", "Trojan", "backdoor", "Backdoor", "loader", "Loader", 
        "downloader", "Downloader", "dropper", "Dropper", "SELECTL", "Supershell", "bash", "ssh", "sh", "shell"
        ]
    

    def __init__(self, 
                 db_path=Path("tests/mwdb"), 
                 selenium_webdriver_type="chrome", 
                 webdriver_path="/Users/lucasfaudman/Documents/SANS/internship/chromedriver",
                 sources = ["malwarebazaar", "urlhaus", "threatfox", "malpedia", "exploitdb"],
                 max_errors={
                        "malwarebazaar": 2,
                        "urlhaus": 2,
                        "threatfox": 2,
                        "malpedia": 2,
                        "exploitdb": 2,
                    }) -> None:
        
        super().__init__(db_path, selenium_webdriver_type, webdriver_path, sources, max_errors)


    def get_malpedia_names_from_tags(self, tags, ignore_tags=IGNORE_TAGS):
        """
        Get Malpedia names from tags via ThreatFox tag search. 
        This is needed for AI to query Malpedia for malware info about a given result
        but is not present in Abuse.ch API results other than a Tag Search
        """

        malpedia_names = []
        query_tags = set(tags) - set(ignore_tags)
        tags_data = self.get_data(query_tags, arg_type="tag", sources=["threatfox"])
        
        for tag in query_tags:
            for tag_info in tags_data.get(tag, {})["threatfox"].get("results",{}).get("data",[]):
                if tag_info.get("malware_malpedia"):
                    malpedia_names.append(tag_info.get("malware_malpedia", "").split("/")[-1])
 
        return list(set(malpedia_names)) 


    def check_malpedia(self, malpedia_name, arg_type="malpedia_name"):
        """Get data for malware with malpedia_name from Malpedia"""

        malpedia_url = f"https://malpedia.caad.fkie.fraunhofer.de/details/{malpedia_name}"
        response = requests.get(malpedia_url)
        soup = BeautifulSoup(response.text, "html.parser")
        
        
        description = soup.find("meta", {"name": "description"})
        output = self.get_output_template(malpedia_url)
        if description and description.get("content"): 
            output["results"]["description"] = description.get("content")
        else:
            output["error"] = "ERROR: No description found"

        return output


    def check_exploitdb(self, arg, arg_type="search_text"):
        
        headers = {
            "User-Agent": "XMLHttpRequest",
            "X-Requested-With": "XMLHttpRequest",
        } 
        
        
        if arg_type.startswith("search"):
            search_param = arg_type.split("_")[-1]
            url = f"https://www.exploit-db.com/search?{search_param}={arg}"
            
            response = requests.get(url, headers=headers)
            data = response.json().get('data', [])
            
            output = self.get_output_template(url, default_results=[])
            for entry in data:
                output["results"].append({
                    "exploit_id": entry.get("id"),
                    "title": entry.get("description", {})[1].replace("&#039;", "'"),
                    "type": entry.get("type", {}).get("name"),
                    "platform": entry.get("platform", {}).get("platform"),
                    "date_published": entry.get("date_published"),
                    "url": f"https://www.exploit-db.com/exploits/{entry.get('id')}",
                })

            return output
            
        elif arg_type == "exploitdb_id":
            url = f"https://www.exploit-db.com/exploits/{arg}"

            response = requests.get(url, headers=headers)
            soup = BeautifulSoup(response.text, "html.parser")

            output = self.get_output_template(url)
            code = soup.find("code")
            if code:
                output["results"] = code.text
            else:
                output["error"] = "ERROR: No code found"

            return output
        

    def _check_abusech(self, api_baseurl="", api_endpoint="", query_type="", query_key="", query_value="", body_format="data"):
        """Handler for all abuse.ch API calls"""

        apiurl = 'https://' + api_baseurl + api_endpoint #+ "/"
        data = {query_key : query_value}
        if query_type:
            data["query"] = query_type
            

        request_kwargs = {"url": apiurl,
                          body_format: data
                          }

        response = requests.post(**request_kwargs)
        response_json = response.json()

        output = self.get_output_template()
        
        query_status = response_json.pop("query_status")
        if query_status != "ok":
            output["error"] = "ERROR: " + query_status
            return output
            
        else:
            output["results"] = response_json

        return output


    def check_malwarebazaar(self, fhash, arg_type="hash"):
        """Get data for malware hash from MalwareBazaar"""

        if "hash" not in arg_type:
            raise ValueError(f"Invalid arg_type: {arg_type}")

        output = self._check_abusech(
            api_baseurl="mb-api.abuse.ch/api/v1/", 
            api_endpoint="", 
            query_type="get_info", 
            query_key="hash", 
            query_value=fhash,
            body_format="data"
            )

        if output.get("results"):
            output["results"] = output["results"]["data"][0]
            output["sharing_link"] = f"https://bazaar.abuse.ch/sample/{fhash}"

            # Add malpedia_names from tags to results
            if output["results"].get("tags"):
                output["results"]["malpedia_names"] = self.get_malpedia_names_from_tags(output["results"]["tags"])

        return output


    def check_urlhaus(self, arg, arg_type="host"):
        """Get data for url or host from URLhaus"""

        if arg_type == "url":
            api_endpoint="url"
            query_key="url"
        
        elif arg_type in ["host", "ip", "domain"]:
            api_endpoint="host"
            query_key="host"

        else:
            raise ValueError(f"Invalid arg_type: {arg_type}")

        output = self._check_abusech(
            api_baseurl="urlhaus-api.abuse.ch/v1/",
            api_endpoint=api_endpoint,
            query_type="",
            query_key=query_key,
            query_value=arg,
            body_format="data")
        
        if output.get("results"):
            output["sharing_link"] = output["results"].pop("urlhaus_reference", "")                    

            
            if output["results"].get("tags"):
                # Add malpedia_names from tags to results
                output["results"]["malpedia_names"] = self.get_malpedia_names_from_tags(output["results"]["tags"])

            
            for url_item in output["results"].get("urls", []):
                if url_item.get("tags"):
                    # Add malpedia_names from tags to to each url subitem in results with tags
                    url_item["malpedia_names"] = self.get_malpedia_names_from_tags(url_item["tags"])

        return output
    

    def check_threatfox(self, arg, arg_type="hash"):
        """Get data for hash, ip, domain, url or tag from ThreatFox"""

        if "hash" in arg_type:
            query_type = "search_hash"
            query_key = "hash"
            url_key = "ioc"
        
        elif arg_type in ["tag", "malware", "malware_name", "malware_family"]:
            arg_type = arg_type.split("_")[0]
            query_type = arg_type + "info"
            query_key = arg_type
            url_key = arg_type

        elif arg_type in ["ioc", "ip", "domain", "url", "host", "ip:port"]:
            query_type = "search_ioc"
            query_key = "search_term"
            url_key = "ioc"

        else:
            raise ValueError(f"Invalid arg_type: {arg_type}")

        output = self._check_abusech(
            api_baseurl="threatfox-api.abuse.ch/api/v1/",
            api_endpoint="",
            query_type=query_type,
            query_key=query_key,
            query_value=arg,
            body_format="json")
        
        if output.get("results"):
            output["sharing_link"] = f"https://threatfox.abuse.ch/browse.php?search={url_key}%3A{arg}"

            
            if output["results"].get("tags"):
                # Add malpedia_names from tags to results
                output["results"]["malpedia_names"] = self.get_malpedia_names_from_tags(output["results"]["tags"])                   

        return output
    

    def download_remote_file(self, url):
        """Download file from remote URL and return as bytes"""
        print(f"Fetching {url}")
        response = requests.get(url)
        return response.content
    
    
    def get_urlhaus_download(self, url):
        """Get file from URLhaus download link for url"""
        urlhaus_results = self.get_data(args=[url],arg_type="url", sources=['urlhaus'])
        urlhaus_download_url = urlhaus_results.get(url, {}).get("urlhaus", {}).get("results", {}) \
                            .get("payloads", [{}])[0].get("urlhaus_download", "")

        if urlhaus_download_url:
            content = self.download_remote_file(urlhaus_download_url)
            if b'"query_status": "not_found"' in content:
                print("ERROR: URLhaus download link query status not_found")
            else:
                return content
        
        return None
    
    
    def reduce_malwarebazaar(self, results):
        """
        Reduce malwarebazaar results to only relevant fields to reduce tokens before passing to AI model.
        Also renames fields to be more verbose to improve AI comprehension.
        Based on analyzerbase.utils.recursive_pop()
        """
        remove_keys = ["sha256_hash", "sha3_384_hash", "sha1_hash", "md5_hash", 
                        "reporter", "origin_country", "anonymous",
                        "imphash", "tlsh", "telfhash", "gimphash",
                        "ssdeep", "dhash_icon", "archive_pw", 
                        "code_sign", "author", "reference"
                        "twitter_handle", "display_name",
                        "link", "analysis_url", "report_link"
                         ]


        replace_keys = {
            "file_information": "file_context_info",
            "signature": "malware_family",
            "clamav": "clamav_signatures",
            "uploads": "times_uploaded_to_malwarebazaar",
            "downloads": "times_downloaded_from_malwarebazaar",
        }


        reduced_results = recursive_pop(results, remove_keys=remove_keys, replace_keys=replace_keys)
        return reduced_results
    

    def reduce_urlhaus(self, results):
        """
        Reduce urlhaus results to only relevant fields to reduce tokens before passing to AI model.
        Also renames fields to be more verbose to improve AI comprehension.
        Based on analyzerbase.utils.recursive_pop()
        """

        remove_keys = ["sha256_hash", "sha3_384_hash", "sha1_hash", "md5_hash", 
                        "reporter", "origin_country", "anonymous",
                        "imphash", "tlsh", "telfhash", "gimphash",
                        "ssdeep", "dhash_icon", "archive_pw", 
                        "code_sign", "author", "reference"
                        "twitter_handle", "display_name",
                        "link", "analysis_url", "report_link",
                        "urlhaus_reference", "urlhaus_download",
                        "response_md5", "response_sha256",
                         ]        
        
        
        replace_keys = {
            "firstseen": "first_seen",
            "lastseen": "last_seen",
            "larted": "url_reported_to_hosting_provider",
            "signature": "malware_family",
            "blacklists": "blacklist_statuses",
        }

        url_status_vals = {
            "online": "URL is online and currently serving a payload",
            "offline": "URL is offline and no longer serving a payload",
        }


        spamhaus_vals = {
            "spammer_domain": "URL is a known spammer domain",
            "phishing_domain": "URL is a known phishing domain",
            "botnet_cc_domain": "URL is a known botnet C&C domain",
            "abused_legit_spam": "URL is a known compromised website used for spammer hosting",
            "abused_legit_malware": "URL is a known compromised website used for malware distribution",
            "abused_legit_phishing": "URL is a known compromised website used for phishing hosting",
            "abused_legit_botnetcc": "URL is a known botnet C&C hosting",
            "abused_redirector": "URL is a known abused redirector or URL shortener",
            "not listed": None,
        }

        
        reduced_results = recursive_pop(results, 
                                        remove_keys=remove_keys, 
                                        replace_keys=replace_keys,
                                        replace_values={**spamhaus_vals, **url_status_vals}
                                        )
        
        return reduced_results

        
    def reduce_threatfox(self, results):
        """
        Reduce threatfox results to only relevant fields to reduce tokens before passing to AI model.
        Also renames fields to be more verbose to improve AI comprehension.
        Based on analyzerbase.utils.recursive_pop()
        """
        remove_keys = ["md5_hash", "reporter", "credits", 
                       "malware_bazaar", "reference"
                        "malware_malpedia"]
        
        replace_keys = {"malware": "malpedia_name"}
        
        reduced_results = recursive_pop(results, remove_keys=remove_keys, replace_keys=replace_keys)
        return reduced_results
    

    def reduce_malpedia(self, results):
        """
        Reduce malpedia results to only relevant fields to reduce tokens before passing to AI model.
        Also renames fields to be more verbose to improve AI comprehension.
        Based on analyzerbase.utils.recursive_pop()
        """

        reduced_results = recursive_pop(results)
        return reduced_results


    def reduce_exploitdb(self, results):
        """
        Reduce exploitdb results to only relevant fields to reduce tokens before passing to AI model.
        Also renames fields to be more verbose to improve AI comprehension.
        Based on analyzerbase.utils.recursive_pop()
        """

        reduced_results = recursive_pop(results, remove_keys=["url"])
        return reduced_results

    
    def count_malwarebazaar(self, data, malware):
        """Not implemented yet but needed to maintain check/count/reduce_{source} interface"""        
        return data
    def count_urlhaus(self, data, malware):
        """Not implemented yet but needed to maintain check/count/reduce_{source} interface"""        
        return data
    def count_threatfox(self, data, malware):
        """Not implemented yet but needed to maintain check/count/reduce_{source} interface"""        
        return data
    def count_malpedia(self, data, malware):
        """Not implemented yet but needed to maintain check/count/reduce_{source} interface"""        
        return data
    def count_exploitdb(self, data, malware):
        """Not implemented yet but needed to maintain check/count/reduce_{source} interface"""        
        return data
        

if __name__ == '__main__':
    pass