adding mistToJson

marcoramilli · Apr 21, 2019 · 96d6974 · 96d6974
1 parent 6b30900
commit 96d6974
Show file tree

Hide file tree

Showing 3 changed files with 229 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -2,8 +2,7 @@
 Please check it out: https://marcoramilli.com/2016/12/16/malware-training-sets-a-machine-learning-dataset-for-everyone/
 
 **Cite The DataSet**  
-**  
-** If you find those results useful please cite them :
+If you find those results useful please cite them :
 
 
     @misc{ MR,
@@ -14,4 +13,9 @@ Please check it out: https://marcoramilli.com/2016/12/16/malware-training-sets-a
        note = "[Online; December 2016]"
      }
 
-
+*UPDATE*
+Many people asked me about the scripts I used to generate MIST-Modified JSON. So here there are ! (take a look to scripts section).
+You might use `mist_json.py`  as a reporting module from CuckooSandbox and the script `fromMongoToARFF.py` to generate ARFF files suitables for WEKA.
+
+If you are going to create new datasets by running your local CuckooSandbox using `mist_json.py` module and you wanto to share them, please feel free to make pool requests !
+
diff --git a/scripts/fromMongoToARFF.py b/scripts/fromMongoToARFF.py
@@ -1,7 +1,7 @@
 # IMPORTANT: This Script is not suitable for production environments !
 # IMPORTANT: This Script is just a MOCUKP and it is not performant at all !
 # IMPORTANT: I am not accepting criticism on such piece of code since it has been written in Hurry just to make it works
-# If you want to know more about what this script does please visit: http://marcoramilli.blogspot.com
+# If you want to know more about what this script does please visit: http://marcoramilli.com
 
 # HOW to use it:
 # Step 1: imports JSON representation of MIST format into mongodb server. You might want to use a simple bash script such as:
@@ -31,7 +31,7 @@
 total_collections = collection.find().count()
 for o, item in enumerate(collection.find(no_cursor_timeout=True)):
     print "|-> Working on Item number: " + str(o) + " on totals: " + str(total_collections)
-    for key in item['properties']: 
+    for key in item['properties']:
         if key == "label":
             print "|--> Found Label: " + str(key)
             if item['properties'][key] not in labels:
@@ -52,7 +52,7 @@
                 if key not in key_list:
                     print "|--> Adding properties: " + str(key)
                     key_list.append(key)
-                
+
 
 #writing header
 out.write("@RELATION maware \n")
@@ -68,11 +68,11 @@
                 out.write("'" + l + "'}\n")
             else:
                 out.write("'" + l + "',")
-            
+
     else:
         #No the last one
         out.write("@ATTRIBUTE '" + k + "' numeric \n")
-        
+
 
 def write_data(f, t):
     #writing data
@@ -86,12 +86,12 @@ def write_data(f, t):
                     property_name = key.split('!')[0]
                     print "**index: " + str(index) + " name: " + str(property_name)
                     value = item['properties'][property_name].split(' ')[int(index)]
-                    # interesting ridiculous approach ! :D 
+                    # interesting ridiculous approach ! :D
                     value = str( int(value.encode('hex'),16) )
                     print "|---> Value: " + str(value)
                 else:
                     value = item['properties'][k]
-                    # interesting ridicoulous approach ! :D 
+                    # interesting ridicoulous approach ! :D
                     value = str( int(value.encode('hex'),16) )
             except Exception as e:
                 print "Exception: " + str(e)
@@ -100,7 +100,7 @@ def write_data(f, t):
                 out.write(value + "," +  item['properties']['label'] + "\n")
             else:
                 out.write(value + ",")
-    
+
 
 out.write("@DATA \n")
 f = 0

diff --git a/scripts/mist_json.py b/scripts/mist_json.py
@@ -0,0 +1,214 @@
+# IMPORTANT: This Script is not suitable for production environments !
+# IMPORTANT: This Script is just a MOCUKP and it is not performant at all !
+# IMPORTANT: I am not accepting criticism on such piece of code since it has been written in Hurry just to make it works
+
+# If you want to know more about what this script does please visit: http://marcoramilli.com
+# You might decide to use this script to generate from a CuckooSandbo machine a
+# MIST (modified) report in order to use it to you AI engine
+
+import os
+import subprocess
+import hashlib
+import urllib
+import random
+import string
+import glob
+import threading
+import json
+import gzip
+import sys
+import time
+import logging
+
+from lib.cuckoo.common.abstracts import Report
+from lib.cuckoo.common.exceptions import CuckooReportError
+import traceback
+
+log = logging.getLogger()
+
+class MistJson(Report):
+    """Converts reports on MIST JSON to produce features for Prediction IO"""
+
+    def sanitize_file(self, filename):
+        normals = filename.lower().replace('\\', ' ').replace('.', ' ').split(' ')
+        hashed_components = [
+            hashlib.md5(normal).hexdigest()[:8] for normal in normals[-3:]]
+        return hashed_components
+
+
+    def sanitize_reg(self, keyname):
+        normals = keyname.lower().replace('\\', ' ').split(' ')
+        hashed_components = [
+            hashlib.md5(normal).hexdigest()[:8] for normal in normals[-2:]]
+        return hashed_components
+
+
+    def sanitize_cmd(self, cmd):
+        normals = cmd.lower().replace('"', '').replace(
+            '\\', ' ').replace('.', ' ').split(' ')
+        hashed_components = [
+            hashlib.md5(normal).hexdigest()[:8] for normal in normals]
+        return hashed_components
+
+
+    def sanitize_generic(self, value):
+        return [hashlib.md5(value.lower()).hexdigest()[:8]]
+
+
+    def sanitize_domain(self, domain):
+        components = domain.lower().split('.')
+        hashed_components = [
+            hashlib.md5(comp).hexdigest()[:8] for comp in components]
+        return hashed_components
+
+
+    def sanitize_ip(self, ipaddr):
+        components = ipaddr.split('.')
+        class_c = components[:3]
+        return [hashlib.md5('.'.join(class_c)).hexdigest()[:8],
+                hashlib.md5(ipaddr).hexdigest()[:8]]
+
+
+    def sanitize_url(self, url):
+        # normalize URL according to CIF specification
+        uri = url
+        if ":" in url:
+            uri = url[url.index(':') + 1:]
+	if not isinstance(uri, list):
+            uri = uri.strip("/")
+	else:
+	    uri_2 = ",".join(uri)
+	    uri = uri_2
+
+        quoted = urllib.quote(uri.encode('utf8')).lower()
+        return [hashlib.md5(quoted).hexdigest()[:8]]
+
+
+    def insert_into_json(self, json_result, key, values):
+        new_set = set(values)
+        if key not in json_result:
+            json_result[key] = new_set
+        else:
+            json_result[key] |= new_set
+
+
+    def try_iterate(self, results, *paths):
+        try:
+            for entry in paths:
+                results = results[entry]
+            for el in results:
+                yield el
+        except:
+            return
+
+
+    def mist_convert(self, results):
+        """ Performs conversion of analysis results to MIST format """
+        json_result = {}
+        analysis_id = results["info"]["id"]
+        log.info("[+] Working on converting id= " + str(analysis_id))
+
+        for entry in self.try_iterate(results, "behavior", "summary", "files"):
+           self.insert_into_json(json_result, "file_access", self.sanitize_file(entry))
+        for entry in self.try_iterate(results, "behavior", "summary", "write_files"):
+            self.insert_into_json(json_result, "file_write", self.sanitize_file(entry))
+        for entry in self.try_iterate(results, "behavior", "summary", "delete_files"):
+            self.insert_into_json(json_result, "file_delete", self.sanitize_file(entry))
+        for entry in self.try_iterate(results, "behavior", "summary", "read_files"):
+            self.insert_into_json(json_result, "file_read", self.sanitize_file(entry))
+        for entry in self.try_iterate(results, "behavior", "summary", "keys"):
+            self.insert_into_json(json_result, "reg_access", self.sanitize_reg(entry))
+        for entry in self.try_iterate(results, "behavior", "summary", "read_keys"):
+            self.insert_into_json(json_result, "reg_read", self.sanitize_reg(entry))
+        for entry in self.try_iterate(results, "behavior", "summary", "write_keys"):
+            self.insert_into_json(json_result, "reg_write", self.sanitize_reg(entry))
+        for entry in self.try_iterate(results, "behavior", "summary", "delete_keys"):
+            self.insert_into_json(json_result, "reg_delete", self.sanitize_reg(entry))
+        for entry in self.try_iterate(results, "behavior", "summary",
+                                 "executed_commands"):
+            self.insert_into_json(json_result, "cmd_exec", self.sanitize_cmd(entry))
+        for entry in self.try_iterate(results, "behavior", "summary", "resolved_apis"):
+            self.insert_into_json(json_result, "api_resolv", self.sanitize_generic(entry))
+        for entry in self.try_iterate(results, "behavior", "summary", "mutexes"):
+            self.insert_into_json(json_result, "mutex_access", self.sanitize_generic(entry))
+        for entry in self.try_iterate(results, "behavior", "summary",
+                                 "created_services"):
+            self.insert_into_json(json_result, "service_create",
+                             self.sanitize_generic(entry))
+        for entry in self.try_iterate(results, "behavior", "summary",
+                                 "started_services"):
+            self.insert_into_json(json_result, "service_start", self.sanitize_generic(entry))
+
+        for entry in self.try_iterate(results, "signatures"):
+            if "virustotal" in entry["name"]:
+                continue
+            signame = "sig_" + entry["name"].lower().replace(' ', '_')
+            for res in self.try_iterate(entry, "data"):
+                try:
+                    for key, value in res.items():
+                        if isinstance(value, basestring):
+                            lowerval = value.lower()
+                            sanitized = None
+                            if lowerval.startswith("hkey"):
+                                sanitized = self.sanitize_reg(value)
+                            elif lowerval.startswith("c:"):
+                                sanitized = self.sanitize_file(value)
+                            else:
+                                sanitized = self.sanitize_generic(value)
+                            self.insert_into_json(json_result, signame, sanitized)
+                except:
+                    pass
+
+        for host in self.try_iterate(results, "network", "hosts"):
+            if "country_name" in host:
+                self.insert_into_json(
+                    json_result, "net_con", self.sanitize_generic(host["country_name"]))
+            if "ip" in host:
+                self.insert_into_json(json_result, "net_con", self.sanitize_ip(host["ip"]))
+
+        for domain in self.try_iterate(results, "network", "domains"):
+            self.insert_into_json(
+                json_result, "net_dns", self.sanitize_domain(domain["domain"]))
+
+        for req in self.try_iterate(results, "network", "http"):
+            self.insert_into_json(json_result, "net_http", self.sanitize_url(req["uri"]))
+
+        for req in self.try_iterate(results, "network", "mitm", "requests"):
+            self.insert_into_json(json_result, "net_mitm", self.sanitize_url(req["url"]))
+
+        for dropped in self.try_iterate(results, "dropped"):
+            if "size" in dropped and "type" in dropped:
+                self.insert_into_json(json_result, "file_drop",
+                                 list(map(
+                                     lambda el:
+                                     "%08x_%s" % (int(dropped["size"]) &
+                                                  0xfffffc00, el),
+                                     self.sanitize_generic(dropped["type"]))))
+
+        for key, value in json_result.items():
+          json_result[key] = ' '.join(value)
+        return json_result
+
+
+    def run(self, results):
+	""""
+            Writes features:
+    	    @param results: results dictionary.
+	        @raise CuckooReportError: if it fails to parse results.
+  	"""
+
+        mistjson = self.options.get("enabled", True)
+        analyses_home = self.options.get("analyses_home", "/analyses/")
+
+  	if mistjson:
+	    mts_id = results["info"]["id"]
+            path       = os.path.join(analyses_home, str(mts_id), "reports", "mist.json.gzip")
+	    try:
+                mist_json = self.mist_convert(results)
+                if len(mist_json) > 0:
+            	    log.info( "[+] Saving mist report for %s " % mts_id)
+    		    with gzip.open(path, 'wb') as outfile:
+    		        json.dump(mist_json, outfile, sort_keys=True,indent=2, ensure_ascii=False)
+	    except Exception as e:
+	        log.error("Error in feature extraction for %s, error is: %s" % (mts_id, e))
+