Skip to content

Commit

Permalink
adding mistToJson
Browse files Browse the repository at this point in the history
  • Loading branch information
Marco Ramilli committed Apr 21, 2019
1 parent 6b30900 commit 96d6974
Show file tree
Hide file tree
Showing 3 changed files with 229 additions and 11 deletions.
10 changes: 7 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,7 @@
Please check it out: https://marcoramilli.com/2016/12/16/malware-training-sets-a-machine-learning-dataset-for-everyone/

**Cite The DataSet**
**
** If you find those results useful please cite them :
If you find those results useful please cite them :


@misc{ MR,
Expand All @@ -14,4 +13,9 @@ Please check it out: https://marcoramilli.com/2016/12/16/malware-training-sets-a
note = "[Online; December 2016]"
}


*UPDATE*
Many people asked me about the scripts I used to generate MIST-Modified JSON. So here there are ! (take a look to scripts section).
You might use `mist_json.py` as a reporting module from CuckooSandbox and the script `fromMongoToARFF.py` to generate ARFF files suitables for WEKA.

If you are going to create new datasets by running your local CuckooSandbox using `mist_json.py` module and you wanto to share them, please feel free to make pool requests !

16 changes: 8 additions & 8 deletions scripts/fromMongoToARFF.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# IMPORTANT: This Script is not suitable for production environments !
# IMPORTANT: This Script is just a MOCUKP and it is not performant at all !
# IMPORTANT: I am not accepting criticism on such piece of code since it has been written in Hurry just to make it works
# If you want to know more about what this script does please visit: http://marcoramilli.blogspot.com
# If you want to know more about what this script does please visit: http://marcoramilli.com

# HOW to use it:
# Step 1: imports JSON representation of MIST format into mongodb server. You might want to use a simple bash script such as:
Expand Down Expand Up @@ -31,7 +31,7 @@
total_collections = collection.find().count()
for o, item in enumerate(collection.find(no_cursor_timeout=True)):
print "|-> Working on Item number: " + str(o) + " on totals: " + str(total_collections)
for key in item['properties']:
for key in item['properties']:
if key == "label":
print "|--> Found Label: " + str(key)
if item['properties'][key] not in labels:
Expand All @@ -52,7 +52,7 @@
if key not in key_list:
print "|--> Adding properties: " + str(key)
key_list.append(key)


#writing header
out.write("@RELATION maware \n")
Expand All @@ -68,11 +68,11 @@
out.write("'" + l + "'}\n")
else:
out.write("'" + l + "',")

else:
#No the last one
out.write("@ATTRIBUTE '" + k + "' numeric \n")


def write_data(f, t):
#writing data
Expand All @@ -86,12 +86,12 @@ def write_data(f, t):
property_name = key.split('!')[0]
print "**index: " + str(index) + " name: " + str(property_name)
value = item['properties'][property_name].split(' ')[int(index)]
# interesting ridiculous approach ! :D
# interesting ridiculous approach ! :D
value = str( int(value.encode('hex'),16) )
print "|---> Value: " + str(value)
else:
value = item['properties'][k]
# interesting ridicoulous approach ! :D
# interesting ridicoulous approach ! :D
value = str( int(value.encode('hex'),16) )
except Exception as e:
print "Exception: " + str(e)
Expand All @@ -100,7 +100,7 @@ def write_data(f, t):
out.write(value + "," + item['properties']['label'] + "\n")
else:
out.write(value + ",")


out.write("@DATA \n")
f = 0
Expand Down
214 changes: 214 additions & 0 deletions scripts/mist_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
# IMPORTANT: This Script is not suitable for production environments !
# IMPORTANT: This Script is just a MOCUKP and it is not performant at all !
# IMPORTANT: I am not accepting criticism on such piece of code since it has been written in Hurry just to make it works

# If you want to know more about what this script does please visit: http://marcoramilli.com
# You might decide to use this script to generate from a CuckooSandbo machine a
# MIST (modified) report in order to use it to you AI engine

import os
import subprocess
import hashlib
import urllib
import random
import string
import glob
import threading
import json
import gzip
import sys
import time
import logging

from lib.cuckoo.common.abstracts import Report
from lib.cuckoo.common.exceptions import CuckooReportError
import traceback

log = logging.getLogger()

class MistJson(Report):
"""Converts reports on MIST JSON to produce features for Prediction IO"""

def sanitize_file(self, filename):
normals = filename.lower().replace('\\', ' ').replace('.', ' ').split(' ')
hashed_components = [
hashlib.md5(normal).hexdigest()[:8] for normal in normals[-3:]]
return hashed_components


def sanitize_reg(self, keyname):
normals = keyname.lower().replace('\\', ' ').split(' ')
hashed_components = [
hashlib.md5(normal).hexdigest()[:8] for normal in normals[-2:]]
return hashed_components


def sanitize_cmd(self, cmd):
normals = cmd.lower().replace('"', '').replace(
'\\', ' ').replace('.', ' ').split(' ')
hashed_components = [
hashlib.md5(normal).hexdigest()[:8] for normal in normals]
return hashed_components


def sanitize_generic(self, value):
return [hashlib.md5(value.lower()).hexdigest()[:8]]


def sanitize_domain(self, domain):
components = domain.lower().split('.')
hashed_components = [
hashlib.md5(comp).hexdigest()[:8] for comp in components]
return hashed_components


def sanitize_ip(self, ipaddr):
components = ipaddr.split('.')
class_c = components[:3]
return [hashlib.md5('.'.join(class_c)).hexdigest()[:8],
hashlib.md5(ipaddr).hexdigest()[:8]]


def sanitize_url(self, url):
# normalize URL according to CIF specification
uri = url
if ":" in url:
uri = url[url.index(':') + 1:]
if not isinstance(uri, list):
uri = uri.strip("/")
else:
uri_2 = ",".join(uri)
uri = uri_2

quoted = urllib.quote(uri.encode('utf8')).lower()
return [hashlib.md5(quoted).hexdigest()[:8]]


def insert_into_json(self, json_result, key, values):
new_set = set(values)
if key not in json_result:
json_result[key] = new_set
else:
json_result[key] |= new_set


def try_iterate(self, results, *paths):
try:
for entry in paths:
results = results[entry]
for el in results:
yield el
except:
return


def mist_convert(self, results):
""" Performs conversion of analysis results to MIST format """
json_result = {}
analysis_id = results["info"]["id"]
log.info("[+] Working on converting id= " + str(analysis_id))

for entry in self.try_iterate(results, "behavior", "summary", "files"):
self.insert_into_json(json_result, "file_access", self.sanitize_file(entry))
for entry in self.try_iterate(results, "behavior", "summary", "write_files"):
self.insert_into_json(json_result, "file_write", self.sanitize_file(entry))
for entry in self.try_iterate(results, "behavior", "summary", "delete_files"):
self.insert_into_json(json_result, "file_delete", self.sanitize_file(entry))
for entry in self.try_iterate(results, "behavior", "summary", "read_files"):
self.insert_into_json(json_result, "file_read", self.sanitize_file(entry))
for entry in self.try_iterate(results, "behavior", "summary", "keys"):
self.insert_into_json(json_result, "reg_access", self.sanitize_reg(entry))
for entry in self.try_iterate(results, "behavior", "summary", "read_keys"):
self.insert_into_json(json_result, "reg_read", self.sanitize_reg(entry))
for entry in self.try_iterate(results, "behavior", "summary", "write_keys"):
self.insert_into_json(json_result, "reg_write", self.sanitize_reg(entry))
for entry in self.try_iterate(results, "behavior", "summary", "delete_keys"):
self.insert_into_json(json_result, "reg_delete", self.sanitize_reg(entry))
for entry in self.try_iterate(results, "behavior", "summary",
"executed_commands"):
self.insert_into_json(json_result, "cmd_exec", self.sanitize_cmd(entry))
for entry in self.try_iterate(results, "behavior", "summary", "resolved_apis"):
self.insert_into_json(json_result, "api_resolv", self.sanitize_generic(entry))
for entry in self.try_iterate(results, "behavior", "summary", "mutexes"):
self.insert_into_json(json_result, "mutex_access", self.sanitize_generic(entry))
for entry in self.try_iterate(results, "behavior", "summary",
"created_services"):
self.insert_into_json(json_result, "service_create",
self.sanitize_generic(entry))
for entry in self.try_iterate(results, "behavior", "summary",
"started_services"):
self.insert_into_json(json_result, "service_start", self.sanitize_generic(entry))

for entry in self.try_iterate(results, "signatures"):
if "virustotal" in entry["name"]:
continue
signame = "sig_" + entry["name"].lower().replace(' ', '_')
for res in self.try_iterate(entry, "data"):
try:
for key, value in res.items():
if isinstance(value, basestring):
lowerval = value.lower()
sanitized = None
if lowerval.startswith("hkey"):
sanitized = self.sanitize_reg(value)
elif lowerval.startswith("c:"):
sanitized = self.sanitize_file(value)
else:
sanitized = self.sanitize_generic(value)
self.insert_into_json(json_result, signame, sanitized)
except:
pass

for host in self.try_iterate(results, "network", "hosts"):
if "country_name" in host:
self.insert_into_json(
json_result, "net_con", self.sanitize_generic(host["country_name"]))
if "ip" in host:
self.insert_into_json(json_result, "net_con", self.sanitize_ip(host["ip"]))

for domain in self.try_iterate(results, "network", "domains"):
self.insert_into_json(
json_result, "net_dns", self.sanitize_domain(domain["domain"]))

for req in self.try_iterate(results, "network", "http"):
self.insert_into_json(json_result, "net_http", self.sanitize_url(req["uri"]))

for req in self.try_iterate(results, "network", "mitm", "requests"):
self.insert_into_json(json_result, "net_mitm", self.sanitize_url(req["url"]))

for dropped in self.try_iterate(results, "dropped"):
if "size" in dropped and "type" in dropped:
self.insert_into_json(json_result, "file_drop",
list(map(
lambda el:
"%08x_%s" % (int(dropped["size"]) &
0xfffffc00, el),
self.sanitize_generic(dropped["type"]))))

for key, value in json_result.items():
json_result[key] = ' '.join(value)
return json_result


def run(self, results):
""""
Writes features:
@param results: results dictionary.
@raise CuckooReportError: if it fails to parse results.
"""

mistjson = self.options.get("enabled", True)
analyses_home = self.options.get("analyses_home", "/analyses/")

if mistjson:
mts_id = results["info"]["id"]
path = os.path.join(analyses_home, str(mts_id), "reports", "mist.json.gzip")
try:
mist_json = self.mist_convert(results)
if len(mist_json) > 0:
log.info( "[+] Saving mist report for %s " % mts_id)
with gzip.open(path, 'wb') as outfile:
json.dump(mist_json, outfile, sort_keys=True,indent=2, ensure_ascii=False)
except Exception as e:
log.error("Error in feature extraction for %s, error is: %s" % (mts_id, e))

0 comments on commit 96d6974

Please sign in to comment.