From c8c6533e2426e530a9bcc976f337b5e92db8af79 Mon Sep 17 00:00:00 2001
From: Hariharan Devarajan <mani.hariharan@gmail.com>
Date: Fri, 4 Oct 2024 17:54:58 -0700
Subject: [PATCH] Enhancement: add hostname to metadata events (#215)

* add hostnames to metadata.

* added pid tid and hhash column to metadata
---
 dfanalyzer/main.py                    | 28 +++++++++++++++------------
 src/dftracer/writer/chrome_writer.cpp |  4 ++--
 2 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/dfanalyzer/main.py b/dfanalyzer/main.py
index e62e600..8e69cee 100644
--- a/dfanalyzer/main.py
+++ b/dfanalyzer/main.py
@@ -170,6 +170,12 @@ def load_objects(line, fn, time_granularity, time_approximate, condition_fn, loa
                 d["name"] = val["name"]
             if "cat" in val:
                 d["cat"] = val["cat"]
+            if "pid" in val:
+                d["pid"] = val["pid"]
+            if "tid" in val:
+                d["tid"] = val["tid"]
+            if "args" in val and "hhash" in val["args"]:
+                d["hhash"] = val["args"]["hhash"]
             if "M" == val["ph"]:
                 if d["name"] == "FH":
                     d["type"] = 1 # 1-> file hash
@@ -186,6 +192,11 @@ def load_objects(line, fn, time_granularity, time_approximate, condition_fn, loa
                     if "args" in val and "name" in val["args"] and "value" in val["args"]:
                         d["name"] = val["args"]["name"]
                         d["hash"] = val["args"]["value"]
+                elif d["name"] == "PR":
+                    d["type"] = 5 # 5-> process metadata
+                    if "args" in val and "name" in val["args"] and "value" in val["args"]:
+                        d["name"] = val["args"]["name"]
+                        d["hash"] = val["args"]["value"]
                 else:
                     d["type"] = 4 # 4-> others
                     if "args" in val and "name" in val["args"] and "value" in val["args"]:
@@ -193,10 +204,6 @@ def load_objects(line, fn, time_granularity, time_approximate, condition_fn, loa
                         d["value"] = str(val["args"]["value"])
             else:
                 d["type"] = 0 # 0->regular event
-                if "pid" in val:
-                    d["pid"] = val["pid"]
-                if "tid" in val:
-                    d["tid"] = val["tid"]
                 if "dur" in val:
                     val["dur"] = int(val["dur"])
                     val["ts"] = int(val["ts"])
@@ -261,8 +268,6 @@ def io_function(json_object, current_dict, time_approximate,condition_fn):
     if "args" in json_object:
         if "fhash" in json_object["args"]:
             d["fhash"] = json_object["args"]["fhash"]
-        if "hhash" in json_object["args"]:
-            d["hhash"] = json_object["args"]["hhash"]
 
         if "POSIX" == json_object["cat"] and "ret" in json_object["args"]:
             size = int(json_object["args"]["ret"])
@@ -281,7 +286,6 @@ def io_function(json_object, current_dict, time_approximate,condition_fn):
 def io_columns():
     conf = get_dft_configuration()
     return {
-        'hhash': "uint64[pyarrow]",
         'compute_time': "string[pyarrow]" if not conf.time_approximate else "uint64[pyarrow]",
         'io_time': "string[pyarrow]" if not conf.time_approximate else "uint64[pyarrow]",
         'app_io_time': "string[pyarrow]" if not conf.time_approximate else "uint64[pyarrow]",
@@ -453,10 +457,10 @@ def __init__(self, file_pattern, load_fn=None, load_cols={}, load_data = {}, met
                        'tinterval': "string[pyarrow]" if not self.conf.time_approximate else "uint64[pyarrow]", 'trange': "uint64[pyarrow]"}
             columns.update(io_columns())
             columns.update(load_cols)
-            file_hash_columns = {'name': "string[pyarrow]", 'hash':"uint64[pyarrow]"}
-            hostname_hash_columns = {'name': "string[pyarrow]", 'hash':"uint64[pyarrow]"}
-            string_hash_columns = {'name': "string[pyarrow]", 'hash':"uint64[pyarrow]"}
-            other_metadata_columns = { 'name':"string[pyarrow]" ,'value':"string[pyarrow]" }
+            file_hash_columns = {'name': "string[pyarrow]", 'hash':"uint64[pyarrow]",'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]", 'hhash': "uint64[pyarrow]"}
+            hostname_hash_columns = {'name': "string[pyarrow]", 'hash':"uint64[pyarrow]",'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]", 'hhash': "uint64[pyarrow]"}
+            string_hash_columns = {'name': "string[pyarrow]", 'hash':"uint64[pyarrow]",'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]", 'hhash': "uint64[pyarrow]"}
+            other_metadata_columns = { 'name':"string[pyarrow]" ,'value':"string[pyarrow]",'pid': "uint64[pyarrow]", 'tid': "uint64[pyarrow]", 'hhash': "uint64[pyarrow]"}
             if "FH" in metadata_cols:
                 file_hash_columns.update(metadata_cols["FH"])
             if "HH" in metadata_cols:
@@ -475,7 +479,7 @@ def __init__(self, file_pattern, load_fn=None, load_cols={}, load_data = {}, met
             self.file_hash = self.all_events.query("type == 1")[list(file_hash_columns.keys())].groupby('hash').first().persist()
             self.host_hash = self.all_events.query("type == 2")[list(hostname_hash_columns.keys())].groupby('hash').first().persist()
             self.string_hash = self.all_events.query("type == 3")[list(string_hash_columns.keys())].groupby('hash').first().persist()
-            self.metadata = self.all_events.query("type == 4")[list(other_metadata_columns.keys())].persist() 
+            self.metadata = self.all_events.query("type == 4")[list(other_metadata_columns.keys())].persist()
             self.n_partition = math.ceil(total_size.compute() / (128 * 1024 ** 2))
             logging.debug(f"Number of partitions used are {self.n_partition}")
             self.events = events.repartition(npartitions=self.n_partition).persist()
diff --git a/src/dftracer/writer/chrome_writer.cpp b/src/dftracer/writer/chrome_writer.cpp
index 72f3f01..9be3a65 100644
--- a/src/dftracer/writer/chrome_writer.cpp
+++ b/src/dftracer/writer/chrome_writer.cpp
@@ -255,12 +255,12 @@ void dftracer::ChromeWriter::convert_json_metadata(
     if (is_string) {
       written_size = sprintf(
           buffer.data() + current_index,
-          R"(%s{"id":%d,"name":"%s","cat":"dftracer","pid":%lu,"tid":%lu,"ph":"M","args":{"name":"%s","value":"%s"}})",
+          R"(%s{"id":%d,"name":"%s","cat":"dftracer","pid":%lu,"tid":%lu,"ph":"M","args":{"hhash":%d,"name":"%s","value":"%s"}})",
           is_first_char, index, ph, process_id, thread_id, name, value);
     } else {
       written_size = sprintf(
           buffer.data() + current_index,
-          R"(%s{"id":%d,"name":"%s","cat":"dftracer","pid":%lu,"tid":%lu,"ph":"M","args":{"name":"%s","value":%s}})",
+          R"(%s{"id":%d,"name":"%s","cat":"dftracer","pid":%lu,"tid":%lu,"ph":"M","args":{"hhash":%d,"name":"%s","value":%s}})",
           is_first_char, index, ph, process_id, thread_id, name, value);
     }
     current_index += written_size;