From c5d1f5beec9a24eaf304626ebea4a231d7c84b5b Mon Sep 17 00:00:00 2001 From: Wil Roberts <47739563+robertswh@users.noreply.github.com> Date: Thu, 13 Jun 2024 10:57:30 +0100 Subject: [PATCH] read in json from hdfs (#27) * initial commit for reading in json from hdfs * code is in a function and pydoop added to requirements.txt * moved read into main.py * comment out r and d * remove old file --- main.py | 13 +++++++++++++ requirements.txt | 1 + 2 files changed, 14 insertions(+) create mode 100755 main.py diff --git a/main.py b/main.py new file mode 100755 index 00000000..4f993056 --- /dev/null +++ b/main.py @@ -0,0 +1,13 @@ +import pandas as pd + +from src.utils.hdfs_mods import hdfs_load_json as read_json + +# TODO: read from config +folder_path = "/dapsen/workspace_zone/mbs-results/" +file_name = "snapshot-202212-002-2156d36b-e61f-42f1-a0f1-61d1f8568b8e.json" +file_path = folder_path + file_name + +snapshot = read_json(file_path) + +contributors = pd.DataFrame(snapshot["contributors"]) +responses = pd.DataFrame(snapshot["responses"]) diff --git a/requirements.txt b/requirements.txt index e26789b2..baecf735 100755 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ black isort nbstripout nbqa +#research_and_development==1.0.0 pre_commit_hooks flake8 pandas==1.1.5