-
Notifications
You must be signed in to change notification settings - Fork 4
/
build_dataset.sh
159 lines (131 loc) · 6.92 KB
/
build_dataset.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/bin/bash
# Capture the start time
start_time=$(date +%s)
# Read username and password (for PhysioNet)
read -p "Username: " USERNAME
read -s -p "Password: " PASSWORD
# Define directories and file names
MIMIC_CXR="https://physionet.org/files/mimic-cxr-jpg/2.0.0"
CHEST_IMAGENOME_BASE="https://physionet.org/files/chest-imagenome/1.0.0"
CHEST_IMAGENOME_SILVER="${CHEST_IMAGENOME_BASE}/silver_dataset"
CHEST_IMAGENOME_GOLD="$CHEST_IMAGENOME_BASE/gold_dataset"
CHEST_IMAGENOME_UTILS="$CHEST_IMAGENOME_BASE/utils/scene_postprocessing"
CHEST_IMAGENOME_SEMANTICS="$CHEST_IMAGENOME_BASE/semantics"
MIMIC_IV="https://physionet.org/files/mimiciv/2.2"
# Define wget parameters for readability
WGET_PARAMS="-r -N -c -np --user $USERNAME --password $PASSWORD"
# Helper function to download and extract files
download_and_extract() {
local file_url=$1
local destination_dir=$2
local file_name=$(basename "$file_url")
# Download the file
wget $WGET_PARAMS "$file_url"
# Extract if it's a zip file
if [[ "$file_name" == *.zip ]]; then
unzip -o "$destination_dir/$file_name" -d "$destination_dir" # -o: overwrite
fi
# Extract if it's a gzip file
if [[ "$file_name" == *.gz ]]; then
gzip -d "$destination_dir/$file_name"
fi
}
# Download MIMIC-CXR metadata
download_and_extract "$MIMIC_CXR/mimic-cxr-2.0.0-metadata.csv.gz" "physionet.org/files/mimic-cxr-jpg/2.0.0"
# Download Chest Imagenome files
download_and_extract "$CHEST_IMAGENOME_SILVER/scene_graph.zip" "physionet.org/files/chest-imagenome/1.0.0/silver_dataset"
download_and_extract "$CHEST_IMAGENOME_GOLD/gold_attributes_relations_500pts_500studies1st.txt" "physionet.org/files/chest-imagenome/1.0.0/gold_dataset"
download_and_extract "$CHEST_IMAGENOME_GOLD/gold_bbox_coordinate_annotations_1000images.csv" "physionet.org/files/chest-imagenome/1.0.0/gold_dataset"
download_and_extract "$CHEST_IMAGENOME_UTILS/scenegraph_postprocessing.py" "physionet.org/files/chest-imagenome/1.0.0/utils/scene_postprocessing"
download_and_extract "$CHEST_IMAGENOME_SEMANTICS/attribute_relations_v1.txt" "physionet.org/files/chest-imagenome/1.0.0/semantics"
download_and_extract "$CHEST_IMAGENOME_SEMANTICS/label_to_UMLS_mapping.json" "physionet.org/files/chest-imagenome/1.0.0/semantics"
download_and_extract "$CHEST_IMAGENOME_SEMANTICS/objects_extracted_from_reports_v1.txt" "physionet.org/files/chest-imagenome/1.0.0/semantics"
# Download MIMIC-IV hosp modules
download_and_extract "$MIMIC_IV/hosp/admissions.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
download_and_extract "$MIMIC_IV/hosp/diagnoses_icd.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
download_and_extract "$MIMIC_IV/hosp/d_icd_diagnoses.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
download_and_extract "$MIMIC_IV/hosp/d_icd_procedures.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
download_and_extract "$MIMIC_IV/hosp/d_labitems.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
download_and_extract "$MIMIC_IV/hosp/labevents.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
download_and_extract "$MIMIC_IV/hosp/microbiologyevents.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
download_and_extract "$MIMIC_IV/hosp/patients.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
download_and_extract "$MIMIC_IV/hosp/prescriptions.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
download_and_extract "$MIMIC_IV/hosp/procedures_icd.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
download_and_extract "$MIMIC_IV/hosp/transfers.csv.gz" "physionet.org/files/mimiciv/2.2/hosp"
# Download MIMIC-IV icu modules
download_and_extract "$MIMIC_IV/icu/chartevents.csv.gz" "physionet.org/files/mimiciv/2.2/icu"
download_and_extract "$MIMIC_IV/icu/d_items.csv.gz" "physionet.org/files/mimiciv/2.2/icu"
download_and_extract "$MIMIC_IV/icu/icustays.csv.gz" "physionet.org/files/mimiciv/2.2/icu"
download_and_extract "$MIMIC_IV/icu/inputevents.csv.gz" "physionet.org/files/mimiciv/2.2/icu"
download_and_extract "$MIMIC_IV/icu/outputevents.csv.gz" "physionet.org/files/mimiciv/2.2/icu"
# Save currentdirectory
orig_dir=$(pwd)
# Change directory and run python script
if [ ! -f "physionet.org/files/chest-imagenome/1.0.0/silver_dataset/scene_tabular/attribute_relations_tabular.txt" ] || [ ! -f "physionet.org/files/chest-imagenome/1.0.0/silver_dataset/scene_tabular/bbox_objects_tabular.txt" ]; then
cd "physionet.org/files/chest-imagenome/1.0.0/utils/scene_postprocessing"
echo '{
"SCENE_DIR": "../../silver_dataset/scene_graph",
"OUTPUT_DIR": "../../silver_dataset/scene_tabular",
"OUTPUT_TYPE": ["attributes", "objects"],
"RDF_LEVEL": "study_id",
"RESOURCE": "../../semantics/label_to_UMLS_mapping.json",
"AGGREGATION": "last",
"INCLUDE_SECTIONS": "all"
}' > scenegraph_postprocessing_settings.json
python scenegraph_postprocessing.py
echo "Done with scene postprocessing"
fi
# Return to the original directory
cd "$orig_dir"
# Preprocessing and generate dataset
SAVE_DIR="dataset_builder/preprocessed_data/"
PREPROCESS_SCRIPTS=("preprocess_cohort.py" "preprocess_label.py")
SPLITS=("train" "valid" "test")
mkdir -p "$SAVE_DIR"
for split in "${SPLITS[@]}"; do
if [ ! -f "${SAVE_DIR}/${split}_dataset.csv" ]; then
for script in "${PREPROCESS_SCRIPTS[@]}"; do
python "dataset_builder/${script}" \
--mimic_cxr_jpg_dir "physionet.org/files/mimic-cxr-jpg/2.0.0/" \
--chest_imagenome_dir "physionet.org/files/chest-imagenome/1.0.0/" \
--save_dir "$SAVE_DIR"
done
fi
done
# DB preprocessing code
declare -A splits=( ["test"]=400 ["train"]=800 ) # Array of splits and their corresponding number of patients
for split in "${!splits[@]}"; do
num_patient=${splits[$split]}
echo "Processing $split split with $num_patient patients..."
python dataset_builder/preprocess_db.py \
--split "$split" \
--mimic_iv_dir "physionet.org/files/mimiciv/2.2/" \
--mimic_cxr_jpg_dir "physionet.org/files/mimic-cxr-jpg/2.0.0/" \
--chest_imagenome_dir "physionet.org/files/chest-imagenome/1.0.0/" \
--db_name mimic_iv_cxr \
--out_dir "./database" \
--deid \
--timeshift \
--current_time "2105-12-31 23:59:00" \
--start_year 2100 \
--time_span 5 \
--cur_patient_ratio 0.1 \
--num_patient $num_patient
done
echo "Database preprocessing complete."
# Answer generation code
for split in "${SPLITS[@]}"; do
python dataset_builder/generate_answer.py \
--mimic_iv_dir "physionet.org/files/mimiciv/2.2/" \
--mimic_cxr_jpg_dir "physionet.org/files/mimic-cxr-jpg/2.0.0/" \
--chest_imagenome_dir "physionet.org/files/chest-imagenome/1.0.0/" \
--json_file_path "dataset/mimic_iv_cxr/_${split}.json" \
--db_file_path "database/mimic_iv_cxr/${split}/mimic_iv_cxr.db" \
--output_path "dataset/mimic_iv_cxr/${split}.json"
done
# Capture the end time
end_time=$(date +%s)
# Calculate the runtime
runtime=$((end_time - start_time))
# Display the runtime
echo "Script runtime: $runtime seconds"