-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsciCrunchConverter.py
executable file
·167 lines (138 loc) · 5.76 KB
/
sciCrunchConverter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/bin/env python3
import argparse
import sys
import os
import json
import xml.etree.ElementTree as ET
def main(args):
print (args.inputFile)
json_blob = scicrunch_raw_to_json(args.inputFile)
# create the output dir if it does not already exist.
outDirExists = os.path.exists(args.outputDir)
if not outDirExists:
# Create a new directory because it does not exist
os.makedirs(args.outputDir)
for dataset in json_blob:
# Transform to dbGaP XML
dataset_xml = json_to_dbgap_xml(dataset)
# Format XML
ET.indent(dataset_xml)
# Write xml. Note we use the str function to replace any unicode characters
study_id = str(dataset['study_id'].replace("/","-"))
filename = str(f"{args.outputDir}/{study_id}.xml")
dataset_xml.write(filename, encoding="UTF-8",xml_declaration=True)
def scicrunch_raw_to_json(inputFile):
'''
Transforms the downloaded scicrunch data for a single collection (initially SPARK
with multiple datasets) and returns a single JSON blob of roughly the following format:
[
{ One of these for every dataset in the file
"study_id": , from hits[i]._id
"dataset_id": , from hits[i]._id
"dataset_name": , from hits[i].name
"variables": [
{
"variable_id": , dataset_id.v1
"variable_name": , from the organ, species and keyword fields
"variable_description": same as the variable_name
},
]
},
...
]
'''
hitDict = []
json_blob = []
with open(inputFile, "r") as fileToRead:
inputJson = json.load(fileToRead)
hitDict = inputJson["hits"]
hitList = hitDict["hits"]
for i in range(len(hitList)):
print (hitList[i]["_id"])
dataset = {}
# First set up the dataset metadata
dataset['study_id'] = str(hitList[i]["_id"])
dataset['dataset_id'] = str(hitList[i]["_id"])
# look for study name in 2 different places
try:
# This seems to be used for papers
dataset['study_name'] = str(hitList[i]["_source"]["item"]["name"])
except KeyError:
try:
# This seems to be used for datasets
dataset['study_name'] = str(hitList[i]["_source"]["objects"][0]["name"])
except KeyError:
print("No name for this dataset")
# Now for the variables
dataset['variables'] = []
# The variable data we want is in 3 places: 1) an array of keywords, 2) the organ field
# and 3) the species field
# The keywords
try:
for j in range(len(hitList[i]["_source"]["item"]["keywords"])):
variable = {}
variable['variable_id'] = str(hitList[i]["_id"] + ".v" + str(j + 1))
print (hitList[i]["_source"]["item"]["keywords"][j]["keyword"])
variable['variable_name'] = str(hitList[i]["_source"]["item"]["keywords"][j]["keyword"])
variable['variable_description'] = str(hitList[i]["_source"]["item"]["keywords"][j]["keyword"])
dataset['variables'].append(variable)
except KeyError:
print("No keywords in this dataset")
# The organ
try:
variable = {}
j = j + 1
variable['variable_id'] = str(hitList[i]["_id"] + ".v" + str(j + 1))
variable['variable_name'] = str(hitList[i]["_source"]["anatomy"]["organ"][0]["curie"])
variable['variable_description'] = str(hitList[i]["_source"]["anatomy"]["organ"][0]["curie"])
dataset['variables'].append(variable)
except KeyError:
print("No organ in this dataset")
# The species
try:
variable = {}
j = j + 1
variable['variable_id'] = str(hitList[i]["_id"] + ".v" + str(j + 1))
variable['variable_name'] = str(hitList[i]["_source"]["organisms"]["subject"][0]["species"]["curie"])
variable['variable_description'] = str(hitList[i]["_source"]["organisms"]["subject"][0]["species"]["curie"])
dataset['variables'].append(variable)
except KeyError:
print("No species in this dataset")
print (dataset)
# Append dataset
json_blob.append(dataset)
return json_blob
def json_to_dbgap_xml(dataset):
'''
Transforms the JSON blob into a dbGaP XML format, e.g.:
<data_table id="pht000700.v1" study_id="phs000166.v2" participant_set="1"
date_created="Thu Sep 3 15:21:50 2009">
<variable id="phv00070931.v1">
<name>SUBJ_ID</name>
<description>Deidentified Subject ID</description>
<type>integer</type>
</variable>
'''
# Build root
root = ET.Element("data_table")
root.set("id",dataset["dataset_id"])
root.set("study_id",dataset["study_id"])
try:
root.set("study_name",dataset["study_name"])
except KeyError:
print("No study_name in this dataset", dataset)
# Loop over each variable
for var_dict in dataset['variables']:
variable = ET.SubElement(root,"variable")
variable.set("id",var_dict['variable_id'])
name = ET.SubElement(variable, "name")
name.text = var_dict.get('variable_name',"")
desc = ET.SubElement(variable, "description")
desc.text = var_dict.get('variable_description',"")
return(ET.ElementTree(root))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Transform SciCrunch download to dbGaP XML format")
parser.add_argument('--inputFile', action="store", help= "Specify the file to convert")
parser.add_argument('--outputDir', action="store", help ="Specify absolute path for outputs")
args = parser.parse_args()
main(args)