-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_dataset.py
executable file
·165 lines (131 loc) · 5.3 KB
/
parse_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
#!/usr/bin/python
# This file parses a plain text dataset, and creates a sqlite db
# Created by Denis Lantsman
import sys
import os
import time
import csv
import xml.dom.minidom as minidom
import sqlite3
import re
import argparse
parser = argparse.ArgumentParser(description = 'Parse an EDM dataset.')
parser.add_argument('path', type=str, help='transactions file')
parser.add_argument('moddir', type=str, help='module directory')
parser.add_argument('objpath', type=str, help='objective path')
parser.add_argument('currpath', type=str, help='curriculum path')
parser.add_argument('--dbpath', type=str, default='rm.db',
help='Target file (rm.db)')
args = parser.parse_args()
conn = sqlite3.connect(args.dbpath)
c = conn.cursor()
colsin = ['#userid', 'problemid', 'metaid', 'problemuid', 'first_time',
'submit_time', 'hint_used', 'answercorrect']
colsout = ['user_id', 'problem_num', 'meta_id', 'problem_id', 'start',
'duration', 'hint', 'correct']
colstr = ""
for col in colsout:
colstr += col + ","
colstr = colstr[:-1]
# Create transaction table
print 'parsing transacitons'
c.execute("DROP TABLE IF EXISTS transactions;")
c.execute("CREATE TABLE transactions ({});".format(colstr))
timeformat = '%d.%m.%Y %H:%M:%S'
with open(args.path, 'rb') as f:
dr = csv.DictReader(f)
for r in dr:
# skip if data missing
if r['first_time'] == 'N' and r['submit_time'] == 'N':
continue
# convert times from strings to seconds
r['first_time'] = time.mktime(time.strptime(r['first_time'], timeformat))
r['submit_time'] = time.mktime(time.strptime(r['submit_time'], timeformat))
r['submit_time'] = r['submit_time'] - r['first_time']
# skip if duration too long (>15 minutes)
if r['submit_time'] > 900 or r['submit_time'] < 0:
continue
c.execute("INSERT INTO transactions ({}) VALUES (?, ?, ?, ?, ?, ?, ?, ?)".format(colstr),
[r[col] for col in colsin])
conn.commit()
# Create curriculum table
print 'creating curriculum table'
curriculum_dom = minidom.parse(args.currpath)
curriculum = [x.childNodes[0].data for x in curriculum_dom.getElementsByTagName('uid')]
objectives_dom = minidom.parse(args.objpath)
colsobj = ['obj_id', 'obj_idx', 'name', 'short_name']
colstr = ""
for col in colsobj:
colstr += col + ","
colstr = colstr[:-1]
c.execute("DROP TABLE IF EXISTS objectives;")
c.execute("CREATE TABLE objectives ({});".format(colstr))
for objective_idx, objective_uid in enumerate(curriculum):
for objective_dom in objectives_dom.getElementsByTagName('objective'):
if objective_dom.getAttribute('uid') == objective_uid:
name = objective_dom.getElementsByTagName('name')[0].childNodes[0].data
short_name = objective_dom.getElementsByTagName('shortName')[0].childNodes[0].data
c.execute("INSERT INTO objectives ({}) VALUES (?, ?, ?, ?)".format(colstr),
(objective_uid, objective_idx, name, short_name))
conn.commit()
print 'creating module and problem tables'
colsmodule = ['module_id', 'obj_id', 'module_idx', 'name']
mod_colstr = ""
for col in colsmodule:
mod_colstr += col + ","
mod_colstr = mod_colstr[:-1]
c.execute("DROP TABLE IF EXISTS modules;")
c.execute("CREATE TABLE modules ({});".format(mod_colstr))
colsprob = ['meta_id', 'module_id', 'meta_idx']
prob_colstr = ""
for col in colsprob:
prob_colstr += col + ","
prob_colstr = prob_colstr[:-1]
c.execute("DROP TABLE IF EXISTS problems;")
c.execute("CREATE TABLE problems ({});".format(prob_colstr))
for root, dirs, files in os.walk(args.moddir):
for filename in files:
print 'traversing file {}'.format(filename)
module_dom = minidom.parse(os.path.join(root, filename))
name = module_dom.getElementsByTagName('name')[0].childNodes[0].data
uid = module_dom.getElementsByTagName('UID')[0].childNodes[0].data
obj_uid = module_dom.getElementsByTagName('targetObjective')
if len(obj_uid) == 0:
continue
obj_uid = obj_uid[0].childNodes[0].data
# order the objectives
idx = -1
if '-IR-' in uid:
idx = 1
if '-Th-' in uid:
idx = 2
if '-A-Pr-' in uid:
idx = 3
if '-B-Pr-' in uid:
idx = 4
if idx == -1:
continue
c.execute("INSERT INTO modules ({}) VALUES (?, ?, ?, ?)".format(mod_colstr),
(uid, obj_uid, idx, name))
# regular expression for finding metaids
count = 1
for entry in module_dom.getElementsByTagName('entries'):
if entry.getElementsByTagName('type')[0].childNodes[0].data == '1':
content = entry.getElementsByTagName('content')[0].childNodes[0].data
matches = re.findall(r'metaID=[\w|-]+', content)
for match in matches:
metaid = match.split('=')[1]
c.execute("INSERT INTO problems ({}) VALUES (?, ?, ?)".format(prob_colstr),
(metaid, uid, count))
count+=1
print 'pruning extraneous entries from db.'
c.execute('''delete from objectives where obj_id not in (select obj_id from modules)''')
c.execute('''delete from modules where module_id not in (select module_id from problems)''')
c.execute('''delete from transactions where meta_id not in (select meta_id from problems)''')
print 'creating indexes into transactions table'
c.execute("CREATE INDEX metaid_index ON transactions (meta_id)")
c.execute("CREATE INDEX uid_index ON transactions (user_id)")
print "done"
conn.commit()
conn.close()