forked from weecology/sad-comparison
-
Notifications
You must be signed in to change notification settings - Fork 1
/
sad-process-db.py
160 lines (121 loc) · 6.82 KB
/
sad-process-db.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
""" Code to put results from sad-comparisons.py into database."""
from __future__ import division
import csv
import sys
import multiprocessing
import itertools
import os
import numpy as np
from math import log, exp
from scipy import stats
import sqlite3 as dbapi
# Function to import the AICc results.
def import_results(datafile):
"""Imports raw result .csv files in the form: site, S, N, logseries_untruncated, pln, negbin, zipf."""
raw_results = np.genfromtxt(datafile, dtype = "S15, i8, i8, f8, f8, f8, f8", skip_header = 1,
names = ['site', 'S', 'N', 'logseries', 'pln', 'negbin', 'AICc_zipf'], delimiter = ",", missing_values = '', filling_values = '')
return raw_results
# Function to determine the winning model for each site.
def winning_model(data_dir, dataset_name, results):
# Open output files
output_processed = csv.writer(open(data_dir + dataset_name + '_processed_results.csv','wb'))
# Insert comment line
output_processed.writerow(["# 0 = Logseries, 1 = Poisson lognormal, 2 = Negative binomial, 3 = Zipf distribution"])
# Insert header
output_processed.writerow(['dataset', 'site', 'S', 'N', "model_code", "model_name", "AICc_weight"])
# Create dictionary of models and their corresponding codes
models = {0: 'Logseries', 1:'Poisson lognormal', 2:'Negative binomial', 3:'Zipf distribution'}
for site in results:
site_results = site.tolist()
site_ID = site_results[0]
S = site_results[1]
N = site_results[2]
AICc_weights = site_results[3:]
AICc_max_weight = max(AICc_weights) # This will return the actual AICc_weight of the winning model, given that the winning model is the one with the highest AICc weight.
winning_model = AICc_weights.index(AICc_max_weight) # This will return the winning model, where the model is indicated by the index position
model_name = models[winning_model]
# Format results for output
processed_results = [[dataset_name] + [site_ID] + [S] + [N] + [winning_model] + [model_name] + [AICc_max_weight]]
# Save results to a csv file:
output_processed.writerows(processed_results)
# Save results to sqlite database
#Create database for simulated data """
cur.execute("""CREATE TABLE IF NOT EXISTS ResultsWin
(dataset_code TEXT,
site TEXT,
S INTEGER,
N INTEGER,
model_code INTEGER,
model_name TEXT,
AICc_weight_model FLOAT)""")
cur.executemany("""INSERT INTO ResultsWin VALUES(?,?,?,?,?,?,?)""", processed_results)
con.commit()
return processed_results
def process_results(data_dir, dataset_name, results, value_type):
for site in results:
site_results = site.tolist()
site_ID = site_results[0]
S = site_results[1]
N = site_results[2]
values = site_results[3:]
counter = 0
models = {0: 'Logseries', 1:'Poisson lognormal', 2:'Negative binomial', 3:'Zipf distribution'}
for index, value in enumerate(values):
model_name = models[index]
processed_results = [[dataset_name] + [site_ID] + [S] + [N] + [index] + [model_name] + [value_type] + [value]]
# Save results to sqlite database
#Create database for simulated data """
cur.execute("""CREATE TABLE IF NOT EXISTS RawResults
(dataset_code TEXT,
site TEXT,
S INTEGER,
N INTEGER,
model_code INTEGER,
model_name TEXT,
value_type TEXT,
value FLOAT)""")
cur.executemany("""INSERT INTO RawResults VALUES(?,?,?,?,?,?,?,?)""", processed_results)
con.commit()
return processed_results
if __name__ == '__main__':
# Set up analysis parameters
results_ext = '_dist_test.csv' # Extension for raw model AICc results files
likelihood_ext = '_likelihoods.csv' # Extension for raw model likelihood files
relative_ll_ext = '_relative_L.csv' # Extenstion for raw model relative likelihood files
database_name = input("Please provide the name of the output database. ")
data_dir = input("Please provide the path to the data directory. ")
if not data_dir:
data_dir = './sad-data/chapter1/' # path to data directory
datasets = input("Please provide a list of dataset ID codes. ")
if not datasets:
datasets = ['bbs', 'cbc', 'fia', 'gentry', 'mcdb', 'naba', 'Reptilia', 'Coleoptera', 'Arachnida', 'Amphibia', 'Actinopterygii'] # Dataset ID codes
# Asks for toggle variable so I don't have to rerun all the setup if it is already processed.
needs_processing = input("Data needs to be processed into an sqlite database, True or False? ")
# Starts actual processing for each set of results in turn.
if needs_processing == True:
# Set up database capabilities
# Set up ability to query data
database = data_dir + database_name
con = dbapi.connect(database)
cur = con.cursor()
# Switch con data type to string
con.text_factory = str
drop_tables = input("Database needs to be rebuilt, True or False? ")
if drop_tables == True:
cur.execute("""DROP TABLE IF EXISTS ResultsWin""")
cur.execute("""DROP TABLE IF EXISTS RawResults""")
con.commit()
for dataset in datasets:
datafile = data_dir + dataset + results_ext
datafile2 = data_dir + dataset + likelihood_ext
datafile3 = data_dir + dataset + relative_ll_ext
raw_results = import_results(datafile) # Import AICc weight data
raw_results_likelihood = import_results(datafile2) # Import log-likelihood data
raw_results_relative_ll = import_results(datafile3) #Import relative likelihood data
winning_model(data_dir, dataset, raw_results) # Finds the winning model for each site
process_results(data_dir, dataset, raw_results, 'AICc weight') #Turns the raw results into a database.
process_results(data_dir, dataset, raw_results_likelihood, 'likelihood') #Turns the raw results into a database.
process_results(data_dir, dataset, raw_results_relative_ll, 'relative likelihood') #Turns the raw results into a database.
#Close connection to database
con.close()
print("Database complete.")