Skip to content

Commit

Permalink
Merge pull request #46 from MiBiPreT/45-update-data-handling
Browse files Browse the repository at this point in the history
Restructure and clean up data handling
  • Loading branch information
AlrauneZ authored Sep 20, 2024
2 parents aa4fa9d + ff5c145 commit 7934501
Show file tree
Hide file tree
Showing 19 changed files with 1,664 additions and 1,032 deletions.
28 changes: 15 additions & 13 deletions mibipret/analysis/reduction/ordination.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import pandas as pd
import skbio.stats.ordination as sciord
from sklearn import decomposition
from mibipret.data.names import name_sample
from mibipret.data.names_data import name_sample


def pca(data_frame,
Expand Down Expand Up @@ -293,21 +293,23 @@ def constrained_ordination(data_frame,
(data_independent_variables.shape[0] < data_independent_variables.shape[1]):
raise ValueError("Ordination method {} not possible with more variables than samples.".format(method))

try:
# Performing constrained ordination using function from scikit-bio.
if method == 'cca':
# Performing constrained ordination using function from scikit-bio.
if method == 'cca':
try:
sci_ordination = sciord.cca(data_dependent_variables, data_independent_variables, scaling = n_comp)
elif method == 'rda':
except(TypeError,ValueError):
raise TypeError("Not all column values are numeric values. Consider standardizing data first.")
elif method == 'rda':
try:
sci_ordination = sciord.rda(data_dependent_variables, data_independent_variables, scaling = n_comp)
else:
raise ValueError("Ordination method {} not a valid option.".format(method))

loadings_independent = sci_ordination.biplot_scores.to_numpy()[:,0:n_comp]
loadings_dependent = sci_ordination.features.to_numpy()[:,0:n_comp]
scores = sci_ordination.samples.to_numpy()[:,0:n_comp]
except(TypeError,ValueError):
raise TypeError("Not all column values are numeric values. Consider standardizing data first.")
else:
raise ValueError("Ordination method {} not a valid option.".format(method))

except(TypeError):
raise TypeError("Not all column values are numeric values. Consider standardizing data first.")
loadings_independent = sci_ordination.biplot_scores.to_numpy()[:,0:n_comp]
loadings_dependent = sci_ordination.features.to_numpy()[:,0:n_comp]
scores = sci_ordination.samples.to_numpy()[:,0:n_comp]

if loadings_independent.shape[1]<n_comp:
raise ValueError("Number of dependent variables too small.")
Expand Down
4 changes: 2 additions & 2 deletions mibipret/analysis/reduction/stable_isotope_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
@author: Alraune Zech
"""
import numpy as np
from mibipret.data.names import names_contaminants
from mibipret.data.names import names_isotopes
from mibipret.data.names_data import names_contaminants
from mibipret.data.names_data import names_isotopes


def Lambda_regression(delta_C,
Expand Down
2 changes: 1 addition & 1 deletion mibipret/analysis/sample/properties.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
@author: A. Zech
"""

import mibipret.data.names as names
import mibipret.data.names_data as names

#name_phosphate,name_nitrite
#name_sulfide,name_ammonium,name_methane,name_manganese,
Expand Down
72 changes: 36 additions & 36 deletions mibipret/analysis/sample/screening_NA.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import numpy as np
import pandas as pd
import mibipret.data.names as ean
import mibipret.data.names_data as names
from .properties import properties


Expand Down Expand Up @@ -48,7 +48,7 @@ def reductors(
cols= check_data(data)

try:
for ea in ean.electron_acceptors[ea_group]:
for ea in names.electron_acceptors[ea_group]:
if ea in cols:
tot_reduct += properties[ea]['factor_stoichiometry']* data[ea]/properties[ea]['molecular_mass']
# pd.to_numeric(data[ea]) / properties[ea]['molecular_mass']
Expand All @@ -61,7 +61,7 @@ def reductors(
raise ValueError("Data not in standardized format. Run 'standardize()' first.")

if isinstance(tot_reduct, pd.Series):
tot_reduct.rename(ean.name_total_reductors,inplace = True)
tot_reduct.rename(names.name_total_reductors,inplace = True)
if verbose:
print("Total amount of electron reductors per well in [mmol e-/l] is:\n{}".format(tot_reduct))
print('----------------------------------------------------------------')
Expand All @@ -72,7 +72,7 @@ def reductors(
# tot_reduct = False

if inplace:
data[ean.name_total_reductors] = tot_reduct
data[names.name_total_reductors] = tot_reduct

return tot_reduct

Expand Down Expand Up @@ -128,9 +128,9 @@ def oxidators(
NP_avail = available_NP(data)

try:
eas = ean.contaminants[contaminant_group].copy()
if (ean.name_o_xylene in cols) and (ean.name_pm_xylene in cols): # and (ean.name_xylene in cols):
eas.remove(ean.name_xylene)
eas = names.contaminants[contaminant_group].copy()
if (names.name_o_xylene in cols) and (names.name_pm_xylene in cols): # and (names.name_xylene in cols):
eas.remove(names.name_xylene)

for cont in eas:
if cont in cols:
Expand Down Expand Up @@ -158,15 +158,15 @@ def oxidators(
# print('________________________________________________________________')
# tot_oxi = False
if isinstance(tot_oxi, pd.Series):
tot_oxi.rename(ean.name_total_oxidators,inplace = True)
tot_oxi.rename(names.name_total_oxidators,inplace = True)
if verbose:
print("Total amount of oxidators per well in [mmol e-/l] is:\n{}".format(tot_oxi))
print('-----------------------------------------------------')
else:
raise ValueError("No data on oxidators or only zero concentrations given.")

if inplace:
data[ean.name_total_oxidators] = tot_oxi
data[names.name_total_oxidators] = tot_oxi

return tot_oxi

Expand Down Expand Up @@ -203,7 +203,7 @@ def available_NP(

cols = check_data(data)

nutrient_list = [ean.name_nitrate, ean.name_nitrite, ean.name_phosphate]
nutrient_list = [names.name_nitrate, names.name_nitrite, names.name_phosphate]
list_nut_miss = []

for nut in nutrient_list:
Expand All @@ -212,13 +212,13 @@ def available_NP(
if len(list_nut_miss)>0:
raise ValueError("Concentrations of nutrient(s) missing:", list_nut_miss)

CNs = (data[ean.name_nitrate] + data[ean.name_nitrite]) * (39. / 4.5)
CPs = data[ean.name_phosphate] * (39. / 1.)
CNs = (data[names.name_nitrate] + data[names.name_nitrite]) * (39. / 4.5)
CPs = data[names.name_phosphate] * (39. / 1.)
NP_avail =CNs.combine(CPs, min, 0)
NP_avail.name = ean.name_NP_avail
NP_avail.name = names.name_NP_avail

if inplace:
data[ean.name_NP_avail] = NP_avail
data[names.name_NP_avail] = NP_avail

if verbose:
print("Total NP available is:\n{}".format(NP_avail))
Expand Down Expand Up @@ -262,23 +262,23 @@ def electron_balance(

cols = check_data(data)

if ean.name_total_reductors in cols:
tot_reduct = data[ean.name_total_reductors]
if names.name_total_reductors in cols:
tot_reduct = data[names.name_total_reductors]
else:
tot_reduct = reductors(data,**kwargs)
# raise ValueError("Total amount of oxidators not given in data.")

if ean.name_total_oxidators in cols:
tot_oxi = data[ean.name_total_oxidators]
if names.name_total_oxidators in cols:
tot_oxi = data[names.name_total_oxidators]
else:
tot_oxi = oxidators(data,**kwargs)
# raise ValueError("Total amount of reductors not given in data.")

e_bal = tot_reduct.div(tot_oxi, axis=0)
e_bal.name = ean.name_e_balance
e_bal.name = names.name_e_balance

if inplace:
data[ean.name_e_balance] = e_bal
data[names.name_e_balance] = e_bal

if verbose:
print("Electron balance e_red/e_cont is:\n{}".format(e_bal))
Expand Down Expand Up @@ -319,8 +319,8 @@ def NA_traffic(

cols = check_data(data)

if ean.name_e_balance in cols:
e_balance = data[ean.name_e_balance]
if names.name_e_balance in cols:
e_balance = data[names.name_e_balance]
else:
e_balance = electron_balance(data,**kwargs)
# raise ValueError("Electron balance not given in data.")
Expand All @@ -329,10 +329,10 @@ def NA_traffic(
traffic = np.where(e_bal<1,"red","green")
traffic[np.isnan(e_bal)] = 'y'

NA_traffic = pd.Series(name =ean.name_na_traffic_light,data = traffic,index = e_balance.index)
NA_traffic = pd.Series(name =names.name_na_traffic_light,data = traffic,index = e_balance.index)

if inplace:
data[ean.name_na_traffic_light] = NA_traffic
data[names.name_na_traffic_light] = NA_traffic

if verbose:
print("Evaluation if natural attenuation (NA) is ongoing:")#" for {}".format(contaminant_group))
Expand Down Expand Up @@ -383,9 +383,9 @@ def total_contaminant_concentration(
tot_conc = 0.
cols = check_data(data)
try:
eas = ean.contaminants[contaminant_group].copy()
if (ean.name_o_xylene in cols) and (ean.name_pm_xylene in cols): # and (ean.name_xylene in cols):
eas.remove(ean.name_xylene)
eas = names.contaminants[contaminant_group].copy()
if (names.name_o_xylene in cols) and (names.name_pm_xylene in cols): # and (names.name_xylene in cols):
eas.remove(names.name_xylene)
for cont in eas:
if cont in cols:
tot_conc += data[cont] # mass concentration in ug/l
Expand All @@ -402,15 +402,15 @@ def total_contaminant_concentration(
# print('________________________________________________________________')
# tot_conc = False
if isinstance(tot_conc, pd.Series):
tot_conc.rename(ean.name_total_contaminants,inplace = True)
tot_conc.rename(names.name_total_contaminants,inplace = True)
if verbose:
print("Total concentration of {} in [ug/l] is:\n{}".format(contaminant_group,tot_conc))
print('--------------------------------------------------')
else:
raise ValueError("No data on contaminants or only zero concentrations given.")

if inplace:
data[ean.name_total_contaminants] = tot_conc
data[names.name_total_contaminants] = tot_conc

return tot_conc

Expand Down Expand Up @@ -458,14 +458,14 @@ def thresholds_for_intervention(
if inplace:
na_intervention = data
else:
na_intervention= pd.DataFrame(data, columns=[ean.name_sample,ean.name_observation_well])
na_intervention= pd.DataFrame(data, columns=[names.name_sample,names.name_observation_well])
traffic = np.zeros(data.shape[0],dtype=int)
intervention = [[] for i in range(data.shape[0])]

try:
eas = ean.contaminants[contaminant_group].copy()
if (ean.name_o_xylene in cols) and (ean.name_pm_xylene in cols): # and (ean.name_xylene in cols):
eas.remove(ean.name_xylene)
eas = names.contaminants[contaminant_group].copy()
if (names.name_o_xylene in cols) and (names.name_pm_xylene in cols): # and (names.name_xylene in cols):
eas.remove(names.name_xylene)
for cont in eas:
if cont in cols:
th_value = properties[cont]['thresholds_for_intervention_NL']
Expand All @@ -479,9 +479,9 @@ def thresholds_for_intervention(

traffic_light = np.where(traffic>0,"red","green")
traffic_light[np.isnan(traffic)] = 'y'
na_intervention[ean.name_intervention_traffic] = traffic_light
na_intervention[ean.name_intervention_number] = traffic
na_intervention[ean.name_intervention_contaminants] = intervention
na_intervention[names.name_intervention_traffic] = traffic_light
na_intervention[names.name_intervention_number] = traffic
na_intervention[names.name_intervention_contaminants] = intervention

if verbose:
print("Evaluation of contaminant concentrations exceeding intervention values for {}:".format(
Expand Down
Loading

0 comments on commit 7934501

Please sign in to comment.