From 2e2926dc91c3b40f5acb4b98803f24728e580aea Mon Sep 17 00:00:00 2001 From: Denis-Titov Date: Wed, 3 Jul 2024 11:25:56 -0700 Subject: [PATCH] add function and tests to filter practically unidentifiable parameters --- src/data_driven_rate_equation_selection.jl | 67 ++++- test/.JuliaFormatter.toml | 1 + test/tests_for_optimal_rate_eq_selection.jl | 314 +++++++++++++++----- 3 files changed, 303 insertions(+), 79 deletions(-) create mode 100644 test/.JuliaFormatter.toml diff --git a/src/data_driven_rate_equation_selection.jl b/src/data_driven_rate_equation_selection.jl index de2dbc1..57dde16 100644 --- a/src/data_driven_rate_equation_selection.jl +++ b/src/data_driven_rate_equation_selection.jl @@ -26,7 +26,7 @@ This function is used to perform data-driven rate equation selection using a gen - `save_train_results::Bool`: A boolean indicating whether to save the results of the training for each number of parameters as a csv file. - `enzyme_name::String`: A string for enzyme name that is used to name the csv files that are saved. -# Returns nothing, but saves a csv file for each `num_params` with the results of the training for each combination of parameters tested and a csv file with test results for top 10% of the best results with each number of parameters tested. +# Returns train_results, test_results and list of practically_unidentifiable_params and optionally saves a csv file for each `num_params` with the results of the training for each combination of parameters tested and a csv file with test results for top 10% of the best results with each number of parameters tested. """ function data_driven_rate_equation_selection( @@ -61,8 +61,13 @@ function data_driven_rate_equation_selection( num_param_range = (range_number_params[1]):1:range_number_params[2] end - #calculate starting_param_removal_codes num_param_range[1] parameters - all_param_removal_codes = calculate_all_parameter_removal_codes(param_names) + #calculate starting_param_removal_codes parameters + practically_unidentifiable_params = + find_practically_unidentifiable_params(data, param_names) + all_param_removal_codes = calculate_all_parameter_removal_codes( + param_names, + practically_unidentifiable_params, + ) starting_param_removal_codes = calculate_all_parameter_removal_codes_w_num_params( num_param_range[1], all_param_removal_codes, @@ -156,7 +161,11 @@ function data_driven_rate_equation_selection( fill(best_nt_param_removal_code, nrow(df_results)) df_test_results = vcat(df_test_results, df_results) end - return (train_results = df_train_results, test_results = df_test_results) + return ( + train_results = df_train_results, + test_results = df_test_results, + practically_unidentifiable_params = practically_unidentifiable_params, + ) end "function to calculate train loss without a figure and test loss on removed figure" @@ -236,7 +245,10 @@ function test_rate_equation( end """Generate all possibles codes for ways that params can be removed from the rate equation""" -function calculate_all_parameter_removal_codes(param_names::Tuple{Symbol,Vararg{Symbol}}) +function calculate_all_parameter_removal_codes( + param_names::Tuple{Symbol,Vararg{Symbol}}, + practically_unidentifiable_params::Tuple{Vararg{Symbol}}, +) feasible_param_subset_codes = () for param_name in param_names if param_name == :L @@ -254,15 +266,50 @@ function calculate_all_parameter_removal_codes(param_names::Tuple{Symbol,Vararg{ !startswith(string(param_name), "K_i") && !startswith(string(param_name), "K_a") && length(split(string(param_name), "_")) > 2 - feasible_param_subset_codes = (feasible_param_subset_codes..., [0, 1, 2]) + if param_name in practically_unidentifiable_params + feasible_param_subset_codes = (feasible_param_subset_codes..., [1]) + else + feasible_param_subset_codes = (feasible_param_subset_codes..., [0, 1, 2]) + end elseif startswith(string(param_name), "alpha") - feasible_param_subset_codes = (feasible_param_subset_codes..., [0, 1]) + if param_name in practically_unidentifiable_params + feasible_param_subset_codes = (feasible_param_subset_codes..., [0]) + else + feasible_param_subset_codes = (feasible_param_subset_codes..., [0, 1]) + end end end # return collect(Iterators.product(feasible_param_subset_codes...)) return Iterators.product(feasible_param_subset_codes...) end +"""Find parameters that cannot be identified based on data and they are in front of products of metabolites concentrations that are always zero as these combinations of metabolites are absent in the data.""" +function find_practically_unidentifiable_params( + data::DataFrame, + param_names::Tuple{Symbol,Vararg{Symbol}}, +) + practically_unidentifiable_params = [] + for param_name in param_names + if startswith(string(param_name), "K_") && + !startswith(string(param_name), "K_i") && + !startswith(string(param_name), "K_a") && + length(split(string(param_name), "_")) > 3 + if all([ + prod(row) == 0 for + row in eachrow(data[:, Symbol.(split(string(param_name), "_")[2:end])]) + ]) + push!(practically_unidentifiable_params, param_name) + end + elseif startswith(string(param_name), "alpha_") + metabs_in_param_name = Symbol.(split(string(param_name), "_")[2:3]) + if all([prod(row) == 0 for row in eachrow(data[:, metabs_in_param_name])]) + push!(practically_unidentifiable_params, param_name) + end + end + end + return Tuple(practically_unidentifiable_params) +end + """Generate NamedTuple of codes for ways that params can be removed from the rate equation but still leave `num_params`""" function calculate_all_parameter_removal_codes_w_num_params( num_params::Int, @@ -300,7 +347,7 @@ Function to convert parameter vector to vector where some params are equal to 0, function param_subset_select( params, param_names::Tuple{Symbol,Vararg{Symbol}}, - nt_param_removal_code::T where T<:NamedTuple, + nt_param_removal_code::T where {T<:NamedTuple}, ) @assert length(params) == length(param_names) params_dict = @@ -358,7 +405,7 @@ end Calculate `nt_param_removal_codes` with `num_params` including non-zero term combinations for codes (excluding alpha terms) in each `nt_previous_param_removal_codes` that has `num_params-1` """ function forward_selection_next_param_removal_codes( - nt_previous_param_removal_codes::Vector{T} where T<:NamedTuple, + nt_previous_param_removal_codes::Vector{T} where {T<:NamedTuple}, num_alpha_params::Int, ) param_removal_code_names = keys(nt_previous_param_removal_codes[1]) @@ -399,7 +446,7 @@ end Use `nt_previous_param_removal_codes` to calculate `nt_next_param_removal_codes` that have one additional zero elements except for for elements <= `num_alpha_params` from the end """ function reverse_selection_next_param_removal_codes( - nt_previous_param_removal_codes::Vector{T} where T<:NamedTuple, + nt_previous_param_removal_codes::Vector{T} where {T<:NamedTuple}, num_alpha_params::Int, ) param_removal_code_names = keys(nt_previous_param_removal_codes[1]) diff --git a/test/.JuliaFormatter.toml b/test/.JuliaFormatter.toml new file mode 100644 index 0000000..7ee327d --- /dev/null +++ b/test/.JuliaFormatter.toml @@ -0,0 +1 @@ +#style = "sciml" diff --git a/test/tests_for_optimal_rate_eq_selection.jl b/test/tests_for_optimal_rate_eq_selection.jl index 599eed0..812528c 100644 --- a/test/tests_for_optimal_rate_eq_selection.jl +++ b/test/tests_for_optimal_rate_eq_selection.jl @@ -17,41 +17,41 @@ param_names = ( :L, :Vmax_a, :Vmax_i, - [Symbol(:K_a, "_Metabolite$(i)") for i in 1:num_metabolites]..., - [Symbol(:K_i, "_Metabolite$(i)") for i in 1:num_metabolites]..., - [Symbol(:alpha, "_$(i)") for i in 1:n_alphas]... + [Symbol(:K_a, "_Metabolite$(i)") for i = 1:num_metabolites]..., + [Symbol(:K_i, "_Metabolite$(i)") for i = 1:num_metabolites]..., + [Symbol(:alpha, "_$(i)") for i = 1:n_alphas]..., ) param_removal_code_names = ( [ - Symbol(replace(string(param_name), "_a_" => "_allo_", "Vmax_a" => "Vmax_allo")) for - param_name in param_names if + Symbol(replace(string(param_name), "_a_" => "_allo_", "Vmax_a" => "Vmax_allo")) + for param_name in param_names if !contains(string(param_name), "_i") && param_name != :Vmax ]..., ) -all_param_removal_codes = DataDrivenEnzymeRateEqs.calculate_all_parameter_removal_codes(param_names) -param_subset_codes_with_num_params = [x - for x in all_param_removal_codes - if - length(param_names) - - sum(values(x[1:(end-n_alphas)]) .> 0) - n_alphas == - num_previous_step_params] -previous_param_removal_codes = [rand(param_subset_codes_with_num_params) - for i in 1:rand(1:20)] -nt_previous_param_removal_codes = [NamedTuple{param_removal_code_names}(x) - for x in previous_param_removal_codes] +all_param_removal_codes = + DataDrivenEnzymeRateEqs.calculate_all_parameter_removal_codes(param_names, ()) +param_subset_codes_with_num_params = [ + x for x in all_param_removal_codes if + length(param_names) - sum(values(x[1:(end-n_alphas)]) .> 0) - n_alphas == + num_previous_step_params +] +previous_param_removal_codes = + [rand(param_subset_codes_with_num_params) for i = 1:rand(1:20)] +nt_previous_param_removal_codes = + [NamedTuple{param_removal_code_names}(x) for x in previous_param_removal_codes] param_removal_code_names -nt_funct_output_param_subset_codes = DataDrivenEnzymeRateEqs.forward_selection_next_param_removal_codes( - nt_previous_param_removal_codes, - n_alphas -) +nt_funct_output_param_subset_codes = + DataDrivenEnzymeRateEqs.forward_selection_next_param_removal_codes( + nt_previous_param_removal_codes, + n_alphas, + ) funct_output_param_subset_codes = [values(nt) for nt in nt_funct_output_param_subset_codes] #ensure that funct_output_param_subset_codes have one less parameter than previous_param_removal_codes @test all( length(param_names) - n_alphas - sum(funct_output_param_subset_code[1:(end-n_alphas)] .> 0) == - (num_previous_step_params - 1) - for + (num_previous_step_params - 1) for funct_output_param_subset_code in funct_output_param_subset_codes ) #ensure that non-zero elements from previous_param_removal_codes are present in > 1 of the funct_output_param_subset_code but less than the max_matches @@ -72,12 +72,17 @@ for funct_output_param_subset_code in funct_output_param_subset_codes count = 0 max_matches_vect = Int[] for previous_param_removal_code in previous_param_removal_codes - count += funct_output_param_subset_code[1:(end-n_alphas)] .* - previous_param_removal_code[1:(end-n_alphas)] == - previous_param_removal_code[1:(end-n_alphas)] .^ 2 - push!(max_matches_vect, - sum((previous_param_removal_code[1:(end-n_alphas)] .== 0) .* - non_zero_code_combos_per_param[1:(end-n_alphas)])) + count += + funct_output_param_subset_code[1:(end-n_alphas)] .* + previous_param_removal_code[1:(end-n_alphas)] == + previous_param_removal_code[1:(end-n_alphas)] .^ 2 + push!( + max_matches_vect, + sum( + (previous_param_removal_code[1:(end-n_alphas)] .== 0) .* + non_zero_code_combos_per_param[1:(end-n_alphas)], + ), + ) end max_matches = maximum(max_matches_vect) push!(count_matches, max_matches >= count > 0) @@ -104,7 +109,8 @@ param_removal_code_names = ( !contains(string(param_name), "_i") && param_name != :Vmax ]..., ) -all_param_removal_codes = DataDrivenEnzymeRateEqs.calculate_all_parameter_removal_codes(param_names) +all_param_removal_codes = + DataDrivenEnzymeRateEqs.calculate_all_parameter_removal_codes(param_names, ()) param_subset_codes_with_num_params = [ x for x in all_param_removal_codes if length(param_names) - sum(values(x[1:(end-n_alphas)]) .> 0) - n_alphas == @@ -112,11 +118,13 @@ param_subset_codes_with_num_params = [ ] previous_param_removal_codes = [rand(param_subset_codes_with_num_params) for i = 1:rand(1:20)] -nt_previous_param_removal_codes = [NamedTuple{param_removal_code_names}(x) for x in previous_param_removal_codes] -nt_funct_output_param_subset_codes = DataDrivenEnzymeRateEqs.reverse_selection_next_param_removal_codes( - nt_previous_param_removal_codes, - n_alphas -) +nt_previous_param_removal_codes = + [NamedTuple{param_removal_code_names}(x) for x in previous_param_removal_codes] +nt_funct_output_param_subset_codes = + DataDrivenEnzymeRateEqs.reverse_selection_next_param_removal_codes( + nt_previous_param_removal_codes, + n_alphas, + ) funct_output_param_subset_codes = [values(nt) for nt in nt_funct_output_param_subset_codes] #ensure that funct_output_param_subset_codes have one more parameter than nt_previous_param_removal_codes @test all( @@ -143,24 +151,121 @@ for funct_output_param_subset_code in funct_output_param_subset_codes count = 0 for previous_param_removal_code in values.(nt_previous_param_removal_codes) count += - funct_output_param_subset_code[1:end-n_alphas] == previous_param_removal_code[1:end-n_alphas] .* (funct_output_param_subset_code[1:end-n_alphas] .!= 0) + funct_output_param_subset_code[1:end-n_alphas] == + previous_param_removal_code[1:end-n_alphas] .* + (funct_output_param_subset_code[1:end-n_alphas] .!= 0) end - max_matches = sum((funct_output_param_subset_code[1:end-n_alphas] .== 0) .* non_zero_code_combos_per_param[1:end-n_alphas]) + max_matches = sum( + (funct_output_param_subset_code[1:end-n_alphas] .== 0) .* + non_zero_code_combos_per_param[1:end-n_alphas], + ) push!(count_matches, max_matches >= count > 0) end @test all(count_matches) +#test calculate_all_parameter_removal_codes_w_num_params +num_metabolites = rand(4:8) +n_alphas = rand(1:4) +param_names = ( + :L, + :Vmax_a, + :Vmax_i, + [Symbol(:K_a, "_Metabolite$(i)") for i = 1:num_metabolites]..., + [Symbol(:K_i, "_Metabolite$(i)") for i = 1:num_metabolites]..., + [Symbol(:alpha, "_$(i)") for i = 1:n_alphas]..., +) +param_removal_code_names = ( + [ + Symbol(replace(string(param_name), "_a_" => "_allo_")) for + param_name in param_names if + !contains(string(param_name), "_i") && param_name != :Vmax + ]..., +) +all_param_removal_codes = + DataDrivenEnzymeRateEqs.calculate_all_parameter_removal_codes(param_names, ()) + +num_params = rand( + (length(param_names)-length(param_removal_code_names)):length(param_names)-n_alphas, +) +nt_param_subset_codes_w_num_params = + DataDrivenEnzymeRateEqs.calculate_all_parameter_removal_codes_w_num_params( + num_params, + all_param_removal_codes, + param_names, + param_removal_code_names, + n_alphas, + ) +#ensure that funct_output_param_subset_codes have the correct number of parameters +@test all( + length(param_names) - n_alphas - sum(values(subset_code)[1:(end-n_alphas)] .> 0) == + num_params for subset_code in nt_param_subset_codes_w_num_params +) + +# test find_practically_unidentifiable_params on PKM2 and LDH data +#Load and process data +PKM2_data_for_fit = CSV.read(joinpath(@__DIR__, "Data_for_tests/PKM2_data.csv"), DataFrame) +#Add source column that uniquely identifies a figure from publication +PKM2_data_for_fit.source = PKM2_data_for_fit.Article .* "_" .* PKM2_data_for_fit.Fig +data = PKM2_data_for_fit +PKM2_enzyme = (; + substrates = [:PEP, :ADP], + products = [:Pyruvate, :ATP], + regulators = [:F16BP, :Phenylalanine], + Keq = 20_000.0, + oligomeric_state = 4, + rate_equation_name = :pkm2_rate_equation, +) +metab_names, param_names = @derive_general_mwc_rate_eq(PKM2_enzyme) +unidentifiable_params = + DataDrivenEnzymeRateEqs.find_practically_unidentifiable_params(data, param_names) +@test all( + unidentifiable_param ∈ ( + :alpha_PEP_Pyruvate, + :alpha_ADP_Pyruvate, + :alpha_Pyruvate_F16BP, + :alpha_Pyruvate_Phenylalanine, + :alpha_ATP_Phenylalanine, + ) for unidentifiable_param in unidentifiable_params +) +#Load and process data +LDH_data_for_fit = CSV.read(joinpath(@__DIR__, "Data_for_tests/LDH_data.csv"), DataFrame) +#Add source column that uniquely identifies a figure from publication +LDH_data_for_fit.source = LDH_data_for_fit.Article .* "_" .* LDH_data_for_fit.Fig +data = LDH_data_for_fit +LDH_enzyme = (; + substrates=[:Pyruvate, :NADH], + products=[:Lactate, :NAD], + regulators=[], + Keq=20_000.0, + rate_equation_name=:ldh_rate_equation, +) +metab_names, param_names = @derive_general_qssa_rate_eq(LDH_enzyme) +unidentifiable_params = + DataDrivenEnzymeRateEqs.find_practically_unidentifiable_params(data, param_names) +@test all( + unidentifiable_param ∈ ( + :K_Pyruvate_NADH_Lactate_NAD, + ) for unidentifiable_param in unidentifiable_params +) + ## #test the ability of `data_driven_rate_equation_selection` to recover the MWC rate_equation and params used to generated data for an arbitrary enzyme - data_gen_rate_equation_Keq = 1.0 -mwc_data_gen_rate_equation(metabs, params, data_gen_rate_equation_Keq) = (1 / params.K_a_S) * (metabs.S - metabs.P / data_gen_rate_equation_Keq) / (1 + metabs.S / params.K_a_S + metabs.P / params.K_a_P) -mwc_alternative_data_gen_rate_equation(metabs, params, data_gen_rate_equation_Keq) = (1 / params.K_a_S) * (metabs.S - metabs.P / data_gen_rate_equation_Keq) / (1 + metabs.S / params.K_a_S + metabs.P / params.K_a_P + metabs.S * metabs.P / (params.K_a_S * params.K_a_P)) +mwc_data_gen_rate_equation(metabs, params, data_gen_rate_equation_Keq) = + (1 / params.K_a_S) * (metabs.S - metabs.P / data_gen_rate_equation_Keq) / + (1 + metabs.S / params.K_a_S + metabs.P / params.K_a_P) +mwc_alternative_data_gen_rate_equation(metabs, params, data_gen_rate_equation_Keq) = + (1 / params.K_a_S) * (metabs.S - metabs.P / data_gen_rate_equation_Keq) / ( + 1 + + metabs.S / params.K_a_S + + metabs.P / params.K_a_P + + metabs.S * metabs.P / (params.K_a_S * params.K_a_P) + ) data_gen_param_names = (:Vmax_a, :K_a_S, :K_a_P) metab_names = (:S, :P) -params = (Vmax=10.0, K_a_S=1e-3, K_a_P=5e-3) +params = (Vmax = 10.0, K_a_S = 1e-3, K_a_P = 5e-3) #create DataFrame of simulated data num_datapoints = 10 num_figures = 4 @@ -168,49 +273,82 @@ S_concs = Float64[] P_concs = Float64[] sources = String[] -for i in 1:num_figures +for i = 1:num_figures if i < num_figures ÷ 2 - for S in range(0, rand(1:10) * params.K_a_S, rand(num_datapoints÷2:num_datapoints*2)) + for S in + range(0, rand(1:10) * params.K_a_S, rand(num_datapoints÷2:num_datapoints*2)) push!(S_concs, S) push!(P_concs, 0.0) push!(sources, "Figure$i") end else - for P in range(0, rand(1:10) * params.K_a_P, rand(num_datapoints÷2:num_datapoints*2)) + for P in + range(0, rand(1:10) * params.K_a_P, rand(num_datapoints÷2:num_datapoints*2)) push!(S_concs, 0.0) push!(P_concs, P) push!(sources, "Figure$i") end end end -data = DataFrame(S=S_concs, P=P_concs, source=sources) +data = DataFrame(S = S_concs, P = P_concs, source = sources) noise_sd = 0.2 -data.Rate = [mwc_data_gen_rate_equation(row, params, data_gen_rate_equation_Keq) * (1 + noise_sd * randn()) for row in eachrow(data)] +data.Rate = [ + mwc_data_gen_rate_equation(row, params, data_gen_rate_equation_Keq) * + (1 + noise_sd * randn()) for row in eachrow(data) +] data -enzyme_parameters = (; substrates=[:S,], products=[:P], regulators=[], Keq=1.0, oligomeric_state=1, rate_equation_name=:mwc_derived_rate_equation) +enzyme_parameters = (; + substrates = [:S], + products = [:P], + regulators = [], + Keq = 1.0, + oligomeric_state = 1, + rate_equation_name = :mwc_derived_rate_equation, +) metab_names, derived_param_names = @derive_general_mwc_rate_eq(enzyme_parameters) -mwc_derived_rate_equation_no_Keq(nt_metabs, nt_params) = mwc_derived_rate_equation(nt_metabs, nt_params, enzyme_parameters.Keq) -selection_result = @time data_driven_rate_equation_selection(mwc_derived_rate_equation_no_Keq, data, metab_names, derived_param_names, (3, 7), true) +mwc_derived_rate_equation_no_Keq(nt_metabs, nt_params) = + mwc_derived_rate_equation(nt_metabs, nt_params, enzyme_parameters.Keq) +selection_result = @time data_driven_rate_equation_selection( + mwc_derived_rate_equation_no_Keq, + data, + metab_names, + derived_param_names, + (3, 7), + true, +) #Display best equation with 3 parameters. Compare with data_gen_rate_equation with Vmax=1 #TODO: remove the filtering for 3 parameters after we add the automatic determination of the best number of parameters -nt_param_removal_code = filter(x -> x.num_params .== 3, selection_result.test_results).nt_param_removal_codes[1] +nt_param_removal_code = + filter(x -> x.num_params .== 3, selection_result.test_results).nt_param_removal_codes[1] using Symbolics -selected_sym_rate_equation = display_rate_equation(mwc_derived_rate_equation, metab_names, derived_param_names; nt_param_removal_code=nt_param_removal_code) -original_sym_rate_equation = display_rate_equation(mwc_data_gen_rate_equation, metab_names, data_gen_param_names) -alrenative_original_sym_rate_equation = display_rate_equation(mwc_alternative_data_gen_rate_equation, metab_names, data_gen_param_names) +selected_sym_rate_equation = display_rate_equation( + mwc_derived_rate_equation, + metab_names, + derived_param_names; + nt_param_removal_code = nt_param_removal_code, +) +original_sym_rate_equation = + display_rate_equation(mwc_data_gen_rate_equation, metab_names, data_gen_param_names) +alrenative_original_sym_rate_equation = display_rate_equation( + mwc_alternative_data_gen_rate_equation, + metab_names, + data_gen_param_names, +) println("Selected MWC rate equation:") println(simplify(selected_sym_rate_equation)) println("Original MWC rate equation:") println(simplify(original_sym_rate_equation)) #equation with S*P term and without it is equally likely to be selected as there's no data with S and P present. Hence the OR condition below -selected_is_original = simplify(original_sym_rate_equation - selected_sym_rate_equation) == 0 +selected_is_original = + simplify(original_sym_rate_equation - selected_sym_rate_equation) == 0 selected_is_original = selected_is_original isa Bool ? selected_is_original : false -selected_is_alternative = simplify(alrenative_original_sym_rate_equation - selected_sym_rate_equation) == 0 +selected_is_alternative = + simplify(alrenative_original_sym_rate_equation - selected_sym_rate_equation) == 0 selected_is_alternative = selected_is_alternative isa Bool ? selected_is_alternative : false @test selected_is_original || selected_is_alternative @@ -218,12 +356,20 @@ selected_is_alternative = selected_is_alternative isa Bool ? selected_is_alterna #test the ability of `data_driven_rate_equation_selection` to recover the QSSA rate_equation and params used to generated data for an arbitrary enzyme data_gen_rate_equation_Keq = 1.0 -qssa_data_gen_rate_equation(metabs, params, data_gen_rate_equation_Keq) = (1 / params.K_S) * (metabs.S - metabs.P / data_gen_rate_equation_Keq) / (1 + metabs.S / params.K_S + metabs.P / params.K_P) -qssa_alternative_data_gen_rate_equation(metabs, params, data_gen_rate_equation_Keq) = (1 / params.K_S) * (metabs.S - metabs.P / data_gen_rate_equation_Keq) / (1 + metabs.S / params.K_S + metabs.P / params.K_P + metabs.S * metabs.P / (params.K_S * params.K_P)) +qssa_data_gen_rate_equation(metabs, params, data_gen_rate_equation_Keq) = + (1 / params.K_S) * (metabs.S - metabs.P / data_gen_rate_equation_Keq) / + (1 + metabs.S / params.K_S + metabs.P / params.K_P) +qssa_alternative_data_gen_rate_equation(metabs, params, data_gen_rate_equation_Keq) = + (1 / params.K_S) * (metabs.S - metabs.P / data_gen_rate_equation_Keq) / ( + 1 + + metabs.S / params.K_S + + metabs.P / params.K_P + + metabs.S * metabs.P / (params.K_S * params.K_P) + ) data_gen_param_names = (:Vmax, :K_S, :K_P) metab_names = (:S, :P) -params = (Vmax=10.0, K_S=1e-3, K_P=5e-3) +params = (Vmax = 10.0, K_S = 1e-3, K_P = 5e-3) #create DataFrame of simulated data num_datapoints = 10 num_figures = 4 @@ -231,7 +377,7 @@ S_concs = Float64[] P_concs = Float64[] sources = String[] -for i in 1:num_figures +for i = 1:num_figures if i < num_figures ÷ 2 for S in range(0, rand(1:10) * params.K_S, rand(num_datapoints÷2:num_datapoints*2)) push!(S_concs, S) @@ -246,33 +392,63 @@ for i in 1:num_figures end end end -data = DataFrame(S=S_concs, P=P_concs, source=sources) +data = DataFrame(S = S_concs, P = P_concs, source = sources) noise_sd = 0.2 -data.Rate = [qssa_data_gen_rate_equation(row, params, data_gen_rate_equation_Keq) * (1 + noise_sd * randn()) for row in eachrow(data)] +data.Rate = [ + qssa_data_gen_rate_equation(row, params, data_gen_rate_equation_Keq) * + (1 + noise_sd * randn()) for row in eachrow(data) +] data -enzyme_parameters = (; substrates=[:S,], products=[:P], regulators=[], Keq=1.0, rate_equation_name=:qssa_derived_rate_equation) +enzyme_parameters = (; + substrates = [:S], + products = [:P], + regulators = [], + Keq = 1.0, + rate_equation_name = :qssa_derived_rate_equation, +) metab_names, derived_param_names = @derive_general_qssa_rate_eq(enzyme_parameters) -qssa_derived_rate_equation_no_Keq(nt_metabs, nt_params) = qssa_derived_rate_equation(nt_metabs, nt_params, enzyme_parameters.Keq) -selection_result = @time data_driven_rate_equation_selection(qssa_derived_rate_equation_no_Keq, data, metab_names, derived_param_names, (1, 4), true) +qssa_derived_rate_equation_no_Keq(nt_metabs, nt_params) = + qssa_derived_rate_equation(nt_metabs, nt_params, enzyme_parameters.Keq) +selection_result = @time data_driven_rate_equation_selection( + qssa_derived_rate_equation_no_Keq, + data, + metab_names, + derived_param_names, + (1, 4), + true, +) #Display best equation with 3 parameters. Compare with data_gen_rate_equation with Vmax=1 #TODO: remove the filtering for 3 parameters after we add the automatic determination of the best number of parameters -nt_param_removal_code = filter(x -> x.num_params .== 3, selection_result.test_results).nt_param_removal_codes[1] +nt_param_removal_code = + filter(x -> x.num_params .== 3, selection_result.test_results).nt_param_removal_codes[1] using Symbolics -selected_sym_rate_equation = display_rate_equation(qssa_derived_rate_equation, metab_names, derived_param_names; nt_param_removal_code=nt_param_removal_code) -original_sym_rate_equation = display_rate_equation(qssa_data_gen_rate_equation, metab_names, data_gen_param_names) -alrenative_original_sym_rate_equation = display_rate_equation(qssa_alternative_data_gen_rate_equation, metab_names, data_gen_param_names) +selected_sym_rate_equation = display_rate_equation( + qssa_derived_rate_equation, + metab_names, + derived_param_names; + nt_param_removal_code = nt_param_removal_code, +) +original_sym_rate_equation = + display_rate_equation(qssa_data_gen_rate_equation, metab_names, data_gen_param_names) +alrenative_original_sym_rate_equation = display_rate_equation( + qssa_alternative_data_gen_rate_equation, + metab_names, + data_gen_param_names, +) println("Selected QSSA rate equation:") println(simplify(selected_sym_rate_equation)) println("Original QSSA rate equation:") println(simplify(original_sym_rate_equation)) #equation with S*P term and without it is equally likely to be selected as there's no data with S and P present. Hence the OR condition below -selected_is_original = simplify(original_sym_rate_equation - selected_sym_rate_equation) == 0 +selected_is_original = + simplify(original_sym_rate_equation - selected_sym_rate_equation) == 0 selected_is_original = selected_is_original isa Bool ? selected_is_original : false -selected_is_alternative = simplify(alrenative_original_sym_rate_equation - selected_sym_rate_equation) == 0 +selected_is_alternative = + simplify(alrenative_original_sym_rate_equation - selected_sym_rate_equation) == 0 selected_is_alternative = selected_is_alternative isa Bool ? selected_is_alternative : false @test selected_is_original || selected_is_alternative