WayScience · MattsonCam · Jun 16, 2023 · Jun 13, 2023 · Jun 13, 2023 · Jun 13, 2023
diff --git a/1.train_models/linear_reg_plate_1_2_norm_data/data/.~lock.model_properties.tsv# b/1.train_models/linear_reg_plate_1_2_norm_data/data/.~lock.model_properties.tsv#
@@ -0,0 +1 @@
+,camo,oak,16.06.2023 16:06,file:///home/camo/.config/libreoffice/4;
diff --git a/1.train_models/linear_reg_plate_1_2_norm_data/data/model_properties.tsv b/1.train_models/linear_reg_plate_1_2_norm_data/data/model_properties.tsv
diff --git a/1.train_models/linear_reg_plate_1_2_norm_data/nbconverted/train_model.py b/1.train_models/linear_reg_plate_1_2_norm_data/nbconverted/train_model.py
@@ -0,0 +1,220 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+# # Fit a linear model on cell morphology features
+#
+# Our aim is to determine which features significantly contribute to NF1 genotype adjusted by covariates. This notebook is adapted from Dr. Way at:
+# https://github.com/WayScience/Benchmarking_NF1_data/blob/main/5_analyze_data/notebooks/linear_model/fit_linear_model.ipynb
+
+# In[ ]:
+
+
+from pathlib import Path
+import pandas as pd
+import sys
+
+from sklearn.linear_model import LinearRegression
+import numpy as np
+from scipy import stats
+from statsmodels.stats.multitest import multipletests
+
+
+# ## Finding the git root directory to reference paths on any system
+
+# In[ ]:
+
+
+# Get the current working directory
+cwd = Path.cwd()
+
+if (cwd / ".git").is_dir():
+    root_dir = cwd
+
+else:
+    root_dir = None
+    for parent in cwd.parents:
+        if (parent / ".git").is_dir():
+            root_dir = parent
+            break
+
+# Check if a Git root directory was found
+if root_dir is None:
+    raise FileNotFoundError("No Git root directory found.")
+
+
+# In[ ]:
+
+
+sys.path.append(f"{root_dir}/utils")
+import preprocess_utils as ppu
+
+
+# ## Specify paths
+
+# In[ ]:
+
+
+data_path = Path("data")
+
+data_path.mkdir(
+    parents=True, exist_ok=True
+)  # Create the parent directories if they don't exist
+
+plate_path = Path(
+    f"{root_dir}/nf1_painting_repo/3.processing_features/data/normalized_data"
+)
+
+model_data_path = f"{data_path}/model_properties.tsv"
+
+
+# ## Combine data
+
+# In[ ]:
+
+
+filename1 = "Plate_1_sc_norm.parquet"
+filename2 = "Plate_2_sc_norm.parquet"
+
+path1 = plate_path / filename1
+path2 = plate_path / filename2
+
+po1 = ppu.Preprocess_data(path=path1)
+po2 = ppu.Preprocess_data(path=path2)
+
+plate1df = po1.df  # Returns the dataframe generated by the csv
+plate2df = po2.df  # Returns the dataframe generated by the csv
+
+# Set plate column:
+plate1df["Metadata_plate"] = "1"
+plate2df["Metadata_plate"] = "2"
+
+common_columns = list(plate1df.columns.intersection(plate2df.columns))
+plate1df = plate1df.loc[:, common_columns]
+plate2df = plate2df.loc[:, common_columns]
+
+# Combine the plate dataframes:
+cp_df = pd.concat([plate1df, plate2df], axis="rows")
+
+
+# ## Preprocessing
+
+# In[ ]:
+
+
+# Removes columns that contain all zeros
+cp_df = cp_df.loc[:, (cp_df != 0).any(axis=0)]
+cp_df.reset_index(inplace=True, drop=True)
+
+# Removes metadata
+pcp_features = po1.remove_meta(cp_df)
+cp_features = pcp_features.columns
+
+# Setup linear modeling framework
+variables = ["Metadata_number_of_singlecells", "Metadata_plate"]
+X = cp_df.loc[:, variables]
+
+# Add dummy matrix of categorical genotypes
+genotype_x = pd.get_dummies(data=cp_df.Metadata_genotype).astype(int)
+
+X = pd.concat([X, genotype_x], axis=1)
+X_arr = X.values
+
+dfreg = X.shape[1]
+dfres = X.shape[0] - dfreg - 1
+
+
+# ## Fit linear model
+
+# In[ ]:
+
+
+# Fit linear model for each feature
+lm_results = []
+i = 0
+for cp_feature in cp_features:
+    # Subset CP data to each individual feature (univariate test)
+    y = cp_df.loc[:, cp_feature]
+
+    # Fit linear model
+    lm = LinearRegression(fit_intercept=True)
+    lm_result = lm.fit(X=X_arr, y=y)
+
+    # Extract Beta coefficients
+    # (contribution of feature to X covariates)
+    coef = lm_result.coef_
+
+    # Estimate fit (R^2)
+    r2_score = lm.score(X=X_arr, y=y)
+
+    y_pred = lm.predict(X_arr)
+
+    # Calculate the residuals
+    residuals = y.values - y_pred
+
+    # Calculate the Sum of Squares Regression (SSR)
+    ssr = np.sum((y_pred - np.mean(y.values)) ** 2)
+
+    # Calculate the Mean Squares Regression (MSR)
+    msr = ssr / dfreg
+
+    # Calculate the Sum of Squares Residual (SSE) or residual sum of squares
+    sse = np.sum(residuals**2)
+
+    # Calculate the Mean Squares Residual (MSE)
+    mse = sse / dfres
+
+    # Calculate the F-value
+    f_value = msr / mse
+
+    # Calculate the p-value for the F-statistic
+    p_value = 1 - stats.f.cdf(f_value, dfreg, dfres)
+
+    # Add results to a growing list
+    lm_results.append([cp_feature, r2_score, p_value] + list(coef))
+
+
+# In[ ]:
+
+
+# Convert results to a pandas DataFrame
+lm_results = pd.DataFrame(
+    lm_results,
+    columns=[
+        "feature",
+        "r2_score",
+        "p_value",
+        "cell_count_coef",
+        "plate_coef",
+        "Null_coef",
+        "WT_coef",
+    ],
+)
+
+
+# ## Controlling for FDR
+
+# In[1]:
+
+
+# Benjamini-Hochberg procedure has a greater power than Bonferroni, and still controls for FDR
+# Here, alpha is the FDR level
+rejected, corrected_pvalues, _, _ = multipletests(
+    lm_results["p_value"].tolist(), method="fdr_bh", alpha=0.05
+)
+
+# Store the correcte p values
+lm_results["corrected_p_value"] = corrected_pvalues
+
+# Store the critical threshold value in one column
+lm_results["critical_threshold"] = [max(corrected_pvalues[rejected])] * len(lm_results)
+
+# Taking the negative log of the corrected p values
+lm_results["neg_log_p"] = -np.log10(lm_results["corrected_p_value"])
+
+
+# ## Save the results
+
+# In[ ]:
+
+
+lm_results.to_csv(model_data_path, sep="\t", index=False)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		,camo,oak,16.06.2023 16:06,file:///home/camo/.config/libreoffice/4;