Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Linear model creation and interpretation #5

Merged
merged 12 commits into from
Jun 16, 2023
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
,camo,oak,16.06.2023 16:06,file:///home/camo/.config/libreoffice/4;
1,162 changes: 1,162 additions & 0 deletions 1.train_models/linear_reg_plate_1_2_norm_data/data/model_properties.tsv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,220 @@
#!/usr/bin/env python
# coding: utf-8

# # Fit a linear model on cell morphology features
#
# Our aim is to determine which features significantly contribute to NF1 genotype adjusted by covariates. This notebook is adapted from Dr. Way at:
# https://github.com/WayScience/Benchmarking_NF1_data/blob/main/5_analyze_data/notebooks/linear_model/fit_linear_model.ipynb

# In[ ]:
MattsonCam marked this conversation as resolved.
Show resolved Hide resolved


from pathlib import Path
MattsonCam marked this conversation as resolved.
Show resolved Hide resolved
import pandas as pd
import sys

from sklearn.linear_model import LinearRegression
import numpy as np
from scipy import stats
from statsmodels.stats.multitest import multipletests


# ## Finding the git root directory to reference paths on any system

# In[ ]:


# Get the current working directory
cwd = Path.cwd()

if (cwd / ".git").is_dir():
root_dir = cwd

else:
root_dir = None
for parent in cwd.parents:
if (parent / ".git").is_dir():
root_dir = parent
break

# Check if a Git root directory was found
if root_dir is None:
raise FileNotFoundError("No Git root directory found.")
MattsonCam marked this conversation as resolved.
Show resolved Hide resolved


# In[ ]:


sys.path.append(f"{root_dir}/utils")
import preprocess_utils as ppu


# ## Specify paths

# In[ ]:


data_path = Path("data")

data_path.mkdir(
parents=True, exist_ok=True
) # Create the parent directories if they don't exist

plate_path = Path(
f"{root_dir}/nf1_painting_repo/3.processing_features/data/normalized_data"
MattsonCam marked this conversation as resolved.
Show resolved Hide resolved
)

model_data_path = f"{data_path}/model_properties.tsv"


# ## Combine data

# In[ ]:


filename1 = "Plate_1_sc_norm.parquet"
filename2 = "Plate_2_sc_norm.parquet"

path1 = plate_path / filename1
path2 = plate_path / filename2

po1 = ppu.Preprocess_data(path=path1)
po2 = ppu.Preprocess_data(path=path2)

plate1df = po1.df # Returns the dataframe generated by the csv
plate2df = po2.df # Returns the dataframe generated by the csv

# Set plate column:
plate1df["Metadata_plate"] = "1"
plate2df["Metadata_plate"] = "2"

common_columns = list(plate1df.columns.intersection(plate2df.columns))
plate1df = plate1df.loc[:, common_columns]
plate2df = plate2df.loc[:, common_columns]

# Combine the plate dataframes:
cp_df = pd.concat([plate1df, plate2df], axis="rows")


# ## Preprocessing

# In[ ]:


# Removes columns that contain all zeros
cp_df = cp_df.loc[:, (cp_df != 0).any(axis=0)]
cp_df.reset_index(inplace=True, drop=True)

# Removes metadata
pcp_features = po1.remove_meta(cp_df)
MattsonCam marked this conversation as resolved.
Show resolved Hide resolved
cp_features = pcp_features.columns

# Setup linear modeling framework
variables = ["Metadata_number_of_singlecells", "Metadata_plate"]
X = cp_df.loc[:, variables]

# Add dummy matrix of categorical genotypes
genotype_x = pd.get_dummies(data=cp_df.Metadata_genotype).astype(int)

X = pd.concat([X, genotype_x], axis=1)
X_arr = X.values

dfreg = X.shape[1]
dfres = X.shape[0] - dfreg - 1


# ## Fit linear model

# In[ ]:


# Fit linear model for each feature
MattsonCam marked this conversation as resolved.
Show resolved Hide resolved
lm_results = []
i = 0
for cp_feature in cp_features:
# Subset CP data to each individual feature (univariate test)
y = cp_df.loc[:, cp_feature]

# Fit linear model
lm = LinearRegression(fit_intercept=True)
lm_result = lm.fit(X=X_arr, y=y)

# Extract Beta coefficients
# (contribution of feature to X covariates)
coef = lm_result.coef_

# Estimate fit (R^2)
r2_score = lm.score(X=X_arr, y=y)

y_pred = lm.predict(X_arr)

# Calculate the residuals
residuals = y.values - y_pred

# Calculate the Sum of Squares Regression (SSR)
ssr = np.sum((y_pred - np.mean(y.values)) ** 2)

# Calculate the Mean Squares Regression (MSR)
msr = ssr / dfreg

# Calculate the Sum of Squares Residual (SSE) or residual sum of squares
sse = np.sum(residuals**2)

# Calculate the Mean Squares Residual (MSE)
mse = sse / dfres

# Calculate the F-value
f_value = msr / mse

# Calculate the p-value for the F-statistic
p_value = 1 - stats.f.cdf(f_value, dfreg, dfres)

# Add results to a growing list
lm_results.append([cp_feature, r2_score, p_value] + list(coef))


# In[ ]:


# Convert results to a pandas DataFrame
lm_results = pd.DataFrame(
lm_results,
columns=[
"feature",
"r2_score",
"p_value",
"cell_count_coef",
"plate_coef",
"Null_coef",
"WT_coef",
],
)


# ## Controlling for FDR

# In[1]:


# Benjamini-Hochberg procedure has a greater power than Bonferroni, and still controls for FDR
# Here, alpha is the FDR level
rejected, corrected_pvalues, _, _ = multipletests(
lm_results["p_value"].tolist(), method="fdr_bh", alpha=0.05
)

# Store the correcte p values
lm_results["corrected_p_value"] = corrected_pvalues

# Store the critical threshold value in one column
lm_results["critical_threshold"] = [max(corrected_pvalues[rejected])] * len(lm_results)

# Taking the negative log of the corrected p values
lm_results["neg_log_p"] = -np.log10(lm_results["corrected_p_value"])


# ## Save the results

# In[ ]:


lm_results.to_csv(model_data_path, sep="\t", index=False)
Loading