diff --git a/pages/6_Compound_space_visualizer.py b/pages/6_Compound_space_visualizer.py index 8e1d665..316ee2c 100644 --- a/pages/6_Compound_space_visualizer.py +++ b/pages/6_Compound_space_visualizer.py @@ -41,37 +41,6 @@ ) # .block-conatiner controls the padding of the page, .stTabs controls the font size of the text in the tabs -# Functions -def is_numeric(col): - return pd.api.types.is_numeric_dtype(col) - - -def pareto_efficient(costs): - is_efficient = np.ones(costs.shape[0], dtype=bool) - for i, c in enumerate(costs): - if is_efficient[i]: - is_efficient[is_efficient] = np.any( - costs[is_efficient] < c, axis=1 - ) # Keep any point with a lower cost - is_efficient[i] = True # And keep self - return is_efficient - - -def create_excel(): - """Create Excel file to download.""" - wb = Workbook() - ws = wb.active - ws.title = "Pareto Analysis Results" - - for r in dataframe_to_rows(df, index=False, header=True): - ws.append(r) - - excel_file = io.BytesIO() - wb.save(excel_file) - excel_file.seek(0) - return excel_file - - def plot_molecule(smiles): try: mol = Chem.MolFromSmiles(str(smiles)) @@ -115,28 +84,6 @@ def get_svg_download_link(fig): ) -def plot_molecule(smiles): - try: - mol = Chem.MolFromSmiles(str(smiles)) - if mol is not None: - img = Draw.MolToImage(mol, size=(200, 200)) - return img - else: - return None - except: - return None - - -def get_molecule_image_src(smiles): - img = plot_molecule(smiles) - if img: - buffered = io.BytesIO() - img.save(buffered, format="PNG") - img_str = base64.b64encode(buffered.getvalue()).decode() - return f"data:image/png;base64,{img_str}" - return "" - - def safe_calc(mol): if mol is None: return [None] * len(desc_functions) @@ -146,6 +93,33 @@ def safe_calc(mol): return [None] * len(desc_functions) +@st.fragment # Prevents the app from running the code below all the time +def load_dataset(file: str = None): + # File type and parameters + file_type = file.name.split(".")[-1] + + if file_type == "csv": + df = pd.read_csv(file, sep=",") + elif file_type == "xlsx": + sheet_name = st.text_input("Enter sheet name (leave blank for first sheet)", "") + if not sheet_name: + sheet_name = 0 + + df = pd.read_excel(file, sheet_name=sheet_name, engine="openpyxl") + else: + st.error("Please upload a CSV or Excel file.") + return + + df.columns = [i.lower() for i in df.columns] + + # Convert SMILES column to string if it exists + if "smiles" in df.columns: + df["smiles"] = df["smiles"].astype(str) + df["molecule_img"] = df["smiles"].apply(get_molecule_image_src) + + return df + + tab_1, tab_2 = st.tabs( [ "t-SNE Plot", @@ -164,55 +138,28 @@ def safe_calc(mol): st.header("Data loader", anchor="data-loader", divider="gray") # File uploader - uploaded_file = st.file_uploader("Choose a CSV file", type="csv", key="chem-space") - - @st.fragment # Prevents the app from running the code below all the time - def load_dataset(): - # File type and parameters - file_type = uploaded_file.name.split(".")[-1] - - if file_type == "csv": - sep = st.text_input("Enter CSV separator", ";") - df = pd.read_csv(uploaded_file, sep=sep) - else: # Excel - sheet_name = st.text_input( - "Enter sheet name (leave blank for first sheet)", "" - ) - if not sheet_name: - sheet_name = 0 - - df = pd.read_excel(uploaded_file, sheet_name=sheet_name, engine="openpyxl") - - # Convert SMILES column to string if it exists - if "SMILES" in df.columns: - df["SMILES"] = df["SMILES"].astype(str) - df["molecule_img"] = df["SMILES"].apply(get_molecule_image_src) - - return df + uploaded_file = st.file_uploader( + "Choose a CSV file (comma separated) or excel sheet", + type="csv", + key="chem-space", + ) if uploaded_file is None: st.info( "Please upload a CSV or Excel file to proceed! Loading an example file", icon="ℹ️", ) - # df = pd.read_csv("data/E4C_smiles.csv", sep=";") else: - df = load_dataset() - - df.columns = [i.lower() for i in df.columns] + df = load_dataset(uploaded_file) - # Check if required columns exist - if "smiles" not in df.columns or "type" not in df.columns: - st.error("The CSV file must contain 'smiles' and 'type' columns.") - else: - st.info("Reading the CSV file...") - # Convert SMILES to RDKit molecules - mols = [Chem.MolFromSmiles(smiles) for smiles in df["smiles"]] + st.info("Reading the CSV file...") + # Convert SMILES to RDKit molecules + mols = [Chem.MolFromSmiles(smiles) for smiles in df["smiles"]] - # Convert SMILES column to string if it exists - if "smiles" in df.columns: - df["smiles"] = df["smiles"].astype(str) - df["molecule_img"] = df["smiles"].apply(get_molecule_image_src) + # Convert SMILES column to string if it exists + if "smiles" in df.columns: + df["smiles"] = df["smiles"].astype(str) + df["molecule_img"] = df["smiles"].apply(get_molecule_image_src) st.header("Data plotter", anchor="data-plot", divider="gray") st.markdown("### Select plot type and calculate 2D RDKit descriptors") @@ -258,19 +205,19 @@ def load_dataset(): n_iter=500, learning_rate="auto", ) - tsne_results = tsne.fit_transform(descriptors_df) + results = tsne.fit_transform(descriptors_df) elif plot_type == "UMAP": # Perform UMAP umap_model = UMAP( n_components=2, random_state=42, n_neighbors=15, metric="cosine" ) - umap_results = umap_model.fit_transform(descriptors_df) + results = umap_model.fit_transform(descriptors_df) # Create a Bokeh ColumnDataSource source = ColumnDataSource( data=dict( - x=tsne_results[:, 0], - y=tsne_results[:, 1], + x=results[:, 0], + y=results[:, 1], smiles=df["smiles"], type=df["type"], molecule_img=df["molecule_img"], @@ -353,157 +300,151 @@ def load_dataset(): "Upload your compounds (CSV with SMILES column)", type="csv" ) - if uploaded_file is not None: + if uploaded_file is not None and user_file is not None: # Read the CSV file df = pd.read_csv(uploaded_file) + df.columns = [i.lower() for i in df.columns] - # Check if required columns exist - if "SMILES" not in df.columns or "Type" not in df.columns: - st.error("The CSV file must contain 'SMILES' and 'Type' columns.") - else: - # Process user file if uploaded - if user_file is not None: - # Read the user's CSV file - user_df = pd.read_csv(user_file) - - # Check if SMILES column exists - if "SMILES" not in user_df.columns: - st.error("The user's CSV file must contain a 'SMILES' column.") - else: - # Add 'Type' column with 'user' label - user_df["Type"] = "User" - - # Concatenate user_df with the main df - df = pd.concat([df, user_df], ignore_index=True) - df = df.dropna(subset="SMILES") + # Read the user's CSV file + user_df = pd.read_csv(user_file) + user_df.columns = [i.lower() for i in user_df.columns] - st.success("User compounds added successfully.") + # Check if SMILES column exists + if "smiles" not in user_df.columns: + st.error("The user's CSV file must contain a 'smiles' column.") + else: + # Add 'Type' column with 'user' label + user_df["type"] = "User" - # Continue with the analysis - st.success("File uploaded successfully. Generating UMAP plot...") + # Concatenate user_df with the main df + df = pd.concat([df, user_df], ignore_index=True) + df = df.dropna(subset="smiles") - # Filter out invalid molecules - # Convert SMILES to RDKit molecules - mols = [Chem.MolFromSmiles(smiles) for smiles in df["SMILES"]] - mols = [mol for mol in mols if mol is not None] + st.success("User compounds added successfully.") - # Convert SMILES column to string if it exists - if "SMILES" in df.columns: - df["SMILES"] = df["SMILES"].astype(str) - df["molecule_img"] = df["SMILES"].apply(get_molecule_image_src) + # Continue with the analysis + st.success("File uploaded successfully. Generating UMAP plot...") - # Calculate 2D RDKit descriptors - desc_names = [ - "MolWt", - "FractionCSP3", - "HeavyAtomCount", - "NumAliphaticCarbocycles", - "NumAliphaticHeterocycles", - "NumAliphaticRings", - "NumAromaticCarbocycles", - "NumAromaticHeterocycles", - "NumAromaticRings", - "NumHAcceptors", - "NumHDonors", - "NumHeteroatoms", - "NumRotatableBonds", - "NumSaturatedCarbocycles", - "NumSaturatedHeterocycles", - "NumSaturatedRings", - "RingCount", - "MolLogP", - "MolMR", - ] + # Filter out invalid molecules + # Convert SMILES to RDKit molecules + mols = [Chem.MolFromSmiles(smiles) for smiles in df["smiles"]] + mols = [mol for mol in mols if mol is not None] - desc_functions = [(name, getattr(Descriptors, name)) for name in desc_names] - calc = lambda m: [func(m) for _, func in desc_functions] - descriptors = [safe_calc(mol) for mol in mols] - descriptors_df = pd.DataFrame(descriptors, columns=desc_names) - descriptors_df = descriptors_df.dropna() # Remove rows with None values - - # Perform UMAP - umap_model = UMAP( - n_components=2, - random_state=42, - n_neighbors=30, - metric="cosine", - n_epochs=300, - min_dist=0.3, + # Convert SMILES column to string if it exists + if "smiles" in df.columns: + df["smiles"] = df["smiles"].astype(str) + df["molecule_img"] = df["smiles"].apply(get_molecule_image_src) + + # Calculate 2D RDKit descriptors + desc_names = [ + "MolWt", + "FractionCSP3", + "HeavyAtomCount", + "NumAliphaticCarbocycles", + "NumAliphaticHeterocycles", + "NumAliphaticRings", + "NumAromaticCarbocycles", + "NumAromaticHeterocycles", + "NumAromaticRings", + "NumHAcceptors", + "NumHDonors", + "NumHeteroatoms", + "NumRotatableBonds", + "NumSaturatedCarbocycles", + "NumSaturatedHeterocycles", + "NumSaturatedRings", + "RingCount", + "MolLogP", + "MolMR", + ] + + desc_functions = [(name, getattr(Descriptors, name)) for name in desc_names] + calc = lambda m: [func(m) for _, func in desc_functions] + descriptors = [safe_calc(mol) for mol in mols] + descriptors_df = pd.DataFrame(descriptors, columns=desc_names) + descriptors_df = descriptors_df.dropna() # Remove rows with None values + + # Perform UMAP + umap_model = UMAP( + n_components=2, + random_state=42, + n_neighbors=30, + metric="cosine", + n_epochs=300, + min_dist=0.3, + ) + umap_results = umap_model.fit_transform(descriptors_df) + # Ensure df and umap_results are aligned + df = df.loc[descriptors_df.index].reset_index(drop=True) + + source = ColumnDataSource( + data=dict( + x=umap_results[:, 0], + y=umap_results[:, 1], + smiles=df["smiles"], + type=df["type"], + molecule_img=df["molecule_img"], ) - umap_results = umap_model.fit_transform(descriptors_df) - # Ensure df and umap_results are aligned - df = df.loc[descriptors_df.index].reset_index(drop=True) + ) - source = ColumnDataSource( + # Create Bokeh figure + p = figure(width=800, height=600, title="UMAP Plot of 2D RDKit Descriptors") + + # Create a color mapper + colors = Category10[10][: len(df["type"].unique())] + color_mapper = factor_cmap("type", palette=colors, factors=df["type"].unique()) + + # Create the scatter plot + scatter = p.scatter( + "x", + "y", + source=source, + color=color_mapper, + legend_field="type", + size=10, + alpha=0.8, + ) + + # Highlight user compounds + if "User" in df["Type"].unique(): + user_source = ColumnDataSource( data=dict( - x=umap_results[:, 0], - y=umap_results[:, 1], - smiles=df["SMILES"], - type=df["Type"], - molecule_img=df["molecule_img"], + x=umap_results[df["type"] == "User", 0], + y=umap_results[df["type"] == "User", 1], + smiles=df[df["type"] == "User"]["smiles"], + type=df[df["type"] == "User"]["type"], + molecule_img=df[df["type"] == "User"]["molecule_img"], ) ) - - # Create Bokeh figure - p = figure(width=800, height=600, title="UMAP Plot of 2D RDKit Descriptors") - - # Create a color mapper - colors = Category10[10][: len(df["Type"].unique())] - color_mapper = factor_cmap( - "type", palette=colors, factors=df["Type"].unique() - ) - - # Create the scatter plot - scatter = p.scatter( + p.scatter( "x", "y", - source=source, - color=color_mapper, - legend_field="type", - size=10, + source=user_source, + color="red", + size=15, alpha=0.8, + legend_label="User Compounds", ) - - # Highlight user compounds - if "User" in df["Type"].unique(): - user_source = ColumnDataSource( - data=dict( - x=umap_results[df["Type"] == "User", 0], - y=umap_results[df["Type"] == "User", 1], - smiles=df[df["Type"] == "User"]["SMILES"], - type=df[df["Type"] == "User"]["Type"], - molecule_img=df[df["Type"] == "User"]["molecule_img"], - ) - ) - p.scatter( - "x", - "y", - source=user_source, - color="red", - size=15, - alpha=0.8, - legend_label="User Compounds", - ) - # Create tooltip HTML - tooltip_html = """ + # Create tooltip HTML + tooltip_html = """ +