-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpotential_calc.py
90 lines (66 loc) · 3.54 KB
/
potential_calc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# potential_calc
# This file contains functions to calculate potential from contacts of the training set and given an imput TCR.
# 1) calculate_potential(contacts_df, peptide=False)
# 2) get_potential(row, df_residues, col_id)
#Import libraries
import pandas as pd
import numpy as np
def calculate_potential(df, peptide=False):
"""
Calculate potential for all residue pairs based on observed and expected frequencies.
Args:
df (pd.DataFrame): DataFrame containing residue contact information.
Returns:
pd.DataFrame: DataFrame with all possible residue pairs and their potential.
"""
residues = list('ACDEFGHIKLMNPQRSTVWY') # 20 standard amino acids
if peptide==True:
residues_p = list('ADEFGHIKLMNPQRSTVWY') #not C
all_pairs = pd.DataFrame([(a, b) for a in residues_p for b in residues], columns=['residue_from', 'residue_to'])
else:
all_pairs = pd.DataFrame([(a, b) for a in residues for b in residues], columns=['residue_from', 'residue_to'])
# Count the number of observed contacts
contact_counts = df.groupby(['residue_from', 'residue_to']).size().reset_index(name='count')
# Add pseudocount to each pair
contact_counts['count'] += 1 # Add pseudocount
# Merge with all_pairs to ensure every possible residue pair is included
contact_counts = all_pairs.merge(contact_counts, on=['residue_from', 'residue_to'], how='left')
# Replace NaN with pseudocount for missing pairs
contact_counts['count'] = contact_counts['count'].fillna(1)
# Calculate observed frequencies pobs(a, b)
total_contacts = contact_counts['count'].sum()
contact_counts['pobs'] = contact_counts['count'] / total_contacts
# Calculate pobs(a) and pobs(b)
pobs_a = contact_counts.groupby('residue_from')['pobs'].sum().reset_index(name='pobs_a')
pobs_b = contact_counts.groupby('residue_to')['pobs'].sum().reset_index(name='pobs_b')
# Merge to get pobs(a) and pobs(b) for each pair
contact_counts = contact_counts.merge(pobs_a, left_on='residue_from', right_on='residue_from')
contact_counts = contact_counts.merge(pobs_b, left_on='residue_to', right_on='residue_to')
# Calculate expected frequencies pexp(a, b)
contact_counts['pexp'] = contact_counts['pobs_a'] * contact_counts['pobs_b']
# Calculate potential
contact_counts['potential'] = -np.log(contact_counts['pobs'] / contact_counts['pexp'])
return contact_counts[['residue_from', 'residue_to', 'potential']]
def get_potential(row, df_residues, col1_id, col2_id):
"""
Retrieve the potential value for a given residue pair from a DataFrame.
Args:
row (pd.Series): A row from the DataFrame containing contact information.
df_residues (pd.DataFrame): DataFrame containing potential values.
col_id (str): The column name that identifies the residue in the row.
Returns:
float or None: The potential value if a match is found; otherwise, None.
"""
try:
# Get the residue values from the row
residue_from = row[col1_id]
residue_to = row[col2_id]
# Find the matching rows in the residues DataFrame
match = df_residues[(df_residues['residue_from'] == residue_from) &
(df_residues['residue_to'] == residue_to)]
if not match.empty:
return match['potential'].values[0]
return None
except KeyError as e:
print(f"Error: Key {e} not found in row or DataFrame.")
return None