Skip to content

Commit

Permalink
cb fix and org molecule
Browse files Browse the repository at this point in the history
  • Loading branch information
sunal1996 committed Sep 12, 2023
1 parent 85da1b9 commit f24d28c
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 51 deletions.
6 changes: 3 additions & 3 deletions src/aposteriori/data_prep/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
)



# {{{ CLI
@click.command()
@click.argument(
Expand Down Expand Up @@ -88,7 +87,8 @@
default="",
type=click.Path(),
help=(
"Path to a .csv or a .txt file that contains organic molecules (cofactors etc.) to be considered during frame creation"
"Path to a Pieces format file used to filter the dataset to specific chains in"
"specific files. All other PDB files included in the input will be ignored."
),
)
@click.option(
Expand Down Expand Up @@ -140,7 +140,7 @@
default="CNO",
required=True,
help=(
"Encodes atoms in different channels, depending on atom types. Default is CNO, other options are ´CNOCB´,`CNOCACB`, `BackSideOrg` `BackCBSideOrg`. If the user wishes to keep only the backbone atoms of polypeptides and encode CB and CA as carbon atoms, they can proceed with CNO encoding. If the user wishes to put Cb or Cb and Ca in different channels, then they can use CNOCB and CNOCACB respectively. If the user wishes to include side-chain atoms, or other organic molecule atoms, they can use BackSideOrg (if they wish to consider CB as a side-chain atom) or BackCBSideOrg(if they wish to consider CB in a separate channel)."
"Encodes atoms in different channels, depending on atom types. Default is CNO, other options are ´CNOCB´ and `CNOCACB` to encode the Cb or Cb and Ca in different channels respectively."
),
)
@click.option(
Expand Down
111 changes: 63 additions & 48 deletions src/aposteriori/data_prep/create_frame_data_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
PDB_REQUEST_URL,
UNCOMMON_RESIDUE_DICT,
)
from scipy import rand

# {{{ Types
@dataclass
Expand Down Expand Up @@ -183,7 +184,7 @@ def encode_gaussian_atom(
Returns
-------
atom_encoding: np.ndarray
atom_encoding: np.ndarrayk
Boolean array with atom encoding of shape (3, 3, 3, encoder_length,)
"""
Expand All @@ -203,8 +204,8 @@ def encode_gaussian_atom(
if atom_label.upper() == "CA":
atom_label = "C"
atom_idx = self.label_to_encoding[atom_label]
elif self.encoder_length == 8:
# In this scenario, Ca is a carbon atom but Cb has a separate channel
elif self.encoder_length > 5:
# Encode side chain atoms or organic molecule atoms:
if atom_label.startswith("C"):
atom_label = "CS"
atom_idx = self.label_to_encoding[atom_label]
Expand All @@ -223,7 +224,7 @@ def encode_gaussian_atom(
raise ValueError(
f"{atom_label} not found in {self.atomic_labels} encoding."
)
# If label to encode is C, N, O:
# If label to encode is C, N, O or S:
if atom_label in ATOM_VANDERWAAL_RADII.keys():
# Get encoding:
atomic_radius = ATOM_VANDERWAAL_RADII[atom_label]
Expand Down Expand Up @@ -328,7 +329,7 @@ def encode_cb_to_ampal_residue(residue: ampal.Residue):

def encode_cb_prevox(residue: ampal.Residue):
"""
Encodes a Cb atom to an AMPAL residue. The Cb is added to an average position
Encodes a Cb atom to all of the AMPAL residues before the voxelisation begins. The Cb is added to an average position
calculated by averaging the Cb coordinates of the aligned frames for the 1QYS protein.
Parameters
Expand Down Expand Up @@ -687,12 +688,15 @@ def create_residue_frame(
assembly = residue.parent.parent
chain = residue.parent
align_to_residue_plane(residue)

print(residue["CB"])
frame = np.zeros(
(voxels_per_side, voxels_per_side, voxels_per_side, codec.encoder_length),
)

)
#Encode the residue information separately, will be further updated later.
# resinfo = np.zeros(
# (voxels_per_side, voxels_per_side, voxels_per_side, 2),
#)

# Change frame type to float if gaussian else use bool:
frame = frame.astype(np.float16) if voxels_as_gaussian else frame.astype(np.bool_)
Expand All @@ -709,6 +713,8 @@ def create_residue_frame(
ass = atom.parent.parent.parent
cha = atom.parent.parent
res = atom.parent
print(indices)
print(atom)
assert (atom.element != "") or (atom.element != " "), (
f"Atom element should not be blank:\n"
f"{atom.chain}:{atom.res_num}:{atom.res_label}"
Expand All @@ -726,6 +732,7 @@ def create_residue_frame(
# If the voxel is a gaussian, there may be remnants of a nearby atom
# hence this test would fail
if not voxels_as_gaussian:
#Since CB vector is added pre-voxelisation, one must remove the CB from the consideration for all channels being empty.
if not atom.res_label == "CB":
np.testing.assert_array_equal(
frame[indices], np.array([False] * len(frame[indices]), dtype=bool)
Expand All @@ -748,10 +755,14 @@ def create_residue_frame(
atom_coord=indices,
atom_idx=atom_idx,
)
if atom_filter_fn(atom) == keep_sidechains(atom) and res.mol_code == residue.mol_code:
for index in range(5, getattr(codec,"encoder_length")):
if frame[indices][index] == True:
frame[indices][index] = False
else:
# Encode atom as voxel:
frame[indices] = Codec.encode_atom(codec, atom.res_label)
#Remove side chains from the residue that is being voxelised.
#Remove side chains from the residue that is being voxelised
if atom_filter_fn(atom) == keep_sidechains(atom) and res.mol_code == residue.mol_code:
for index in range(5, getattr(codec,"encoder_length")):
if frame[indices][index] == True:
Expand Down Expand Up @@ -799,50 +810,55 @@ def voxelise_assembly(
codec,
voxels_as_gaussian,
):

total_atoms = len(list(assembly))

#If keep_side_chains atom filter chosen, keep side chain atoms. Also, if organic_cofactors are chosen, keep them for now. Water molecules will be removed.

for atom in assembly.get_atoms():
if not atom_filter_fn(atom) and not organic_cofactors(atom, cfile):
del atom.parent.atoms[atom.res_label]
del atom

remaining_atoms = len(list(assembly.get_atoms()))
print(f"{name}: Filtered {total_atoms - remaining_atoms} of {total_atoms} atoms.")

#Get the residue ids for keeping random side-chains
# Filters atoms not related to assembly:

res_index = (residue_number_indexing(assembly))
rand_side_chain = rnd.sample(range(len(res_index)), int(len(res_index)*keep_side_chain_portion))
total_atoms = len(list(assembly.get_atoms()))

for atom in assembly.get_atoms():
if (atom.parent.id) not in rand_side_chain:
#Remove side chains if the side chains are not in random selection.
if not default_atom_filter(atom) and not organic_cofactors(atom, cfile):
del atom.parent.atoms[atom.res_label]
del atom
elif isinstance (atom.parent, ampal.Ligand):
#If some organic molecules ended up in the random choice and if they are not of interest, remove them.
if keep_side_chain_portion!=0: # This is the situation when some of the side chain information is being kept.
#First round of filtering is done to remove water molecules. It is useful particularly if keep_side_chain_portion is used since it will be important to accurately determine number of residues left.
for atom in assembly.get_atoms():
if not atom_filter_fn(atom) and not organic_cofactors(atom, cfile):
del atom.parent.atoms[atom.res_label]
del atom.parent.atoms[atom.res_label]
del atom

#Encode the cb vector for all the residues before the voxelisation begins.
if encode_cb:
for chain in assembly:
if not isinstance(chain, ampal.Polypeptide):
continue
for residue in chain:
encode_cb_prevox(residue)

remaining_atoms = len(list(assembly.get_atoms()))

#A secondary checkpoint on how many atoms are left after side-chain keeping, CB addition etc.
print(remaining_atoms)

print(f"{name}: Filtered {total_atoms - remaining_atoms} of {total_atoms} atoms.")
remaining_atoms = len(list(assembly.get_atoms()))
print(f"{name}: Filtered {total_atoms - remaining_atoms} of {total_atoms} atoms.")
#Put the residues in numeric order for random picking.
res_index = (residue_number_indexing(assembly))
global rand_side_chain # defined as global since it may need to be used in create_dataset_frame function later.
rand_side_chain = rnd.sample(range(len(res_index)), int(len(res_index)*keep_side_chain_portion))
for atom in assembly.get_atoms():
if (atom.parent.id) not in rand_side_chain:
if not default_atom_filter(atom) and not organic_cofactors(atom, cfile):
del atom.parent.atoms[atom.res_label]
del atom
elif isinstance (atom.parent, ampal.Ligand):
if not atom_filter_fn(atom) and not organic_cofactors(atom, cfile):
del atom.parent.atoms[atom.res_label]
del atom
if encode_cb:
for chain in assembly:
if not isinstance(chain, ampal.Polypeptide):
continue
for residue in chain:
encode_cb_prevox(residue)
remaining_atoms = len(list(assembly.get_atoms()))
print(f"{name}: Total atoms to proceed with after CB addition (or none) is {remaining_atoms}")
else:
for atom in assembly.get_atoms():
if not default_atom_filter(atom) and not organic_cofactors(atom, cfile):
del atom.parent.atoms[atom.res_label]
del atom
remaining_atoms = len(list(assembly.get_atoms()))
print(f"{name}: Filtered {total_atoms - remaining_atoms} of {total_atoms} atoms.")
if encode_cb:
for chain in assembly:
if not isinstance(chain, ampal.Polypeptide):
continue
for residue in chain:
encode_cb_prevox(residue)
remaining_atoms = len(list(assembly.get_atoms()))
print(f"{name}: Total atoms to proceed with after CB addition (or none) is {remaining_atoms}")

for chain in assembly:
if chain_filter_list:
Expand Down Expand Up @@ -1053,7 +1069,6 @@ def keep_sidechains(atom: ampal.Atom) -> bool:
return False

def residue_number_indexing(assembly:ampal.Assembly):
"""Re-numbers the residues on the protein, so that side-chain portions can be kept later on"""
count=0
residue_number=[]
ligands=assembly.get_ligands()
Expand Down

0 comments on commit f24d28c

Please sign in to comment.