arer90
code 241111
f2d6c5c
raw
history blame
34.8 kB
import os
import numpy as np
import pandas as pd
import gc
from concurrent.futures import ProcessPoolExecutor, as_completed
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, Draw
from rdkit import RDConfig
from rdkit.Chem import Descriptors, rdMolDescriptors, Lipinski, rdDistGeom, rdPartialCharges
from rdkit.Chem.AllChem import GetMorganGenerator
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
from rdkit.Avalon.pyAvalonTools import GetAvalonFP
from rdkit.Chem.Descriptors import ExactMolWt
def mol3d(mol):
mol = Chem.AddHs(mol)
optimization_methods = [
(AllChem.EmbedMolecule, (mol, AllChem.ETKDGv3()), {}),
(AllChem.UFFOptimizeMolecule, (mol,), {'maxIters': 200}),
(AllChem.MMFFOptimizeMolecule, (mol,), {'maxIters': 200})
]
for method, args, kwargs in optimization_methods:
try:
method(*args, **kwargs)
if mol.GetNumConformers() > 0:
return mol
except ValueError as e:
print(f"Error: {e} - Trying next optimization method [{method}]")
print(f"Invalid mol for 3d {Chem.MolToSmiles(mol)} - No conformer generated")
return None
def generating_newfps(fps, descriptor, descriptor_name, save_res="np"):
try:
if descriptor is None:
return fps
if save_res == "pd":
new_fps = pd.DataFrame(fps) if not isinstance(fps, pd.DataFrame) else fps
if isinstance(descriptor, np.ndarray) and descriptor.ndim >= 2:
try:
descriptors_df = pd.DataFrame(
{f"{descriptor_name}_{i+1}": descriptor[:, i] for i in range(descriptor.shape[1])}
)
new_fps = pd.concat([new_fps, descriptors_df], axis=1)
del descriptor
except Exception as e:
print(f"[-1-] Error occured: {e}")
elif isinstance(descriptor, list) and isinstance(descriptor[0], np.ndarray):
try:
arrays_1d = [arr[:, None] for arr in descriptor if arr.ndim == 1]
arrays_2d = [arr for arr in descriptor if arr.ndim == 2]
combined_1d = np.concatenate(arrays_1d, axis=1) if arrays_1d else None
combined_2d = np.concatenate(arrays_2d, axis=1) if arrays_2d else None
if combined_1d is not None:
df_1d = pd.DataFrame(
combined_1d,
columns=[f'{descriptor_name}_{i+1}' for i in range(combined_1d.shape[1])]
)
new_fps = pd.concat([new_fps, df_1d], axis=1)
if combined_2d is not None:
df_2d = pd.DataFrame(
combined_2d,
columns=[f'{descriptor_name}_{i+1}' for i in range(combined_2d.shape[1])]
)
new_fps = pd.concat([new_fps, df_2d], axis=1)
del descriptor, arrays_1d, arrays_2d
if combined_1d is not None: del combined_1d
if combined_2d is not None: del combined_2d
except Exception as e:
print(f"[-2-] Error occured: {e}")
elif isinstance(descriptor, list) and isinstance(descriptor[0], list):
try:
descriptor = np.asarray(descriptor).astype('float')
descriptors_df = pd.DataFrame(
{f"{descriptor_name}_{i+1}": descriptor[:, i] for i in range(descriptor.shape[1])}
)
new_fps = pd.concat([new_fps, descriptors_df], axis=1)
del descriptor
except Exception as e:
print(f"[-3-] Error occured: {e}")
else:
descriptor = np.asarray(descriptor).astype('float')
new_fps[descriptor_name] = descriptor.flatten()
del descriptor
new_fps = new_fps.replace([np.inf, -np.inf], np.nan).fillna(0)
return new_fps
else:
new_fps = fps
if descriptor is None:
pass
elif isinstance(descriptor, np.ndarray) and descriptor.ndim >= 2:
try:
new_fps = np.concatenate([new_fps, descriptor], axis=1)
del descriptor
except Exception as e:
print(f"[-1-] Error occured: {e}")
elif isinstance(descriptor, list) and isinstance(descriptor[0], np.ndarray):
try:
arrays_1d = [arr[:, None] for arr in descriptor if arr.ndim == 1]
arrays_2d = [arr for arr in descriptor if arr.ndim == 2]
combined_1d = np.concatenate(arrays_1d, axis=1) if arrays_1d else None
combined_2d = np.concatenate(arrays_2d, axis=1) if arrays_2d else None
to_concat = [new_fps] + [arr for arr in [combined_1d, combined_2d] if arr is not None]
new_fps = np.concatenate(to_concat, axis=1)
del descriptor, arrays_1d, arrays_2d
if combined_1d is not None: del combined_1d
if combined_2d is not None: del combined_2d
except Exception as e:
print(f"[-2-] Error occured: {e}")
elif isinstance(descriptor, list) and isinstance(descriptor[0], list):
try:
descriptor = np.asarray(descriptor).astype('float')
new_fps = np.concatenate([new_fps, descriptor], axis=1)
del descriptor
except Exception as e:
print(f"[-3-] Error occured: {e}")
else:
descriptor = np.asarray(descriptor).astype('float')
new_fps = np.concatenate([new_fps, descriptor[:,None]], axis=1)
del descriptor
new_fps = np.nan_to_num(new_fps, nan=0.0, posinf=0.0, neginf=0.0).astype('float')
return new_fps
except Exception as e:
print(f"Error occurred in {descriptor_name}: {e}")
return fps
def Normalization(descriptor):
descriptor = np.asarray(descriptor)
epsilon = 1e-10
max_value = 1e15
descriptor = np.clip(descriptor, -max_value, max_value)
descriptor_custom = np.where(np.abs(descriptor) < epsilon, epsilon, descriptor)
descriptor_log = np.sign(descriptor_custom) * np.log1p(np.abs(descriptor_custom))
descriptor_log = np.nan_to_num(descriptor_log, nan=0.0, posinf=0.0, neginf=0.0)
del epsilon
gc.collect()
return descriptor_log
def values_chi(mol, chi_type):
i = 0
chi_func = Chem.GraphDescriptors.ChiNn_ if chi_type == 'n' else Chem.GraphDescriptors.ChiNv_
while chi_func(mol, i) != 0.0:
i += 1
return np.array([chi_func(mol, j) for j in range(i)])
def generate_chi(mols, chi_type):
n_jobs = os.cpu_count()
with ProcessPoolExecutor(max_workers=n_jobs) as executor:
futures = [executor.submit(values_chi, mol, chi_type) for mol in mols]
descriptor = [future.result() for future in futures]
max_length = max(len(x) for x in descriptor)
padded_descriptor = np.array([np.pad(x, (0, max_length - len(x)), 'constant') for x in descriptor])
return padded_descriptor
def sanitize_and_compute_descriptor(mol):
try:
mol = Chem.RemoveHs(mol)
Chem.SanitizeMol(mol)
try:
Chem.rdPartialCharges.ComputeGasteigerCharges(mol)
except Exception as e:
print(f"Gasteiger charge calculation failed: {e}")
return [0] * 8
try:
return Chem.rdMolDescriptors.BCUT2D(mol)
except Exception as e:
print(f"BCUT2D calculation failed: {e}")
return [Descriptors.MolWt(mol)] * 8
except Exception as e:
return [0] * 8
def compute_descriptors_parallel(mols, n_jobs=None):
with ProcessPoolExecutor(max_workers=n_jobs) as executor:
futures = [executor.submit(sanitize_and_compute_descriptor, mol) for mol in mols if mol is not None]
descriptors = [future.result() for future in futures]
return np.array(descriptors)
def process_molecules_parallel(mols, max_workers=4, chunk_size=100):
results = []
for i in range(0, len(mols), chunk_size):
chunk = mols[i:i + chunk_size]
with ProcessPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(mol3d, mol) for mol in chunk]
for future in as_completed(futures):
result = future.result()
if result is not None:
results.append(result)
gc.collect()
return results
def search_data_descriptor_compress(trial, fps, mols, name, target_path="result", save_res="np"):
####################################
phase0 = 1 #trial.suggest_int("MolWt", 0, 1) # 'MolWt'
phase1 = 1 #trial.suggest_int("MolLogP", 0, 1) # 'MolLogP'
phase2 = 1 #trial.suggest_int("MolMR", 0, 1) # 'MolMR'
phase3 = 1 #trial.suggest_int("TPSA", 0, 1) # 'TPSA'
phase4 = trial.suggest_int("NumRotatableBonds", 0, 1) # 'NumRotatableBonds'
phase5 = trial.suggest_int("HeavyAtomCount", 0, 1) # 'HeavyAtomCount'
phase6 = trial.suggest_int("NumHAcceptors", 0, 1) # 'NumHAcceptors'
phase7 = trial.suggest_int("NumHDonors", 0, 1) # 'NumHDonors'
phase8 = trial.suggest_int("NumHeteroatoms", 0, 1) # 'NumHeteroatoms'
phase9 = trial.suggest_int("NumValenceElectrons", 0, 1) # 'NumValenceElectrons'
phase10 = trial.suggest_int("NHOHCount", 0, 1) # 'NHOHCount'
phase11 = trial.suggest_int("NOCount", 0, 1) # 'NOCount'
phase12 = trial.suggest_int("RingCount", 0, 1) # 'RingCount'
phase13 = trial.suggest_int("NumAromaticRings", 0, 1) # 'NumAromaticRings'
phase14 = trial.suggest_int("NumSaturatedRings", 0, 1) # 'NumSaturatedRings'
phase15 = trial.suggest_int("NumAliphaticRings", 0, 1) # 'NumAliphaticRings'
phase16 = trial.suggest_int("LabuteASA", 0, 1) # 'LabuteASA'
phase17 = trial.suggest_int("BalabanJ", 0, 1) # 'BalabanJ'
phase18 = trial.suggest_int("BertzCT", 0, 1) # 'BertzCT'
phase19 = trial.suggest_int("Ipc", 0, 1) # 'Ipc'
phase20 = trial.suggest_int("kappa_Series[1-3]_ind", 0, 1) # 'kappa_Series[1-3]_ind'
phase21 = trial.suggest_int("Chi_Series[13]_ind", 0, 1) # 'Chi_Series[13]_ind'
phase22 = trial.suggest_int("Phi", 0, 1) # 'Phi'
phase23 = trial.suggest_int("HallKierAlpha", 0, 1) # 'HallKierAlpha'
phase24 = trial.suggest_int("NumAmideBonds", 0, 1) # 'NumAmideBonds'
phase25 = trial.suggest_int("FractionCSP3", 0, 1) # 'FractionCSP3'
phase26 = trial.suggest_int("NumSpiroAtoms", 0, 1) # 'NumSpiroAtoms'
phase27 = trial.suggest_int("NumBridgeheadAtoms", 0, 1) # 'NumBridgeheadAtoms'
phase28 = trial.suggest_int("PEOE_VSA_Series[1-14]_ind", 0, 1) # 'PEOE_VSA_Series[1-14]_ind'
phase29 = trial.suggest_int("SMR_VSA_Series[1-10]_ind", 0, 1) # 'SMR_VSA_Series[1-10]_ind'
phase30 = trial.suggest_int("SlogP_VSA_Series[1-12]_ind", 0, 1)# 'SlogP_VSA_Series[1-12]_ind'
phase31 = trial.suggest_int("EState_VSA_Series[1-11]_ind", 0, 1)# 'EState_VSA_Series[1-11]_ind'
phase32 = trial.suggest_int("VSA_EState_Series[1-10]", 0, 1) # 'VSA_EState_Series[1-10]'
phase33 = trial.suggest_int("MQNs", 0, 1) # 'MQNs'
phase34 = trial.suggest_int("AUTOCORR2D", 0, 1) # 'AUTOCORR2D'
phase35 = trial.suggest_int("BCUT2D", 0, 1) # 'BCUT2D'
phase36 = trial.suggest_int("Asphericity", 0, 1) # 'Asphericity'
phase37 = trial.suggest_int("PBF", 0, 1) # 'PBF'
phase38 = trial.suggest_int("RadiusOfGyration", 0, 1) # 'RadiusOfGyration'
phase39 = trial.suggest_int("InertialShapeFactor", 0, 1) # 'InertialShapeFactor'
phase40 = trial.suggest_int("Eccentricity", 0, 1) # 'Eccentricity'
phase41 = trial.suggest_int("SpherocityIndex", 0, 1) # 'SpherocityIndex'
phase42 = trial.suggest_int("PMI_series[1-3]_ind", 0, 1) # 'PMI_series[1-3]_ind'
phase43 = trial.suggest_int("NPR_series[1-2]_ind", 0, 1) # 'NPR_series[1-2]_ind'
phase44 = trial.suggest_int("AUTOCORR3D", 0, 1) # 'AUTOCORR3D'
phase45 = trial.suggest_int("RDF", 0, 1) # 'RDF'
phase46 = trial.suggest_int("MORSE", 0, 1) # 'MORSE'
phase47 = trial.suggest_int("WHIM", 0, 1) # 'WHIM'
phase48 = trial.suggest_int("GETAWAY", 0, 1) # 'GETAWAY'
####################################
def clear_descriptor_memory(descriptor):
del descriptor
gc.collect()
####################################
####################################
if phase0 == 1:
descriptor = [Descriptors.ExactMolWt(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'MolWt', save_res)
clear_descriptor_memory(descriptor)
if phase1 == 1:
descriptor = [Chem.Crippen.MolLogP(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'MolLogP', save_res)
clear_descriptor_memory(descriptor)
if phase2 == 1:
descriptor = [Chem.Crippen.MolMR(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'MolMR', save_res)
clear_descriptor_memory(descriptor)
if phase3 == 1:
descriptor = [Descriptors.TPSA(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'TPSA', save_res)
clear_descriptor_memory(descriptor)
if phase4 == 1:
descriptor = [Chem.Lipinski.NumRotatableBonds(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'NumRotatableBonds', save_res)
clear_descriptor_memory(descriptor)
if phase5 == 1:
descriptor = [Chem.Lipinski.HeavyAtomCount(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'HeavyAtomCount', save_res)
clear_descriptor_memory(descriptor)
if phase6 == 1:
descriptor = [Chem.Lipinski.NumHAcceptors(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'NumHAcceptors', save_res)
clear_descriptor_memory(descriptor)
if phase7 == 1:
descriptor = [Chem.Lipinski.NumHDonors(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'NumHDonors', save_res)
clear_descriptor_memory(descriptor)
if phase8 == 1:
descriptor = [Chem.Lipinski.NumHeteroatoms(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'NumHeteroatoms', save_res)
clear_descriptor_memory(descriptor)
if phase9 == 1:
descriptor = [Chem.Descriptors.NumValenceElectrons(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'NumValenceElectrons', save_res)
clear_descriptor_memory(descriptor)
if phase10 == 1:
descriptor = [Chem.Lipinski.NHOHCount(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'NHOHCount', save_res)
clear_descriptor_memory(descriptor)
if phase11 == 1:
descriptor = [Chem.Lipinski.NOCount(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'NOCount', save_res)
clear_descriptor_memory(descriptor)
if phase12 == 1:
descriptor = [Chem.Lipinski.RingCount(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'RingCount', save_res)
clear_descriptor_memory(descriptor)
if phase13 == 1:
descriptor = [Chem.Lipinski.NumAromaticRings(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'NumAromaticRings', save_res)
clear_descriptor_memory(descriptor)
if phase14 == 1:
descriptor = [Chem.Lipinski.NumSaturatedRings(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'NumSaturatedRings', save_res)
clear_descriptor_memory(descriptor)
if phase15 == 1:
descriptor = [Chem.Lipinski.NumAliphaticRings(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'NumAliphaticRings', save_res)
clear_descriptor_memory(descriptor)
if phase16 == 1:
descriptor = [Chem.rdMolDescriptors.CalcLabuteASA(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'LabuteASA', save_res)
clear_descriptor_memory(descriptor)
if phase17 == 1:
descriptor = [Chem.GraphDescriptors.BalabanJ(alpha) for alpha in mols]
# descriptor = Normalization(descriptor)
fps = generating_newfps(fps, descriptor, 'BalabanJ', save_res)
clear_descriptor_memory(descriptor)
if phase18 == 1:
descriptor = [Chem.GraphDescriptors.BertzCT(alpha) for alpha in mols]
# descriptor = Normalization(descriptor)
fps = generating_newfps(fps, descriptor, 'BertzCT', save_res)
clear_descriptor_memory(descriptor)
if phase19 == 1:
descriptor = [Chem.GraphDescriptors.Ipc(alpha) for alpha in mols]
descriptor = Normalization(descriptor)
fps = generating_newfps(fps, descriptor, 'Ipc', save_res)
clear_descriptor_memory(descriptor)
if phase20 == 1:
d1 = [Chem.GraphDescriptors.Kappa1(alpha) for alpha in mols]
d2 = [Chem.GraphDescriptors.Kappa2(alpha) for alpha in mols]
d3 = [Chem.GraphDescriptors.Kappa3(alpha) for alpha in mols]
d1 = np.asarray(d1)
d2 = np.asarray(d2)
d3 = np.asarray(d3)
fps = generating_newfps(fps, [d1,d2,d3], 'kappa_Series[1-3]_ind', save_res)
clear_descriptor_memory(d1)
clear_descriptor_memory(d2)
clear_descriptor_memory(d3)
if phase21 == 1:
d1 = [Chem.GraphDescriptors.Chi0(alpha) for alpha in mols]
d2 = [Chem.GraphDescriptors.Chi0n(alpha) for alpha in mols]
d3 = [Chem.GraphDescriptors.Chi0v(alpha) for alpha in mols]
d4 = [Chem.GraphDescriptors.Chi1(alpha) for alpha in mols]
d5 = [Chem.GraphDescriptors.Chi1n(alpha) for alpha in mols]
d6 = [Chem.GraphDescriptors.Chi1v(alpha) for alpha in mols]
d7 = [Chem.GraphDescriptors.Chi2n(alpha) for alpha in mols]
d8 = [Chem.GraphDescriptors.Chi2v(alpha) for alpha in mols]
d9 = [Chem.GraphDescriptors.Chi3n(alpha) for alpha in mols]
d10 = [Chem.GraphDescriptors.Chi3v(alpha) for alpha in mols]
d11 = [Chem.GraphDescriptors.Chi4n(alpha) for alpha in mols]
d12 = [Chem.GraphDescriptors.Chi4v(alpha) for alpha in mols]
d13 = generate_chi(mols, 'n')
d14 = generate_chi(mols, 'v')
d1 = np.asarray(d1)
d2 = np.asarray(d2)
d3 = np.asarray(d3)
d4 = np.asarray(d4)
d5 = np.asarray(d5)
d6 = np.asarray(d6)
d7 = np.asarray(d7)
d8 = np.asarray(d8)
d9 = np.asarray(d9)
d10 = np.asarray(d10)
d11 = np.asarray(d11)
d12 = np.asarray(d12)
d13 = np.asarray(d13)
d14 = np.asarray(d14)
fps = generating_newfps(fps, [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14], 'Chi_Series[13]_ind', save_res)
clear_descriptor_memory(d1)
clear_descriptor_memory(d2)
clear_descriptor_memory(d3)
clear_descriptor_memory(d4)
clear_descriptor_memory(d5)
clear_descriptor_memory(d6)
clear_descriptor_memory(d7)
clear_descriptor_memory(d8)
clear_descriptor_memory(d9)
clear_descriptor_memory(d10)
clear_descriptor_memory(d11)
clear_descriptor_memory(d12)
clear_descriptor_memory(d13)
clear_descriptor_memory(d14)
if phase22 == 1:
descriptor = [Chem.rdMolDescriptors.CalcPhi(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'Phi', save_res)
clear_descriptor_memory(descriptor)
if phase23 == 1:
descriptor = [Chem.GraphDescriptors.HallKierAlpha(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'HallKierAlpha', save_res)
clear_descriptor_memory(descriptor)
if phase24 == 1:
descriptor = [Chem.rdMolDescriptors.CalcNumAmideBonds(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'NumAmideBonds', save_res)
clear_descriptor_memory(descriptor)
if phase25 == 1:
descriptor = [Chem.Lipinski.FractionCSP3(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'FractionCSP3', save_res)
clear_descriptor_memory(descriptor)
if phase26 == 1:
descriptor = [Chem.rdMolDescriptors.CalcNumSpiroAtoms(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'NumSpiroAtoms', save_res)
clear_descriptor_memory(descriptor)
if phase27 == 1:
descriptor = [Chem.rdMolDescriptors.CalcNumBridgeheadAtoms(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'NumBridgeheadAtoms', save_res)
clear_descriptor_memory(descriptor)
if phase28 == 1:
d1 = [Chem.MolSurf.PEOE_VSA1(alpha) for alpha in mols]
d2 = [Chem.MolSurf.PEOE_VSA2(alpha) for alpha in mols]
d3 = [Chem.MolSurf.PEOE_VSA3(alpha) for alpha in mols]
d4 = [Chem.MolSurf.PEOE_VSA4(alpha) for alpha in mols]
d5 = [Chem.MolSurf.PEOE_VSA5(alpha) for alpha in mols]
d6 = [Chem.MolSurf.PEOE_VSA6(alpha) for alpha in mols]
d7 = [Chem.MolSurf.PEOE_VSA7(alpha) for alpha in mols]
d8 = [Chem.MolSurf.PEOE_VSA8(alpha) for alpha in mols]
d9 = [Chem.MolSurf.PEOE_VSA9(alpha) for alpha in mols]
d10 = [Chem.MolSurf.PEOE_VSA10(alpha) for alpha in mols]
d11 = [Chem.MolSurf.PEOE_VSA11(alpha) for alpha in mols]
d12 = [Chem.MolSurf.PEOE_VSA12(alpha) for alpha in mols]
d13 = [Chem.MolSurf.PEOE_VSA13(alpha) for alpha in mols]
d14 = [Chem.MolSurf.PEOE_VSA14(alpha) for alpha in mols]
d1 = np.asarray(d1)
d2 = np.asarray(d2)
d3 = np.asarray(d3)
d4 = np.asarray(d4)
d5 = np.asarray(d5)
d6 = np.asarray(d6)
d7 = np.asarray(d7)
d8 = np.asarray(d8)
d9 = np.asarray(d9)
d10 = np.asarray(d10)
d11 = np.asarray(d11)
d12 = np.asarray(d12)
d13 = np.asarray(d13)
d14 = np.asarray(d14)
fps = generating_newfps(fps, [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14],'PEOE_VSA_Series[1-14]_ind', save_res)
clear_descriptor_memory(d1)
clear_descriptor_memory(d2)
clear_descriptor_memory(d3)
clear_descriptor_memory(d4)
clear_descriptor_memory(d5)
clear_descriptor_memory(d6)
clear_descriptor_memory(d7)
clear_descriptor_memory(d8)
clear_descriptor_memory(d9)
clear_descriptor_memory(d10)
clear_descriptor_memory(d11)
clear_descriptor_memory(d12)
clear_descriptor_memory(d13)
clear_descriptor_memory(d14)
if phase29 == 1:
d1 = [Chem.MolSurf.SMR_VSA1(alpha) for alpha in mols]
d2 = [Chem.MolSurf.SMR_VSA2(alpha) for alpha in mols]
d3 = [Chem.MolSurf.SMR_VSA3(alpha) for alpha in mols]
d4 = [Chem.MolSurf.SMR_VSA4(alpha) for alpha in mols]
d5 = [Chem.MolSurf.SMR_VSA5(alpha) for alpha in mols]
d6 = [Chem.MolSurf.SMR_VSA6(alpha) for alpha in mols]
d7 = [Chem.MolSurf.SMR_VSA7(alpha) for alpha in mols]
d8 = [Chem.MolSurf.SMR_VSA8(alpha) for alpha in mols]
d9 = [Chem.MolSurf.SMR_VSA9(alpha) for alpha in mols]
d10 = [Chem.MolSurf.SMR_VSA10(alpha) for alpha in mols]
d1 = np.asarray(d1)
d2 = np.asarray(d2)
d3 = np.asarray(d3)
d4 = np.asarray(d4)
d5 = np.asarray(d5)
d6 = np.asarray(d6)
d7 = np.asarray(d7)
d8 = np.asarray(d8)
d9 = np.asarray(d9)
d10 = np.asarray(d10)
fps = generating_newfps(fps, [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10],'SMR_VSA_Series[1-10]_ind', save_res)
clear_descriptor_memory(d1)
clear_descriptor_memory(d2)
clear_descriptor_memory(d3)
clear_descriptor_memory(d4)
clear_descriptor_memory(d5)
clear_descriptor_memory(d6)
clear_descriptor_memory(d7)
clear_descriptor_memory(d8)
clear_descriptor_memory(d9)
clear_descriptor_memory(d10)
if phase30 == 1:
d1 = [Chem.MolSurf.SlogP_VSA1(alpha) for alpha in mols]
d2 = [Chem.MolSurf.SlogP_VSA2(alpha) for alpha in mols]
d3 = [Chem.MolSurf.SlogP_VSA3(alpha) for alpha in mols]
d4 = [Chem.MolSurf.SlogP_VSA4(alpha) for alpha in mols]
d5 = [Chem.MolSurf.SlogP_VSA5(alpha) for alpha in mols]
d6 = [Chem.MolSurf.SlogP_VSA6(alpha) for alpha in mols]
d7 = [Chem.MolSurf.SlogP_VSA7(alpha) for alpha in mols]
d8 = [Chem.MolSurf.SlogP_VSA8(alpha) for alpha in mols]
d9 = [Chem.MolSurf.SlogP_VSA9(alpha) for alpha in mols]
d10= [Chem.MolSurf.SlogP_VSA10(alpha) for alpha in mols]
d11= [Chem.MolSurf.SlogP_VSA11(alpha) for alpha in mols]
d12= [Chem.MolSurf.SlogP_VSA12(alpha) for alpha in mols]
d1 = np.asarray(d1)
d2 = np.asarray(d2)
d3 = np.asarray(d3)
d4 = np.asarray(d4)
d5 = np.asarray(d5)
d6 = np.asarray(d6)
d7 = np.asarray(d7)
d8 = np.asarray(d8)
d9 = np.asarray(d9)
d10 = np.asarray(d10)
d11 = np.asarray(d11)
d12 = np.asarray(d12)
fps = generating_newfps(fps, [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12],'SlogP_VSA_Series[1-12]_ind', save_res)
clear_descriptor_memory(d1)
clear_descriptor_memory(d2)
clear_descriptor_memory(d3)
clear_descriptor_memory(d4)
clear_descriptor_memory(d5)
clear_descriptor_memory(d6)
clear_descriptor_memory(d7)
clear_descriptor_memory(d8)
clear_descriptor_memory(d9)
clear_descriptor_memory(d10)
clear_descriptor_memory(d11)
clear_descriptor_memory(d12)
if phase31 == 1:
d1 = [Chem.EState.EState_VSA.EState_VSA1(alpha) for alpha in mols]
d2 = [Chem.EState.EState_VSA.EState_VSA2(alpha) for alpha in mols]
d3 = [Chem.EState.EState_VSA.EState_VSA3(alpha) for alpha in mols]
d4 = [Chem.EState.EState_VSA.EState_VSA4(alpha) for alpha in mols]
d5 = [Chem.EState.EState_VSA.EState_VSA5(alpha) for alpha in mols]
d6 = [Chem.EState.EState_VSA.EState_VSA6(alpha) for alpha in mols]
d7 = [Chem.EState.EState_VSA.EState_VSA7(alpha) for alpha in mols]
d8 = [Chem.EState.EState_VSA.EState_VSA8(alpha) for alpha in mols]
d9 = [Chem.EState.EState_VSA.EState_VSA9(alpha) for alpha in mols]
d10 = [Chem.EState.EState_VSA.EState_VSA10(alpha) for alpha in mols]
d11 = [Chem.EState.EState_VSA.EState_VSA11(alpha) for alpha in mols]
d1 = np.asarray(d1)
d2 = np.asarray(d2)
d3 = np.asarray(d3)
d4 = np.asarray(d4)
d5 = np.asarray(d5)
d6 = np.asarray(d6)
d7 = np.asarray(d7)
d8 = np.asarray(d8)
d9 = np.asarray(d9)
d10 = np.asarray(d10)
d11 = np.asarray(d11)
fps = generating_newfps(fps, [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11],'EState_VSA_Series[1-11]_ind', save_res)
clear_descriptor_memory(d1)
clear_descriptor_memory(d2)
clear_descriptor_memory(d3)
clear_descriptor_memory(d4)
clear_descriptor_memory(d5)
clear_descriptor_memory(d6)
clear_descriptor_memory(d7)
clear_descriptor_memory(d8)
clear_descriptor_memory(d9)
clear_descriptor_memory(d10)
clear_descriptor_memory(d11)
if phase32 == 1:
d1 = [Chem.EState.EState_VSA.VSA_EState1(alpha) for alpha in mols]
d2 = [Chem.EState.EState_VSA.VSA_EState2(alpha) for alpha in mols]
d3 = [Chem.EState.EState_VSA.VSA_EState3(alpha) for alpha in mols]
d4 = [Chem.EState.EState_VSA.VSA_EState4(alpha) for alpha in mols]
d5 = [Chem.EState.EState_VSA.VSA_EState5(alpha) for alpha in mols]
d6 = [Chem.EState.EState_VSA.VSA_EState6(alpha) for alpha in mols]
d7 = [Chem.EState.EState_VSA.VSA_EState7(alpha) for alpha in mols]
d8 = [Chem.EState.EState_VSA.VSA_EState8(alpha) for alpha in mols]
d9 = [Chem.EState.EState_VSA.VSA_EState9(alpha) for alpha in mols]
d10 = [Chem.EState.EState_VSA.VSA_EState10(alpha) for alpha in mols]
d1 = np.asarray(d1)
d2 = np.asarray(d2)
d3 = np.asarray(d3)
d4 = np.asarray(d4)
d5 = np.asarray(d5)
d6 = np.asarray(d6)
d7 = np.asarray(d7)
d8 = np.asarray(d8)
d9 = np.asarray(d9)
d10 = np.asarray(d10)
fps = generating_newfps(fps, [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10],'VSA_EState_Series[1-10]', save_res)
clear_descriptor_memory(d1)
clear_descriptor_memory(d2)
clear_descriptor_memory(d3)
clear_descriptor_memory(d4)
clear_descriptor_memory(d5)
clear_descriptor_memory(d6)
clear_descriptor_memory(d7)
clear_descriptor_memory(d8)
clear_descriptor_memory(d9)
clear_descriptor_memory(d10)
if phase33 == 1:
descriptor = [Chem.rdMolDescriptors.MQNs_(alpha) for alpha in mols]
# descriptor = Normalization(descriptor)
fps = generating_newfps(fps, descriptor, 'MQNs', save_res)
clear_descriptor_memory(descriptor)
if phase34 == 1:
descriptor = [Chem.rdMolDescriptors.CalcAUTOCORR2D(alpha) for alpha in mols]
fps = generating_newfps(fps, descriptor, 'AUTOCORR2D', save_res)
clear_descriptor_memory(descriptor)
if phase35 == 1:
descriptor = compute_descriptors_parallel(mols)
fps = generating_newfps(fps, descriptor, 'BCUT2D', save_res)
clear_descriptor_memory(descriptor)
####################################################
mols2 = process_molecules_parallel(mols, max_workers=8)
del mols
gc.collect()
####################################################
if phase36 == 1:
descriptor = [Chem.rdMolDescriptors.CalcAsphericity(alpha) for alpha in mols2]
fps = generating_newfps(fps, descriptor, 'Asphericity', save_res)
clear_descriptor_memory(descriptor)
if phase37 == 1:
descriptor = [Chem.rdMolDescriptors.CalcPBF(alpha) for alpha in mols2]
fps = generating_newfps(fps, descriptor, 'PBF', save_res)
clear_descriptor_memory(descriptor)
if phase38 == 1:
descriptor = [Chem.rdMolDescriptors.CalcRadiusOfGyration(alpha) for alpha in mols2]
fps = generating_newfps(fps, descriptor, 'RadiusOfGyration', save_res)
clear_descriptor_memory(descriptor)
if phase39 == 1:
descriptor = [Chem.rdMolDescriptors.CalcInertialShapeFactor(alpha) for alpha in mols2]
fps = generating_newfps(fps, descriptor, 'InertialShapeFactor', save_res)
clear_descriptor_memory(descriptor)
if phase40 == 1:
descriptor = [Chem.rdMolDescriptors.CalcEccentricity(alpha) for alpha in mols2]
fps = generating_newfps(fps, descriptor, 'Eccentricity', save_res)
clear_descriptor_memory(descriptor)
if phase41 == 1:
descriptor = [Chem.rdMolDescriptors.CalcSpherocityIndex(alpha) for alpha in mols2]
fps = generating_newfps(fps, descriptor, 'SpherocityIndex', save_res)
clear_descriptor_memory(descriptor)
if phase42 == 1:
d1 = [Chem.rdMolDescriptors.CalcPMI1(alpha) for alpha in mols2]
d2 = [Chem.rdMolDescriptors.CalcPMI2(alpha) for alpha in mols2]
d3 = [Chem.rdMolDescriptors.CalcPMI3(alpha) for alpha in mols2]
d1 = Normalization(d1)
d2 = Normalization(d2)
d3 = Normalization(d3)
d1 = np.asarray(d1)
d2 = np.asarray(d2)
d3 = np.asarray(d3)
fps = generating_newfps(fps, [d1,d2,d3], 'PMI_series[1-3]_ind', save_res)
clear_descriptor_memory(d1)
clear_descriptor_memory(d2)
clear_descriptor_memory(d3)
if phase43 == 1:
d1 = [Chem.rdMolDescriptors.CalcNPR1(alpha) for alpha in mols2]
d2 = [Chem.rdMolDescriptors.CalcNPR2(alpha) for alpha in mols2]
d1 = np.asarray(d1)
d2 = np.asarray(d2)
fps = generating_newfps(fps, [d1,d2], 'NPR_series[1-2]_ind', save_res)
clear_descriptor_memory(d1)
clear_descriptor_memory(d2)
if phase44 == 1:
descriptor = [Chem.rdMolDescriptors.CalcAUTOCORR3D(mols) for mols in mols2]
fps = generating_newfps(fps, descriptor, 'AUTOCORR3D', save_res)
clear_descriptor_memory(descriptor)
if phase45 == 1:
descriptor = [Chem.rdMolDescriptors.CalcRDF(mols) for mols in mols2]
descriptor = Normalization(descriptor)
fps = generating_newfps(fps, descriptor, 'RDF', save_res)
clear_descriptor_memory(descriptor)
if phase46 == 1:
descriptor = [Chem.rdMolDescriptors.CalcMORSE(mols) for mols in mols2]
descriptor = Normalization(descriptor)
fps = generating_newfps(fps, descriptor, 'MORSE', save_res)
clear_descriptor_memory(descriptor)
if phase47 == 1:
descriptor = [Chem.rdMolDescriptors.CalcWHIM(mols) for mols in mols2]
descriptor = Normalization(descriptor)
fps = generating_newfps(fps, descriptor, 'WHIM', save_res)
clear_descriptor_memory(descriptor)
if phase48 == 1:
descriptor = [Chem.rdMolDescriptors.CalcGETAWAY(mols) for mols in mols2]
descriptor = Normalization(descriptor)
fps = generating_newfps(fps, descriptor, 'GETAWAY', save_res)
clear_descriptor_memory(descriptor)
#########################################
if save_res == "pd":
fps.to_csv(f'{target_path}/{name}_feature_selection.csv')
fps = fps.astype('float')
return fps