|
import os |
|
import numpy as np |
|
import pandas as pd |
|
import gc |
|
from concurrent.futures import ProcessPoolExecutor, as_completed |
|
|
|
from rdkit import Chem |
|
from rdkit.Chem import AllChem, DataStructs, Draw |
|
from rdkit import RDConfig |
|
from rdkit.Chem import Descriptors, rdMolDescriptors, Lipinski, rdDistGeom, rdPartialCharges |
|
from rdkit.Chem.AllChem import GetMorganGenerator |
|
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray |
|
from rdkit.Avalon.pyAvalonTools import GetAvalonFP |
|
from rdkit.Chem.Descriptors import ExactMolWt |
|
|
|
def mol3d(mol): |
|
mol = Chem.AddHs(mol) |
|
optimization_methods = [ |
|
(AllChem.EmbedMolecule, (mol, AllChem.ETKDGv3()), {}), |
|
(AllChem.UFFOptimizeMolecule, (mol,), {'maxIters': 200}), |
|
(AllChem.MMFFOptimizeMolecule, (mol,), {'maxIters': 200}) |
|
] |
|
|
|
for method, args, kwargs in optimization_methods: |
|
try: |
|
method(*args, **kwargs) |
|
if mol.GetNumConformers() > 0: |
|
return mol |
|
except ValueError as e: |
|
print(f"Error: {e} - Trying next optimization method [{method}]") |
|
|
|
print(f"Invalid mol for 3d {Chem.MolToSmiles(mol)} - No conformer generated") |
|
return None |
|
|
|
def generating_newfps(fps, descriptor, descriptor_name, save_res="np"): |
|
try: |
|
if descriptor is None: |
|
return fps |
|
|
|
if save_res == "pd": |
|
new_fps = pd.DataFrame(fps) if not isinstance(fps, pd.DataFrame) else fps |
|
|
|
if isinstance(descriptor, np.ndarray) and descriptor.ndim >= 2: |
|
try: |
|
descriptors_df = pd.DataFrame( |
|
{f"{descriptor_name}_{i+1}": descriptor[:, i] for i in range(descriptor.shape[1])} |
|
) |
|
new_fps = pd.concat([new_fps, descriptors_df], axis=1) |
|
del descriptor |
|
except Exception as e: |
|
print(f"[-1-] Error occured: {e}") |
|
|
|
elif isinstance(descriptor, list) and isinstance(descriptor[0], np.ndarray): |
|
try: |
|
arrays_1d = [arr[:, None] for arr in descriptor if arr.ndim == 1] |
|
arrays_2d = [arr for arr in descriptor if arr.ndim == 2] |
|
combined_1d = np.concatenate(arrays_1d, axis=1) if arrays_1d else None |
|
combined_2d = np.concatenate(arrays_2d, axis=1) if arrays_2d else None |
|
|
|
if combined_1d is not None: |
|
df_1d = pd.DataFrame( |
|
combined_1d, |
|
columns=[f'{descriptor_name}_{i+1}' for i in range(combined_1d.shape[1])] |
|
) |
|
new_fps = pd.concat([new_fps, df_1d], axis=1) |
|
|
|
if combined_2d is not None: |
|
df_2d = pd.DataFrame( |
|
combined_2d, |
|
columns=[f'{descriptor_name}_{i+1}' for i in range(combined_2d.shape[1])] |
|
) |
|
new_fps = pd.concat([new_fps, df_2d], axis=1) |
|
|
|
del descriptor, arrays_1d, arrays_2d |
|
if combined_1d is not None: del combined_1d |
|
if combined_2d is not None: del combined_2d |
|
except Exception as e: |
|
print(f"[-2-] Error occured: {e}") |
|
|
|
elif isinstance(descriptor, list) and isinstance(descriptor[0], list): |
|
try: |
|
descriptor = np.asarray(descriptor).astype('float') |
|
descriptors_df = pd.DataFrame( |
|
{f"{descriptor_name}_{i+1}": descriptor[:, i] for i in range(descriptor.shape[1])} |
|
) |
|
new_fps = pd.concat([new_fps, descriptors_df], axis=1) |
|
del descriptor |
|
except Exception as e: |
|
print(f"[-3-] Error occured: {e}") |
|
|
|
else: |
|
descriptor = np.asarray(descriptor).astype('float') |
|
new_fps[descriptor_name] = descriptor.flatten() |
|
del descriptor |
|
|
|
new_fps = new_fps.replace([np.inf, -np.inf], np.nan).fillna(0) |
|
return new_fps |
|
|
|
else: |
|
new_fps = fps |
|
|
|
if descriptor is None: |
|
pass |
|
elif isinstance(descriptor, np.ndarray) and descriptor.ndim >= 2: |
|
try: |
|
new_fps = np.concatenate([new_fps, descriptor], axis=1) |
|
del descriptor |
|
except Exception as e: |
|
print(f"[-1-] Error occured: {e}") |
|
elif isinstance(descriptor, list) and isinstance(descriptor[0], np.ndarray): |
|
try: |
|
arrays_1d = [arr[:, None] for arr in descriptor if arr.ndim == 1] |
|
arrays_2d = [arr for arr in descriptor if arr.ndim == 2] |
|
combined_1d = np.concatenate(arrays_1d, axis=1) if arrays_1d else None |
|
combined_2d = np.concatenate(arrays_2d, axis=1) if arrays_2d else None |
|
to_concat = [new_fps] + [arr for arr in [combined_1d, combined_2d] if arr is not None] |
|
new_fps = np.concatenate(to_concat, axis=1) |
|
del descriptor, arrays_1d, arrays_2d |
|
if combined_1d is not None: del combined_1d |
|
if combined_2d is not None: del combined_2d |
|
except Exception as e: |
|
print(f"[-2-] Error occured: {e}") |
|
elif isinstance(descriptor, list) and isinstance(descriptor[0], list): |
|
try: |
|
descriptor = np.asarray(descriptor).astype('float') |
|
new_fps = np.concatenate([new_fps, descriptor], axis=1) |
|
del descriptor |
|
except Exception as e: |
|
print(f"[-3-] Error occured: {e}") |
|
else: |
|
descriptor = np.asarray(descriptor).astype('float') |
|
new_fps = np.concatenate([new_fps, descriptor[:,None]], axis=1) |
|
del descriptor |
|
|
|
new_fps = np.nan_to_num(new_fps, nan=0.0, posinf=0.0, neginf=0.0).astype('float') |
|
return new_fps |
|
|
|
except Exception as e: |
|
print(f"Error occurred in {descriptor_name}: {e}") |
|
return fps |
|
|
|
def Normalization(descriptor): |
|
descriptor = np.asarray(descriptor) |
|
epsilon = 1e-10 |
|
max_value = 1e15 |
|
descriptor = np.clip(descriptor, -max_value, max_value) |
|
descriptor_custom = np.where(np.abs(descriptor) < epsilon, epsilon, descriptor) |
|
descriptor_log = np.sign(descriptor_custom) * np.log1p(np.abs(descriptor_custom)) |
|
descriptor_log = np.nan_to_num(descriptor_log, nan=0.0, posinf=0.0, neginf=0.0) |
|
del epsilon |
|
gc.collect() |
|
return descriptor_log |
|
|
|
def values_chi(mol, chi_type): |
|
i = 0 |
|
chi_func = Chem.GraphDescriptors.ChiNn_ if chi_type == 'n' else Chem.GraphDescriptors.ChiNv_ |
|
while chi_func(mol, i) != 0.0: |
|
i += 1 |
|
return np.array([chi_func(mol, j) for j in range(i)]) |
|
|
|
def generate_chi(mols, chi_type): |
|
n_jobs = os.cpu_count() |
|
with ProcessPoolExecutor(max_workers=n_jobs) as executor: |
|
futures = [executor.submit(values_chi, mol, chi_type) for mol in mols] |
|
descriptor = [future.result() for future in futures] |
|
|
|
max_length = max(len(x) for x in descriptor) |
|
padded_descriptor = np.array([np.pad(x, (0, max_length - len(x)), 'constant') for x in descriptor]) |
|
|
|
return padded_descriptor |
|
|
|
def sanitize_and_compute_descriptor(mol): |
|
try: |
|
mol = Chem.RemoveHs(mol) |
|
Chem.SanitizeMol(mol) |
|
try: |
|
Chem.rdPartialCharges.ComputeGasteigerCharges(mol) |
|
except Exception as e: |
|
print(f"Gasteiger charge calculation failed: {e}") |
|
return [0] * 8 |
|
|
|
try: |
|
return Chem.rdMolDescriptors.BCUT2D(mol) |
|
except Exception as e: |
|
print(f"BCUT2D calculation failed: {e}") |
|
return [Descriptors.MolWt(mol)] * 8 |
|
except Exception as e: |
|
return [0] * 8 |
|
|
|
def compute_descriptors_parallel(mols, n_jobs=None): |
|
with ProcessPoolExecutor(max_workers=n_jobs) as executor: |
|
futures = [executor.submit(sanitize_and_compute_descriptor, mol) for mol in mols if mol is not None] |
|
descriptors = [future.result() for future in futures] |
|
return np.array(descriptors) |
|
|
|
def process_molecules_parallel(mols, max_workers=4, chunk_size=100): |
|
results = [] |
|
for i in range(0, len(mols), chunk_size): |
|
chunk = mols[i:i + chunk_size] |
|
with ProcessPoolExecutor(max_workers=max_workers) as executor: |
|
futures = [executor.submit(mol3d, mol) for mol in chunk] |
|
for future in as_completed(futures): |
|
result = future.result() |
|
if result is not None: |
|
results.append(result) |
|
gc.collect() |
|
return results |
|
|
|
def search_data_descriptor_compress(trial, fps, mols, name, target_path="result", save_res="np"): |
|
|
|
phase0 = 1 |
|
phase1 = 1 |
|
phase2 = 1 |
|
phase3 = 1 |
|
phase4 = trial.suggest_int("NumRotatableBonds", 0, 1) |
|
phase5 = trial.suggest_int("HeavyAtomCount", 0, 1) |
|
phase6 = trial.suggest_int("NumHAcceptors", 0, 1) |
|
phase7 = trial.suggest_int("NumHDonors", 0, 1) |
|
phase8 = trial.suggest_int("NumHeteroatoms", 0, 1) |
|
phase9 = trial.suggest_int("NumValenceElectrons", 0, 1) |
|
phase10 = trial.suggest_int("NHOHCount", 0, 1) |
|
phase11 = trial.suggest_int("NOCount", 0, 1) |
|
phase12 = trial.suggest_int("RingCount", 0, 1) |
|
phase13 = trial.suggest_int("NumAromaticRings", 0, 1) |
|
phase14 = trial.suggest_int("NumSaturatedRings", 0, 1) |
|
phase15 = trial.suggest_int("NumAliphaticRings", 0, 1) |
|
phase16 = trial.suggest_int("LabuteASA", 0, 1) |
|
phase17 = trial.suggest_int("BalabanJ", 0, 1) |
|
phase18 = trial.suggest_int("BertzCT", 0, 1) |
|
phase19 = trial.suggest_int("Ipc", 0, 1) |
|
phase20 = trial.suggest_int("kappa_Series[1-3]_ind", 0, 1) |
|
phase21 = trial.suggest_int("Chi_Series[13]_ind", 0, 1) |
|
phase22 = trial.suggest_int("Phi", 0, 1) |
|
phase23 = trial.suggest_int("HallKierAlpha", 0, 1) |
|
phase24 = trial.suggest_int("NumAmideBonds", 0, 1) |
|
phase25 = trial.suggest_int("FractionCSP3", 0, 1) |
|
phase26 = trial.suggest_int("NumSpiroAtoms", 0, 1) |
|
phase27 = trial.suggest_int("NumBridgeheadAtoms", 0, 1) |
|
phase28 = trial.suggest_int("PEOE_VSA_Series[1-14]_ind", 0, 1) |
|
phase29 = trial.suggest_int("SMR_VSA_Series[1-10]_ind", 0, 1) |
|
phase30 = trial.suggest_int("SlogP_VSA_Series[1-12]_ind", 0, 1) |
|
phase31 = trial.suggest_int("EState_VSA_Series[1-11]_ind", 0, 1) |
|
phase32 = trial.suggest_int("VSA_EState_Series[1-10]", 0, 1) |
|
phase33 = trial.suggest_int("MQNs", 0, 1) |
|
phase34 = trial.suggest_int("AUTOCORR2D", 0, 1) |
|
phase35 = trial.suggest_int("BCUT2D", 0, 1) |
|
phase36 = trial.suggest_int("Asphericity", 0, 1) |
|
phase37 = trial.suggest_int("PBF", 0, 1) |
|
phase38 = trial.suggest_int("RadiusOfGyration", 0, 1) |
|
phase39 = trial.suggest_int("InertialShapeFactor", 0, 1) |
|
phase40 = trial.suggest_int("Eccentricity", 0, 1) |
|
phase41 = trial.suggest_int("SpherocityIndex", 0, 1) |
|
phase42 = trial.suggest_int("PMI_series[1-3]_ind", 0, 1) |
|
phase43 = trial.suggest_int("NPR_series[1-2]_ind", 0, 1) |
|
phase44 = trial.suggest_int("AUTOCORR3D", 0, 1) |
|
phase45 = trial.suggest_int("RDF", 0, 1) |
|
phase46 = trial.suggest_int("MORSE", 0, 1) |
|
phase47 = trial.suggest_int("WHIM", 0, 1) |
|
phase48 = trial.suggest_int("GETAWAY", 0, 1) |
|
|
|
def clear_descriptor_memory(descriptor): |
|
del descriptor |
|
gc.collect() |
|
|
|
|
|
if phase0 == 1: |
|
descriptor = [Descriptors.ExactMolWt(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'MolWt', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase1 == 1: |
|
descriptor = [Chem.Crippen.MolLogP(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'MolLogP', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase2 == 1: |
|
descriptor = [Chem.Crippen.MolMR(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'MolMR', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase3 == 1: |
|
descriptor = [Descriptors.TPSA(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'TPSA', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase4 == 1: |
|
descriptor = [Chem.Lipinski.NumRotatableBonds(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'NumRotatableBonds', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase5 == 1: |
|
descriptor = [Chem.Lipinski.HeavyAtomCount(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'HeavyAtomCount', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase6 == 1: |
|
descriptor = [Chem.Lipinski.NumHAcceptors(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'NumHAcceptors', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase7 == 1: |
|
descriptor = [Chem.Lipinski.NumHDonors(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'NumHDonors', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase8 == 1: |
|
descriptor = [Chem.Lipinski.NumHeteroatoms(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'NumHeteroatoms', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase9 == 1: |
|
descriptor = [Chem.Descriptors.NumValenceElectrons(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'NumValenceElectrons', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase10 == 1: |
|
descriptor = [Chem.Lipinski.NHOHCount(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'NHOHCount', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase11 == 1: |
|
descriptor = [Chem.Lipinski.NOCount(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'NOCount', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase12 == 1: |
|
descriptor = [Chem.Lipinski.RingCount(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'RingCount', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase13 == 1: |
|
descriptor = [Chem.Lipinski.NumAromaticRings(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'NumAromaticRings', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase14 == 1: |
|
descriptor = [Chem.Lipinski.NumSaturatedRings(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'NumSaturatedRings', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase15 == 1: |
|
descriptor = [Chem.Lipinski.NumAliphaticRings(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'NumAliphaticRings', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase16 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcLabuteASA(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'LabuteASA', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase17 == 1: |
|
descriptor = [Chem.GraphDescriptors.BalabanJ(alpha) for alpha in mols] |
|
|
|
fps = generating_newfps(fps, descriptor, 'BalabanJ', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase18 == 1: |
|
descriptor = [Chem.GraphDescriptors.BertzCT(alpha) for alpha in mols] |
|
|
|
fps = generating_newfps(fps, descriptor, 'BertzCT', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase19 == 1: |
|
descriptor = [Chem.GraphDescriptors.Ipc(alpha) for alpha in mols] |
|
descriptor = Normalization(descriptor) |
|
fps = generating_newfps(fps, descriptor, 'Ipc', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase20 == 1: |
|
d1 = [Chem.GraphDescriptors.Kappa1(alpha) for alpha in mols] |
|
d2 = [Chem.GraphDescriptors.Kappa2(alpha) for alpha in mols] |
|
d3 = [Chem.GraphDescriptors.Kappa3(alpha) for alpha in mols] |
|
d1 = np.asarray(d1) |
|
d2 = np.asarray(d2) |
|
d3 = np.asarray(d3) |
|
fps = generating_newfps(fps, [d1,d2,d3], 'kappa_Series[1-3]_ind', save_res) |
|
clear_descriptor_memory(d1) |
|
clear_descriptor_memory(d2) |
|
clear_descriptor_memory(d3) |
|
if phase21 == 1: |
|
d1 = [Chem.GraphDescriptors.Chi0(alpha) for alpha in mols] |
|
d2 = [Chem.GraphDescriptors.Chi0n(alpha) for alpha in mols] |
|
d3 = [Chem.GraphDescriptors.Chi0v(alpha) for alpha in mols] |
|
d4 = [Chem.GraphDescriptors.Chi1(alpha) for alpha in mols] |
|
d5 = [Chem.GraphDescriptors.Chi1n(alpha) for alpha in mols] |
|
d6 = [Chem.GraphDescriptors.Chi1v(alpha) for alpha in mols] |
|
d7 = [Chem.GraphDescriptors.Chi2n(alpha) for alpha in mols] |
|
d8 = [Chem.GraphDescriptors.Chi2v(alpha) for alpha in mols] |
|
d9 = [Chem.GraphDescriptors.Chi3n(alpha) for alpha in mols] |
|
d10 = [Chem.GraphDescriptors.Chi3v(alpha) for alpha in mols] |
|
d11 = [Chem.GraphDescriptors.Chi4n(alpha) for alpha in mols] |
|
d12 = [Chem.GraphDescriptors.Chi4v(alpha) for alpha in mols] |
|
d13 = generate_chi(mols, 'n') |
|
d14 = generate_chi(mols, 'v') |
|
d1 = np.asarray(d1) |
|
d2 = np.asarray(d2) |
|
d3 = np.asarray(d3) |
|
d4 = np.asarray(d4) |
|
d5 = np.asarray(d5) |
|
d6 = np.asarray(d6) |
|
d7 = np.asarray(d7) |
|
d8 = np.asarray(d8) |
|
d9 = np.asarray(d9) |
|
d10 = np.asarray(d10) |
|
d11 = np.asarray(d11) |
|
d12 = np.asarray(d12) |
|
d13 = np.asarray(d13) |
|
d14 = np.asarray(d14) |
|
fps = generating_newfps(fps, [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14], 'Chi_Series[13]_ind', save_res) |
|
clear_descriptor_memory(d1) |
|
clear_descriptor_memory(d2) |
|
clear_descriptor_memory(d3) |
|
clear_descriptor_memory(d4) |
|
clear_descriptor_memory(d5) |
|
clear_descriptor_memory(d6) |
|
clear_descriptor_memory(d7) |
|
clear_descriptor_memory(d8) |
|
clear_descriptor_memory(d9) |
|
clear_descriptor_memory(d10) |
|
clear_descriptor_memory(d11) |
|
clear_descriptor_memory(d12) |
|
clear_descriptor_memory(d13) |
|
clear_descriptor_memory(d14) |
|
if phase22 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcPhi(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'Phi', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase23 == 1: |
|
descriptor = [Chem.GraphDescriptors.HallKierAlpha(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'HallKierAlpha', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase24 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcNumAmideBonds(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'NumAmideBonds', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase25 == 1: |
|
descriptor = [Chem.Lipinski.FractionCSP3(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'FractionCSP3', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase26 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcNumSpiroAtoms(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'NumSpiroAtoms', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase27 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcNumBridgeheadAtoms(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'NumBridgeheadAtoms', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase28 == 1: |
|
d1 = [Chem.MolSurf.PEOE_VSA1(alpha) for alpha in mols] |
|
d2 = [Chem.MolSurf.PEOE_VSA2(alpha) for alpha in mols] |
|
d3 = [Chem.MolSurf.PEOE_VSA3(alpha) for alpha in mols] |
|
d4 = [Chem.MolSurf.PEOE_VSA4(alpha) for alpha in mols] |
|
d5 = [Chem.MolSurf.PEOE_VSA5(alpha) for alpha in mols] |
|
d6 = [Chem.MolSurf.PEOE_VSA6(alpha) for alpha in mols] |
|
d7 = [Chem.MolSurf.PEOE_VSA7(alpha) for alpha in mols] |
|
d8 = [Chem.MolSurf.PEOE_VSA8(alpha) for alpha in mols] |
|
d9 = [Chem.MolSurf.PEOE_VSA9(alpha) for alpha in mols] |
|
d10 = [Chem.MolSurf.PEOE_VSA10(alpha) for alpha in mols] |
|
d11 = [Chem.MolSurf.PEOE_VSA11(alpha) for alpha in mols] |
|
d12 = [Chem.MolSurf.PEOE_VSA12(alpha) for alpha in mols] |
|
d13 = [Chem.MolSurf.PEOE_VSA13(alpha) for alpha in mols] |
|
d14 = [Chem.MolSurf.PEOE_VSA14(alpha) for alpha in mols] |
|
d1 = np.asarray(d1) |
|
d2 = np.asarray(d2) |
|
d3 = np.asarray(d3) |
|
d4 = np.asarray(d4) |
|
d5 = np.asarray(d5) |
|
d6 = np.asarray(d6) |
|
d7 = np.asarray(d7) |
|
d8 = np.asarray(d8) |
|
d9 = np.asarray(d9) |
|
d10 = np.asarray(d10) |
|
d11 = np.asarray(d11) |
|
d12 = np.asarray(d12) |
|
d13 = np.asarray(d13) |
|
d14 = np.asarray(d14) |
|
fps = generating_newfps(fps, [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14],'PEOE_VSA_Series[1-14]_ind', save_res) |
|
clear_descriptor_memory(d1) |
|
clear_descriptor_memory(d2) |
|
clear_descriptor_memory(d3) |
|
clear_descriptor_memory(d4) |
|
clear_descriptor_memory(d5) |
|
clear_descriptor_memory(d6) |
|
clear_descriptor_memory(d7) |
|
clear_descriptor_memory(d8) |
|
clear_descriptor_memory(d9) |
|
clear_descriptor_memory(d10) |
|
clear_descriptor_memory(d11) |
|
clear_descriptor_memory(d12) |
|
clear_descriptor_memory(d13) |
|
clear_descriptor_memory(d14) |
|
if phase29 == 1: |
|
d1 = [Chem.MolSurf.SMR_VSA1(alpha) for alpha in mols] |
|
d2 = [Chem.MolSurf.SMR_VSA2(alpha) for alpha in mols] |
|
d3 = [Chem.MolSurf.SMR_VSA3(alpha) for alpha in mols] |
|
d4 = [Chem.MolSurf.SMR_VSA4(alpha) for alpha in mols] |
|
d5 = [Chem.MolSurf.SMR_VSA5(alpha) for alpha in mols] |
|
d6 = [Chem.MolSurf.SMR_VSA6(alpha) for alpha in mols] |
|
d7 = [Chem.MolSurf.SMR_VSA7(alpha) for alpha in mols] |
|
d8 = [Chem.MolSurf.SMR_VSA8(alpha) for alpha in mols] |
|
d9 = [Chem.MolSurf.SMR_VSA9(alpha) for alpha in mols] |
|
d10 = [Chem.MolSurf.SMR_VSA10(alpha) for alpha in mols] |
|
d1 = np.asarray(d1) |
|
d2 = np.asarray(d2) |
|
d3 = np.asarray(d3) |
|
d4 = np.asarray(d4) |
|
d5 = np.asarray(d5) |
|
d6 = np.asarray(d6) |
|
d7 = np.asarray(d7) |
|
d8 = np.asarray(d8) |
|
d9 = np.asarray(d9) |
|
d10 = np.asarray(d10) |
|
fps = generating_newfps(fps, [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10],'SMR_VSA_Series[1-10]_ind', save_res) |
|
clear_descriptor_memory(d1) |
|
clear_descriptor_memory(d2) |
|
clear_descriptor_memory(d3) |
|
clear_descriptor_memory(d4) |
|
clear_descriptor_memory(d5) |
|
clear_descriptor_memory(d6) |
|
clear_descriptor_memory(d7) |
|
clear_descriptor_memory(d8) |
|
clear_descriptor_memory(d9) |
|
clear_descriptor_memory(d10) |
|
if phase30 == 1: |
|
d1 = [Chem.MolSurf.SlogP_VSA1(alpha) for alpha in mols] |
|
d2 = [Chem.MolSurf.SlogP_VSA2(alpha) for alpha in mols] |
|
d3 = [Chem.MolSurf.SlogP_VSA3(alpha) for alpha in mols] |
|
d4 = [Chem.MolSurf.SlogP_VSA4(alpha) for alpha in mols] |
|
d5 = [Chem.MolSurf.SlogP_VSA5(alpha) for alpha in mols] |
|
d6 = [Chem.MolSurf.SlogP_VSA6(alpha) for alpha in mols] |
|
d7 = [Chem.MolSurf.SlogP_VSA7(alpha) for alpha in mols] |
|
d8 = [Chem.MolSurf.SlogP_VSA8(alpha) for alpha in mols] |
|
d9 = [Chem.MolSurf.SlogP_VSA9(alpha) for alpha in mols] |
|
d10= [Chem.MolSurf.SlogP_VSA10(alpha) for alpha in mols] |
|
d11= [Chem.MolSurf.SlogP_VSA11(alpha) for alpha in mols] |
|
d12= [Chem.MolSurf.SlogP_VSA12(alpha) for alpha in mols] |
|
d1 = np.asarray(d1) |
|
d2 = np.asarray(d2) |
|
d3 = np.asarray(d3) |
|
d4 = np.asarray(d4) |
|
d5 = np.asarray(d5) |
|
d6 = np.asarray(d6) |
|
d7 = np.asarray(d7) |
|
d8 = np.asarray(d8) |
|
d9 = np.asarray(d9) |
|
d10 = np.asarray(d10) |
|
d11 = np.asarray(d11) |
|
d12 = np.asarray(d12) |
|
fps = generating_newfps(fps, [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12],'SlogP_VSA_Series[1-12]_ind', save_res) |
|
clear_descriptor_memory(d1) |
|
clear_descriptor_memory(d2) |
|
clear_descriptor_memory(d3) |
|
clear_descriptor_memory(d4) |
|
clear_descriptor_memory(d5) |
|
clear_descriptor_memory(d6) |
|
clear_descriptor_memory(d7) |
|
clear_descriptor_memory(d8) |
|
clear_descriptor_memory(d9) |
|
clear_descriptor_memory(d10) |
|
clear_descriptor_memory(d11) |
|
clear_descriptor_memory(d12) |
|
if phase31 == 1: |
|
d1 = [Chem.EState.EState_VSA.EState_VSA1(alpha) for alpha in mols] |
|
d2 = [Chem.EState.EState_VSA.EState_VSA2(alpha) for alpha in mols] |
|
d3 = [Chem.EState.EState_VSA.EState_VSA3(alpha) for alpha in mols] |
|
d4 = [Chem.EState.EState_VSA.EState_VSA4(alpha) for alpha in mols] |
|
d5 = [Chem.EState.EState_VSA.EState_VSA5(alpha) for alpha in mols] |
|
d6 = [Chem.EState.EState_VSA.EState_VSA6(alpha) for alpha in mols] |
|
d7 = [Chem.EState.EState_VSA.EState_VSA7(alpha) for alpha in mols] |
|
d8 = [Chem.EState.EState_VSA.EState_VSA8(alpha) for alpha in mols] |
|
d9 = [Chem.EState.EState_VSA.EState_VSA9(alpha) for alpha in mols] |
|
d10 = [Chem.EState.EState_VSA.EState_VSA10(alpha) for alpha in mols] |
|
d11 = [Chem.EState.EState_VSA.EState_VSA11(alpha) for alpha in mols] |
|
d1 = np.asarray(d1) |
|
d2 = np.asarray(d2) |
|
d3 = np.asarray(d3) |
|
d4 = np.asarray(d4) |
|
d5 = np.asarray(d5) |
|
d6 = np.asarray(d6) |
|
d7 = np.asarray(d7) |
|
d8 = np.asarray(d8) |
|
d9 = np.asarray(d9) |
|
d10 = np.asarray(d10) |
|
d11 = np.asarray(d11) |
|
fps = generating_newfps(fps, [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11],'EState_VSA_Series[1-11]_ind', save_res) |
|
clear_descriptor_memory(d1) |
|
clear_descriptor_memory(d2) |
|
clear_descriptor_memory(d3) |
|
clear_descriptor_memory(d4) |
|
clear_descriptor_memory(d5) |
|
clear_descriptor_memory(d6) |
|
clear_descriptor_memory(d7) |
|
clear_descriptor_memory(d8) |
|
clear_descriptor_memory(d9) |
|
clear_descriptor_memory(d10) |
|
clear_descriptor_memory(d11) |
|
if phase32 == 1: |
|
d1 = [Chem.EState.EState_VSA.VSA_EState1(alpha) for alpha in mols] |
|
d2 = [Chem.EState.EState_VSA.VSA_EState2(alpha) for alpha in mols] |
|
d3 = [Chem.EState.EState_VSA.VSA_EState3(alpha) for alpha in mols] |
|
d4 = [Chem.EState.EState_VSA.VSA_EState4(alpha) for alpha in mols] |
|
d5 = [Chem.EState.EState_VSA.VSA_EState5(alpha) for alpha in mols] |
|
d6 = [Chem.EState.EState_VSA.VSA_EState6(alpha) for alpha in mols] |
|
d7 = [Chem.EState.EState_VSA.VSA_EState7(alpha) for alpha in mols] |
|
d8 = [Chem.EState.EState_VSA.VSA_EState8(alpha) for alpha in mols] |
|
d9 = [Chem.EState.EState_VSA.VSA_EState9(alpha) for alpha in mols] |
|
d10 = [Chem.EState.EState_VSA.VSA_EState10(alpha) for alpha in mols] |
|
d1 = np.asarray(d1) |
|
d2 = np.asarray(d2) |
|
d3 = np.asarray(d3) |
|
d4 = np.asarray(d4) |
|
d5 = np.asarray(d5) |
|
d6 = np.asarray(d6) |
|
d7 = np.asarray(d7) |
|
d8 = np.asarray(d8) |
|
d9 = np.asarray(d9) |
|
d10 = np.asarray(d10) |
|
fps = generating_newfps(fps, [d1,d2,d3,d4,d5,d6,d7,d8,d9,d10],'VSA_EState_Series[1-10]', save_res) |
|
clear_descriptor_memory(d1) |
|
clear_descriptor_memory(d2) |
|
clear_descriptor_memory(d3) |
|
clear_descriptor_memory(d4) |
|
clear_descriptor_memory(d5) |
|
clear_descriptor_memory(d6) |
|
clear_descriptor_memory(d7) |
|
clear_descriptor_memory(d8) |
|
clear_descriptor_memory(d9) |
|
clear_descriptor_memory(d10) |
|
if phase33 == 1: |
|
descriptor = [Chem.rdMolDescriptors.MQNs_(alpha) for alpha in mols] |
|
|
|
fps = generating_newfps(fps, descriptor, 'MQNs', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase34 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcAUTOCORR2D(alpha) for alpha in mols] |
|
fps = generating_newfps(fps, descriptor, 'AUTOCORR2D', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase35 == 1: |
|
descriptor = compute_descriptors_parallel(mols) |
|
fps = generating_newfps(fps, descriptor, 'BCUT2D', save_res) |
|
clear_descriptor_memory(descriptor) |
|
|
|
mols2 = process_molecules_parallel(mols, max_workers=8) |
|
del mols |
|
gc.collect() |
|
|
|
if phase36 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcAsphericity(alpha) for alpha in mols2] |
|
fps = generating_newfps(fps, descriptor, 'Asphericity', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase37 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcPBF(alpha) for alpha in mols2] |
|
fps = generating_newfps(fps, descriptor, 'PBF', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase38 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcRadiusOfGyration(alpha) for alpha in mols2] |
|
fps = generating_newfps(fps, descriptor, 'RadiusOfGyration', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase39 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcInertialShapeFactor(alpha) for alpha in mols2] |
|
fps = generating_newfps(fps, descriptor, 'InertialShapeFactor', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase40 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcEccentricity(alpha) for alpha in mols2] |
|
fps = generating_newfps(fps, descriptor, 'Eccentricity', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase41 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcSpherocityIndex(alpha) for alpha in mols2] |
|
fps = generating_newfps(fps, descriptor, 'SpherocityIndex', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase42 == 1: |
|
d1 = [Chem.rdMolDescriptors.CalcPMI1(alpha) for alpha in mols2] |
|
d2 = [Chem.rdMolDescriptors.CalcPMI2(alpha) for alpha in mols2] |
|
d3 = [Chem.rdMolDescriptors.CalcPMI3(alpha) for alpha in mols2] |
|
d1 = Normalization(d1) |
|
d2 = Normalization(d2) |
|
d3 = Normalization(d3) |
|
d1 = np.asarray(d1) |
|
d2 = np.asarray(d2) |
|
d3 = np.asarray(d3) |
|
fps = generating_newfps(fps, [d1,d2,d3], 'PMI_series[1-3]_ind', save_res) |
|
clear_descriptor_memory(d1) |
|
clear_descriptor_memory(d2) |
|
clear_descriptor_memory(d3) |
|
if phase43 == 1: |
|
d1 = [Chem.rdMolDescriptors.CalcNPR1(alpha) for alpha in mols2] |
|
d2 = [Chem.rdMolDescriptors.CalcNPR2(alpha) for alpha in mols2] |
|
d1 = np.asarray(d1) |
|
d2 = np.asarray(d2) |
|
fps = generating_newfps(fps, [d1,d2], 'NPR_series[1-2]_ind', save_res) |
|
clear_descriptor_memory(d1) |
|
clear_descriptor_memory(d2) |
|
if phase44 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcAUTOCORR3D(mols) for mols in mols2] |
|
fps = generating_newfps(fps, descriptor, 'AUTOCORR3D', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase45 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcRDF(mols) for mols in mols2] |
|
descriptor = Normalization(descriptor) |
|
fps = generating_newfps(fps, descriptor, 'RDF', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase46 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcMORSE(mols) for mols in mols2] |
|
descriptor = Normalization(descriptor) |
|
fps = generating_newfps(fps, descriptor, 'MORSE', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase47 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcWHIM(mols) for mols in mols2] |
|
descriptor = Normalization(descriptor) |
|
fps = generating_newfps(fps, descriptor, 'WHIM', save_res) |
|
clear_descriptor_memory(descriptor) |
|
if phase48 == 1: |
|
descriptor = [Chem.rdMolDescriptors.CalcGETAWAY(mols) for mols in mols2] |
|
descriptor = Normalization(descriptor) |
|
fps = generating_newfps(fps, descriptor, 'GETAWAY', save_res) |
|
clear_descriptor_memory(descriptor) |
|
|
|
if save_res == "pd": |
|
fps.to_csv(f'{target_path}/{name}_feature_selection.csv') |
|
|
|
fps = fps.astype('float') |
|
return fps |