#%% BINDINGDB_FILE = "s3://datasets-public-research/bindingdb/BindingDB_All.tsv" PROTS_TO_KEEP = [ "HIV-1 Protease", "HIV-1 Protease M1", "Galactokinase (GALK)", "Caspase-3", "Neuraminidase A", "Cytochrome P450 3A", ] BINDING_DB_COLUMNS = { "Ligand SMILES": "smiles", "Ligand InChI": "inchi", "Target Name Assigned by Curator or DataSource": "target_name", "Ki (nM)": "ki", "IC50 (nM)": "ic50", "Kd (nM)": "kd", "EC50 (nM)": "ec50", "kon (M-1-s-1)": "kon", "koff (s-1)": "koff", "BindingDB Target Chain Sequence": "target_seq", } OUT_PATH = "s3://datasets-public-research/bindingdb/BindingDB_micro.tsv" import pandas as pd #%% df = pd.read_csv(BINDINGDB_FILE, sep="\t", on_bad_lines="warn", usecols=list(BINDING_DB_COLUMNS.keys())) #%% target_names = df["Target Name Assigned by Curator or DataSource"] keep_target = [name in PROTS_TO_KEEP for name in target_names] df_small = df[keep_target] #%% df_small.to_csv(OUT_PATH, sep="\t") # %%