#%%

BINDINGDB_FILE = "s3://datasets-public-research/bindingdb/BindingDB_All.tsv"
PROTS_TO_KEEP = [
    "HIV-1 Protease",
    "HIV-1 Protease M1",
    "Galactokinase (GALK)",
    "Caspase-3",
    "Neuraminidase A",
    "Cytochrome P450 3A",
]
BINDING_DB_COLUMNS = {
    "Ligand SMILES": "smiles",
    "Ligand InChI": "inchi",
    "Target Name Assigned by Curator or DataSource": "target_name",
    "Ki (nM)": "ki",
    "IC50 (nM)": "ic50",
    "Kd (nM)": "kd",
    "EC50 (nM)": "ec50",
    "kon (M-1-s-1)": "kon",
    "koff (s-1)": "koff",
    "BindingDB Target Chain  Sequence": "target_seq",
}

OUT_PATH = "s3://datasets-public-research/bindingdb/BindingDB_micro.tsv"

import pandas as pd

#%%
df = pd.read_csv(BINDINGDB_FILE, sep="\t", on_bad_lines="warn", usecols=list(BINDING_DB_COLUMNS.keys()))

#%%
target_names = df["Target Name Assigned by Curator or DataSource"]
keep_target = [name in PROTS_TO_KEEP for name in target_names]
df_small = df[keep_target]

#%%
df_small.to_csv(OUT_PATH, sep="\t")

# %%