2024
Soyeon Lee; Sunyong Yoo
Abstract | Links | BibTeX | Dimensions | Tags: Artificial Intelligence, Attention mechanism, Bioinformatics, Deep learning, Drug-induced liver injury, Feature importance, Hepatotoxicity, in silico
@article{lee2024interdili,
title = {InterDILI: interpretable prediction of drug-induced liver injury through permutation feature importance and attention mechanism},
author = {Soyeon Lee and Sunyong Yoo},
url = {https://link.springer.com/article/10.1186/s13321-023-00796-8},
doi = {10.1186/s13321-023-00796-8},
year = {2024},
date = {2024-01-03},
urldate = {2024-01-03},
journal = {Journal of Cheminformatics},
volume = {16},
number = {1},
pages = {1},
publisher = {Springer},
abstract = {Safety is one of the important factors constraining the distribution of clinical drugs on the market. Drug-induced liver injury (DILI) is the leading cause of safety problems produced by drug side effects. Therefore, the DILI risk of approved drugs and potential drug candidates should be assessed. Currently, in vivo and in vitro methods are used to test DILI risk, but both methods are labor-intensive, time-consuming, and expensive. To overcome these problems, many in silico methods for DILI prediction have been suggested. Previous studies have shown that DILI prediction models can be utilized as prescreening tools, and they achieved a good performance. However, there are still limitations in interpreting the prediction results. Therefore, this study focused on interpreting the model prediction to analyze which features could potentially cause DILI. For this, five publicly available datasets were collected to train and test the model. Then, various machine learning methods were applied using substructure and physicochemical descriptors as inputs and the DILI label as the output. The interpretation of feature importance was analyzed by recognizing the following general-to-specific patterns: (i) identifying general important features of the overall DILI predictions, and (ii) highlighting specific molecular substructures which were highly related to the DILI prediction for each compound. The results indicated that the model not only captured the previously known properties to be related to DILI but also proposed a new DILI potential substructural of physicochemical properties. The models for the DILI prediction achieved an area under the receiver operating characteristic (AUROC) of 0.88–0.97 and an area under the Precision-Recall curve (AUPRC) of 0.81–0.95. From this, we hope the proposed models can help identify the potential DILI risk of drug candidates at an early stage and offer valuable insights for drug development.},
note = {Correspondence to Sunyong Yoo},
keywords = {Artificial Intelligence, Attention mechanism, Bioinformatics, Deep learning, Drug-induced liver injury, Feature importance, Hepatotoxicity, in silico},
pubstate = {published},
tppubtype = {article}
}
2023
Jinmyung Jung; Sunyong Yoo
Abstract | Links | BibTeX | Dimensions | Tags: Bioinformatics, Breast cancer, Feature importance, Gene expression, Machine learning, Metastasis marker
@article{jung2023identification,
title = {Identification of Breast Cancer Metastasis Markers from Gene Expression Profiles Using Machine Learning Approaches},
author = {Jinmyung Jung and Sunyong Yoo},
url = {https://www.mdpi.com/2073-4425/14/9/1820},
doi = {10.3390/genes14091820},
year = {2023},
date = {2023-09-20},
urldate = {2023-09-20},
journal = {Genes},
volume = {14},
number = {9},
pages = {1820},
publisher = {MDPI},
abstract = {Cancer metastasis accounts for approximately 90% of cancer deaths, and elucidating markers in metastasis is the first step in its prevention. To characterize metastasis marker genes (MGs) of breast cancer, XGBoost models that classify metastasis status were trained with gene expression profiles from TCGA. Then, a metastasis score (MS) was assigned to each gene by calculating the inner product between the feature importance and the AUC performance of the models. As a result, 54, 202, and 357 genes with the highest MS were characterized as MGs by empirical p-value cutoffs of 0.001, 0.005, and 0.01, respectively. The three sets of MGs were compared with those from existing metastasis marker databases, which provided significant results in most comparisons (p-value < 0.05). They were also significantly enriched in biological processes associated with breast cancer metastasis. The three MGs, SPPL2C, KRT23, and RGS7, showed highly significant results (p-value < 0.01) in the survival analysis. The MGs that could not be identified by statistical analysis (e.g., GOLM1, ELAVL1, UBP1, and AZGP1), as well as the MGs with the highest MS (e.g., ZNF676, FAM163B, LDOC2, IRF1, and STK40), were verified via the literature. Additionally, we checked how close the MGs were to each other in the protein–protein interaction networks. We expect that the characterized markers will help understand and prevent breast cancer metastasis.},
note = {Correspondence to Sunyong Yoo},
keywords = {Bioinformatics, Breast cancer, Feature importance, Gene expression, Machine learning, Metastasis marker},
pubstate = {published},
tppubtype = {article}
}