2025
Sunwoo Jung; Sunyong Yoo
Abstract | Links | BibTeX | Dimensions | Tags: ADR, Artificial Intelligence, Attention mechanism, Bioinformatics, DDI, Deep learning, Text mining
@article{Jung2024,
title = {Interpretable prediction of drug-drug interactions via text embedding in biomedical literature},
author = {Sunwoo Jung and Sunyong Yoo},
url = {https://www.sciencedirect.com/science/article/pii/S0010482524015816},
doi = {10.1016/j.compbiomed.2024.109496},
isbn = {0010-4825},
year  = {2025},
date = {2025-02-01},
urldate = {2025-02-01},
journal = {Computers in Biology and Medicine},
volume = {185},
pages = {109496},
abstract = {Polypharmacy is a promising approach for treating diseases, especially those with complex symptoms. However, it can lead to unexpected drug-drug interactions (DDIs), potentially reducing efficacy and triggering adverse drug reactions (ADRs). Predicting the risk of DDIs is crucial for ensuring safe drug use, particularly by identifying the types of DDIs and the mechanisms involved. Therefore, this study used biomedical literature to proposed hierarchical attention-based deep learning models to predict DDIs and their types. The proposed model consists of two components: drug embedding and DDI prediction. The drug embedding module extracts representation vectors that effectively capture drug properties using sentence and sequence embedding methods. For sentence embedding, a pre-trained biomedical language model is used to map drug-related sentences into vector space. For sequence embedding, sentence embedding vectors are sequentially fed into bidirectional long short-term memory with a hierarchical attention network, enabling the analysis of sentences relevant to DDI prediction while accounting for the order of the sentences. Finally, DDI prediction is performed using a deep neural network based on the sequence embedding vectors of a drug pair. Our model achieved high performances in the accuracy (0.85–0.90), AUROC (0.98–0.99), and AUPR (0.63–0.95) performance across 164 DDI types. Additionally, the proposed model showed improvements in up to 11 % in AUROC, and 8 % in AUPR. Furthermore, model interprets predictions by leveraging attention mechanisms and drug similarity. The results indicated that the model considered various factors beyond similarity to predict DDIs. These findings may help prevent unforeseen medical accidents and reduce healthcare costs by predicting detailed drug interaction types.},
note = {Correspondence to Sunyong Yoo},
keywords = {ADR, Artificial Intelligence, Attention mechanism, Bioinformatics, DDI, Deep learning, Text mining},
pubstate = {published},
tppubtype = {article}
}
Dohyeon Lee; Sunyong Yoo
Abstract | Links | BibTeX | Dimensions | Tags: Artificial Intelligence, Attention mechanism, Bioinformatics, Cardiotoxicity, Deep learning, Graph attention network
@article{Lee2025,
title = {hERGAT: predicting hERG blockers using graph attention mechanism through atom- and molecule-level interaction analyses},
author = {Dohyeon Lee and Sunyong Yoo},
url = {https://link.springer.com/article/10.1186/s13321-025-00957-x?utm_source=rct_congratemailt&utm_medium=email&utm_campaign=oa_20250128&utm_content=10.1186/s13321-025-00957-x},
doi = {10.1186/s13321-025-00957-x},
issn = {1758-2946},
year  = {2025},
date = {2025-01-28},
urldate = {2025-01-28},
journal = {Journal of Cheminformatics},
volume = {17},
number = {11},
abstract = {The human ether-a-go-go-related gene (hERG) channel plays a critical role in the electrical activity of the heart, and its blockers can cause serious cardiotoxic effects. Thus, screening for hERG channel blockers is a crucial step in the drug development process. Many in silico models have been developed to predict hERG blockers, which can efficiently save time and resources. However, previous methods have found it hard to achieve high performance and to interpret the predictive results. To overcome these challenges, we have proposed hERGAT, a graph neural network model with an attention mechanism, to consider compound interactions on atomic and molecular levels. In the atom-level interaction analysis, we applied a graph attention mechanism (GAT) that integrates information from neighboring nodes and their extended connections. The hERGAT employs a gated recurrent unit (GRU) with the GAT to learn information between more distant atoms. To confirm this, we performed clustering analysis and visualized a correlation heatmap, verifying the interactions between distant atoms were considered during the training process. In the molecule-level interaction analysis, the attention mechanism enables the target node to focus on the most relevant information, highlighting the molecular substructures that play crucial roles in predicting hERG blockers. Through a literature review, we confirmed that highlighted substructures have a significant role in determining the chemical and biological characteristics related to hERG activity. Furthermore, we integrated physicochemical properties into our hERGAT model to improve the performance. Our model achieved an area under the receiver operating characteristic of 0.907 and an area under the precision-recall of 0.904, demonstrating its effectiveness in modeling hERG activity and offering a reliable framework for optimizing drug safety in early development stages.},
note = {Correspondence to Sunyong Yoo},
keywords = {Artificial Intelligence, Attention mechanism, Bioinformatics, Cardiotoxicity, Deep learning, Graph attention network},
pubstate = {published},
tppubtype = {article}
}
2024
Myeonghyeon Jeong; Sunyong Yoo
Abstract | Links | BibTeX | Dimensions | Tags: Attention mechanism, Bioinformatics, Deep learning, Fetotoxicity, in silico, Interpretability
@article{jeong2024fetoml,
title = {FetoML: Interpretable predictions of the fetotoxicity of drugs based on machine learning approaches},
author = {Myeonghyeon Jeong and Sunyong Yoo},
url = {https://onlinelibrary.wiley.com/doi/full/10.1002/minf.202300312},
doi = {10.1002/minf.202300312},
issn = {1868-1743},
year  = {2024},
date = {2024-03-03},
urldate = {2024-03-03},
journal = {Molecular Informatics},
volume = {43},
number = {6},
pages = {e202300312},
publisher = {Wiley Online Library},
abstract = {Pregnant females may use medications to manage health problems that develop during pregnancy or that they had prior to pregnancy. However, using medications during pregnancy has a potential risk to the fetus. Assessing the fetotoxicity of drugs is essential to ensure safe treatments, but the current process is challenged by ethical issues, time, and cost. Therefore, the need for in silico models to efficiently assess the fetotoxicity of drugs has recently emerged. Previous studies have proposed successful machine learning models for fetotoxicity prediction and even suggest molecular substructures that are possibly associated with fetotoxicity risks or protective effects. However, the interpretation of the decisions of the models on fetotoxicity prediction for each drug is still insufficient. This study constructed machine learning-based models that can predict the fetotoxicity of drugs while providing explanations for the decisions. For this, permutation feature importance was used to identify the general features that the model made significant in predicting the fetotoxicity of drugs. In addition, features associated with fetotoxicity for each drug were analyzed using the attention mechanism. The predictive performance of all the constructed models was significantly high (AUROC: 0.854-0.974, AUPR: 0.890-0.975). Furthermore, we conducted literature reviews on the predicted important features and found that they were highly associated with fetotoxicity. We expect that our model will benefit fetotoxicity research by providing an evaluation of fetotoxicity risks for drugs or drug candidates, along with an interpretation of that prediction.},
note = {Correspondence to Sunyong Yoo},
keywords = {Attention mechanism, Bioinformatics, Deep learning, Fetotoxicity, in silico, Interpretability},
pubstate = {published},
tppubtype = {article}
}
Sunyong Yoo; Myeonghyeon Jeong; Subhin Seomun; Kiseong Kim; Youngmahn Han
Abstract | Links | BibTeX | Dimensions | Tags: Amino acids, Attention mechanism, Bioinformatics, Coronaviruses, Deep learning, Immune system, Lymphocytes, Predictive models, Proteins, Transformer
@article{yoo2024interpretable,
title = {Interpretable Prediction of SARS-CoV-2 Epitope-specific TCR Recognition Using a Pre-Trained Protein Language Model},
author = {Sunyong Yoo and Myeonghyeon Jeong and Subhin Seomun and Kiseong Kim and Youngmahn Han},
url = {https://ieeexplore.ieee.org/abstract/document/10443062},
doi = {10.1109/TCBB.2024.3368046},
year  = {2024},
date = {2024-02-21},
urldate = {2024-02-21},
journal = {IEEE/ACM Transactions on Computational Biology and Bioinformatics},
volume = {21},
issue = {3},
pages = {428-438},
publisher = {IEEE},
abstract = {The emergence of the novel coronavirus, designated as severe acute respiratory syndrome coronavirus-2 (SARS-CoV-2), has posed a significant threat to public health worldwide. There has been progress in reducing hospitalizations and deaths due to SARS-CoV-2. However, challenges stem from the emergence of SARS-CoV-2 variants, which exhibit high transmission rates, increased disease severity, and the ability to evade humoral immunity. Epitope-specific T-cell receptor (TCR) recognition is key in determining the T-cell immunogenicity for SARS-CoV-2 epitopes. Although several data-driven methods for predicting epitope-specific TCR recognition have been proposed, they remain challenging due to the enormous diversity of TCRs and the lack of available training data. Self-supervised transfer learning has recently been proven useful for extracting information from unlabeled protein sequences, increasing the predictive performance of fine-tuned models, and using a relatively small amount of training data. This study presents a deep-learning model generated by fine-tuning pre-trained protein embeddings from a large corpus of protein sequences. The fine-tuned model showed markedly high predictive performance and outperformed the recent Gaussian process-based prediction model. The output attentions captured by the deep-learning model suggested critical amino acid positions in the SARS-CoV-2 epitope-specific TCRβ sequences that are highly associated with the viral escape of T-cell immune response.},
note = {Correspondence to Sunyong Yoo},
keywords = {Amino acids, Attention mechanism, Bioinformatics, Coronaviruses, Deep learning, Immune system, Lymphocytes, Predictive models, Proteins, Transformer},
pubstate = {published},
tppubtype = {article}
}
Soyeon Lee; Sunyong Yoo
Abstract | Links | BibTeX | Dimensions | Tags: Artificial Intelligence, Attention mechanism, Bioinformatics, Deep learning, Drug-induced liver injury, Feature importance, Hepatotoxicity, in silico
@article{lee2024interdili,
title = {InterDILI: interpretable prediction of drug-induced liver injury through permutation feature importance and attention mechanism},
author = {Soyeon Lee and Sunyong Yoo},
url = {https://link.springer.com/article/10.1186/s13321-023-00796-8},
doi = {10.1186/s13321-023-00796-8},
year  = {2024},
date = {2024-01-03},
urldate = {2024-01-03},
journal = {Journal of Cheminformatics},
volume = {16},
number = {1},
pages = {1},
publisher = {Springer},
abstract = {Safety is one of the important factors constraining the distribution of clinical drugs on the market. Drug-induced liver injury (DILI) is the leading cause of safety problems produced by drug side effects. Therefore, the DILI risk of approved drugs and potential drug candidates should be assessed. Currently, in vivo and in vitro methods are used to test DILI risk, but both methods are labor-intensive, time-consuming, and expensive. To overcome these problems, many in silico methods for DILI prediction have been suggested. Previous studies have shown that DILI prediction models can be utilized as prescreening tools, and they achieved a good performance. However, there are still limitations in interpreting the prediction results. Therefore, this study focused on interpreting the model prediction to analyze which features could potentially cause DILI. For this, five publicly available datasets were collected to train and test the model. Then, various machine learning methods were applied using substructure and physicochemical descriptors as inputs and the DILI label as the output. The interpretation of feature importance was analyzed by recognizing the following general-to-specific patterns: (i) identifying general important features of the overall DILI predictions, and (ii) highlighting specific molecular substructures which were highly related to the DILI prediction for each compound. The results indicated that the model not only captured the previously known properties to be related to DILI but also proposed a new DILI potential substructural of physicochemical properties. The models for the DILI prediction achieved an area under the receiver operating characteristic (AUROC) of 0.88–0.97 and an area under the Precision-Recall curve (AUPRC) of 0.81–0.95. From this, we hope the proposed models can help identify the potential DILI risk of drug candidates at an early stage and offer valuable insights for drug development.},
note = {Correspondence to Sunyong Yoo},
keywords = {Artificial Intelligence, Attention mechanism, Bioinformatics, Deep learning, Drug-induced liver injury, Feature importance, Hepatotoxicity, in silico},
pubstate = {published},
tppubtype = {article}
}
정선우; 유선용
Abstract | Links | BibTeX | Dimensions | Tags: ADR, DDI, Deep learning, Text mining
@article{정선우2024drug,
title = {Drug-Drug Interaction Prediction Model Based on Deep Learning Using Drug Information Document Embedding},
author = {정선우 and 유선용},
url = {https://www.dbpia.co.kr/pdf/pdfView.do?nodeId=NODE11852157&googleIPSandBox=false&mark=0&minRead=10&ipRange=false&b2cLoginYN=false&icstClss=010000&isPDFSizeAllowed=true&nodeHistoryTotalCnt=2&accessgl=Y&language=ko_KR&hasTopBanner=true},
doi = {10.5626/JOK.2024.51.6.503},
issn = {2833-6296},
year  = {2024},
date = {2024-01-02},
urldate = {2024-01-02},
journal = {Journal of KIISE},
volume = {51},
number = {6},
pages = {503–512},
abstract = {다약제는 암, 고혈압, 천식 등 다양한 질병에 대하여 유망한 접근법이다. 일반적으로 병원에 방문하는 환자는 2종 이상의 약물을 처방받는다. 그러나 다약제의 사용은 개별 약물이 목표하는 작용 외에 예상치 못한 상호작용을 유발할 수 있다. 약물 간 상호작용을 사전에 예측하는 것은 안전한 약물 사용을 위한 매우 중요한 과제이다. 본 연구에서는 다약제 사용 시 발생 가능한 약물 간 상호작용 예측을 위해 개별 약물 정보를 포함한 문서를 이용하여 약물을 표현하는 문서 임베딩 기반의 딥러닝 예측 모델을 제안한다. 약물 정보 문서는 DrugBank 데이터를 이용해 약물의 설명, 적응증, 약력학 정보, 작용 기전, 독성 속성을 결합해 구축한다. 그 후 Doc2Vec, BioSentVec 언어 모델을 통해 약물 문서로부터 약물 표현 벡터를 생성한다. 두 약물 표현 벡터는 한 쌍으로 묶여 딥러닝 기반 예측 모델에 입력되고, 해당 모델은 두 약물 간 상호작용을 예측한다. 본 논문에서는 언어 임베딩 모델의 성능 비교, 데이터의 불균형도 조절 등 다양한 조건의 변화에 따른 실험 결과의 차이를 분석하여 약물 간 상호작용 예측을 위한 최적의 모델을 구축하는 것을 목표로 한다. 제안된 모델은 약물 처방 과정, 신약 개발의 임상 과정 등에서 약물간 상호작용 사전 예측을 위하여 활용될 수 있을 것으로 기대된다.},
note = {Correspondence to Sunyong Yoo},
keywords = {ADR, DDI, Deep learning, Text mining},
pubstate = {published},
tppubtype = {article}
}
2022
Seonwoo Jung; Min-Keun Song; Eunjoo Lee; Sejin Bae; Yeon-Yong Kim; Doheon Lee; Myoung Jin Lee; Sunyong Yoo
Abstract | Links | BibTeX | Dimensions | Tags: Atrial fibrillation, Attention mechanism, Deep learning, Machine learning, Medical informatics, National health insurance service, Stroke
@article{jung2022predicting,
title = {Predicting ischemic stroke in patients with atrial fibrillation using machine learning},
author = {Seonwoo Jung and Min-Keun Song and Eunjoo Lee and Sejin Bae and Yeon-Yong Kim and Doheon Lee and Myoung Jin Lee and Sunyong Yoo},
url = {https://www.imrpress.com/journal/FBL/27/3/10.31083/j.fbl2703080/htm?utm_source=TrendMD&utm_medium=cpc&utm_campaign=Frontiers_in_Bioscience-Landmark_TrendMD_1},
doi = {10.31083/j.fbl2703080},
year  = {2022},
date = {2022-03-04},
urldate = {2022-03-04},
journal = {Frontiers in Bioscience-Landmark},
volume = {27},
number = {3},
pages = {80},
publisher = {IMR Press},
abstract = {Background 
Atrial fibrillation (AF) is a well-known risk factor for stroke. Predicting the risk is important to prevent the first and secondary attacks of cerebrovascular diseases by determining early treatment. This study aimed to predict the ischemic stroke in AF patients based on the massive and complex Korean National Health Insurance (KNHIS) data through a machine learning approach. 
Methods 
We extracted 65-dimensional features, including demographics, health examination, and medical history information, of 754,949 patients with AF from KNHIS. Logistic regression was used to determine whether the extracted features had a statistically significant association with ischemic stroke occurrence. Then, we constructed the ischemic stroke prediction model using an attention-based deep neural network. The extracted features were used as input, and the occurrence of ischemic stroke after the diagnosis of AF was the output used to train the model. 
Results We found 48 features significantly associated with ischemic stroke occurrence through regression analysis (p-value < 0.001). When the proposed deep learning model was applied to 150,989 AF patients, it was confirmed that the occurrence ischemic stroke was predicted to be higher AUROC (AUROC = 0.727 ± 0.003) compared to CHA2DS2-VASc score (AUROC = 0.651 ± 0.007) and other machine learning methods. 
Conclusions 
As part of preventive medicine, this study could help AF patients prepare for ischemic stroke prevention based on predicted stoke associated features and risk scores.},
note = {Correspondence to Sunyong Yoo},
keywords = {Atrial fibrillation, Attention mechanism, Deep learning, Machine learning, Medical informatics, National health insurance service, Stroke},
pubstate = {published},
tppubtype = {article}
}
Atrial fibrillation (AF) is a well-known risk factor for stroke. Predicting the risk is important to prevent the first and secondary attacks of cerebrovascular diseases by determining early treatment. This study aimed to predict the ischemic stroke in AF patients based on the massive and complex Korean National Health Insurance (KNHIS) data through a machine learning approach.
Methods
We extracted 65-dimensional features, including demographics, health examination, and medical history information, of 754,949 patients with AF from KNHIS. Logistic regression was used to determine whether the extracted features had a statistically significant association with ischemic stroke occurrence. Then, we constructed the ischemic stroke prediction model using an attention-based deep neural network. The extracted features were used as input, and the occurrence of ischemic stroke after the diagnosis of AF was the output used to train the model.
Results We found 48 features significantly associated with ischemic stroke occurrence through regression analysis (p-value < 0.001). When the proposed deep learning model was applied to 150,989 AF patients, it was confirmed that the occurrence ischemic stroke was predicted to be higher AUROC (AUROC = 0.727 ± 0.003) compared to CHA2DS2-VASc score (AUROC = 0.651 ± 0.007) and other machine learning methods.
Conclusions
As part of preventive medicine, this study could help AF patients prepare for ischemic stroke prevention based on predicted stoke associated features and risk scores.
2020
Sunyong Yoo; Hyung Chae Yang; Seongyeong Lee; Jaewook Shin; Seyoung Min; Eunjoo Lee; Minkeun Song; Doheon Lee
Abstract | Links | BibTeX | Dimensions | Tags: Bioinformatics, Chemical property, Deep learning, Molecular interaction, Natural product, Network analysis, Text mining
@article{10.3389/fphar.2020.584875,
title = {A Deep Learning-Based Approach for Identifying the Medicinal Uses of Plant-Derived Natural Compounds},
author = {Sunyong Yoo and Hyung Chae Yang and Seongyeong Lee and Jaewook Shin and Seyoung Min and Eunjoo Lee and Minkeun Song and Doheon Lee},
url = {https://www.frontiersin.org/journals/pharmacology/articles/10.3389/fphar.2020.584875},
doi = {10.3389/fphar.2020.584875},
issn = {1663-9812},
year  = {2020},
date = {2020-01-01},
urldate = {2020-01-01},
journal = {Frontiers in Pharmacology},
volume = {11},
pages = {584875},
abstract = {Medicinal plants and their extracts have been used as important sources for drug discovery. In particular, plant-derived natural compounds, including phytochemicals, antioxidants, vitamins, and minerals, are gaining attention as they promote health and prevent disease. Although several in vitro methods have been developed to confirm the biological activities of natural compounds, there is still considerable room to reduce time and cost. To overcome these limitations, several in silico methods have been proposed for conducting large-scale analysis, but they are still limited in terms of dealing with incomplete and heterogeneous natural compound data. Here, we propose a deep learning-based approach to identify the medicinal uses of natural compounds by exploiting massive and heterogeneous drug and natural compound data. The rationale behind this approach is that deep learning can effectively utilize heterogeneous features to alleviate incomplete information. Based on latent knowledge, molecular interactions, and chemical property features, we generated 686 dimensional features for 4,507 natural compounds and 2,882 approved and investigational drugs. The deep learning model was trained using the generated features and verified drug indication information. When the features of natural compounds were applied as input to the trained model, potential efficacies were successfully predicted with high accuracy, sensitivity, and specificity.},
keywords = {Bioinformatics, Chemical property, Deep learning, Molecular interaction, Natural product, Network analysis, Text mining},
pubstate = {published},
tppubtype = {article}
}