@article{M5B984FEC, title = "A Study on Improving Performance of Software Requirements Classification Models by Handling Imbalanced Data", journal = "KIPS Transactions on Software and Data Engineering", year = "2023", issn = "2287-5905", doi = "https://doi.org/10.3745/KTSDE.2023.12.7.295", author = "Jong-Woo Choi/Young-Jun Lee/Chae-Gyun Lim/Ho-Jin Choi", keywords = "Requirements Classification, Imbalanced Data, Data Augmentation, Undersampling, BERT", abstract = "Software requirements written in natural language may have different meanings from the stakeholders’ viewpoint. When designing an architecture based on quality attributes, it is necessary to accurately classify quality attribute requirements because the efficient design is possible only when appropriate architectural tactics for each quality attribute are selected. As a result, although many natural language processing models have been studied for the classification of requirements, which is a high-cost task, few topics improve classification performance with the imbalanced quality attribute datasets. In this study, we first show that the classification model can automatically classify the Korean requirement dataset through experiments. Based on these results, we explain that data augmentation through EDA(Easy Data Augmentation) techniques and undersampling strategies can improve the imbalance of quality attribute datasets, and show that they are effective in classifying requirements. The results improved by 5.24%p on F1-score, indicating that handling imbalanced data helps classify Korean requirements of classification models. Furthermore, detailed experiments of EDA illustrate operations that help improve classification performance" }