@article{M08194F67, title = "Diffusion-based Audio-to-Visual Generation for High-Quality Bird Images", journal = "KIPS Transactions on Software and Data Engineering", year = "2025", issn = "2287-5905", doi = "https://doi.org/10.3745/TKIPS.2025.14.3.135", author = "Adel Toleubekova, Joo Yong Shim, XinYu Piao, Jong-Kook Kim", keywords = "Audio-to-visual generation, Diffusion models, Image generation, Audio features, Multi-modal generation", abstract = "Accurately identifying bird species from their vocalizations and generating corresponding bird images is still a challenging task due to limited training data and environmental noise in audio data. To address this limitation, this paper introduces a diffusion-based audio-to-image generation approach that satisfies both the need to accurately identify bird sounds and generate bird images. The main idea is to use a conditional diffusion model to handle the complexities of bird audio data, such as pitch variations and environmental noise while establishing a robust connection between the auditory and visual domains. This enables the model to generate high-quality bird images based on the given bird audio input. Plus, the proposed approach is integrated with deep audio processing to enhance its capabilities by meticulously aligning audio features with visual information and learning to map intricate acoustic patterns to corresponding visual representations. Experimental results demonstrate the effectiveness of the proposed approach in generating better images for bird classes compared to previous methods" }