@techreport{oai:ipsj.ixsq.nii.ac.jp:00235071, author = {Zhengyang, Wang and Hao, Jin and Haoran, Xie and Zhengyang, Wang and Hao, Jin and Haoran, Xie}, issue = {3}, month = {Jun}, note = {Demand for generating building images from text prompts grows, despite recent advances in diffusion models greatly enhancing image quality. The current generative models struggle with controlling the number of floors. To this end, we propose a retrieval-augmented framework for generating building images with provided floor count using a diffusion model. Initially, the text prompts with the provided floor count to retrieve the most suitable image from a building image database. Then, we adopted a multi-level structure detection algorithm to obtain a sketch from the matched image to ensure structural consistency. Finally, the building image with the desired floor count and style is generated by diffusion model, guided by the detected building sketch. Our proposed framework enables accurate control over the floor count in building image synthesis. We demonstrate the robustness and scalability of generating building images with a specific floor count from text prompts., Demand for generating building images from text prompts grows, despite recent advances in diffusion models greatly enhancing image quality. The current generative models struggle with controlling the number of floors. To this end, we propose a retrieval-augmented framework for generating building images with provided floor count using a diffusion model. Initially, the text prompts with the provided floor count to retrieve the most suitable image from a building image database. Then, we adopted a multi-level structure detection algorithm to obtain a sketch from the matched image to ensure structural consistency. Finally, the building image with the desired floor count and style is generated by diffusion model, guided by the detected building sketch. Our proposed framework enables accurate control over the floor count in building image synthesis. We demonstrate the robustness and scalability of generating building images with a specific floor count from text prompts.}, title = {Retrieval-Augmented Multi-Floor Building Image Generation}, year = {2024} }