@techreport{oai:ipsj.ixsq.nii.ac.jp:00211879, author = {Maximilian, Zimmermann and Thang, Dang and Tsuguchika, Tabaru and Atsushi, Ike and Maximilian, Zimmermann and Thang, Dang and Tsuguchika, Tabaru and Atsushi, Ike}, issue = {11}, month = {Jul}, note = {This research work is about using Transformer models, which are first introduced in the paper ”Attention is All You Need”, for a multimodal task, specifically image captioning. By treating it as an NLP translation task, different Transformer models are evaluated and optimised. Through the analysis of the data, model and distributed communication pipeline, bottlenecks are identified and performance increases in regards to accuracy and speed are shown across multiple accelerators., This research work is about using Transformer models, which are first introduced in the paper ”Attention is All You Need”, for a multimodal task, specifically image captioning. By treating it as an NLP translation task, different Transformer models are evaluated and optimised. Through the analysis of the data, model and distributed communication pipeline, bottlenecks are identified and performance increases in regards to accuracy and speed are shown across multiple accelerators.}, title = {Analysis of Optimised Transformer Models in Image Captioning Tasks}, year = {2021} }