@techreport{oai:ipsj.ixsq.nii.ac.jp:00231227, author = {Lodovico, Molina Ivo and Tsubasa, Minematsu and Atsushi, Shimada and Lodovico, Molina Ivo and Tsubasa, Minematsu and Atsushi, Shimada}, issue = {25}, month = {Nov}, note = {This paper explores the potential of large language models (LLMs) for Automatic Question Generation in educational contexts. We compare three models - GPT-3.5 Turbo, Flan T5 XXL, and Llama 2-Chat 13B - on their ability to generate relevant questions from university slide text without finetuning. Questions were generated in a two-step pipeline: first, answer phrases were extracted from slides using Llama 2-Chat 13B; then, questions were generated for each answer by the three models. To evaluate question quality, a survey was conducted where students rated 144 questions across five metrics: clarity, relevance, difficulty, slide relation, and answer correctness. Results showed GPT-3.5 and Llama 2-Chat 13B outperformed Flan T5 XXL overall, with lower scores on clarity and answer-question alignment for Flan T5 XXL. GPT-3.5 excelled at tailoring questions to match input answers. While isolated questions seemed coherent for all models, Llama 2-Chat 13B and Flan T5 XXL showed weaker alignment between generated questions and answers compared to GPT-3.5. This research analyzes the capacity of LLMs for Automatic Question Generation to enhance education, particularly GPT-3.5 and Llama 2-Chat 13B, without any finetuning. Further work is needed to optimize models and methodologies to continually improve question relevance and quality., This paper explores the potential of large language models (LLMs) for Automatic Question Generation in educational contexts. We compare three models - GPT-3.5 Turbo, Flan T5 XXL, and Llama 2-Chat 13B - on their ability to generate relevant questions from university slide text without finetuning. Questions were generated in a two-step pipeline: first, answer phrases were extracted from slides using Llama 2-Chat 13B; then, questions were generated for each answer by the three models. To evaluate question quality, a survey was conducted where students rated 144 questions across five metrics: clarity, relevance, difficulty, slide relation, and answer correctness. Results showed GPT-3.5 and Llama 2-Chat 13B outperformed Flan T5 XXL overall, with lower scores on clarity and answer-question alignment for Flan T5 XXL. GPT-3.5 excelled at tailoring questions to match input answers. While isolated questions seemed coherent for all models, Llama 2-Chat 13B and Flan T5 XXL showed weaker alignment between generated questions and answers compared to GPT-3.5. This research analyzes the capacity of LLMs for Automatic Question Generation to enhance education, particularly GPT-3.5 and Llama 2-Chat 13B, without any finetuning. Further work is needed to optimize models and methodologies to continually improve question relevance and quality.}, title = {A Comparative Analysis of Large Language Models for Contextually Relevant Question Generation in Education}, year = {2023} }