@techreport{oai:ipsj.ixsq.nii.ac.jp:00241628,
author = {Xin, Zhao and Zehui, Jiang and Naoki, Yoshinaga and Xin, Zhao and Zehui, Jiang and Naoki, Yoshinaga},
issue = {8},
month = {Dec},
note = {Neurons in feed-forward layers of Transformers have shown the ability to store factual knowledge. However, previous analyses mostly focused on qualitative evaluation, leaving the numerical relationship between neuron activations and model outputs less understood. Our study conducts a quantitative analysis through neuron-wise intervention experiments using the knowledge probing dataset. Our findings first reveal that neurons exhibit linearity and polarity in producing output tokens probabilities, quantified by “neuron empirical gradients.” Empirical gradients provide a direct measure of neurons' importance in representing knowledge. However, neuron-wise intervention experiments are costly, making it impractical to obtain empirical gradients in large language models. To address this, we propose NeurGrad, an efficient method for measuring neuron empirical gradients. Our experimental results show that NeurGrad outperforms several baseline methods in both efficiency and accuracy., Neurons in feed-forward layers of Transformers have shown the ability to store factual knowledge. However, previous analyses mostly focused on qualitative evaluation, leaving the numerical relationship between neuron activations and model outputs less understood. Our study conducts a quantitative analysis through neuron-wise intervention experiments using the knowledge probing dataset. Our findings first reveal that neurons exhibit linearity and polarity in producing output tokens probabilities, quantified by “neuron empirical gradients.” Empirical gradients provide a direct measure of neurons' importance in representing knowledge. However, neuron-wise intervention experiments are costly, making it impractical to obtain empirical gradients in large language models. To address this, we propose NeurGrad, an efficient method for measuring neuron empirical gradients. Our experimental results show that NeurGrad outperforms several baseline methods in both efficiency and accuracy.},
title = {Linear Effect of Neuron Activations in Transformer-based Language Models},
year = {2024}
}