@techreport{oai:ipsj.ixsq.nii.ac.jp:00227960,
 author = {Guanyu, Cao and Takuya, Maekawa and Kazuya, Ohara and Yasue, Kishino and Guanyu, Cao and Takuya, Maekawa and Kazuya, Ohara and Yasue, Kishino},
 issue = {31},
 month = {Sep},
 note = {This paper explores a practical approach to estimate depth images of moving humans and objects by Wi-Fi Channel State Information (CSI). This technique enables visual sensing with commercial off-the-shelf Wi-Fi devices that saves expense while being invariant to illumination and occlusion, and friendly to privacy. However, training a cross-modality model usually requires a large amount of paired data. The variety of human postures further exacerbates the data requirement. We leverage Variational Auto-Encoder (VAE) to learn the regularized latent space of depth images in order to estimate unseen images. Besides, we adopt metric learning to learn the physical attributes of depth images. These strategies mitigate the labor of collecting paired data, and enable the use of in-the-wild depth image datasets. Concretely, a teacher-student network is established, where the teacher network is based on Beta-VAE that learns the latent space of depth images with the help of inductive bias containing shape, size, planar coordinates and depth coordinates. The student network is an encoder that learns such latent space via knowledge distillation. The student network and the decoder of the teacher network constitute an end-to-end depth image estimation network from CSI. To the best of our knowledge, our proposed model is the first to estimate the depth images from CSI., This paper explores a practical approach to estimate depth images of moving humans and objects by Wi-Fi Channel State Information (CSI). This technique enables visual sensing with commercial off-the-shelf Wi-Fi devices that saves expense while being invariant to illumination and occlusion, and friendly to privacy. However, training a cross-modality model usually requires a large amount of paired data. The variety of human postures further exacerbates the data requirement. We leverage Variational Auto-Encoder (VAE) to learn the regularized latent space of depth images in order to estimate unseen images. Besides, we adopt metric learning to learn the physical attributes of depth images. These strategies mitigate the labor of collecting paired data, and enable the use of in-the-wild depth image datasets. Concretely, a teacher-student network is established, where the teacher network is based on Beta-VAE that learns the latent space of depth images with the help of inductive bias containing shape, size, planar coordinates and depth coordinates. The student network is an encoder that learns such latent space via knowledge distillation. The student network and the decoder of the teacher network constitute an end-to-end depth image estimation network from CSI. To the best of our knowledge, our proposed model is the first to estimate the depth images from CSI.},
 title = {Preliminary Investigation of Estimating Depth Images of Moving Objects from Wi-Fi Channel State Information},
 year = {2023}
}