@techreport{oai:ipsj.ixsq.nii.ac.jp:00182709, author = {Ran, Wensheng and Tian, Ran and Naoaki, Okazaki and Kentaro, Inui and Ran, Wensheng and Tian, Ran and Naoaki, Okazaki and Kentaro, Inui}, issue = {2}, month = {Jul}, note = {Referring Expression Comprehension (REC) is the task of pointing out the correct object in an image as corresponding to a given natural language expression. In this work, we improve a previous model of REC by explicitly aligning relations between mentions in the language expression to pairs of objects placed in specific relative positions in the image. Evaluation on the RefGoogle dataset [4] shows that our model outperforms previous work ; we also find that, quite surprisingly, the image features extracted from a pre-trained convolution neural network as used by previous research are not as efficient to REC as automatically recognized category labels., Referring Expression Comprehension (REC) is the task of pointing out the correct object in an image as corresponding to a given natural language expression. In this work, we improve a previous model of REC by explicitly aligning relations between mentions in the language expression to pairs of objects placed in specific relative positions in the image. Evaluation on the RefGoogle dataset [4] shows that our model outperforms previous work; we also find that, quite surprisingly, the image features extracted from a pre-trained convolution neural network as used by previous research are not as efficient to REC as automatically recognized category labels.}, title = {Modeling Relations between Objects for Referring Expression Comprehension}, year = {2017} }