@article { , title = {Exploiting Attention-Consistency Loss For Spatial-Temporal Stream Action Recognition}, abstract = {Currently, many action recognition methods mostly consider the information from spatial streams. We propose a new perspective inspired by the human visual system to combine both spatial and temporal streams to measure their attention consistency. Specifically, a branch-independent convolutional neural network (CNN) based algorithm is developed with a novel attention-consistency loss metric, enabling the temporal stream to concentrate on consistent discriminative regions with the spatial stream in the same period. The consistency loss is further combined with the cross-entropy loss to enhance the visual attention consistency. We evaluate the proposed method for action recognition on two benchmark datasets: Kinetics400 and UCF101. Despite its apparent simplicity, our proposed framework with the attention consistency achieves better performance than most of the two-stream networks, i.e. 75.7\% top-1 accuracy on Kinetics400 and 95.7\% on UCF101, while reducing 7.1\% computational cost compared with our baseline. Particularly, our proposed method can attain remarkable improvements on complex action classes, showing that our proposed network can act as a potential benchmark to handle complicated scenarios in industry 4.0 applications.}, doi = {10.1145/3538749}, eissn = {1551-6865}, issn = {1551-6857}, issue = {2S}, journal = {ACM Transactions on Multimedia Computing, Communications, and Applications}, publicationstatus = {Published}, publisher = {Association for Computing Machinery (ACM)}, url = {http://researchrepository.napier.ac.uk/Output/2876251}, volume = {18}, keyword = {Action Recognition, Attention Consistency, Multi-level Attention, Two-stream Structure}, year = {2022}, author = {Xu, Haotian and Jin, Xiaobo and Wang, Qiufeng and Hussain, Amir and Huang, Kaizhu} }