Publications

Mumcu, F., Jones, M.J., Yilmaz, Y., Cherian, A., "ComplexVAD: Detecting Interaction Anomalies in Video", IEEE Winter Conference on Applications of Computer Vision (WACV) Workshop, February 2025.
BibTeX TR2025-016 PDF
- @inproceedings{Mumcu2025feb,
- author = {Mumcu, Furkan and Jones, Michael J. and Yilmaz, Yasin and Cherian, Anoop},
- title = {{ComplexVAD: Detecting Interaction Anomalies in Video}},
- booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV) Workshop},
- year = 2025,
- month = feb,
- url = {https://www.merl.com/publications/TR2025-016}
- }
He, Y., Shin, S., Cherian, A., Trigoni, N., Markham, A., "SoundLoc3D: Invisible 3D Sound Source Localization and Classification Using a Multimodal RGB-D Acoustic Camera", IEEE Winter Conference on Applications of Computer Vision (WACV), December 2024, pp. 5408-5418.
BibTeX TR2025-003 PDF
- @inproceedings{He2024dec2,
- author = {He, Yuhang and Shin, Sangyun and Cherian, Anoop and Trigoni, Niki and Markham, Andrew},
- title = {{SoundLoc3D: Invisible 3D Sound Source Localization and Classification Using a Multimodal RGB-D Acoustic Camera}},
- booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)},
- year = 2024,
- pages = {5408--5418},
- month = dec,
- publisher = {CVF},
- url = {https://www.merl.com/publications/TR2025-003}
- }
Zhang, J., Zhang, F., Rodriguez, C., Ben-Shabat, I., Cherian, A., Gould, S., "Temporally Grounding Instructional Diagrams in Unconstrained Videos", IEEE Winter Conference on Applications of Computer Vision (WACV), December 2024, pp. 8090-8100.
BibTeX TR2025-002 PDF
- @inproceedings{Zhang2024dec,
- author = {Zhang, Jiahao and Zhang, Frederic and Rodriguez, Cristian and Ben-Shabat, Itzik and Cherian, Anoop and Gould, Stephen},
- title = {{Temporally Grounding Instructional Diagrams in Unconstrained Videos}},
- booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)},
- year = 2024,
- pages = {8090--8100},
- month = dec,
- publisher = {CVF},
- url = {https://www.merl.com/publications/TR2025-002}
- }
Zhang, J., Cherian, A., Rodriguez, C., Deng, W., Gould, S., "Manual-PA: Learning 3D Part Assembly from Instruction Diagrams", arXiv, November 2024.
BibTeX arXiv
- @article{Zhang2024nov,
- author = {Zhang, Jiahao and Cherian, Anoop and Rodriguez, Cristian and Deng, Weijian and Gould, Stephen},
- title = {{Manual-PA: Learning 3D Part Assembly from Instruction Diagrams}},
- journal = {arXiv},
- year = 2024,
- month = nov,
- url = {https://arxiv.org/abs/2411.18011}
- }
Cherian, A., Corcodel, R., Jain, S., Romeres, D., "LLMPhy: Complex Physical Reasoning Using Large Language Models and World Models", arXiv, November 2024.
BibTeX arXiv
- @article{Cherian2024oct,
- author = {Cherian, Anoop and Corcodel, Radu and Jain, Siddarth and Romeres, Diego},
- title = {{LLMPhy: Complex Physical Reasoning Using Large Language Models and World Models}},
- journal = {arXiv},
- year = 2024,
- month = nov,
- url = {https://arxiv.org/abs/2411.08027}
- }
Cherian, A., Peng, K.-C., Lohit, S., Matthiesen, J., Smith, K., Tenenbaum, J.B., "Evaluating Large Vision-and-Language Models on Children’s Mathematical Olympiads", Advances in Neural Information Processing Systems (NeurIPS), November 2024, pp. 15779-15800.
BibTeX TR2024-160 PDF Video Presentation
- @inproceedings{Cherian2024nov,
- author = {Cherian, Anoop and Peng, Kuan-Chuan and Lohit, Suhas and Matthiesen, Joanna and Smith, Kevin and Tenenbaum, Joshua B.},
- title = {{Evaluating Large Vision-and-Language Models on Children’s Mathematical Olympiads}},
- booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
- year = 2024,
- pages = {15779--15800},
- month = nov,
- publisher = {NeurIPS Proceedings},
- url = {https://www.merl.com/publications/TR2024-160}
- }
Cherian, A., Jain, S., Marks, T.K., "Few-shot Transparent Instance Segmentation for Bin Picking", IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), September 2024, pp. 5009-5016.
BibTeX TR2024-127 PDF Video
- @inproceedings{Cherian2024sep,
- author = {Cherian, Anoop and Jain, Siddarth and Marks, Tim K.},
- title = {{Few-shot Transparent Instance Segmentation for Bin Picking}},
- booktitle = {2024 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
- year = 2024,
- pages = {5009--5016},
- month = sep,
- publisher = {IEEE},
- url = {https://www.merl.com/publications/TR2024-127}
- }
Yin, J., Luo, A., Du, Y., Cherian, A., Marks, T.K., Le Roux, J., Gan, C., "Disentangled Acoustic Fields For Multimodal Physical Scene Understanding", IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), September 2024, pp. 557-564.
BibTeX TR2024-125 PDF
- @inproceedings{Yin2024sep,
- author = {Yin, Jie and Luo, Andrew and Du, Yilun and Cherian, Anoop and Marks, Tim K. and {Le Roux}, Jonathan and Gan, Chuang},
- title = {{Disentangled Acoustic Fields For Multimodal Physical Scene Understanding}},
- booktitle = {2024 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
- year = 2024,
- pages = {557--564},
- month = sep,
- publisher = {IEEE},
- url = {https://www.merl.com/publications/TR2024-125}
- }
Zhang, J., Zhang, F., Rodriguez, C., Ben-Shabat, I., Cherian, A., Gould, S., "Temporally Grounding Instructional Diagrams in Unconstrained Videos", arXiv, July 2024.
BibTeX arXiv
- @article{Zhang2024jul4,
- author = {Zhang, Jiahao and Zhang, Frederic and Rodriguez, Cristian and Ben-Shabat, Itzik and Cherian, Anoop and Gould, Stephen},
- title = {{Temporally Grounding Instructional Diagrams in Unconstrained Videos}},
- journal = {arXiv},
- year = 2024,
- month = jul,
- url = {https://arxiv.org/abs/2407.12066}
- }
Ni, H., Egger, B., Lohit, S., Cherian, A., Wang, Y., Koike-Akino, T., Huang, S.X., Marks, T.K., "TI2V-Zero: Zero-Shot Image Conditioning for Text-to-Video Diffusion Models", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), June 2024, pp. 9015-9025.
BibTeX TR2024-059 PDF Video Software Presentation
- @inproceedings{Ni2024jun,
- author = {Ni, Haomiao and Egger, Bernhard and Lohit, Suhas and Cherian, Anoop and Wang, Ye and Koike-Akino, Toshiaki and Huang, Sharon X. and Marks, Tim K.},
- title = {{TI2V-Zero: Zero-Shot Image Conditioning for Text-to-Video Diffusion Models}},
- booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
- year = 2024,
- pages = {9015--9025},
- month = jun,
- url = {https://www.merl.com/publications/TR2024-059}
- }
He, Y., Cherian, A., Wichern, G., Markham, A., "Deep Neural Room Acoustics Primitive", International Conference on Machine Learning (ICML), June 2024, pp. 17842-17857.
BibTeX TR2024-072 PDF
- @inproceedings{He2024jun,
- author = {He, Yuhang and Cherian, Anoop and Wichern, Gordon and Markham, Andrew},
- title = {{Deep Neural Room Acoustics Primitive}},
- booktitle = {International Conference on Machine Learning (ICML)},
- year = 2024,
- pages = {17842--17857},
- month = jun,
- url = {https://www.merl.com/publications/TR2024-072}
- }
Yang, Z., Liu, J., Chen, P., Cherian, A., Marks, T.K., Le Roux, J., Gan, C., "RILA: Reflective and Imaginative Language Agent for Zero-Shot Semantic Audio-Visual Navigation", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), April 2024, pp. 16251-16261.
BibTeX TR2024-043 PDF
- @inproceedings{Yang2024apr,
- author = {Yang, Zeyuan and Liu, Jiageng and Chen, Peihao and Cherian, Anoop and Marks, Tim K. and {Le Roux}, Jonathan and Gan, Chuang},
- title = {{RILA: Reflective and Imaginative Language Agent for Zero-Shot Semantic Audio-Visual Navigation}},
- booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
- year = 2024,
- pages = {16251--16261},
- month = apr,
- publisher = {CVF},
- url = {https://www.merl.com/publications/TR2024-043}
- }
Zhu, X., Jha, D.K., Romeres, D., Sun, L., Tomizuka, M., Cherian, A., "Multi-level Reasoning for Robotic Assembly: From Sequence Inference to Contact Selection", IEEE International Conference on Robotics and Automation (ICRA), March 2024, pp. 816-823.
BibTeX TR2024-033 PDF Video
- @inproceedings{Zhu2024mar,
- author = {Zhu, Xinghao and Jha, Devesh K. and Romeres, Diego and Sun, Lingfeng and Tomizuka, Masayoshi and Cherian, Anoop},
- title = {{Multi-level Reasoning for Robotic Assembly: From Sequence Inference to Contact Selection}},
- booktitle = {IEEE International Conference on Robotics and Automation (ICRA)},
- year = 2024,
- pages = {816--823},
- month = mar,
- publisher = {IEEE},
- url = {https://www.merl.com/publications/TR2024-033}
- }
Hori, C., Wang, P., Rahman, M., Vaca-Rubio, C., Khurana, S., Cherian, A., Le Roux, J., "Wi-Fi based Indoor Monitoring Enhanced by Multimodal Fusion", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP48485.2024.10447600, March 2024, pp. 13296-13300.
BibTeX TR2024-012 PDF
- @inproceedings{Hori2024mar,
- author = {Hori, Chiori and Wang, Pu and Rahman, Mahbub and Vaca-Rubio, Cristian and Khurana, Sameer and Cherian, Anoop and {Le Roux}, Jonathan},
- title = {{Wi-Fi based Indoor Monitoring Enhanced by Multimodal Fusion}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2024,
- pages = {13296--13300},
- month = mar,
- publisher = {IEEE},
- doi = {10.1109/ICASSP48485.2024.10447600},
- issn = {2379-190X},
- isbn = {979-8-3503-4485-1},
- url = {https://www.merl.com/publications/TR2024-012}
- }
Carmichael, Z., Jones, L.S., Cherian, A., Michael J., , Scheirer, W., "Pixel-Grounded Prototypical Part Networks", IEEE Winter Conference on Applications of Computer Vision (WACV), DOI: 10.1109/WACV57701.2024.00470, January 2024.
BibTeX TR2024-002 PDF Video Software Presentation
- @inproceedings{Carmichael2024jan,
- author = {Carmichael, Zachariah and Jones, Lohit, Suhas and Cherian, Anoop and Michael J. and Scheirer, Walter},
- title = {{Pixel-Grounded Prototypical Part Networks}},
- booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)},
- year = 2024,
- month = jan,
- doi = {10.1109/WACV57701.2024.00470},
- url = {https://www.merl.com/publications/TR2024-002}
- }
Liu, X., Paul, S., Chatterjee, M., Cherian, A., "CAVEN: An Embodied Conversational Agent for Efficient Audio-Visual Navigation in Noisy Environments", AAAI Conference on Artificial Intelligence, DOI: 10.1609/aaai.v38i4.28167, December 2023, pp. 3765-3773.
BibTeX TR2023-154 PDF
- @inproceedings{Liu2023dec2,
- author = {Liu, Xiulong and Paul, Sudipta and Chatterjee, Moitreya and Cherian, Anoop},
- title = {{CAVEN: An Embodied Conversational Agent for Efficient Audio-Visual Navigation in Noisy Environments}},
- booktitle = {Proceedings of the 38th AAAI Conference on Artificial Intelligence},
- year = 2023,
- pages = {3765--3773},
- month = dec,
- doi = {10.1609/aaai.v38i4.28167},
- url = {https://www.merl.com/publications/TR2023-154}
- }
He, Y., Shin, S., Cherian, A., Markham, A., Trigon, N., "Sound3DVDet: 3D Sound Source Detection using Multiview Microphone Array and RGB Images", IEEE Winter Conference on Applications of Computer Vision (WACV), December 2023, pp. 5496-5507.
BibTeX TR2023-144 PDF
- @inproceedings{He2023dec,
- author = {He, Yuhang and Shin, Sangyun and Cherian, Anoop and Markham, Andrew and Trigon, Niki},
- title = {{Sound3DVDet: 3D Sound Source Detection using Multiview Microphone Array and RGB Images}},
- booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)},
- year = 2023,
- pages = {5496--5507},
- month = dec,
- url = {https://www.merl.com/publications/TR2023-144}
- }
Nair, N.G., Cherian, A., Lohit, S., Wang, Y., Koike-Akino, T., Patel, V.M., Marks, T.K., "Steered Diffusion: A Generalized Framework for Plug-and-Play Conditional Image Synthesis", IEEE International Conference on Computer Vision (ICCV), October 2023, pp. 20850-20860.
BibTeX TR2023-126 PDF Software Presentation
- @inproceedings{Nair2023sep,
- author = {Nair, Nithin Gopalakrishnan and Cherian, Anoop and Lohit, Suhas and Wang, Ye and Koike-Akino, Toshiaki and Patel, Vishal M. and Marks, Tim K.},
- title = {{Steered Diffusion: A Generalized Framework for Plug-and-Play Conditional Image Synthesis}},
- booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision},
- year = 2023,
- pages = {20850--20860},
- month = oct,
- publisher = {IEEE/CVF},
- url = {https://www.merl.com/publications/TR2023-126}
- }
Liu, X., Paul, S., Chatterjee, M., Cherian, A., "Active Sparse Conversations for Improved Audio-Visual Embodied Navigation", arXiv, June 2023.
BibTeX arXiv
- @inproceedings{Liu2023jun,
- author = {Liu, Xiulong and Paul, Sudipta and Chatterjee, Moitreya and Cherian, Anoop},
- title = {{Active Sparse Conversations for Improved Audio-Visual Embodied Navigation}},
- booktitle = {arXiv},
- year = 2023,
- month = jun,
- url = {https://arxiv.org/abs/2306.04047}
- }
Cherian, A., Jain, S., Marks, T.K., Sullivan, A., "Discriminative 3D Shape Modeling for Few-Shot Instance Segmentation", IEEE International Conference on Robotics and Automation (ICRA), DOI: 10.1109/ICRA48891.2023.10160644, May 2023, pp. 9296-9302.
BibTeX TR2023-010 PDF Presentation
- @inproceedings{Cherian2023may,
- author = {Cherian, Anoop and Jain, Siddarth and Marks, Tim K. and Sullivan, Alan},
- title = {{Discriminative 3D Shape Modeling for Few-Shot Instance Segmentation}},
- booktitle = {IEEE International Conference on Robotics and Automation (ICRA)},
- year = 2023,
- pages = {9296--9302},
- month = may,
- publisher = {IEEE},
- doi = {10.1109/ICRA48891.2023.10160644},
- url = {https://www.merl.com/publications/TR2023-010}
- }
Ota, K., Tung, H.-Y., Smith, K., Cherian, A., Marks, T.K., Sullivan, A., Kanezaki, A., Tenenbaum, J.B., "H-SAUR: Hypothesize, Simulate, Act, Update, and Repeat for Understanding Object Articulations from Interactions", IEEE International Conference on Robotics and Automation (ICRA), DOI: 10.1109/ICRA48891.2023.10160575, May 2023, pp. 7272-7278.
BibTeX TR2023-009 PDF
- @inproceedings{Ota2023may,
- author = {Ota, Kei and Tung, Hsiao-Yu and Smith, Kevin and Cherian, Anoop and Marks, Tim K. and Sullivan, Alan and Kanezaki, Asako and Tenenbaum, Joshua B.},
- title = {{H-SAUR: Hypothesize, Simulate, Act, Update, and Repeat for Understanding Object Articulations from Interactions}},
- booktitle = {IEEE International Conference on Robotics and Automation (ICRA)},
- year = 2023,
- pages = {7272--7278},
- month = may,
- publisher = {IEEE},
- doi = {10.1109/ICRA48891.2023.10160575},
- url = {https://www.merl.com/publications/TR2023-009}
- }
Shah, A., Roy, A., Shah, K., Mishra, S.K., Jacobs, D., Cherian, A., Chellappa, R., "HaLP: Hallucinating Latent Positives for Skeleton-based Self-Supervised Learning of Actions", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), May 2023, pp. 18846-18856.
BibTeX TR2023-035 PDF
- @inproceedings{Shah2023may,
- author = {Shah, Anshul and Roy, Aniket and Shah, Ketul and Mishra, Shlok Kumar and Jacobs, David and Cherian, Anoop and Chellappa, Rama},
- title = {{HaLP: Hallucinating Latent Positives for Skeleton-based Self-Supervised Learning of Actions}},
- booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
- year = 2023,
- pages = {18846--18856},
- month = may,
- publisher = {CVF},
- url = {https://www.merl.com/publications/TR2023-035}
- }
Zhang, J., Cherian, A., Liu, Y., Shabat, I.B., Rodriguez, C., Gould, S., "Aligning Step-by-Step Instructional Diagrams to Video Demonstrations", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), May 2023, pp. 2483-2492.
BibTeX TR2023-034 PDF
- @inproceedings{Zhang2023may,
- author = {Zhang, Jiahao and Cherian, Anoop and Liu, Yanbin and Shabat, Itzik Ben and Rodriguez, Cristian and Gould, Stephen},
- title = {{Aligning Step-by-Step Instructional Diagrams to Video Demonstrations}},
- booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
- year = 2023,
- pages = {2483--2492},
- month = may,
- publisher = {CVF},
- url = {https://www.merl.com/publications/TR2023-034}
- }
Cherian, A., Peng, K.-C., Lohit, S., Smith, K., Tenenbaum, J.B., "Are Deep Neural Networks SMARTer than Second Graders?", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), March 2023, pp. 10834-10844.
BibTeX TR2023-014 PDF Video Data Software Presentation
- @inproceedings{Cherian2023mar,
- author = {Cherian, Anoop and Peng, Kuan-Chuan and Lohit, Suhas and Smith, Kevin and Tenenbaum, Joshua B.},
- title = {{Are Deep Neural Networks SMARTer than Second Graders?}},
- booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
- year = 2023,
- pages = {10834--10844},
- month = mar,
- publisher = {CVF},
- url = {https://www.merl.com/publications/TR2023-014}
- }
Liu, T., Cherian, A., "Learning a Constrained Optimizer: A Primal Method", AAAI Bridge on Constraint Programming and Machine Learning, January 2023.
BibTeX TR2023-003 PDF
- @inproceedings{Liu2023jan,
- author = {Liu, Tao and Cherian, Anoop},
- title = {{Learning a Constrained Optimizer: A Primal Method}},
- booktitle = {AAAI Bridge on Constraint Programming and Machine Learning},
- year = 2023,
- month = jan,
- url = {https://www.merl.com/publications/TR2023-003}
- }