Publications

Araki, S., Ito, N., Haeb-Umbach, R., Wichern, G., Wang, Z.-Q., Mitsufuji, Y., "30+ Years of Source Separation Research: Achievements and Future Challenges", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2025.
BibTeX TR2025-036 PDF
- @inproceedings{Araki2025mar,
- author = {Araki, Shoko and Ito, Nobutaka and Haeb-Umbach, Reinhold and Wichern, Gordon and Wang, Zhong-Qiu and Mitsufuji, Yuki},
- title = {{30+ Years of Source Separation Research: Achievements and Future Challenges}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2025,
- month = mar,
- url = {https://www.merl.com/publications/TR2025-036}
- }
Ebbers, J., Germain, F.G., Wilkinghoff, K., Wichern, G., Le Roux, J., "No Class Left Behind: A Closer Look at Class Balancing for Audio Tagging", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2025.
BibTeX TR2025-037 PDF
- @inproceedings{Ebbers2025mar,
- author = {Ebbers, Janek and Germain, François G and Wilkinghoff, Kevin and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{No Class Left Behind: A Closer Look at Class Balancing for Audio Tagging}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2025,
- month = mar,
- url = {https://www.merl.com/publications/TR2025-037}
- }
Gruttadauria, E., Fontaine, M., Le Roux, J., Essid, S., "O-EENC-SD: Efficient Online End-to-End Neural Clustering for Speaker Diarization", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2025.
BibTeX TR2025-031 PDF
- @inproceedings{Gruttadauria2025mar,
- author = {Gruttadauria, Elio and Fontaine, Mathieu and {Le Roux}, Jonathan and Essid, Slim},
- title = {{O-EENC-SD: Efficient Online End-to-End Neural Clustering for Speaker Diarization}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2025,
- month = mar,
- url = {https://www.merl.com/publications/TR2025-031}
- }
Hori, C., Kambara, M., Sugiura, K., Ota, K., Khurana, S., Jain, S., Corcodel, R., Jha, D.K., Romeres, D., Le Roux, J., "Interactive Robot Action Replanning using Multimodal LLM Trained from Human Demonstration Videos", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2025.
BibTeX TR2025-034 PDF
- @inproceedings{Hori2025mar,
- author = {Hori, Chiori and Kambara, Motonari and Sugiura, Komei and Ota, Kei and Khurana, Sameer and Jain, Siddarth and Corcodel, Radu and Jha, Devesh K. and Romeres, Diego and {Le Roux}, Jonathan},
- title = {{Interactive Robot Action Replanning using Multimodal LLM Trained from Human Demonstration Videos}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2025,
- month = mar,
- url = {https://www.merl.com/publications/TR2025-034}
- }
Masuyama, Y., Wichern, G., Germain, F.G., Ick, C., Le Roux, J., "Retrieval-Augmented Neural Field for HRTF Upsampling and Personalization", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2025.
BibTeX TR2025-029 PDF Software
- @inproceedings{Masuyama2025mar,
- author = {Masuyama, Yoshiki and Wichern, Gordon and Germain, François G and Ick, Christopher and {Le Roux}, Jonathan},
- title = {{Retrieval-Augmented Neural Field for HRTF Upsampling and Personalization}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2025,
- month = mar,
- url = {https://www.merl.com/publications/TR2025-029}
- }
Saijo, K., Ebbers, J., Germain, F.G., Khurana, S., Wichern, G., Le Roux, J., "Leveraging Audio-Only Data for Text-Queried Target Sound Extraction", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2025.
BibTeX TR2025-033 PDF
- @inproceedings{Saijo2025mar2,
- author = {Saijo, Kohei and Ebbers, Janek and Germain, François G and Khurana, Sameer and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Leveraging Audio-Only Data for Text-Queried Target Sound Extraction}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2025,
- month = mar,
- url = {https://www.merl.com/publications/TR2025-033}
- }
Saijo, K., Ebbers, J., Germain, F.G., Wichern, G., Le Roux, J., "Task-Aware Unified Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2025.
BibTeX TR2025-032 PDF
- @inproceedings{Saijo2025mar,
- author = {Saijo, Kohei and Ebbers, Janek and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Task-Aware Unified Source Separation}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2025,
- month = mar,
- url = {https://www.merl.com/publications/TR2025-032}
- }
Wilkinghoff, K., Yang, H., Ebbers, J., Germain, F.G., Wichern, G., Le Roux, J., "Keeping the Balance: Anomaly Score Calculation for Domain Generalization", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), March 2025.
BibTeX TR2025-030 PDF
- @inproceedings{Wilkinghoff2025mar,
- author = {Wilkinghoff, Kevin and Yang, Haici and Ebbers, Janek and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Keeping the Balance: Anomaly Score Calculation for Domain Generalization}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2025,
- month = mar,
- url = {https://www.merl.com/publications/TR2025-030}
- }
Tian, J., Shi, J., Chen, W., Arora, S., Masuyama, Y., Takashi, M., Wu, Y., Peng, J., Bharadwaj, S., Zhao, Y., Cornell, S., Peng, Y., Yue, X., Yang, C.-H.H., Neubig, G., Watanabe, S., "ESPnet-SpeechLM: An Open Speech Language Model Toolkit", NAACL-HLT (the system demonstration track), March 2025.
BibTeX TR2025-038 PDF
- @inproceedings{Tian2025mar,
- author = {Tian, Jinchuan and Shi, Jiatong and Chen, William and Arora, Siddhant and Masuyama, Yoshiki and Takashi, Maekaku and Wu, Yihan and Peng, Junyi and Bharadwaj, Shikhar and Zhao, Yiwen and Cornell, Samuele and Peng, Yifan and Yue, Xiang and Yang, Chao-Han H. and Neubig, Graham and Watanabe, Shinji},
- title = {{ESPnet-SpeechLM: An Open Speech Language Model Toolkit}},
- booktitle = {NAACL-HLT (the system demonstration track)},
- year = 2025,
- month = mar,
- url = {https://www.merl.com/publications/TR2025-038}
- }
Koo, J., Wichern, G., Germain, F.G., Khurana, S., Le Roux, J., "SMITIN: Self-Monitored Inference-Time INtervention for Generative Music Transformers", IEEE Open Journal of Signal Processing, DOI: 10.1109/OJSP.2025.3534686, Vol. 6, pp. 266-275, January 2025.
BibTeX TR2025-012 PDF Software
- @article{Koo2025jan,
- author = {Koo, Junghyun and Wichern, Gordon and Germain, François G and Khurana, Sameer and {Le Roux}, Jonathan},
- title = {{SMITIN: Self-Monitored Inference-Time INtervention for Generative Music Transformers}},
- journal = {IEEE Open Journal of Signal Processing},
- year = 2025,
- volume = 6,
- pages = {266--275},
- month = jan,
- doi = {10.1109/OJSP.2025.3534686},
- issn = {2644-1322},
- url = {https://www.merl.com/publications/TR2025-012}
- }
He, Y., Shin, S., Cherian, A., Trigoni, N., Markham, A., "SoundLoc3D: Invisible 3D Sound Source Localization and Classification Using a Multimodal RGB-D Acoustic Camera", IEEE Winter Conference on Applications of Computer Vision (WACV), December 2024, pp. 5408-5418.
BibTeX TR2025-003 PDF
- @inproceedings{He2024dec2,
- author = {He, Yuhang and Shin, Sangyun and Cherian, Anoop and Trigoni, Niki and Markham, Andrew},
- title = {{SoundLoc3D: Invisible 3D Sound Source Localization and Classification Using a Multimodal RGB-D Acoustic Camera}},
- booktitle = {IEEE Winter Conference on Applications of Computer Vision (WACV)},
- year = 2024,
- pages = {5408--5418},
- month = dec,
- publisher = {CVF},
- url = {https://www.merl.com/publications/TR2025-003}
- }
Ick, C., Wichern, G., Masuyama, Y., Germain, F.G., Le Roux, J., "Spatially-Aware Losses for Enhanced Neural Acoustic Fields", NeurIPS 2024 Audio Imagination Workshop, December 2024.
BibTeX TR2024-169 PDF
- @inproceedings{Ick2024dec,
- author = {Ick, Christopher and Wichern, Gordon and Masuyama, Yoshiki and Germain, François G and {Le Roux}, Jonathan},
- title = {{Spatially-Aware Losses for Enhanced Neural Acoustic Fields}},
- booktitle = {NeurIPS 2024 Audio Imagination Workshop},
- year = 2024,
- month = dec,
- url = {https://www.merl.com/publications/TR2024-169}
- }
Cornell, S., Ebbers, J., Douwes, C., Martin-Morato, I., Harju, M., Mesaros, A., Serizel, R., "DCASE 2024 Task 4: Sound Event Detection with Heterogeneous Data and Missing Labels", Detection and Classification of Acoustic Scenes and Events (DCASE) Workshop, October 2024, pp. 31-35.
BibTeX TR2024-146 PDF
- @inproceedings{Cornell2024oct,
- author = {Cornell, Samuele and Ebbers, Janek and Douwes, Constance and Martin-Morato, Irene and Harju, Manu and Mesaros, Annamaria and Serizel, Romain},
- title = {{DCASE 2024 Task 4: Sound Event Detection with Heterogeneous Data and Missing Labels}},
- booktitle = {Detection and Classification of Acoustic Scenes and Events (DCASE) Workshop},
- year = 2024,
- pages = {31--35},
- month = oct,
- url = {https://www.merl.com/publications/TR2024-146}
- }
Saijo, K., Wichern, G., Germain, F.G., Pan, Z., Le Roux, J., "TF-Locoformer: Transformer with Local Modeling by Convolution for Speech Separation and Enhancement", International Workshop on Acoustic Signal Enhancement (IWAENC), DOI: 10.1109/IWAENC61483.2024.10694313, September 2024, pp. 205-209.
BibTeX TR2024-126 PDF Software
- @inproceedings{Saijo2024sep2,
- author = {Saijo, Kohei and Wichern, Gordon and Germain, François G and Pan, Zexu and {Le Roux}, Jonathan},
- title = {{TF-Locoformer: Transformer with Local Modeling by Convolution for Speech Separation and Enhancement}},
- booktitle = {International Workshop on Acoustic Signal Enhancement (IWAENC)},
- year = 2024,
- pages = {205--209},
- month = sep,
- doi = {10.1109/IWAENC61483.2024.10694313},
- issn = {2835-3439},
- isbn = {979-8-3503-6185-8},
- url = {https://www.merl.com/publications/TR2024-126}
- }
Yin, J., Luo, A., Du, Y., Cherian, A., Marks, T.K., Le Roux, J., Gan, C., "Disentangled Acoustic Fields For Multimodal Physical Scene Understanding", IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS), September 2024, pp. 557-564.
BibTeX TR2024-125 PDF
- @inproceedings{Yin2024sep,
- author = {Yin, Jie and Luo, Andrew and Du, Yilun and Cherian, Anoop and Marks, Tim K. and {Le Roux}, Jonathan and Gan, Chuang},
- title = {{Disentangled Acoustic Fields For Multimodal Physical Scene Understanding}},
- booktitle = {2024 IEEE/RSJ International Conference on Intelligent Robots and Systems (IROS)},
- year = 2024,
- pages = {557--564},
- month = sep,
- publisher = {IEEE},
- url = {https://www.merl.com/publications/TR2024-125}
- }
Bahrman, L., Fontaine, M., Le Roux, J., Richard, G., "Speech Dereverberation Constrained on Room Impulse Response Characteristics", Interspeech, DOI: 10.21437/Interspeech.2024-1173, September 2024, pp. 622-626.
BibTeX TR2024-121 PDF
- @inproceedings{Bahrman2024sep,
- author = {Bahrman, Louis and Fontaine, Mathieu and {Le Roux}, Jonathan and Richard, Gaël},
- title = {{Speech Dereverberation Constrained on Room Impulse Response Characteristics}},
- booktitle = {Interspeech},
- year = 2024,
- pages = {622--626},
- month = sep,
- doi = {10.21437/Interspeech.2024-1173},
- issn = {2958-1796},
- url = {https://www.merl.com/publications/TR2024-121}
- }
Ebbers, J., Germain, F.G., Wichern, G., Le Roux, J., "Sound Event Bounding Boxes", Interspeech, DOI: 10.21437/Interspeech.2024-2075, September 2024, pp. 562-566.
BibTeX TR2024-118 PDF Software
- @inproceedings{Ebbers2024sep,
- author = {Ebbers, Janek and Germain, François G and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Sound Event Bounding Boxes}},
- booktitle = {Interspeech},
- year = 2024,
- pages = {562--566},
- month = sep,
- doi = {10.21437/Interspeech.2024-2075},
- issn = {2958-1796},
- url = {https://www.merl.com/publications/TR2024-118}
- }
Khurana, S., Hori, C., Laurent, A., Wichern, G., Le Roux, J., "ZeroST: Zero-Shot Speech Translation", Interspeech, DOI: 10.21437/Interspeech.2024-1088, September 2024, pp. 392-396.
BibTeX TR2024-122 PDF
- @inproceedings{Khurana2024sep,
- author = {Khurana, Sameer and Hori, Chiori and Laurent, Antoine and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{ZeroST: Zero-Shot Speech Translation}},
- booktitle = {Interspeech},
- year = 2024,
- pages = {392--396},
- month = sep,
- doi = {10.21437/Interspeech.2024-1088},
- issn = {2958-1796},
- url = {https://www.merl.com/publications/TR2024-122}
- }
Pan, Z., Wichern, G., Germain, F.G., Saijo, K., Le Roux, J., "PARIS: Pseudo-AutoRegressIve Siamese Training for Online Speech Separation", Interspeech, DOI: 10.21437/Interspeech.2024-1066, September 2024, pp. 582-586.
BibTeX TR2024-124 PDF
- @inproceedings{Pan2024sep,
- author = {Pan, Zexu and Wichern, Gordon and Germain, François G and Saijo, Kohei and {Le Roux}, Jonathan},
- title = {{PARIS}: Pseudo-AutoRegressIve Siamese Training for Online Speech Separation},
- booktitle = {Interspeech},
- year = 2024,
- pages = {582--586},
- month = sep,
- doi = {10.21437/Interspeech.2024-1066},
- issn = {2958-1796},
- url = {https://www.merl.com/publications/TR2024-124}
- }
Saijo, K., Wichern, G., Germain, F.G., Pan, Z., Le Roux, J., "Enhanced Reverberation as Supervision for Unsupervised Speech Separation", Interspeech, DOI: 10.21437/Interspeech.2024-1241, September 2024, pp. 607-611.
BibTeX TR2024-116 PDF Software
- @inproceedings{Saijo2024sep,
- author = {Saijo, Kohei and Wichern, Gordon and Germain, François G and Pan, Zexu and {Le Roux}, Jonathan},
- title = {{Enhanced Reverberation as Supervision for Unsupervised Speech Separation}},
- booktitle = {Interspeech},
- year = 2024,
- pages = {607--611},
- month = sep,
- doi = {10.21437/Interspeech.2024-1241},
- issn = {2958-1796},
- url = {https://www.merl.com/publications/TR2024-116}
- }
Mitsui, Y., Aihara, R., Hori, T., Le Roux, J., Taguchi, S., "Exploring Keyword Enrollment for Japanese End-to-End Automatic Speech Recognition using Contextual Biasing", OTOGAKU Symposium, June 2024.
BibTeX TR2024-073 PDF
- @inproceedings{Mitsui2024jun,
- author = {Mitsui, Yoshiki and Aihara, Ryo and Hori, Takaaki and {Le Roux}, Jonathan and Taguchi, Shinya},
- title = {{Exploring Keyword Enrollment for Japanese End-to-End Automatic Speech Recognition using Contextual Biasing}},
- booktitle = {OTOGAKU Symposium},
- year = 2024,
- month = jun,
- publisher = {Information Processing Society of Japan},
- issn = {2188-8663},
- url = {https://www.merl.com/publications/TR2024-073}
- }
He, Y., Cherian, A., Wichern, G., Markham, A., "Deep Neural Room Acoustics Primitive", International Conference on Machine Learning (ICML), June 2024, pp. 17842-17857.
BibTeX TR2024-072 PDF
- @inproceedings{He2024jun,
- author = {He, Yuhang and Cherian, Anoop and Wichern, Gordon and Markham, Andrew},
- title = {{Deep Neural Room Acoustics Primitive}},
- booktitle = {International Conference on Machine Learning (ICML)},
- year = 2024,
- pages = {17842--17857},
- month = jun,
- url = {https://www.merl.com/publications/TR2024-072}
- }
Kambara, M., Hori, C., Sugiura, K., Ota, K., Jha, D.K., Khurana, S., Jain, S., Corcodel, R., Romeres, D., Le Roux, J., "Human Action Understanding-based Robot Planning using Multimodal LLM", IEEE International Conference on Robotics and Automation (ICRA), June 2024.
BibTeX TR2024-066 PDF
- @inproceedings{Kambara2024jun,
- author = {Kambara, Motonari and Hori, Chiori and Sugiura, Komei and Ota, Kei and Jha, Devesh K. and Khurana, Sameer and Jain, Siddarth and Corcodel, Radu and Romeres, Diego and {Le Roux}, Jonathan},
- title = {{Human Action Understanding-based Robot Planning using Multimodal LLM}},
- booktitle = {IEEE International Conference on Robotics and Automation (ICRA) Workshop},
- year = 2024,
- month = jun,
- url = {https://www.merl.com/publications/TR2024-066}
- }
Uhlich, S., Fabbro, G., Hirano, M., Takahashi, S., Wichern, G., Le Roux, J., Chakraborty, D., Mohanty, S., Li, K., Luo, Y., Yu, J., Gu, R., Solovyev, R., Stempkovskiy, A., Habruseva, T., Sukhovei, M., Mitsufuji, Y., "The Sound Demixing Challenge 2023 – Cinematic Demixing Track", Transactions of the International Society for Music Information Retrieval, DOI: 10.5334/tismir.172, Vol. 7, No. 1, pp. 44-62, May 2024.
BibTeX TR2024-047 PDF
- @article{Uhlich2024may,
- author = {Uhlich, Stefan and Fabbro, Giorgio and Hirano, Masato and Takahashi, Shusuke and Wichern, Gordon and {Le Roux}, Jonathan and Chakraborty, Dipam and Mohanty, Sharada and Li, Kai and Luo, Yi and Yu, Jianwei and Gu, Rongzhi and Solovyev, Roman and Stempkovskiy, Alexander and Habruseva, Tatiana and Sukhovei, Mikhail and Mitsufuji, Yuki},
- title = {{The {S}ound {D}emixing {C}hallenge 2023 – {C}inematic {D}emixing {T}rack}},
- journal = {Transactions of the International Society for Music Information Retrieval},
- year = 2024,
- volume = 7,
- number = 1,
- pages = {44--62},
- month = may,
- doi = {10.5334/tismir.172},
- url = {https://www.merl.com/publications/TR2024-047}
- }
Pan, Z., Wichern, G., Germain, F.G., Subramanian, A., Le Roux, J., "Late Audio-Visual Fusion for In-The-Wild Speaker Diarization", Hands-free Speech Communication and Microphone Arrays (HSCMA), DOI: 10.1109/ICASSPW62465.2024.10626914, April 2024, pp. 174-178.
BibTeX TR2024-029 PDF
- @inproceedings{Pan2024apr,
- author = {Pan, Zexu and Wichern, Gordon and Germain, François G and Subramanian, Aswin and {Le Roux}, Jonathan},
- title = {{Late Audio-Visual Fusion for In-The-Wild Speaker Diarization}},
- booktitle = {Hands-free Speech Communication and Microphone Arrays (HSCMA)},
- year = 2024,
- pages = {174--178},
- month = apr,
- publisher = {IEEE},
- doi = {10.1109/ICASSPW62465.2024.10626914},
- isbn = {979-8-3503-7451-3},
- url = {https://www.merl.com/publications/TR2024-029}
- }