Publications

Manilow, E., Wichern, G., Seetharaman, P., Le Roux, J., "Cutting Music Source Separation Some Slakh: A Dataset to Study the Impact of Training Data Quality and Quantity", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA.2019.8937170, October 2019, pp. 45-49.
BibTeX TR2019-124 PDF
- @inproceedings{Manilow2019oct,
- author = {Manilow, Ethan and Wichern, Gordon and Seetharaman, Prem and {Le Roux}, Jonathan},
- title = {{Cutting Music Source Separation Some Slakh: A Dataset to Study the Impact of Training Data Quality and Quantity}},
- booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
- year = 2019,
- pages = {45--49},
- month = oct,
- doi = {10.1109/WASPAA.2019.8937170},
- issn = {1947-1629},
- isbn = {978-1-7281-1123-0},
- url = {https://www.merl.com/publications/TR2019-124}
- }
Moritz, N., Hori, T., Le Roux, J., "Unidirectional Neural Network Architectures for End-to-End Automatic Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2019-2837, September 2019, pp. 76-80.
BibTeX TR2019-098 PDF
- @inproceedings{Moritz2019sep,
- author = {Moritz, Niko and Hori, Takaaki and {Le Roux}, Jonathan},
- title = {{Unidirectional Neural Network Architectures for End-to-End Automatic Speech Recognition}},
- booktitle = {Interspeech},
- year = 2019,
- pages = {76--80},
- month = sep,
- doi = {10.21437/Interspeech.2019-2837},
- url = {https://www.merl.com/publications/TR2019-098}
- }
Seki, H., Hori, T., Watanabe, S., Le Roux, J., Hershey, J., "End-to-End Multilingual Multi-Speaker Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2019-3038, September 2019, pp. 3755-3759.
BibTeX TR2019-101 PDF
- @inproceedings{Seki2019sep,
- author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and {Le Roux}, Jonathan and Hershey, John},
- title = {{End-to-End Multilingual Multi-Speaker Speech Recognition}},
- booktitle = {Interspeech},
- year = 2019,
- pages = {3755--3759},
- month = sep,
- doi = {10.21437/Interspeech.2019-3038},
- url = {https://www.merl.com/publications/TR2019-101}
- }
Seki, H., Hori, T., Watanabe, S., Moritz, N., Le Roux, J., "Vectorized Beam Search for CTC-Attention-based Speech Recognition", Interspeech, DOI: 10.21437/Interspeech.2019-2860, September 2019, pp. 3825-3829.
BibTeX TR2019-102 PDF
- @inproceedings{Seki2019sep2,
- author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and Moritz, Niko and {Le Roux}, Jonathan},
- title = {{Vectorized Beam Search for CTC-Attention-based Speech Recognition}},
- booktitle = {Interspeech},
- year = 2019,
- pages = {3825--3829},
- month = sep,
- doi = {10.21437/Interspeech.2019-2860},
- url = {https://www.merl.com/publications/TR2019-102}
- }
Wichern, G., McQuinn, E., Antognini, J., Flynn, M., Zhu, R., Crow, D., Manilow, E., Le Roux, J., "WHAM!: Extending Speech Separation to Noisy Environments", Interspeech, DOI: 10.21437/Interspeech.2019-2821, September 2019, pp. 1368-1372.
BibTeX TR2019-099 PDF
- @inproceedings{Wichern2019sep,
- author = {Wichern, Gordon and McQuinn, Emmett and Antognini, Joe and Flynn, Michael and Zhu, Richard and Crow, Dwight and Manilow, Ethan and {Le Roux}, Jonathan},
- title = {{WHAM!: Extending Speech Separation to Noisy Environments}},
- booktitle = {Interspeech},
- year = 2019,
- pages = {1368--1372},
- month = sep,
- doi = {10.21437/Interspeech.2019-2821},
- url = {https://www.merl.com/publications/TR2019-099}
- }
Aihara, R., Hanazawa, T., Okato, Y., Wichern, G., Le Roux, J., "Teacher-Student Deep Clustering For Low-Delay Channel Speech Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8682695, May 2019.
BibTeX TR2019-003 PDF
- @inproceedings{Aihara2019may,
- author = {Aihara, Ryo and Hanazawa, Toshiyuki and Okato, Yohei and Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Teacher-Student Deep Clustering For Low-Delay Channel Speech Separation}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8682695},
- url = {https://www.merl.com/publications/TR2019-003}
- }
Hori, T., Astudillo, R., Hayashi, T., Zhang, Y., Watanabe, S., Le Roux, J., "Cycle-Consistency Training for End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683307, May 2019.
BibTeX TR2019-002 PDF
- @inproceedings{Hori2019may,
- author = {Hori, Takaaki and Astudillo, Ramon and Hayashi, Tomoki and Zhang, Yu and Watanabe, Shinji and {Le Roux}, Jonathan},
- title = {{Cycle-Consistency Training for End-to-End Speech Recognition }},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683307},
- url = {https://www.merl.com/publications/TR2019-002}
- }
Le Roux, J., Wichern, G., Watanabe, S., Sarroff, A., Hershey, J., "The Phasebook: Building Complex Masks via Discrete Representations for Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8682587, May 2019.
BibTeX TR2019-008 PDF
- @inproceedings{LeRoux2019may2,
- author = {{Le Roux}, Jonathan and Wichern, Gordon and Watanabe, Shinji and Sarroff, Andy and Hershey, John},
- title = {{The Phasebook: Building Complex Masks via Discrete Representations for Source Separation}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8682587},
- url = {https://www.merl.com/publications/TR2019-008}
- }
Le Roux, J., Wisdom, S., Erdogan, H., Hershey, J., "SDR -- Half-Baked or Well Done?", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683855, May 2019.
BibTeX TR2019-013 PDF
- @inproceedings{LeRoux2019may,
- author = {{Le Roux}, Jonathan and Wisdom, Scott and Erdogan, Hakan and Hershey, John},
- title = {{SDR-- Half- Baked or Well Done? }},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683855},
- url = {https://www.merl.com/publications/TR2019-013}
- }
Moritz, N., Hori, T., Le Roux, J., "Triggered Attention for End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683510, May 2019.
BibTeX TR2019-015 PDF
- @inproceedings{Moritz2019may,
- author = {Moritz, Niko and Hori, Takaaki and {Le Roux}, Jonathan},
- title = {{Triggered Attention for End-to-End Speech Recognition}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683510},
- url = {https://www.merl.com/publications/TR2019-015}
- }
Seetharaman, P., Wichern, G., Le Roux, J., Pardo, B., "Bootstrapping Single-Channel Source Separation via Unsupervised Spatial Clustering on Stereo Mixtures", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683198, May 2019.
BibTeX TR2019-014 PDF
- @inproceedings{Seetharaman2019may2,
- author = {Seetharaman, Prem and Wichern, Gordon and {Le Roux}, Jonathan and Pardo, Bryan},
- title = {{Bootstrapping Single-Channel Source Separation via Unsupervised Spatial Clustering on Stereo Mixtures}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683198},
- url = {https://www.merl.com/publications/TR2019-014}
- }
Seetharaman, P., Wichern, G., Venkataramani, S., Le Roux, J., "Class-Conditional Embeddings for Music Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2019.8683007, May 2019.
BibTeX TR2019-004 PDF
- @inproceedings{Seetharaman2019may,
- author = {Seetharaman, Prem and Wichern, Gordon and Venkataramani, Shrikant and {Le Roux}, Jonathan},
- title = {{Class-Conditional Embeddings for Music Source Separation }},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2019,
- month = may,
- doi = {10.1109/ICASSP.2019.8683007},
- url = {https://www.merl.com/publications/TR2019-004}
- }
Le Roux, J., Wichern, G., Watanabe, S., Sarroff, A., Hershey, J., "Phasebook and Friends: Leveraging discrete representations for source separation", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/JSTSP.2019.2904183, Vol. 13, No. 2, pp. 370-382, March 2019.
BibTeX TR2018-199 PDF
- @article{LeRoux2019mar,
- author = {{Le Roux}, Jonathan and Wichern, Gordon and Watanabe, Shinji and Sarroff, Andy and Hershey, John},
- title = {{Phasebook and Friends: Leveraging discrete representations for source separation}},
- journal = {IEEE Journal of Selected Topics in Signal Processing},
- year = 2019,
- volume = 13,
- number = 2,
- pages = {370--382},
- month = mar,
- doi = {10.1109/JSTSP.2019.2904183},
- url = {https://www.merl.com/publications/TR2018-199}
- }
Wichern, G., Le Roux, J., "Phase Reconstruction with Learned Time-Frequency Representations for Single-Channel Speech Separation", International Workshop on Acoustic Signal Enhancement (IWAENC), DOI: 10.1109/IWAENC.2018.8521243, September 2018.
BibTeX TR2018-146 PDF
- @inproceedings{Wichern2018sep,
- author = {Wichern, Gordon and {Le Roux}, Jonathan},
- title = {{Phase Reconstruction with Learned Time-Frequency Representations for Single-Channel Speech Separation}},
- booktitle = {International Workshop on Acoustic Signal Enhancement (IWAENC)},
- year = 2018,
- month = sep,
- doi = {10.1109/IWAENC.2018.8521243},
- url = {https://www.merl.com/publications/TR2018-146}
- }
Wang, Z.-Q., Le Roux, J., Wang, D., Hershey, J., "End-to-End Speech Separation with Unfolded Iterative Phase Reconstruction", Interspeech, September 2018.
BibTeX TR2018-135 PDF
- @inproceedings{Wang2018sep,
- author = {Wang, Zhong-Qiu and {Le Roux}, Jonathan and Wang, DeLiang and Hershey, John},
- title = {{End-to-End Speech Separation with Unfolded Iterative Phase Reconstruction}},
- booktitle = {Interspeech},
- year = 2018,
- month = sep,
- url = {https://www.merl.com/publications/TR2018-135}
- }
Seki, H., Hori, T., Watanabe, S., Le Roux, J., Hershey, J., "A Purely End-to-end System for Multi-speaker Speech Recognition", Annual Meeting of the Association for Computational Linguistics (ACL), July 2018, pp. 2620-2630.
BibTeX TR2018-104 PDF Video
- @inproceedings{Seki2018jul,
- author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and {Le Roux}, Jonathan and Hershey, John},
- title = {{A Purely End-to-end System for Multi-speaker Speech Recognition}},
- booktitle = {Annual Meeting of the Association for Computational Linguistics (ACL)},
- year = 2018,
- pages = {2620--2630},
- month = jul,
- publisher = {Elsevier},
- url = {https://www.merl.com/publications/TR2018-104}
- }
Erdogan, H., Hershey, J., Watanabe, S., Le Roux, J., "Deep recurrent networks for separation and recognition of single-channel speech in non-stationary background audio" in New Era for Robust Speech Recognition: Exploiting Deep Learning, Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R., Eds., chapter 7, Springer, July 2018.
BibTeX
- @incollection{Erdogan2018jul,
- author = {Erdogan, Hakan and Hershey, John and Watanabe, Shinji and {Le Roux}, Jonathan},
- title = {{Deep recurrent networks for separation and recognition of single-channel speech in non-stationary background audio}},
- booktitle = {New Era for Robust Speech Recognition: Exploiting Deep Learning},
- year = 2018,
- editor = {Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R.},
- chapter = 7,
- month = jul,
- publisher = {Springer},
- isbn = {978-3-319-64680-0}
- }
Hershey, J., Le Roux, J., Watanabe, S., Wisdom, S., Chen, Z., Isik, Y., "Novel deep architectures in speech processing" in New Era for Robust Speech Recognition: Exploiting Deep Learning, Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R., Eds., chapter 6, Springer, July 9, 2018.
BibTeX
- @incollection{Hershey2018jul,
- author = {Hershey, John and {Le Roux}, Jonathan and Watanabe, Shinji and Wisdom, Scott and Chen, Zhuo and Isik, Yusuf},
- title = {{Novel deep architectures in speech processing}},
- booktitle = {New Era for Robust Speech Recognition: Exploiting Deep Learning},
- year = 2018,
- editor = {Watanabe, S. and Delcroix, M. and Metze, F. and Hershey, J.R.},
- chapter = 6,
- month = jul,
- publisher = {Springer}
- }
Seki, H., Watanabe, S., Hori, T., Le Roux, J., Hershey, J.R., "An End-to-End Language-Tracking Speech Recognizer for Mixed-Language Speech", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2018.8462180, April 2018, pp. 4919-4923.
BibTeX TR2018-002 PDF Video
- @inproceedings{Seki2018apr,
- author = {Seki, Hiroshi and Watanabe, Shinji and Hori, Takaaki and {Le Roux}, Jonathan and Hershey, John R.},
- title = {{An End-to-End Language-Tracking Speech Recognizer for Mixed-Language Speech}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2018,
- pages = {4919--4923},
- month = apr,
- doi = {10.1109/ICASSP.2018.8462180},
- url = {https://www.merl.com/publications/TR2018-002}
- }
Settle, S., Le Roux, J., Hori, T., Watanabe, S., Hershey, J.R., "End-to-End Multi-Speaker Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2018.8461893, April 2018, pp. 4819-4823.
BibTeX TR2018-001 PDF Video
- @inproceedings{Settle2018apr,
- author = {Settle, Shane and {Le Roux}, Jonathan and Hori, Takaaki and Watanabe, Shinji and Hershey, John R.},
- title = {{End-to-End Multi-Speaker Speech Recognition}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2018,
- pages = {4819--4823},
- month = apr,
- doi = {10.1109/ICASSP.2018.8461893},
- url = {https://www.merl.com/publications/TR2018-001}
- }
Wang, Z.-Q., Le Roux, J., Hershey, J.R., "Multi-Channel Deep Clustering: Discriminative Spectral and Spatial Embeddings for Speaker-Independent Speech Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2018.8461639, April 2018, pp. 1-5.
BibTeX TR2018-007 PDF
- @inproceedings{Wang2018apr2,
- author = {Wang, Zhong-Qiu and {Le Roux}, Jonathan and Hershey, John R.},
- title = {{Multi-Channel Deep Clustering: Discriminative Spectral and Spatial Embeddings for Speaker-Independent Speech Separation}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2018,
- pages = {1--5},
- month = apr,
- doi = {10.1109/ICASSP.2018.8461639},
- url = {https://www.merl.com/publications/TR2018-007}
- }
Wang, Z.-Q., Le Roux, J., Hershey, J.R., "Alternative Objective Functions for Deep Clustering", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/ICASSP.2018.8462507, April 2018, pp. 686-690.
BibTeX TR2018-005 PDF
- @inproceedings{Wang2018apr,
- author = {Wang, Zhong-Qiu and {Le Roux}, Jonathan and Hershey, John R.},
- title = {{Alternative Objective Functions for Deep Clustering}},
- booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
- year = 2018,
- pages = {686--690},
- month = apr,
- doi = {10.1109/ICASSP.2018.8462507},
- url = {https://www.merl.com/publications/TR2018-005}
- }
Magron, P., Le Roux, J., Virtanen, T., "Consistent Anisotropic Wiener Filtering for Audio Source", IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA), DOI: 10.1109/WASPAA.2017.8170037, October 2017.
BibTeX TR2017-151 PDF
- @inproceedings{Magron2017oct,
- author = {Magron, Paul and {Le Roux}, Jonathan and Virtanen, Tuomas},
- title = {{Consistent Anisotropic Wiener Filtering for Audio Source }},
- booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics (WASPAA)},
- year = 2017,
- month = oct,
- doi = {10.1109/WASPAA.2017.8170037},
- url = {https://www.merl.com/publications/TR2017-151}
- }
Tachioka, Y., Narita, T., Miura, I., Uramoto, T., Monta, N., Uenohara, S., Furuya, K., Watanabe, S., Le Roux, J., "Coupled initialization of multi-channel non-negative matrix factorization based on spatial and spectral information", Interspeech, August 2017.
BibTeX TR2017-134 PDF
- @inproceedings{Tachioka2017aug,
- author = {Tachioka, Yuuki and Narita, Tomohiro and Miura, Iori and Uramoto, Takanobu and Monta, Natsuki and Uenohara, Shingo and Furuya, Kenichi and Watanabe, Shinji and {Le Roux}, Jonathan},
- title = {{Coupled initialization of multi-channel non-negative matrix factorization based on spatial and spectral information}},
- booktitle = {Interspeech},
- year = 2017,
- month = aug,
- url = {https://www.merl.com/publications/TR2017-134}
- }
Hayashi, T., Watanabe, S., Toda, T., Hori, T., Le Roux, J., Takeda, K., "Duration-Controlled LSTM for Polyphonic Sound Event Detection", IEEE/ACM Transactions on Audio, Speech, and Language Processing, DOI: 10.1109/TASLP.2017.2740002, Vol. 25, No. 11, August 2017.
BibTeX TR2017-150 PDF
- @article{Hayashi2017aug,
- author = {Hayashi, Tomoki and Watanabe, Shinji and Toda, Tomoki and Hori, Takaaki and {Le Roux}, Jonathan and Takeda, Kazuya},
- title = {{Duration-Controlled LSTM for Polyphonic Sound Event Detection}},
- journal = {IEEE/ACM Transactions on Audio, Speech, and Language Processing},
- year = 2017,
- volume = 25,
- number = 11,
- month = aug,
- doi = {10.1109/TASLP.2017.2740002},
- issn = {2329-9304},
- url = {https://www.merl.com/publications/TR2017-150}
- }