Publications

327 / 3,734 publications found.


  •  Moritz, N., Hori, T., Le Roux, J., "Unidirectional Neural Network Architectures for End-to-End Automatic Speech Recognition", Interspeech, DOI: 10.21437/​Interspeech.2019-2837, September 2019, pp. 76-80.
    BibTeX TR2019-098 PDF
    • @inproceedings{Moritz2019sep,
    • author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Unidirectional Neural Network Architectures for End-to-End Automatic Speech Recognition},
    • booktitle = {Interspeech},
    • year = 2019,
    • pages = {76--80},
    • month = sep,
    • doi = {10.21437/Interspeech.2019-2837},
    • url = {https://www.merl.com/publications/TR2019-098}
    • }
  •  Seki, H., Hori, T., Watanabe, S., Le Roux, J., Hershey, J., "End-to-End Multilingual Multi-Speaker Speech Recognition", Interspeech, DOI: 10.21437/​Interspeech.2019-3038, September 2019, pp. 3755-3759.
    BibTeX TR2019-101 PDF
    • @inproceedings{Seki2019sep,
    • author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and Le Roux, Jonathan and Hershey, John},
    • title = {End-to-End Multilingual Multi-Speaker Speech Recognition},
    • booktitle = {Interspeech},
    • year = 2019,
    • pages = {3755--3759},
    • month = sep,
    • doi = {10.21437/Interspeech.2019-3038},
    • url = {https://www.merl.com/publications/TR2019-101}
    • }
  •  Seki, H., Hori, T., Watanabe, S., Moritz, N., Le Roux, J., "Vectorized Beam Search for CTC-Attention-based Speech Recognition", Interspeech, DOI: 10.21437/​Interspeech.2019-2860, September 2019, pp. 3825-3829.
    BibTeX TR2019-102 PDF
    • @inproceedings{Seki2019sep2,
    • author = {Seki, Hiroshi and Hori, Takaaki and Watanabe, Shinji and Moritz, Niko and Le Roux, Jonathan},
    • title = {Vectorized Beam Search for CTC-Attention-based Speech Recognition},
    • booktitle = {Interspeech},
    • year = 2019,
    • pages = {3825--3829},
    • month = sep,
    • doi = {10.21437/Interspeech.2019-2860},
    • url = {https://www.merl.com/publications/TR2019-102}
    • }
  •  Wichern, G., McQuinn, E., Antognini, J., Flynn, M., Zhu, R., Crow, D., Manilow, E., Le Roux, J., "WHAM!: Extending Speech Separation to Noisy Environments", Interspeech, DOI: 10.21437/​Interspeech.2019-2821, September 2019, pp. 1368-1372.
    BibTeX TR2019-099 PDF
    • @inproceedings{Wichern2019sep,
    • author = {Wichern, Gordon and McQuinn, Emmett and Antognini, Joe and Flynn, Michael and Zhu, Richard and Crow, Dwight and Manilow, Ethan and Le Roux, Jonathan},
    • title = {WHAM!: Extending Speech Separation to Noisy Environments},
    • booktitle = {Interspeech},
    • year = 2019,
    • pages = {1368--1372},
    • month = sep,
    • doi = {10.21437/Interspeech.2019-2821},
    • url = {https://www.merl.com/publications/TR2019-099}
    • }
  •  Yalta, N., Watanabe, S., Hori, T., Nakadai, K., Ogata, T., "CNN-based Multichannel End-to-End Speech Recognition for Everyday Home Environments", European Signal Processing Conference (EUSIPCO), DOI: 10.23919/​EUSIPCO.2019.8902524, September 2019, pp. 1-5.
    BibTeX TR2019-094 PDF
    • @inproceedings{Yalta2019sep,
    • author = {Yalta, Nelson and Watanabe, Shinji and Hori, Takaaki and Nakadai, Kazuhiro and Ogata, Tetsuya},
    • title = {CNN-based Multichannel End-to-End Speech Recognition for Everyday Home Environments},
    • booktitle = {European Signal Processing Conference (EUSIPCO)},
    • year = 2019,
    • pages = {1--5},
    • month = sep,
    • doi = {10.23919/EUSIPCO.2019.8902524},
    • url = {https://www.merl.com/publications/TR2019-094}
    • }
  •  Alamri, H., Cartillier, V., Das, A., Wang, J., Lee, S., Anderson, P., Essa, I., Parikh, D., Batra, D., Cherian, A., Marks, T.K., Hori, C., "Audio-Visual Scene-Aware Dialog", IEEE Conference on Computer Vision and Pattern Recognition (CVPR), DOI: 10.1109/​CVPR.2019.00774, June 2019, pp. 7550-7559.
    BibTeX TR2019-048 PDF
    • @inproceedings{Alamri2019jun,
    • author = {Alamri, Huda and Cartillier, Vincent and Das, Abhishek and Wang, Jue and Lee, Stefan and Anderson, Peter and Essa, Irfan and Parikh, Devi and Batra, Dhruv and Cherian, Anoop and Marks, Tim K. and Hori, Chiori},
    • title = {Audio-Visual Scene-Aware Dialog},
    • booktitle = {IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
    • year = 2019,
    • pages = {7550--7559},
    • month = jun,
    • doi = {10.1109/CVPR.2019.00774},
    • url = {https://www.merl.com/publications/TR2019-048}
    • }
  •  Aihara, R., Hanazawa, T., Okato, Y., Wichern, G., Le Roux, J., "Teacher-Student Deep Clustering For Low-Delay Channel Speech Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8682695, May 2019.
    BibTeX TR2019-003 PDF
    • @inproceedings{Aihara2019may,
    • author = {{Aihara, Ryo and Hanazawa, Toshiyuki and Okato, Yohei and Wichern, Gordon and Le Roux, Jonathan}},
    • title = {Teacher-Student Deep Clustering For Low-Delay Channel Speech Separation},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8682695},
    • url = {https://www.merl.com/publications/TR2019-003}
    • }
  •  Baskar, M.K., Burget, L., Watanabe, S., Karafiat, M., Hori, T., Cernocky, J.H., "Promising Accurate Prefix Boosting for Sequence-to-Sequence ASR", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8682782, May 2019, pp. 5646-5650.
    BibTeX TR2019-006 PDF
    • @inproceedings{Baskar2019may,
    • author = {Baskar, Murali Karthick and Burget, Lukas and Watanabe, Shinji and Karafiat, Martin and Hori, Takaaki and Cernocky, Jan, Honza},
    • title = {Promising Accurate Prefix Boosting for Sequence-to-Sequence ASR},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • pages = {5646--5650},
    • month = may,
    • doi = {10.1109/ICASSP.2019.8682782},
    • issn = {2379-190X},
    • isbn = {978-1-4799-8131-1},
    • url = {https://www.merl.com/publications/TR2019-006}
    • }
  •  Cho, J., Watanabe, S., Hori, T., Baskar, M.K., Inaguma, H., Villalba, J., Dehak, N., "Language Model Integration Based on Memory Control for Sequence to Sequence Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8683380, May 2019.
    BibTeX TR2019-007 PDF
    • @inproceedings{Cho2019may,
    • author = {Cho, Jaejin and Watanabe, Shinji and Hori, Takaaki and Baskar, Murali Karthick and Inaguma, Hirofumi and Villalba, Jesus and Dehak, Najim},
    • title = {Language Model Integration Based on Memory Control for Sequence to Sequence Speech Recognition},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8683380},
    • url = {https://www.merl.com/publications/TR2019-007}
    • }
  •  Hori, C., Alamri, H., Wang, J., Wichern, G., Hori, T., Cherian, A., Marks, T.K., Cartillier, V., Lopes, R., Das, A., Essa, I., Batra, D., Parikh, D., "End-to-End Audio Visual Scene-Aware Dialog Using Multimodal Attention-Based Video Features", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8682583, May 2019.
    BibTeX TR2019-016 PDF
    • @inproceedings{Hori2019may2,
    • author = {Hori, Chiori and Alamri, Huda and Wang, Jue and Wichern, Gordon and Hori, Takaaki and Cherian, Anoop and Marks, Tim K. and Cartillier, Vincent and Lopes, Raphael and Das, Abhishek and Essa, Irfan and Batra, Dhruv and Parikh, Devi},
    • title = {End-to-End Audio Visual Scene-Aware Dialog Using Multimodal Attention-Based Video Features},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8682583},
    • url = {https://www.merl.com/publications/TR2019-016}
    • }
  •  Hori, T., Astudillo, R., Hayashi, T., Zhang, Y., Watanabe, S., Le Roux, J., "Cycle-Consistency Training for End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8683307, May 2019.
    BibTeX TR2019-002 PDF
    • @inproceedings{Hori2019may,
    • author = {Hori, Takaaki and Astudillo, Ramon and Hayashi, Tomoki and Zhang, Yu and Watanabe, Shinji and Le Roux, Jonathan},
    • title = {Cycle-Consistency Training for End-to-End Speech Recognition},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8683307},
    • url = {https://www.merl.com/publications/TR2019-002}
    • }
  •  Le Roux, J., Wichern, G., Watanabe, S., Sarroff, A., Hershey, J., "The Phasebook: Building Complex Masks via Discrete Representations for Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8682587, May 2019.
    BibTeX TR2019-008 PDF
    • @inproceedings{LeRoux2019may2,
    • author = {Le Roux, Jonathan and Wichern, Gordon and Watanabe, Shinji and Sarroff, Andy and Hershey, John},
    • title = {The Phasebook: Building Complex Masks via Discrete Representations for Source Separation},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8682587},
    • url = {https://www.merl.com/publications/TR2019-008}
    • }
  •  Le Roux, J., Wisdom, S., Erdogan, H., Hershey, J., "SDR -- Half-Baked or Well Done?", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8683855, May 2019.
    BibTeX TR2019-013 PDF
    • @inproceedings{LeRoux2019may,
    • author = {Le Roux, Jonathan and Wisdom, Scott and Erdogan, Hakan and Hershey, John},
    • title = {SDR -- Half-Baked or Well Done?},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8683855},
    • url = {https://www.merl.com/publications/TR2019-013}
    • }
  •  Moritz, N., Hori, T., Le Roux, J., "Triggered Attention for End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8683510, May 2019.
    BibTeX TR2019-015 PDF
    • @inproceedings{Moritz2019may,
    • author = {Moritz, Niko and Hori, Takaaki and Le Roux, Jonathan},
    • title = {Triggered Attention for End-to-End Speech Recognition},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8683510},
    • url = {https://www.merl.com/publications/TR2019-015}
    • }
  •  Seetharaman, P., Wichern, G., Le Roux, J., Pardo, B., "Bootstrapping Single-Channel Source Separation via Unsupervised Spatial Clustering on Stereo Mixtures", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8683198, May 2019.
    BibTeX TR2019-014 PDF
    • @inproceedings{Seetharaman2019may2,
    • author = {Seetharaman, Prem and Wichern, Gordon and Le Roux, Jonathan and Pardo, Bryan},
    • title = {Bootstrapping Single-Channel Source Separation via Unsupervised Spatial Clustering on Stereo Mixtures},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8683198},
    • url = {https://www.merl.com/publications/TR2019-014}
    • }
  •  Seetharaman, P., Wichern, G., Venkataramani, S., Le Roux, J., "Class-Conditional Embeddings for Music Source Separation", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8683007, May 2019.
    BibTeX TR2019-004 PDF
    • @inproceedings{Seetharaman2019may,
    • author = {Seetharaman, Prem and Wichern, Gordon and Venkataramani, Shrikant and Le Roux, Jonathan},
    • title = {Class-Conditional Embeddings for Music Source Separation},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8683007},
    • url = {https://www.merl.com/publications/TR2019-004}
    • }
  •  Wang, X., Li, R., Mallidi, S.H., Hori, T., Watanabe, S., Hermansky, H., "Stream Attention-Based Multi-Array End-to-End Speech Recognition", IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP), DOI: 10.1109/​ICASSP.2019.8682650, May 2019.
    BibTeX TR2019-005 PDF
    • @inproceedings{Wang2019may,
    • author = {Wang, Xiaofei and Li, Ruizhi and Mallidi, Sri Harish and Hori, Takaaki and Watanabe, Shinji and Hermansky, Hynek},
    • title = {Stream Attention-Based Multi-Array End-to-End Speech Recognition},
    • booktitle = {IEEE International Conference on Acoustics, Speech, and Signal Processing (ICASSP)},
    • year = 2019,
    • month = may,
    • doi = {10.1109/ICASSP.2019.8682650},
    • url = {https://www.merl.com/publications/TR2019-005}
    • }
  •  Le Roux, J., Wichern, G., Watanabe, S., Sarroff, A., Hershey, J., "Phasebook and Friends: Leveraging discrete representations for source separation", IEEE Journal of Selected Topics in Signal Processing, DOI: 10.1109/​JSTSP.2019.2904183, Vol. 13, No. 2, pp. 370-382, March 2019.
    BibTeX TR2018-199 PDF
    • @article{LeRoux2019mar,
    • author = {Le Roux, Jonathan and Wichern, Gordon and Watanabe, Shinji and Sarroff, Andy and Hershey, John},
    • title = {Phasebook and Friends: Leveraging discrete representations for source separation},
    • journal = {IEEE Journal of Selected Topics in Signal Processing},
    • year = 2019,
    • volume = 13,
    • number = 2,
    • pages = {370--382},
    • month = mar,
    • doi = {10.1109/JSTSP.2019.2904183},
    • url = {https://www.merl.com/publications/TR2018-199}
    • }
  •  d’Haro, L.F., Banchs, R., Hori, C., Li, H., "Automatic Evaluation of End-to-End Dialog Systems with Adequacy-Fluency Metrics", Special issue on DSTC6 in Computer Speech and Langauge, DOI: 10.1016/​j.csl.2018.12.004, Vol. 55, pp. 200-215, March 2019.
    BibTeX TR2018-195 PDF
    • @article{dHaro2019mar,
    • author = {d’Haro, Luis Fernando and Banchs, Rafael and Hori, Chiori and Li, Haizhou},
    • title = {Automatic Evaluation of End-to-End Dialog Systems with Adequacy-Fluency Metrics},
    • journal = {Special issue on DSTC6 in Computer Speech and Langauge},
    • year = 2019,
    • volume = 55,
    • pages = {200--215},
    • month = mar,
    • publisher = {Elsevier},
    • doi = {10.1016/j.csl.2018.12.004},
    • url = {https://www.merl.com/publications/TR2018-195}
    • }
  •  Cho, J., Baskar, M.K., Li, R., Wiesner, M., Mallidi, S.H., Yalta, N., Karafiat, M., Watanabe, S., Hori, T., "Multilingual Sequence-to-Sequence Speech Recognition: Architecture, Transfer Learning, and Language Modeling", IEEE Spoken Language Technology Workshop (SLT), DOI: 10.1109/​SLT.2018.8639655, December 2018.
    BibTeX TR2018-175 PDF
    • @inproceedings{Cho2018dec,
    • author = {Cho, Jaejin and Baskar, Murali Karthick and Li, Ruizhi and Wiesner, Matthew and Mallidi, Sri Harish and Yalta, Nelson and Karafiat, Martin and Watanabe, Shinji and Hori, Takaaki},
    • title = {Multilingual Sequence-to-Sequence Speech Recognition: Architecture, Transfer Learning, and Language Modeling},
    • booktitle = {IEEE Spoken Language Technology Workshop (SLT)},
    • year = 2018,
    • month = dec,
    • doi = {10.1109/SLT.2018.8639655},
    • url = {https://www.merl.com/publications/TR2018-175}
    • }
  •  Hayashi, T., Watanabe, S., Zhang, Y., Toda, T., Hori, T., Astudillo, R., Takeda, K., "Back-Translation-Style Data Augmentation for End-to-End ASR", IEEE Spoken Language Technology Workshop (SLT), DOI: 10.1109/​SLT.2018.8639619, December 2018.
    BibTeX TR2018-174 PDF
    • @inproceedings{Hayashi2018dec,
    • author = {Hayashi, Tomoki and Watanabe, Shinji and Zhang, Yu and Toda, Tomoki and Hori, Takaaki and Astudillo, Ramon and Takeda, Kazuya},
    • title = {Back-Translation-Style Data Augmentation for End-to-End ASR},
    • booktitle = {IEEE Spoken Language Technology Workshop (SLT)},
    • year = 2018,
    • month = dec,
    • doi = {10.1109/SLT.2018.8639619},
    • url = {https://www.merl.com/publications/TR2018-174}
    • }
  •  Hori, T., Cho, J., Watanabe, S., "End-to-End Speech Recognition with Word-Based RNN Language Models", IEEE Spoken Language Technology Workshop (SLT), DOI: 10.1109/​SLT.2018.8639693, December 2018.
    BibTeX TR2018-176 PDF
    • @inproceedings{Hori2018dec,
    • author = {Hori, Takaaki and Cho, Jaejin and Watanabe, Shinji},
    • title = {End-to-End Speech Recognition with Word-Based RNN Language Models},
    • booktitle = {IEEE Spoken Language Technology Workshop (SLT)},
    • year = 2018,
    • month = dec,
    • doi = {10.1109/SLT.2018.8639693},
    • url = {https://www.merl.com/publications/TR2018-176}
    • }
  •  Hori, T., Wang, W., Koji, Y., Hori, C., Harsham, B.A., Hershey, J., "Adversarial Training and Decoding Strategies for End-to-end Neural Conversation Models", Computer Speech and Language, DOI: 10.1016/​j.csl.2018.08.006, Vol. 54, pp. 122-139, December 2018.
    BibTeX TR2018-161 PDF
    • @article{Hori2018dec2,
    • author = {Hori, Takaaki and Wang, Wen and Koji, Yusuke and Hori, Chiori and Harsham, Bret A. and Hershey, John},
    • title = {Adversarial Training and Decoding Strategies for End-to-end Neural Conversation Models},
    • journal = {Computer Speech and Language},
    • year = 2018,
    • volume = 54,
    • pages = {122--139},
    • month = dec,
    • publisher = {Elsevier},
    • doi = {10.1016/j.csl.2018.08.006},
    • url = {https://www.merl.com/publications/TR2018-161}
    • }
  •  Wichern, G., Le Roux, J., "Phase Reconstruction with Learned Time-Frequency Representations for Single-Channel Speech Separation", International Workshop on Acoustic Signal Enhancement (IWAENC), DOI: 10.1109/​IWAENC.2018.8521243, September 2018.
    BibTeX TR2018-146 PDF
    • @inproceedings{Wichern2018sep,
    • author = {Wichern, Gordon and Le Roux, Jonathan},
    • title = {Phase Reconstruction with Learned Time-Frequency Representations for Single-Channel Speech Separation},
    • booktitle = {International Workshop on Acoustic Signal Enhancement (IWAENC)},
    • year = 2018,
    • month = sep,
    • doi = {10.1109/IWAENC.2018.8521243},
    • url = {https://www.merl.com/publications/TR2018-146}
    • }
  •  Wang, Z.-Q., Le Roux, J., Wang, D., Hershey, J., "End-to-End Speech Separation with Unfolded Iterative Phase Reconstruction", Interspeech, September 2018.
    BibTeX TR2018-135 PDF
    • @inproceedings{Wang2018sep,
    • author = {Wang, Zhong-Qiu and Le Roux, Jonathan and Wang, DeLiang and Hershey, John},
    • title = {End-to-End Speech Separation with Unfolded Iterative Phase Reconstruction},
    • booktitle = {Interspeech},
    • year = 2018,
    • month = sep,
    • url = {https://www.merl.com/publications/TR2018-135}
    • }