% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Lemercier:601120,
author = {Lemercier, Jean-Marie and Richter, Julius and Welker, Simon
and Gerkmann, Timo},
title = {{S}to{RM}: {A} {D}iffusion-{B}ased {S}tochastic
{R}egeneration {M}odel for {S}peech {E}nhancement and
{D}ereverberation},
journal = {IEEE ACM transactions on audio, speech, and language
processing},
volume = {31},
issn = {2329-9290},
address = {New York, NY},
publisher = {IEEE},
reportid = {PUBDB-2024-00134, arXiv:2212.11851},
pages = {2724 - 2737},
year = {2023},
note = {ISSN 2329-9304 not unique: **2 hits**.},
abstract = {Diffusion models have shown a great ability at bridging the
performance gap between predictive and generative approaches
for speech enhancement. We have shown that they may even
outperform their predictive counterparts for non-additive
corruption types or when they are evaluated on mismatched
conditions. However, diffusion models suffer from a high
computational burden, mainly as they require to run a neural
network for each reverse diffusion step, whereas predictive
approaches only require one pass. As diffusion models are
generative approaches they may also produce vocalizing and
breathing artifacts in adverse conditions. In comparison, in
such difficult scenarios, predictive models typically do not
produce such artifacts but tend to distort the target speech
instead, thereby degrading the speech quality. In this work,
we present a stochastic regeneration approach where an
estimate given by a predictive model is provided as a guide
for further diffusion. We show that the proposed approach
uses the predictive model to remove the vocalizing and
breathing artifacts while producing very high quality
samples thanks to the diffusion model, even in adverse
conditions. We further show that this approach enables to
use lighter sampling schemes with fewer diffusion steps
without sacrificing quality, thus lifting the computational
burden by an order of magnitude. Source code and audio
examples are available online.},
cin = {CFEL-I / FS-CFEL-1-CFEL},
ddc = {400},
cid = {I:(DE-H253)CFEL-I-20161114 /
I:(DE-H253)FS-CFEL-1-CFEL-20210408},
pnm = {633 - Life Sciences – Building Blocks of Life: Structure
and Function (POF4-633) / HIDSS-0002 - DASHH: Data Science
in Hamburg - Helmholtz Graduate School for the Structure of
Matter $(2019_IVF-HIDSS-0002)$},
pid = {G:(DE-HGF)POF4-633 / $G:(DE-HGF)2019_IVF-HIDSS-0002$},
experiment = {EXP:(DE-MLZ)NOSPEC-20140101},
typ = {PUB:(DE-HGF)16},
eprint = {2212.11851},
howpublished = {arXiv:2212.11851},
archivePrefix = {arXiv},
SLACcitation = {$\%\%CITATION$ = $arXiv:2212.11851;\%\%$},
UT = {WOS:001037791600002},
doi = {10.1109/TASLP.2023.3294692},
url = {https://bib-pubdb1.desy.de/record/601120},
}