% IMPORTANT: The following is UTF-8 encoded.  This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.

@ARTICLE{Lemercier:601120,
      author       = {Lemercier, Jean-Marie and Richter, Julius and Welker, Simon
                      and Gerkmann, Timo},
      title        = {{S}to{RM}: {A} {D}iffusion-{B}ased {S}tochastic
                      {R}egeneration {M}odel for {S}peech {E}nhancement and
                      {D}ereverberation},
      journal      = {IEEE ACM transactions on audio, speech, and language
                      processing},
      volume       = {31},
      issn         = {2329-9290},
      address      = {New York, NY},
      publisher    = {IEEE},
      reportid     = {PUBDB-2024-00134, arXiv:2212.11851},
      pages        = {2724 - 2737},
      year         = {2023},
      note         = {ISSN 2329-9304 not unique: **2 hits**.},
      abstract     = {Diffusion models have shown a great ability at bridging the
                      performance gap between predictive and generative approaches
                      for speech enhancement. We have shown that they may even
                      outperform their predictive counterparts for non-additive
                      corruption types or when they are evaluated on mismatched
                      conditions. However, diffusion models suffer from a high
                      computational burden, mainly as they require to run a neural
                      network for each reverse diffusion step, whereas predictive
                      approaches only require one pass. As diffusion models are
                      generative approaches they may also produce vocalizing and
                      breathing artifacts in adverse conditions. In comparison, in
                      such difficult scenarios, predictive models typically do not
                      produce such artifacts but tend to distort the target speech
                      instead, thereby degrading the speech quality. In this work,
                      we present a stochastic regeneration approach where an
                      estimate given by a predictive model is provided as a guide
                      for further diffusion. We show that the proposed approach
                      uses the predictive model to remove the vocalizing and
                      breathing artifacts while producing very high quality
                      samples thanks to the diffusion model, even in adverse
                      conditions. We further show that this approach enables to
                      use lighter sampling schemes with fewer diffusion steps
                      without sacrificing quality, thus lifting the computational
                      burden by an order of magnitude. Source code and audio
                      examples are available online.},
      cin          = {CFEL-I / FS-CFEL-1-CFEL},
      ddc          = {400},
      cid          = {I:(DE-H253)CFEL-I-20161114 /
                      I:(DE-H253)FS-CFEL-1-CFEL-20210408},
      pnm          = {633 - Life Sciences – Building Blocks of Life: Structure
                      and Function (POF4-633) / HIDSS-0002 - DASHH: Data Science
                      in Hamburg - Helmholtz Graduate School for the Structure of
                      Matter $(2019_IVF-HIDSS-0002)$},
      pid          = {G:(DE-HGF)POF4-633 / $G:(DE-HGF)2019_IVF-HIDSS-0002$},
      experiment   = {EXP:(DE-MLZ)NOSPEC-20140101},
      typ          = {PUB:(DE-HGF)16},
      eprint       = {2212.11851},
      howpublished = {arXiv:2212.11851},
      archivePrefix = {arXiv},
      SLACcitation = {$\%\%CITATION$ = $arXiv:2212.11851;\%\%$},
      UT           = {WOS:001037791600002},
      doi          = {10.1109/TASLP.2023.3294692},
      url          = {https://bib-pubdb1.desy.de/record/601120},
}