% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@ARTICLE{Burger:626051,
author = {Burger, Martin and Kabri, Samira and Korolev, Yury and
Roith, Tim and Weigand, Lukas},
title = {{A}nalysis of mean-field models arising from self-attention
dynamics in transformer architectures with layer
normalization},
journal = {Philosophical transactions of the Royal Society of London /
Series A},
volume = {383},
number = {2298},
issn = {1364-503X},
address = {London},
publisher = {Royal Soc.},
reportid = {PUBDB-2025-01273},
pages = {20240233},
year = {2025},
note = {ISSN 1471-2962 not unique: **2 hits**.},
abstract = {The aim of this paper is to provide a mathematical analysis
of transformer architectures using aself-attention mechanism
with layer normalization. In particular, observed patterns
in such architecturesresembling either clusters or uniform
distributions pose a number of challenging mathematical
questions.We focus on a special case that admits a gradient
flow formulation in the spaces of probability measureson the
unit sphere under a special metric, which allows us to give
at least partial answers in a rigorousway. The arising
mathematical problems resemble those recently studied in
aggregation equations, butwith additional challenges
emerging from restricting the dynamics to the sphere and the
particular formof the interaction energy.We provide a
rigorous framework for studying the gradient flow, which
also suggests a possible metricgeometry to study the general
case (i.e. one that is not described by a gradient flow). We
further analyzethe stationary points of the induced
self-attention dynamics. The latter are related to
stationary pointsof the interaction energy in the
Wasserstein geometry, and we further discuss energy
minimizers andmaximizers in different parameter settings.},
cin = {FS-CI},
ddc = {510},
cid = {I:(DE-H253)FS-CI-20230420},
pnm = {623 - Data Management and Analysis (POF4-623) / DFG project
G:(GEPRIS)464101359 - Deep-Learning basierte Regularisierung
inverser Probleme (464101359) / DFG project
G:(GEPRIS)464101190 - Theoretischer Grundlagen des
Unsicherheits-robusten Deep Learning für Inverse Probleme
(464101190)},
pid = {G:(DE-HGF)POF4-623 / G:(GEPRIS)464101359 /
G:(GEPRIS)464101190},
experiment = {EXP:(DE-MLZ)NOSPEC-20140101},
typ = {PUB:(DE-HGF)16},
doi = {10.1098/rsta.2024.0233},
url = {https://bib-pubdb1.desy.de/record/626051},
}