% IMPORTANT: The following is UTF-8 encoded. This means that in the presence
% of non-ASCII characters, it will not work with BibTeX 0.99 or older.
% Instead, you should use an up-to-date BibTeX implementation like “bibtex8” or
% “biber”.
@PHDTHESIS{Kotobi:619753,
author = {Kotobi, Amir},
othercontributors = {Meissner, Robert and Bari, Sadia and Huber, Patrick},
title = {{D}ynamic structure investigation and spectra prediction of
biomolecules using machine learning techniques},
school = {Technische Universität Hamburg},
type = {Dissertation},
publisher = {TUHH Universitätsbibliothek},
reportid = {PUBDB-2024-07888},
pages = {141},
year = {2024},
note = {Dissertation, Technische Universität Hamburg, 2024},
abstract = {The investigation of biomolecular structures and the
prediction of their spectra using experimental and
theoretical studies in the gas phase represent fundamental
steps in comprehending their intrinsic properties and
biological functions. Nonetheless, the complexity of the
potential energy surface of biomolecules, combined with
limitations in computational resources, limits the
interpretation of experimental observations. Integrating
supervised and unsupervised machine learning (ML) techniques
into theoretical calculations is considered as an effective
way to address these challenges. Infrared (IR) and X-ray
absorption spectroscopy (XAS) has proven to be powerful
experimental techniques to study the electronic and spatial
structure of biomolecules such as peptides and proteins.
Reproducing and validating the features observed in spectra
resulting from these experiments often requires the use of
sophisticated ab initio calculations and comprehensive
understanding of biomolecules’ configurational space. In
this thesis, I introduced a novel approach in interpretation
of IR experimental spectrum of a peptide which aims
enhancing the exploratory power of searching configurational
space by combining REMD simulations, unsupervised machine
learning, and ab initio calculations. This scheme relies on
a set of structural descriptors and data-driven clustering
technique which accounts for canonical ensemble of real
experimental condition to obtain an accurate computed
spectrum. We show that by partitioning the configurational
space into subensembles of imilar conformations i.e.
clusters, an accurate IR spectrum can be calculated by
averaging the IR contribution of each representative
conformer in each cluster, weighted according to the
population of each cluster. While this approach unravels
important fingerprints of experimental spectroscopic data,
the calculation of IR and particularly XAS spectra, due to
its inherently expensive theoretical computation, is often
computationally prohibitive task for even medium-sized
molecules. To remedy the computational obstacles associated
with spectra prediction, we develope a data-driven
supervised ML frameworks, i.e. graph neural networks which
are trained on a custom-generated XAS dataset to find a
mapping between structures and spectroscopic signals, thus
bypassing the need for expensive ab initio quantum chemistry
calculations. To insure the interpretability of GNN
models’ predictions, we employ feature attribution to
determine the respective contributions of various atoms in
the molecules to the peaks observed in the XAS spectrum.
Within this approach, we show that it is possible to link
the peaks observed in the spectra to certain core and
virtual orbitals from the quantum chemical calculations and
obtain an in-depth understanding of the ML predicted XAS
spectrum. The results presented in this thesis show that the
integration of supervised and unsupervised ML techniques can
effectively enhance the interpretation of spectroscopic data
and make efficient use of the expensive ab initio
calculations.Die Infrarot- und
Röntgenabsorptionsspektroskopie haben sich als
leistungsfähige experimentelle Instrumente zur die
elektronischen und strukturellen Feinheiten von
Biomolekülen, insbesondere Peptiden und Proteinen,
aufzuklären. Parallel dazu haben die bemerkenswerten
Fortschritte bei den Rechenkapazitäten die Fähigkeit
beschleunigt die Fähigkeit, Chemie, Physik und maschinelles
Lernen in einer echten Symbiose zu kombinieren, wodurch die
präzise Modellierung und Verständnis komplexer
biomolekularer Prozesse auf atomarer Ebene und die
Validierung von experimentell beobachteten
Spektralmerkmalen. Doch die inhärente Komplexität von
Peptiden und Proteinen, gekoppelt mit den
Rechenanforderungen quantenmechanischer Methoden für große
Systeme stellen jedoch eine große Herausforderung dar, wenn
es darum geht, die inhärenten Eigenschaften dieser
Biomolekülen. Um diese Herausforderungen zu bewältigen,
ist die Einbeziehung von überwachten und unüberwachten
Techniken des maschinellen Lernens in die
Molekulardynamik-Simulations-Toolbox erleichtert die das
komplexe Zusammenspiel interatomarer und intermolekularer
Wechselwirkungen zu entschlüsseln und den Weg für die den
Weg für die Vorhersage verschiedener Eigenschaften dieser
Systeme. Diese Dissertation befasst sich mit Feature und
Techniken des unüberwachten maschinellen Lernens (z. B.
Clustering und Dimensionality-Reduction), die auf
atomistische Datensätze angewandt werden, um zu
untersuchen, wie diese Techniken die komplexe
Strukturlandschaft eines Modellpeptids beleuchten können.
Darüber hinaus werden in dieser Arbeit Graph neuronale
Netze als leistungsstarker und effizienter Ansatz zur
Entschlüsselung der komplizierten.},
keywords = {Machine learning (Other) / Infrared (IR) (Other) / X-ray
absorption spectroscopy (XAS) (Other) / Graph neural
networks (GNN) (Other) / Explainability AI (Other) / Natural
Sciences and Mathematics::540: Chemistry (Other) / Natural
Sciences and Mathematics::570: Life Sciences, Biology
(Other) / Natural Sciences and Mathematics::510: Mathematics
(Other)},
cin = {FS-BIG},
cid = {I:(DE-H253)FS-BIG-20220318},
pnm = {633 - Life Sciences – Building Blocks of Life: Structure
and Function (POF4-633) / HIDSS-0002 - DASHH: Data Science
in Hamburg - Helmholtz Graduate School for the Structure of
Matter $(2019_IVF-HIDSS-0002)$ / PHGS, VH-GS-500 - PIER
Helmholtz Graduate School $(2015_IFV-VH-GS-500)$},
pid = {G:(DE-HGF)POF4-633 / $G:(DE-HGF)2019_IVF-HIDSS-0002$ /
$G:(DE-HGF)2015_IFV-VH-GS-500$},
experiment = {EXP:(DE-MLZ)NOSPEC-20140101 /
EXP:(DE-MLZ)External-20140101},
typ = {PUB:(DE-HGF)11},
doi = {10.15480/882.9689},
url = {https://bib-pubdb1.desy.de/record/619753},
}